From 03f318ca652889a1aa407e7088b9a2f6a14ae374 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 8 Jun 2018 12:28:47 -0400 Subject: [PATCH 01/27] nfsd4: extend reclaim period for reclaiming clients If the client is only renewing state a little sooner than once a lease period, then it might not discover the server has restarted till close to the end of the grace period, and might run out of time to do the actual reclaim. Extend the grace period by a second each time we notice there are clients still trying to reclaim, up to a limit of another whole lease period. Signed-off-by: J. Bruce Fields --- fs/nfsd/netns.h | 1 + fs/nfsd/nfs4proc.c | 4 ++++ fs/nfsd/nfs4state.c | 31 ++++++++++++++++++++++++++++++- fs/nfsd/nfsctl.c | 1 + 4 files changed, 36 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 36358d435cb0..426f55005697 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -102,6 +102,7 @@ struct nfsd_net { time_t nfsd4_lease; time_t nfsd4_grace; + bool somebody_reclaimed; bool nfsd_net_up; bool lockd_up; diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 5d99e8810b85..1929f85b8269 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -354,6 +354,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct svc_fh *resfh = NULL; struct net *net = SVC_NET(rqstp); struct nfsd_net *nn = net_generic(net, nfsd_net_id); + bool reclaim = false; dprintk("NFSD: nfsd4_open filename %.*s op_openowner %p\n", (int)open->op_fname.len, open->op_fname.data, @@ -424,6 +425,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (status) goto out; open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; + reclaim = true; case NFS4_OPEN_CLAIM_FH: case NFS4_OPEN_CLAIM_DELEG_CUR_FH: status = do_open_fhandle(rqstp, cstate, open); @@ -452,6 +454,8 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, WARN(status && open->op_created, "nfsd4_process_open2 failed to open newly-created file! status=%u\n", be32_to_cpu(status)); + if (reclaim && !status) + nn->somebody_reclaimed = true; out: if (resfh && resfh != &cstate->current_fh) { fh_dup2(&cstate->current_fh, resfh); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 857141446d6b..9b2ce80abee0 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4693,6 +4693,28 @@ nfsd4_end_grace(struct nfsd_net *nn) */ } +/* + * If we've waited a lease period but there are still clients trying to + * reclaim, wait a little longer to give them a chance to finish. + */ +static bool clients_still_reclaiming(struct nfsd_net *nn) +{ + unsigned long now = get_seconds(); + unsigned long double_grace_period_end = nn->boot_time + + 2 * nn->nfsd4_lease; + + if (!nn->somebody_reclaimed) + return false; + nn->somebody_reclaimed = false; + /* + * If we've given them *two* lease times to reclaim, and they're + * still not done, give up: + */ + if (time_after(now, double_grace_period_end)) + return false; + return true; +} + static time_t nfs4_laundromat(struct nfsd_net *nn) { @@ -4706,6 +4728,11 @@ nfs4_laundromat(struct nfsd_net *nn) time_t t, new_timeo = nn->nfsd4_lease; dprintk("NFSD: laundromat service - starting\n"); + + if (clients_still_reclaiming(nn)) { + new_timeo = 0; + goto out; + } nfsd4_end_grace(nn); INIT_LIST_HEAD(&reaplist); spin_lock(&nn->client_lock); @@ -4803,7 +4830,7 @@ nfs4_laundromat(struct nfsd_net *nn) posix_unblock_lock(&nbl->nbl_lock); free_blocked_lock(nbl); } - +out: new_timeo = max_t(time_t, new_timeo, NFSD_LAUNDROMAT_MINTIMEOUT); return new_timeo; } @@ -6053,6 +6080,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, case 0: /* success! */ nfs4_inc_and_copy_stateid(&lock->lk_resp_stateid, &lock_stp->st_stid); status = 0; + if (lock->lk_reclaim) + nn->somebody_reclaimed = true; break; case FILE_LOCK_DEFERRED: nbl = NULL; diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index d107b4426f7e..5f22476cf371 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1239,6 +1239,7 @@ static __net_init int nfsd_init_net(struct net *net) goto out_idmap_error; nn->nfsd4_lease = 90; /* default lease time */ nn->nfsd4_grace = 90; + nn->somebody_reclaimed = false; nn->clverifier_counter = prandom_u32(); nn->clientid_counter = prandom_u32(); From d6ebf5088f09472c1136cd506bdc27034a6763f8 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 8 Jun 2018 12:33:17 -0400 Subject: [PATCH 02/27] nfsd4: return default lease period I don't have a good rationale for the lease period, but 90 seconds seems long, and as long as we're allowing the server to extend the grace period up to double the lease period, let's half the default to 45. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfsctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 5f22476cf371..ec8a214a5089 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1237,8 +1237,8 @@ static __net_init int nfsd_init_net(struct net *net) retval = nfsd_idmap_init(net); if (retval) goto out_idmap_error; - nn->nfsd4_lease = 90; /* default lease time */ - nn->nfsd4_grace = 90; + nn->nfsd4_lease = 45; /* default lease time */ + nn->nfsd4_grace = 45; nn->somebody_reclaimed = false; nn->clverifier_counter = prandom_u32(); nn->clientid_counter = prandom_u32(); From 16945141c3567bb8561de3677de1ec657675bb15 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 25 Apr 2018 14:34:11 -0400 Subject: [PATCH 03/27] nfsd: fix NFSv4 time_delta attribute Currently we return the worst-case value of 1 second in the time delta attribute. That's not terribly useful. Instead, return a value calculated from the time granularity supported by the filesystem and the system clock. Reviewed-by: Jeff Layton Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4xdr.c | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index a96843c59fc1..4161031ae14e 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2006,6 +2006,31 @@ static __be32 *encode_change(__be32 *p, struct kstat *stat, struct inode *inode, return p; } +/* + * ctime (in NFSv4, time_metadata) is not writeable, and the client + * doesn't really care what resolution could theoretically be stored by + * the filesystem. + * + * The client cares how close together changes can be while still + * guaranteeing ctime changes. For most filesystems (which have + * timestamps with nanosecond fields) that is limited by the resolution + * of the time returned from current_time() (which I'm assuming to be + * 1/HZ). + */ +static __be32 *encode_time_delta(__be32 *p, struct inode *inode) +{ + struct timespec ts; + u32 ns; + + ns = max_t(u32, NSEC_PER_SEC/HZ, inode->i_sb->s_time_gran); + ts = ns_to_timespec(ns); + + p = xdr_encode_hyper(p, ts.tv_sec); + *p++ = cpu_to_be32(ts.tv_nsec); + + return p; +} + static __be32 *encode_cinfo(__be32 *p, struct nfsd4_change_info *c) { *p++ = cpu_to_be32(c->atomic); @@ -2797,9 +2822,7 @@ out_acl: p = xdr_reserve_space(xdr, 12); if (!p) goto out_resource; - *p++ = cpu_to_be32(0); - *p++ = cpu_to_be32(1); - *p++ = cpu_to_be32(0); + p = encode_time_delta(p, d_inode(dentry)); } if (bmval1 & FATTR4_WORD1_TIME_METADATA) { p = xdr_reserve_space(xdr, 12); From a85857633b04d57f4524cca0a2bfaf87b2543f9f Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 25 Apr 2018 13:26:23 -0400 Subject: [PATCH 04/27] nfsd4: support change_attr_type attribute The change attribute is what is used by clients to revalidate their caches. Our server may use i_version or ctime for that purpose. Those choices behave slightly differently, and it may be useful to the client to know which we're using. This attribute tells the client that. The Linux client doesn't yet use this attribute yet, though. Reviewed-by: Jeff Layton Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4xdr.c | 10 ++++++++++ fs/nfsd/nfsd.h | 1 + include/linux/nfs4.h | 8 ++++++++ 3 files changed, 19 insertions(+) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 4161031ae14e..fb4991889f89 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2891,6 +2891,16 @@ out_acl: goto out; } + if (bmval2 & FATTR4_WORD2_CHANGE_ATTR_TYPE) { + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out_resource; + if (IS_I_VERSION(d_inode(dentry))) + *p++ = cpu_to_be32(NFS4_CHANGE_TYPE_IS_MONOTONIC_INCR); + else + *p++ = cpu_to_be32(NFS4_CHANGE_TYPE_IS_TIME_METADATA); + } + if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) { status = nfsd4_encode_security_label(xdr, rqstp, context, contextlen); diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 3fce905d0365..066899929863 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -360,6 +360,7 @@ void nfsd_lockd_shutdown(void); #define NFSD4_2_SUPPORTED_ATTRS_WORD2 \ (NFSD4_1_SUPPORTED_ATTRS_WORD2 | \ + FATTR4_WORD2_CHANGE_ATTR_TYPE | \ FATTR4_WORD2_MODE_UMASK | \ NFSD4_2_SECURITY_ATTRS) diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index 57ffaa20d564..0877ed333733 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -374,6 +374,13 @@ enum lock_type4 { NFS4_WRITEW_LT = 4 }; +enum change_attr_type4 { + NFS4_CHANGE_TYPE_IS_MONOTONIC_INCR = 0, + NFS4_CHANGE_TYPE_IS_VERSION_COUNTER = 1, + NFS4_CHANGE_TYPE_IS_VERSION_COUNTER_NOPNFS = 2, + NFS4_CHANGE_TYPE_IS_TIME_METADATA = 3, + NFS4_CHANGE_TYPE_IS_UNDEFINED = 4 +}; /* Mandatory Attributes */ #define FATTR4_WORD0_SUPPORTED_ATTRS (1UL << 0) @@ -441,6 +448,7 @@ enum lock_type4 { #define FATTR4_WORD2_LAYOUT_BLKSIZE (1UL << 1) #define FATTR4_WORD2_MDSTHRESHOLD (1UL << 4) #define FATTR4_WORD2_CLONE_BLKSIZE (1UL << 13) +#define FATTR4_WORD2_CHANGE_ATTR_TYPE (1UL << 15) #define FATTR4_WORD2_SECURITY_LABEL (1UL << 16) #define FATTR4_WORD2_MODE_UMASK (1UL << 17) From 665d507276004823e07596997d49d2705b4598a3 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 11 Apr 2018 16:53:36 -0400 Subject: [PATCH 05/27] nfsd4: less confusing nfsd4_compound_in_session Make the function prototype match the name a little better. Reviewed-by: Jeff Layton Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 9b2ce80abee0..cfd0ac5c1c5b 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2956,11 +2956,11 @@ out_no_session: return status; } -static bool nfsd4_compound_in_session(struct nfsd4_session *session, struct nfs4_sessionid *sid) +static bool nfsd4_compound_in_session(struct nfsd4_compound_state *cstate, struct nfs4_sessionid *sid) { - if (!session) + if (!cstate->session) return false; - return !memcmp(sid, &session->se_sessionid, sizeof(*sid)); + return !memcmp(sid, &cstate->session->se_sessionid, sizeof(*sid)); } __be32 @@ -2975,7 +2975,7 @@ nfsd4_destroy_session(struct svc_rqst *r, struct nfsd4_compound_state *cstate, struct nfsd_net *nn = net_generic(net, nfsd_net_id); status = nfserr_not_only_op; - if (nfsd4_compound_in_session(cstate->session, &sessionid->sessionid)) { + if (nfsd4_compound_in_session(cstate, &sessionid->sessionid)) { if (!nfsd4_last_compound_op(r)) goto out; ref_held_by_me++; From ca0552f4641436a1dd5c49906690ee39ab5d43a3 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 11 Apr 2018 17:01:53 -0400 Subject: [PATCH 06/27] nfsd4: cleanup sessionid in nfsd4_destroy_session The name of this variable doesn't fit the type. And we only ever use one field of it. Reviewed-by: Jeff Layton Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index cfd0ac5c1c5b..4f13dcb42ab1 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2967,7 +2967,7 @@ __be32 nfsd4_destroy_session(struct svc_rqst *r, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { - struct nfsd4_destroy_session *sessionid = &u->destroy_session; + struct nfs4_sessionid *sessionid = &u->destroy_session.sessionid; struct nfsd4_session *ses; __be32 status; int ref_held_by_me = 0; @@ -2975,14 +2975,14 @@ nfsd4_destroy_session(struct svc_rqst *r, struct nfsd4_compound_state *cstate, struct nfsd_net *nn = net_generic(net, nfsd_net_id); status = nfserr_not_only_op; - if (nfsd4_compound_in_session(cstate, &sessionid->sessionid)) { + if (nfsd4_compound_in_session(cstate, sessionid)) { if (!nfsd4_last_compound_op(r)) goto out; ref_held_by_me++; } - dump_sessionid(__func__, &sessionid->sessionid); + dump_sessionid(__func__, sessionid); spin_lock(&nn->client_lock); - ses = find_in_sessionid_hashtbl(&sessionid->sessionid, net, &status); + ses = find_in_sessionid_hashtbl(sessionid, net, &status); if (!ses) goto out_client_lock; status = nfserr_wrong_cred; From 4a269efb6f8199b58aa77a1a40404877974feb26 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 13 Apr 2018 17:34:35 -0400 Subject: [PATCH 07/27] nfsd: update obselete comment referencing the BKL It's inode->i_lock that's now taken in setlease and break_lease, instead of the big kernel lock. Reviewed-by: Jeff Layton Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 4f13dcb42ab1..19c4d29917ee 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -3945,9 +3945,9 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp) /* * We're assuming the state code never drops its reference * without first removing the lease. Since we're in this lease - * callback (and since the lease code is serialized by the kernel - * lock) we know the server hasn't removed the lease yet, we know - * it's safe to take a reference. + * callback (and since the lease code is serialized by the + * i_lock) we know the server hasn't removed the lease yet, and + * we know it's safe to take a reference. */ refcount_inc(&dp->dl_stid.sc_count); nfsd4_run_cb(&dp->dl_recall); From 7a04cfda7dfa00173ab165577254ede83f81bf01 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 13 Jun 2018 15:04:59 -0400 Subject: [PATCH 08/27] nfsd: clarify check_op_ordering Document a couple things that confused me on a recent reading. Reviewed-by: Jeff Layton Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4proc.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 1929f85b8269..140b05c8a4be 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1603,7 +1603,7 @@ static const char *nfsd4_op_name(unsigned opnum); */ static __be32 nfs41_check_op_ordering(struct nfsd4_compoundargs *args) { - struct nfsd4_op *op = &args->ops[0]; + struct nfsd4_op *first_op = &args->ops[0]; /* These ordering requirements don't apply to NFSv4.0: */ if (args->minorversion == 0) @@ -1611,12 +1611,17 @@ static __be32 nfs41_check_op_ordering(struct nfsd4_compoundargs *args) /* This is weird, but OK, not our problem: */ if (args->opcnt == 0) return nfs_ok; - if (op->status == nfserr_op_illegal) + if (first_op->status == nfserr_op_illegal) return nfs_ok; - if (!(nfsd4_ops[op->opnum].op_flags & ALLOWED_AS_FIRST_OP)) + if (!(nfsd4_ops[first_op->opnum].op_flags & ALLOWED_AS_FIRST_OP)) return nfserr_op_not_in_session; - if (op->opnum == OP_SEQUENCE) + if (first_op->opnum == OP_SEQUENCE) return nfs_ok; + /* + * So first_op is something allowed outside a session, like + * EXCHANGE_ID; but then it has to be the only op in the + * compound: + */ if (args->opcnt != 1) return nfserr_not_only_op; return nfs_ok; From 5b7b15aee641904ae269be9846610a3950cbd64c Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 13 Jun 2018 15:21:35 -0400 Subject: [PATCH 09/27] nfsd: fix corrupted reply to badly ordered compound We're encoding a single op in the reply but leaving the number of ops zero, so the reply makes no sense. Somewhat academic as this isn't a case any real client will hit, though in theory perhaps that could change in a future protocol extension. Reviewed-by: Jeff Layton Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4proc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 140b05c8a4be..3652f9b1fb68 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1735,6 +1735,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp) if (status) { op = &args->ops[0]; op->status = status; + resp->opcnt = 1; goto encode_op; } From 8163496e78db100a6b5cfbdaece385686ae50129 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Tue, 19 Jun 2018 04:57:24 -0400 Subject: [PATCH 10/27] nfsd: don't advertise a SCSI layout for an unsupported request_queue Commit 30181faae37f ("nfsd: Check queue type before submitting a SCSI request") did the work of ensuring that we don't send SCSI requests to a request queue that won't support them, but that check is in the GETDEVICEINFO path. Let's not set the SCSI layout in fs_layout_type in the first place, and then we'll have less clients sending GETDEVICEINFO for non-SCSI request queues and less unnecessary WARN_ONs. While we're in here, remove some outdated comments that refer to "overwriting" layout seletion because commit 8a4c3926889e ("nfsd: allow nfsd to advertise multiple layout types") changed things to no longer overwrite the layout type. Signed-off-by: Benjamin Coddington Reviewed-by: Christoph Hellwig Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4layouts.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index 228faf00a594..2b36aa037ce0 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -133,27 +133,20 @@ void nfsd4_setup_layout_type(struct svc_export *exp) if (!(exp->ex_flags & NFSEXP_PNFS)) return; - /* - * If flex file is configured, use it by default. Otherwise - * check if the file system supports exporting a block-like layout. - * If the block device supports reservations prefer the SCSI layout, - * otherwise advertise the block layout. - */ #ifdef CONFIG_NFSD_FLEXFILELAYOUT exp->ex_layout_types |= 1 << LAYOUT_FLEX_FILES; #endif #ifdef CONFIG_NFSD_BLOCKLAYOUT - /* overwrite flex file layout selection if needed */ if (sb->s_export_op->get_uuid && sb->s_export_op->map_blocks && sb->s_export_op->commit_blocks) exp->ex_layout_types |= 1 << LAYOUT_BLOCK_VOLUME; #endif #ifdef CONFIG_NFSD_SCSILAYOUT - /* overwrite block layout selection if needed */ if (sb->s_export_op->map_blocks && sb->s_export_op->commit_blocks && - sb->s_bdev && sb->s_bdev->bd_disk->fops->pr_ops) + sb->s_bdev && sb->s_bdev->bd_disk->fops->pr_ops && + blk_queue_scsi_passthrough(sb->s_bdev->bd_disk->queue)) exp->ex_layout_types |= 1 << LAYOUT_SCSI; #endif } From 64bed6cbe38bc95689fb9399872d9ce250192f90 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 13 Jul 2018 17:22:24 +0300 Subject: [PATCH 11/27] nfsd: fix leaked file lock with nfs exported overlayfs nfsd and lockd call vfs_lock_file() to lock/unlock the inode returned by locks_inode(file). Many places in nfsd/lockd code use the inode returned by file_inode(file) for lock manipulation. With Overlayfs, file_inode() (the underlying inode) is not the same object as locks_inode() (the overlay inode). This can result in "Leaked POSIX lock" messages and eventually to a kernel crash as reported by Eddie Horng: https://marc.info/?l=linux-unionfs&m=153086643202072&w=2 Fix all the call sites in nfsd/lockd that should use locks_inode(). This is a correctness bug that manifested when overlayfs gained NFS export support in v4.16. Reported-by: Eddie Horng Tested-by: Eddie Horng Cc: Jeff Layton Fixes: 8383f1748829 ("ovl: wire up NFS export operations") Cc: stable@vger.kernel.org Signed-off-by: Amir Goldstein Signed-off-by: J. Bruce Fields --- fs/lockd/clntlock.c | 2 +- fs/lockd/clntproc.c | 2 +- fs/lockd/svclock.c | 16 ++++++++-------- fs/lockd/svcsubs.c | 4 ++-- fs/nfsd/nfs4state.c | 2 +- include/linux/lockd/lockd.h | 4 ++-- 6 files changed, 15 insertions(+), 15 deletions(-) diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index 96c1d14c18f1..c2a128678e6e 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -187,7 +187,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock) continue; if (!rpc_cmp_addr(nlm_addr(block->b_host), addr)) continue; - if (nfs_compare_fh(NFS_FH(file_inode(fl_blocked->fl_file)) ,fh) != 0) + if (nfs_compare_fh(NFS_FH(locks_inode(fl_blocked->fl_file)), fh) != 0) continue; /* Alright, we found a lock. Set the return status * and wake up the caller diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index a2c0dfc6fdc0..d20b92f271c2 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -128,7 +128,7 @@ static void nlmclnt_setlockargs(struct nlm_rqst *req, struct file_lock *fl) char *nodename = req->a_host->h_rpcclnt->cl_nodename; nlmclnt_next_cookie(&argp->cookie); - memcpy(&lock->fh, NFS_FH(file_inode(fl->fl_file)), sizeof(struct nfs_fh)); + memcpy(&lock->fh, NFS_FH(locks_inode(fl->fl_file)), sizeof(struct nfs_fh)); lock->caller = nodename; lock->oh.data = req->a_owner; lock->oh.len = snprintf(req->a_owner, sizeof(req->a_owner), "%u@%s", diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index 3701bccab478..74330daeab71 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -405,8 +405,8 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, __be32 ret; dprintk("lockd: nlmsvc_lock(%s/%ld, ty=%d, pi=%d, %Ld-%Ld, bl=%d)\n", - file_inode(file->f_file)->i_sb->s_id, - file_inode(file->f_file)->i_ino, + locks_inode(file->f_file)->i_sb->s_id, + locks_inode(file->f_file)->i_ino, lock->fl.fl_type, lock->fl.fl_pid, (long long)lock->fl.fl_start, (long long)lock->fl.fl_end, @@ -511,8 +511,8 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file, __be32 ret; dprintk("lockd: nlmsvc_testlock(%s/%ld, ty=%d, %Ld-%Ld)\n", - file_inode(file->f_file)->i_sb->s_id, - file_inode(file->f_file)->i_ino, + locks_inode(file->f_file)->i_sb->s_id, + locks_inode(file->f_file)->i_ino, lock->fl.fl_type, (long long)lock->fl.fl_start, (long long)lock->fl.fl_end); @@ -566,8 +566,8 @@ nlmsvc_unlock(struct net *net, struct nlm_file *file, struct nlm_lock *lock) int error; dprintk("lockd: nlmsvc_unlock(%s/%ld, pi=%d, %Ld-%Ld)\n", - file_inode(file->f_file)->i_sb->s_id, - file_inode(file->f_file)->i_ino, + locks_inode(file->f_file)->i_sb->s_id, + locks_inode(file->f_file)->i_ino, lock->fl.fl_pid, (long long)lock->fl.fl_start, (long long)lock->fl.fl_end); @@ -595,8 +595,8 @@ nlmsvc_cancel_blocked(struct net *net, struct nlm_file *file, struct nlm_lock *l int status = 0; dprintk("lockd: nlmsvc_cancel(%s/%ld, pi=%d, %Ld-%Ld)\n", - file_inode(file->f_file)->i_sb->s_id, - file_inode(file->f_file)->i_ino, + locks_inode(file->f_file)->i_sb->s_id, + locks_inode(file->f_file)->i_ino, lock->fl.fl_pid, (long long)lock->fl.fl_start, (long long)lock->fl.fl_end); diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index 4ec3d6e03e76..899360ba3b84 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -44,7 +44,7 @@ static inline void nlm_debug_print_fh(char *msg, struct nfs_fh *f) static inline void nlm_debug_print_file(char *msg, struct nlm_file *file) { - struct inode *inode = file_inode(file->f_file); + struct inode *inode = locks_inode(file->f_file); dprintk("lockd: %s %s/%ld\n", msg, inode->i_sb->s_id, inode->i_ino); @@ -414,7 +414,7 @@ nlmsvc_match_sb(void *datap, struct nlm_file *file) { struct super_block *sb = datap; - return sb == file_inode(file->f_file)->i_sb; + return sb == locks_inode(file->f_file)->i_sb; } /** diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 19c4d29917ee..c8e4a8f42bbe 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -6322,7 +6322,7 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner) return status; } - inode = file_inode(filp); + inode = locks_inode(filp); flctx = inode->i_flctx; if (flctx && !list_empty_careful(&flctx->flc_posix)) { diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index 4fd95dbeb52f..b065ef406770 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -299,7 +299,7 @@ int nlmsvc_unlock_all_by_ip(struct sockaddr *server_addr); static inline struct inode *nlmsvc_file_inode(struct nlm_file *file) { - return file_inode(file->f_file); + return locks_inode(file->f_file); } static inline int __nlm_privileged_request4(const struct sockaddr *sap) @@ -359,7 +359,7 @@ static inline int nlm_privileged_requester(const struct svc_rqst *rqstp) static inline int nlm_compare_locks(const struct file_lock *fl1, const struct file_lock *fl2) { - return file_inode(fl1->fl_file) == file_inode(fl2->fl_file) + return locks_inode(fl1->fl_file) == locks_inode(fl2->fl_file) && fl1->fl_pid == fl2->fl_pid && fl1->fl_owner == fl2->fl_owner && fl1->fl_start == fl2->fl_start From 9cc3b98d1fdb0d279c6ba9264cac1ae2210497d2 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Wed, 1 Aug 2018 14:28:10 +0800 Subject: [PATCH 12/27] sunrpc: remove redundant variables 'checksumlen','blocksize' and 'data' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Variables 'checksumlen','blocksize' and 'data' are being assigned, but are never used, hence they are redundant and can be removed. Fix the following warning: net/sunrpc/auth_gss/gss_krb5_wrap.c:443:7: warning: variable ‘blocksize’ set but not used [-Wunused-but-set-variable] net/sunrpc/auth_gss/gss_krb5_crypto.c:376:15: warning: variable ‘checksumlen’ set but not used [-Wunused-but-set-variable] net/sunrpc/xprtrdma/svc_rdma.c:97:9: warning: variable ‘data’ set but not used [-Wunused-but-set-variable] Signed-off-by: YueHaibing Reviewed-by: Chuck Lever Signed-off-by: J. Bruce Fields --- net/sunrpc/auth_gss/gss_krb5_crypto.c | 2 -- net/sunrpc/auth_gss/gss_krb5_wrap.c | 2 -- net/sunrpc/xprtrdma/svc_rdma.c | 2 -- 3 files changed, 6 deletions(-) diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c index 8654494b4d0a..84a16fd03431 100644 --- a/net/sunrpc/auth_gss/gss_krb5_crypto.c +++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c @@ -373,7 +373,6 @@ make_checksum_v2(struct krb5_ctx *kctx, char *header, int hdrlen, struct scatterlist sg[1]; int err = -1; u8 *checksumdata; - unsigned int checksumlen; if (kctx->gk5e->keyed_cksum == 0) { dprintk("%s: expected keyed hash for %s\n", @@ -393,7 +392,6 @@ make_checksum_v2(struct krb5_ctx *kctx, char *header, int hdrlen, tfm = crypto_alloc_ahash(kctx->gk5e->cksum_name, 0, CRYPTO_ALG_ASYNC); if (IS_ERR(tfm)) goto out_free_cksum; - checksumlen = crypto_ahash_digestsize(tfm); req = ahash_request_alloc(tfm, GFP_NOFS); if (!req) diff --git a/net/sunrpc/auth_gss/gss_krb5_wrap.c b/net/sunrpc/auth_gss/gss_krb5_wrap.c index a737c2da0837..9a1347fec6f4 100644 --- a/net/sunrpc/auth_gss/gss_krb5_wrap.c +++ b/net/sunrpc/auth_gss/gss_krb5_wrap.c @@ -440,7 +440,6 @@ static u32 gss_wrap_kerberos_v2(struct krb5_ctx *kctx, u32 offset, struct xdr_buf *buf, struct page **pages) { - int blocksize; u8 *ptr, *plainhdr; s32 now; u8 flags = 0x00; @@ -473,7 +472,6 @@ gss_wrap_kerberos_v2(struct krb5_ctx *kctx, u32 offset, *ptr++ = 0xff; be16ptr = (__be16 *)ptr; - blocksize = crypto_skcipher_blocksize(kctx->acceptor_enc); *be16ptr++ = 0; /* "inner" token header always uses 0 for RRC */ *be16ptr++ = 0; diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c index 357ba90c382d..134bef6a451e 100644 --- a/net/sunrpc/xprtrdma/svc_rdma.c +++ b/net/sunrpc/xprtrdma/svc_rdma.c @@ -94,7 +94,6 @@ static int read_reset_stat(struct ctl_table *table, int write, atomic_set(stat, 0); else { char str_buf[32]; - char *data; int len = snprintf(str_buf, 32, "%d\n", atomic_read(stat)); if (len >= 32) return -EFAULT; @@ -103,7 +102,6 @@ static int read_reset_stat(struct ctl_table *table, int write, *lenp = 0; return 0; } - data = &str_buf[*ppos]; len -= *ppos; if (len > *lenp) len = *lenp; From 7b4d6da4bb6d0ece17ca27f0b4612df87f873d83 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 31 Jul 2018 21:24:10 -0500 Subject: [PATCH 13/27] nfsd: Mark expected switch fall-through In preparation to enabling -Wimplicit-fallthrough, mark switch cases where we are expecting to fall through. Warning level 2 was used: -Wimplicit-fallthrough=2 Signed-off-by: Gustavo A. R. Silva Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4callback.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 1f04d2a70d25..144be2dc18d7 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -980,6 +980,7 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback break; case -ESERVERFAULT: ++session->se_cb_seq_nr; + /* Fall through */ case 1: case -NFS4ERR_BADSESSION: nfsd4_mark_cb_fault(cb->cb_clp, cb->cb_seq_status); From a53d5cb0646a12586ae45c892c7a411d47ee1a1d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 27 Jul 2018 11:18:54 -0400 Subject: [PATCH 14/27] svcrdma: Avoid releasing a page in svc_xprt_release() svc_xprt_release() invokes svc_free_res_pages(), which releases pages between rq_respages and rq_next_page. Historically, the RPC/RDMA transport has set these two pointers to be different by one, which means: - one page gets released when svc_recv returns 0. This normally happens whenever one or more RDMA Reads need to be dispatched to complete construction of an RPC Call. - one page gets released after every call to svc_send. In both cases, this released page is immediately refilled by svc_alloc_arg. There does not seem to be a reason for releasing this page. To avoid this unnecessary memory allocator traffic, set rq_next_page more carefully. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 9 ++++++--- net/sunrpc/xprtrdma/svc_rdma_sendto.c | 4 +++- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 841fca143804..ddb7def48b12 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -366,9 +366,6 @@ static void svc_rdma_build_arg_xdr(struct svc_rqst *rqstp, arg->page_base = 0; arg->buflen = ctxt->rc_byte_len; arg->len = ctxt->rc_byte_len; - - rqstp->rq_respages = &rqstp->rq_pages[0]; - rqstp->rq_next_page = rqstp->rq_respages + 1; } /* This accommodates the largest possible Write chunk, @@ -730,6 +727,12 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) svc_rdma_build_arg_xdr(rqstp, ctxt); + /* Prevent svc_xprt_release from releasing pages in rq_pages + * if we return 0 or an error. + */ + rqstp->rq_respages = rqstp->rq_pages; + rqstp->rq_next_page = rqstp->rq_respages; + p = (__be32 *)rqstp->rq_arg.head[0].iov_base; ret = svc_rdma_xdr_decode_req(&rqstp->rq_arg); if (ret < 0) diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 4a3efaea277c..b958fb65f4e3 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -657,7 +657,9 @@ static void svc_rdma_save_io_pages(struct svc_rqst *rqstp, ctxt->sc_pages[i] = rqstp->rq_respages[i]; rqstp->rq_respages[i] = NULL; } - rqstp->rq_next_page = rqstp->rq_respages + 1; + + /* Prevent svc_xprt_release from releasing pages in rq_pages */ + rqstp->rq_next_page = rqstp->rq_respages; } /* Prepare the portion of the RPC Reply that will be transmitted From 07d0ff3b0cd23a4ba7078fb5cc3aeb25e38b3557 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 27 Jul 2018 11:18:59 -0400 Subject: [PATCH 15/27] svcrdma: Clean up Read chunk path Simplify the error handling at the tail of recv_read_chunk() by re-arranging rq_pages[] housekeeping and documenting it properly. NB: In this path, svc_rdma_recvfrom returns zero. Therefore no subsequent reply processing is done on the svc_rqstp, and thus the rq_respages field does not need to be updated. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- net/sunrpc/xprtrdma/svc_rdma_rw.c | 32 +++++++++++++------------------ 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index ce3ea8419704..a1ac9997da98 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -679,6 +679,7 @@ static int svc_rdma_build_read_chunk(struct svc_rqst *rqstp, struct svc_rdma_read_info *info, __be32 *p) { + unsigned int i; int ret; ret = -EINVAL; @@ -701,6 +702,12 @@ static int svc_rdma_build_read_chunk(struct svc_rqst *rqstp, info->ri_chunklen += rs_length; } + /* Pages under I/O have been copied to head->rc_pages. + * Prevent their premature release by svc_xprt_release() . + */ + for (i = 0; i < info->ri_readctxt->rc_page_count; i++) + rqstp->rq_pages[i] = NULL; + return ret; } @@ -816,7 +823,6 @@ int svc_rdma_recv_read_chunk(struct svcxprt_rdma *rdma, struct svc_rqst *rqstp, struct svc_rdma_recv_ctxt *head, __be32 *p) { struct svc_rdma_read_info *info; - struct page **page; int ret; /* The request (with page list) is constructed in @@ -843,27 +849,15 @@ int svc_rdma_recv_read_chunk(struct svcxprt_rdma *rdma, struct svc_rqst *rqstp, ret = svc_rdma_build_normal_read_chunk(rqstp, info, p); else ret = svc_rdma_build_pz_read_chunk(rqstp, info, p); - - /* Mark the start of the pages that can be used for the reply */ - if (info->ri_pageoff > 0) - info->ri_pageno++; - rqstp->rq_respages = &rqstp->rq_pages[info->ri_pageno]; - rqstp->rq_next_page = rqstp->rq_respages + 1; - if (ret < 0) - goto out; + goto out_err; ret = svc_rdma_post_chunk_ctxt(&info->ri_cc); - -out: - /* Read sink pages have been moved from rqstp->rq_pages to - * head->rc_arg.pages. Force svc_recv to refill those slots - * in rq_pages. - */ - for (page = rqstp->rq_pages; page < rqstp->rq_respages; page++) - *page = NULL; - if (ret < 0) - svc_rdma_read_info_free(info); + goto out_err; + return 0; + +out_err: + svc_rdma_read_info_free(info); return ret; } From 3fd9557aec919e2db99365ad5a2c00d04ae8893c Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 27 Jul 2018 11:19:05 -0400 Subject: [PATCH 16/27] NFSD: Refactor the generic write vector fill helper fill_in_write_vector() is nearly the same logic as svc_fill_write_vector(), but there are a few differences so that the former can handle multiple WRITE payloads in a single COMPOUND. svc_fill_write_vector() can be adjusted so that it can be used in the NFSv4 WRITE code path too. Instead of assuming the pages are coming from rq_args.pages, have the caller pass in the page list. The immediate benefit is a reduction of code duplication. It also prevents the NFSv4 WRITE decoder from passing an empty vector element when the transport has provided the payload in the xdr_buf's page array. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs3proc.c | 3 ++- fs/nfsd/nfs4proc.c | 23 ++++------------------- fs/nfsd/nfsproc.c | 3 ++- include/linux/sunrpc/svc.h | 1 + net/sunrpc/svc.c | 11 ++++------- 5 files changed, 13 insertions(+), 28 deletions(-) diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 6259a4b8579f..8d1c2d1a159b 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -202,7 +202,8 @@ nfsd3_proc_write(struct svc_rqst *rqstp) fh_copy(&resp->fh, &argp->fh); resp->committed = argp->stable; - nvecs = svc_fill_write_vector(rqstp, &argp->first, cnt); + nvecs = svc_fill_write_vector(rqstp, rqstp->rq_arg.pages, + &argp->first, cnt); if (!nvecs) RETURN_STATUS(nfserr_io); nfserr = nfsd_write(rqstp, &resp->fh, argp->offset, diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 3652f9b1fb68..b7bc6e1a85ac 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -986,24 +986,6 @@ out: return status; } -static int fill_in_write_vector(struct kvec *vec, struct nfsd4_write *write) -{ - int i = 1; - int buflen = write->wr_buflen; - - vec[0].iov_base = write->wr_head.iov_base; - vec[0].iov_len = min_t(int, buflen, write->wr_head.iov_len); - buflen -= vec[0].iov_len; - - while (buflen) { - vec[i].iov_base = page_address(write->wr_pagelist[i - 1]); - vec[i].iov_len = min_t(int, PAGE_SIZE, buflen); - buflen -= vec[i].iov_len; - i++; - } - return i; -} - static __be32 nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) @@ -1031,7 +1013,10 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, write->wr_how_written = write->wr_stable_how; gen_boot_verifier(&write->wr_verifier, SVC_NET(rqstp)); - nvecs = fill_in_write_vector(rqstp->rq_vec, write); + nvecs = svc_fill_write_vector(rqstp, write->wr_pagelist, + &write->wr_head, write->wr_buflen); + if (!nvecs) + return nfserr_io; WARN_ON_ONCE(nvecs > ARRAY_SIZE(rqstp->rq_vec)); status = nfsd_vfs_write(rqstp, &cstate->current_fh, filp, diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index f107f9fa8e15..a6faee562b31 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -218,7 +218,8 @@ nfsd_proc_write(struct svc_rqst *rqstp) SVCFH_fmt(&argp->fh), argp->len, argp->offset); - nvecs = svc_fill_write_vector(rqstp, &argp->first, cnt); + nvecs = svc_fill_write_vector(rqstp, rqstp->rq_arg.pages, + &argp->first, cnt); if (!nvecs) return nfserr_io; nfserr = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh), diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 574368e8a16f..43f88bd7b601 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -496,6 +496,7 @@ void svc_reserve(struct svc_rqst *rqstp, int space); struct svc_pool * svc_pool_for_cpu(struct svc_serv *serv, int cpu); char * svc_print_addr(struct svc_rqst *, char *, size_t); unsigned int svc_fill_write_vector(struct svc_rqst *rqstp, + struct page **pages, struct kvec *first, size_t total); char *svc_fill_symlink_pathname(struct svc_rqst *rqstp, struct kvec *first, size_t total); diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 30a4226baf03..2194ed507991 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -1537,16 +1537,16 @@ EXPORT_SYMBOL_GPL(svc_max_payload); /** * svc_fill_write_vector - Construct data argument for VFS write call * @rqstp: svc_rqst to operate on + * @pages: list of pages containing data payload * @first: buffer containing first section of write payload * @total: total number of bytes of write payload * - * Returns the number of elements populated in the data argument array. + * Fills in rqstp::rq_vec, and returns the number of elements. */ -unsigned int svc_fill_write_vector(struct svc_rqst *rqstp, struct kvec *first, - size_t total) +unsigned int svc_fill_write_vector(struct svc_rqst *rqstp, struct page **pages, + struct kvec *first, size_t total) { struct kvec *vec = rqstp->rq_vec; - struct page **pages; unsigned int i; /* Some types of transport can present the write payload @@ -1560,14 +1560,11 @@ unsigned int svc_fill_write_vector(struct svc_rqst *rqstp, struct kvec *first, ++i; } - WARN_ON_ONCE(rqstp->rq_arg.page_base != 0); - pages = rqstp->rq_arg.pages; while (total) { vec[i].iov_base = page_address(*pages); vec[i].iov_len = min_t(size_t, total, PAGE_SIZE); total -= vec[i].iov_len; ++i; - ++pages; } From 11b4d66ea3313d9b03a83b80458ddee64990e3c3 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 27 Jul 2018 11:19:10 -0400 Subject: [PATCH 17/27] NFSD: Handle full-length symlinks I've given up on the idea of zero-copy handling of SYMLINK on the server side. This is because the Linux VFS symlink API requires the symlink pathname to be in a NUL-terminated kmalloc'd buffer. The NUL-termination is going to be problematic (watching out for landing on a page boundary and dealing with a 4096-byte pathname). I don't believe that SYMLINK creation is on a performance path or is requested frequently enough that it will cause noticeable CPU cache pollution due to data copies. There will be two places where a transport callout will be necessary to fill in the rqstp: one will be in the svc_fill_symlink_pathname() helper that is used by NFSv2 and NFSv3, and the other will be in nfsd4_decode_create(). Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs3proc.c | 2 ++ fs/nfsd/nfsproc.c | 2 ++ include/linux/sunrpc/svc.h | 3 +- net/sunrpc/svc.c | 69 ++++++++++++++------------------------ 4 files changed, 32 insertions(+), 44 deletions(-) diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 8d1c2d1a159b..9eb8086ea841 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -290,6 +290,7 @@ nfsd3_proc_symlink(struct svc_rqst *rqstp) RETURN_STATUS(nfserr_nametoolong); argp->tname = svc_fill_symlink_pathname(rqstp, &argp->first, + page_address(rqstp->rq_arg.pages[0]), argp->tlen); if (IS_ERR(argp->tname)) RETURN_STATUS(nfserrno(PTR_ERR(argp->tname))); @@ -303,6 +304,7 @@ nfsd3_proc_symlink(struct svc_rqst *rqstp) fh_init(&resp->fh, NFS3_FHSIZE); nfserr = nfsd_symlink(rqstp, &resp->dirfh, argp->fname, argp->flen, argp->tname, &resp->fh); + kfree(argp->tname); RETURN_STATUS(nfserr); } diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index a6faee562b31..0d20fd161225 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -454,6 +454,7 @@ nfsd_proc_symlink(struct svc_rqst *rqstp) return nfserr_nametoolong; argp->tname = svc_fill_symlink_pathname(rqstp, &argp->first, + page_address(rqstp->rq_arg.pages[0]), argp->tlen); if (IS_ERR(argp->tname)) return nfserrno(PTR_ERR(argp->tname)); @@ -466,6 +467,7 @@ nfsd_proc_symlink(struct svc_rqst *rqstp) nfserr = nfsd_symlink(rqstp, &argp->ffh, argp->fname, argp->flen, argp->tname, &newfh); + kfree(argp->tname); fh_put(&argp->ffh); fh_put(&newfh); return nfserr; diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 43f88bd7b601..73e130a840ce 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -499,7 +499,8 @@ unsigned int svc_fill_write_vector(struct svc_rqst *rqstp, struct page **pages, struct kvec *first, size_t total); char *svc_fill_symlink_pathname(struct svc_rqst *rqstp, - struct kvec *first, size_t total); + struct kvec *first, void *p, + size_t total); #define RPC_MAX_ADDRBUFLEN (63U) diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 2194ed507991..d13e05f1a990 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -1577,65 +1577,48 @@ EXPORT_SYMBOL_GPL(svc_fill_write_vector); * svc_fill_symlink_pathname - Construct pathname argument for VFS symlink call * @rqstp: svc_rqst to operate on * @first: buffer containing first section of pathname + * @p: buffer containing remaining section of pathname * @total: total length of the pathname argument * - * Returns pointer to a NUL-terminated string, or an ERR_PTR. The buffer is - * released automatically when @rqstp is recycled. + * The VFS symlink API demands a NUL-terminated pathname in mapped memory. + * Returns pointer to a NUL-terminated string, or an ERR_PTR. Caller must free + * the returned string. */ char *svc_fill_symlink_pathname(struct svc_rqst *rqstp, struct kvec *first, - size_t total) + void *p, size_t total) { - struct xdr_buf *arg = &rqstp->rq_arg; - struct page **pages; - char *result; + size_t len, remaining; + char *result, *dst; - /* VFS API demands a NUL-terminated pathname. This function - * uses a page from @rqstp as the pathname buffer, to enable - * direct placement. Thus the total buffer size is PAGE_SIZE. - * Space in this buffer for NUL-termination requires that we - * cap the size of the returned symlink pathname just a - * little early. - */ - if (total > PAGE_SIZE - 1) - return ERR_PTR(-ENAMETOOLONG); + result = kmalloc(total + 1, GFP_KERNEL); + if (!result) + return ERR_PTR(-ESERVERFAULT); - /* Some types of transport can present the pathname entirely - * in rq_arg.pages. If not, then copy the pathname into one - * page. - */ - pages = arg->pages; - WARN_ON_ONCE(arg->page_base != 0); - if (first->iov_base == 0) { - result = page_address(*pages); - result[total] = '\0'; - } else { - size_t len, remaining; - char *dst; + dst = result; + remaining = total; - result = page_address(*(rqstp->rq_next_page++)); - dst = result; - remaining = total; - - len = min_t(size_t, total, first->iov_len); + len = min_t(size_t, total, first->iov_len); + if (len) { memcpy(dst, first->iov_base, len); dst += len; remaining -= len; - - /* No more than one page left */ - if (remaining) { - len = min_t(size_t, remaining, PAGE_SIZE); - memcpy(dst, page_address(*pages), len); - dst += len; - } - - *dst = '\0'; } - /* Sanity check: we don't allow the pathname argument to + if (remaining) { + len = min_t(size_t, remaining, PAGE_SIZE); + memcpy(dst, p, len); + dst += len; + } + + *dst = '\0'; + + /* Sanity check: Linux doesn't allow the pathname argument to * contain a NUL byte. */ - if (strlen(result) != total) + if (strlen(result) != total) { + kfree(result); return ERR_PTR(-EINVAL); + } return result; } EXPORT_SYMBOL_GPL(svc_fill_symlink_pathname); From 5ed96bc5451bb736a0b7d5d72a0f444316dc6559 Mon Sep 17 00:00:00 2001 From: nixiaoming Date: Mon, 23 Jul 2018 09:57:11 +0800 Subject: [PATCH 18/27] fs/nfsd: Delete invalid assignment statements in nfsd4_decode_exchange_id READ_BUF(8); dummy = be32_to_cpup(p++); dummy = be32_to_cpup(p++); ... READ_BUF(4); dummy = be32_to_cpup(p++); Assigning value to "dummy" here, but that stored value is overwritten before it can be used. At the same time READ_BUF() will re-update the pointer p. delete invalid assignment statements Signed-off-by: nixiaoming Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4xdr.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index fb4991889f89..418fa9c78186 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1390,10 +1390,8 @@ nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp, p += XDR_QUADLEN(dummy); } - /* ssp_window and ssp_num_gss_handles */ + /* ignore ssp_window and ssp_num_gss_handles: */ READ_BUF(8); - dummy = be32_to_cpup(p++); - dummy = be32_to_cpup(p++); break; default: goto xdr_error; From c2cdc2ab24873289db8588a0a7ccec508d7ad075 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 17 Jul 2018 11:01:25 -0700 Subject: [PATCH 19/27] nfsd: constify write_op[] write_op[] is never modified, so make it 'const'. Signed-off-by: Eric Biggers Signed-off-by: J. Bruce Fields --- fs/nfsd/nfsctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index ec8a214a5089..7fb9f7c667b1 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -73,7 +73,7 @@ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size); static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size); #endif -static ssize_t (*write_op[])(struct file *, char *, size_t) = { +static ssize_t (*const write_op[])(struct file *, char *, size_t) = { [NFSD_Fh] = write_filehandle, [NFSD_FO_UnlockIP] = write_unlock_ip, [NFSD_FO_UnlockFS] = write_unlock_fs, From a677a78325b3057ce03b6129f549ece85b0c8dd2 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 1 Aug 2018 19:44:05 -0500 Subject: [PATCH 20/27] nfsd: use true and false for boolean values Return statements in functions returning bool should use true or false instead of an integer value. This issue was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva Signed-off-by: J. Bruce Fields --- fs/nfsd/nfsfh.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index a008e7634181..b319080288c3 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -451,7 +451,7 @@ static bool fsid_type_ok_for_exp(u8 fsid_type, struct svc_export *exp) switch (fsid_type) { case FSID_DEV: if (!old_valid_dev(exp_sb(exp)->s_dev)) - return 0; + return false; /* FALL THROUGH */ case FSID_MAJOR_MINOR: case FSID_ENCODE_DEV: @@ -461,13 +461,13 @@ static bool fsid_type_ok_for_exp(u8 fsid_type, struct svc_export *exp) case FSID_UUID8: case FSID_UUID16: if (!is_root_export(exp)) - return 0; + return false; /* fall through */ case FSID_UUID4_INUM: case FSID_UUID16_INUM: return exp->ex_uuid != NULL; } - return 1; + return true; } From ac5bb5b3b0502b90b489e6b19c46b8c3dbe2e640 Mon Sep 17 00:00:00 2001 From: zhong jiang Date: Tue, 7 Aug 2018 07:20:00 -0400 Subject: [PATCH 21/27] rpc: remove unneeded variable 'ret' in rdma_listen_handler The ret is not modified after initalization, So just remove the variable and return 0. Signed-off-by: zhong jiang Signed-off-by: J. Bruce Fields --- net/sunrpc/xprtrdma/svc_rdma_transport.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index e9535a66bab0..2317cf79a1ed 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -296,7 +296,6 @@ static int rdma_listen_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) { struct sockaddr *sap = (struct sockaddr *)&cma_id->route.addr.src_addr; - int ret = 0; trace_svcrdma_cm_event(event, sap); @@ -315,7 +314,7 @@ static int rdma_listen_handler(struct rdma_cm_id *cma_id, break; } - return ret; + return 0; } static int rdma_cma_handler(struct rdma_cm_id *cma_id, From 44090cc876926277329e1608bafc01b9f6da627f Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Fri, 17 Aug 2018 14:43:54 -0700 Subject: [PATCH 22/27] sunrpc: Don't use stack buffer with scatterlist Fedora got a bug report from NFS: kernel BUG at include/linux/scatterlist.h:143! ... RIP: 0010:sg_init_one+0x7d/0x90 .. make_checksum+0x4e7/0x760 [rpcsec_gss_krb5] gss_get_mic_kerberos+0x26e/0x310 [rpcsec_gss_krb5] gss_marshal+0x126/0x1a0 [auth_rpcgss] ? __local_bh_enable_ip+0x80/0xe0 ? call_transmit_status+0x1d0/0x1d0 [sunrpc] call_transmit+0x137/0x230 [sunrpc] __rpc_execute+0x9b/0x490 [sunrpc] rpc_run_task+0x119/0x150 [sunrpc] nfs4_run_exchange_id+0x1bd/0x250 [nfsv4] _nfs4_proc_exchange_id+0x2d/0x490 [nfsv4] nfs41_discover_server_trunking+0x1c/0xa0 [nfsv4] nfs4_discover_server_trunking+0x80/0x270 [nfsv4] nfs4_init_client+0x16e/0x240 [nfsv4] ? nfs_get_client+0x4c9/0x5d0 [nfs] ? _raw_spin_unlock+0x24/0x30 ? nfs_get_client+0x4c9/0x5d0 [nfs] nfs4_set_client+0xb2/0x100 [nfsv4] nfs4_create_server+0xff/0x290 [nfsv4] nfs4_remote_mount+0x28/0x50 [nfsv4] mount_fs+0x3b/0x16a vfs_kern_mount.part.35+0x54/0x160 nfs_do_root_mount+0x7f/0xc0 [nfsv4] nfs4_try_mount+0x43/0x70 [nfsv4] ? get_nfs_version+0x21/0x80 [nfs] nfs_fs_mount+0x789/0xbf0 [nfs] ? pcpu_alloc+0x6ca/0x7e0 ? nfs_clone_super+0x70/0x70 [nfs] ? nfs_parse_mount_options+0xb40/0xb40 [nfs] mount_fs+0x3b/0x16a vfs_kern_mount.part.35+0x54/0x160 do_mount+0x1fd/0xd50 ksys_mount+0xba/0xd0 __x64_sys_mount+0x21/0x30 do_syscall_64+0x60/0x1f0 entry_SYSCALL_64_after_hwframe+0x49/0xbe This is BUG_ON(!virt_addr_valid(buf)) triggered by using a stack allocated buffer with a scatterlist. Convert the buffer for rc4salt to be dynamically allocated instead. Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1615258 Signed-off-by: Laura Abbott Signed-off-by: J. Bruce Fields --- net/sunrpc/auth_gss/gss_krb5_crypto.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c index 84a16fd03431..d31f868b643c 100644 --- a/net/sunrpc/auth_gss/gss_krb5_crypto.c +++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c @@ -169,7 +169,7 @@ make_checksum_hmac_md5(struct krb5_ctx *kctx, char *header, int hdrlen, struct scatterlist sg[1]; int err = -1; u8 *checksumdata; - u8 rc4salt[4]; + u8 *rc4salt; struct crypto_ahash *md5; struct crypto_ahash *hmac_md5; struct ahash_request *req; @@ -183,14 +183,18 @@ make_checksum_hmac_md5(struct krb5_ctx *kctx, char *header, int hdrlen, return GSS_S_FAILURE; } + rc4salt = kmalloc_array(4, sizeof(*rc4salt), GFP_NOFS); + if (!rc4salt) + return GSS_S_FAILURE; + if (arcfour_hmac_md5_usage_to_salt(usage, rc4salt)) { dprintk("%s: invalid usage value %u\n", __func__, usage); - return GSS_S_FAILURE; + goto out_free_rc4salt; } checksumdata = kmalloc(GSS_KRB5_MAX_CKSUM_LEN, GFP_NOFS); if (!checksumdata) - return GSS_S_FAILURE; + goto out_free_rc4salt; md5 = crypto_alloc_ahash("md5", 0, CRYPTO_ALG_ASYNC); if (IS_ERR(md5)) @@ -258,6 +262,8 @@ out_free_md5: crypto_free_ahash(md5); out_free_cksum: kfree(checksumdata); +out_free_rc4salt: + kfree(rc4salt); return err ? GSS_S_FAILURE : 0; } From a1a237775ec8fd2694b9f8e451821fd168e730d8 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 16 Aug 2018 12:05:54 -0400 Subject: [PATCH 23/27] sunrpc: Enable the kernel to specify the hostname part of service principals A multi-homed NFS server may have more than one "nfs" key in its keytab. Enable the kernel to pick the key it wants as a machine credential when establishing a GSS context. This is useful for GSS-protected NFSv4.0 callbacks, which are required by RFC 7530 S3.3.3 to use the same principal as the service principal the client used when establishing its lease. A complementary modification to rpc.gssd is required to fully enable this feature. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- net/sunrpc/auth_gss/auth_gss.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index be8f103d22fd..1943e1198804 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -284,7 +284,12 @@ err: return p; } -#define UPCALL_BUF_LEN 128 +/* XXX: Need some documentation about why UPCALL_BUF_LEN is so small. + * Is user space expecting no more than UPCALL_BUF_LEN bytes? + * Note that there are now _two_ NI_MAXHOST sized data items + * being passed in this string. + */ +#define UPCALL_BUF_LEN 256 struct gss_upcall_msg { refcount_t count; @@ -462,8 +467,17 @@ static int gss_encode_v1_msg(struct gss_upcall_msg *gss_msg, p += len; gss_msg->msg.len += len; } - if (service_name != NULL) { - len = scnprintf(p, buflen, "service=%s ", service_name); + if (service_name) { + char *c = strchr(service_name, '@'); + + if (!c) + len = scnprintf(p, buflen, "service=%s ", + service_name); + else + len = scnprintf(p, buflen, + "service=%.*s srchost=%s ", + (int)(c - service_name), + service_name, c + 1); buflen -= len; p += len; gss_msg->msg.len += len; From 9abdda5ddab8a899ca8c4b859ef0a7710f40e0dd Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 16 Aug 2018 12:05:59 -0400 Subject: [PATCH 24/27] sunrpc: Extract target name into svc_cred NFSv4.0 callback needs to know the GSS target name the client used when it established its lease. That information is available from the GSS context created by gssproxy. Make it available in each svc_cred. Note this will also give us access to the real target service principal name (which is typically "nfs", but spec does not require that). Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 7 ++- include/linux/sunrpc/svcauth.h | 3 ++ net/sunrpc/auth_gss/gss_rpc_upcall.c | 70 ++++++++++++++++++---------- 3 files changed, 53 insertions(+), 27 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index c8e4a8f42bbe..3322fe35a8fe 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1979,8 +1979,10 @@ static int copy_cred(struct svc_cred *target, struct svc_cred *source) target->cr_principal = kstrdup(source->cr_principal, GFP_KERNEL); target->cr_raw_principal = kstrdup(source->cr_raw_principal, GFP_KERNEL); - if ((source->cr_principal && ! target->cr_principal) || - (source->cr_raw_principal && ! target->cr_raw_principal)) + target->cr_targ_princ = kstrdup(source->cr_targ_princ, GFP_KERNEL); + if ((source->cr_principal && !target->cr_principal) || + (source->cr_raw_principal && !target->cr_raw_principal) || + (source->cr_targ_princ && !target->cr_targ_princ)) return -ENOMEM; target->cr_flavor = source->cr_flavor; @@ -2057,6 +2059,7 @@ same_creds(struct svc_cred *cr1, struct svc_cred *cr2) || (!gid_eq(cr1->cr_gid, cr2->cr_gid)) || !groups_equal(cr1->cr_group_info, cr2->cr_group_info)) return false; + /* XXX: check that cr_targ_princ fields match ? */ if (cr1->cr_principal == cr2->cr_principal) return true; if (!cr1->cr_principal || !cr2->cr_principal) diff --git a/include/linux/sunrpc/svcauth.h b/include/linux/sunrpc/svcauth.h index 7c3656505847..04e404a07882 100644 --- a/include/linux/sunrpc/svcauth.h +++ b/include/linux/sunrpc/svcauth.h @@ -31,6 +31,7 @@ struct svc_cred { /* name of form servicetype@hostname, passed down by * rpc.svcgssd, or computed from the above: */ char *cr_principal; + char *cr_targ_princ; struct gss_api_mech *cr_gss_mech; }; @@ -39,6 +40,7 @@ static inline void init_svc_cred(struct svc_cred *cred) cred->cr_group_info = NULL; cred->cr_raw_principal = NULL; cred->cr_principal = NULL; + cred->cr_targ_princ = NULL; cred->cr_gss_mech = NULL; } @@ -48,6 +50,7 @@ static inline void free_svc_cred(struct svc_cred *cred) put_group_info(cred->cr_group_info); kfree(cred->cr_raw_principal); kfree(cred->cr_principal); + kfree(cred->cr_targ_princ); gss_mech_put(cred->cr_gss_mech); init_svc_cred(cred); } diff --git a/net/sunrpc/auth_gss/gss_rpc_upcall.c b/net/sunrpc/auth_gss/gss_rpc_upcall.c index 1c7c49dbf8ba..73dcda060335 100644 --- a/net/sunrpc/auth_gss/gss_rpc_upcall.c +++ b/net/sunrpc/auth_gss/gss_rpc_upcall.c @@ -234,6 +234,35 @@ static int gssp_alloc_receive_pages(struct gssx_arg_accept_sec_context *arg) return 0; } +static char *gssp_stringify(struct xdr_netobj *netobj) +{ + return kstrndup(netobj->data, netobj->len, GFP_KERNEL); +} + +static void gssp_hostbased_service(char **principal) +{ + char *c; + + if (!*principal) + return; + + /* terminate and remove realm part */ + c = strchr(*principal, '@'); + if (c) { + *c = '\0'; + + /* change service-hostname delimiter */ + c = strchr(*principal, '/'); + if (c) + *c = '@'; + } + if (!c) { + /* not a service principal */ + kfree(*principal); + *principal = NULL; + } +} + /* * Public functions */ @@ -262,6 +291,7 @@ int gssp_accept_sec_context_upcall(struct net *net, */ .exported_context_token.len = GSSX_max_output_handle_sz, .mech.len = GSS_OID_MAX_LEN, + .targ_name.display_name.len = GSSX_max_princ_sz, .src_name.display_name.len = GSSX_max_princ_sz }; struct gssx_res_accept_sec_context res = { @@ -275,6 +305,7 @@ int gssp_accept_sec_context_upcall(struct net *net, .rpc_cred = NULL, /* FIXME ? */ }; struct xdr_netobj client_name = { 0 , NULL }; + struct xdr_netobj target_name = { 0, NULL }; int ret; if (data->in_handle.len != 0) @@ -285,8 +316,6 @@ int gssp_accept_sec_context_upcall(struct net *net, if (ret) return ret; - /* use nfs/ for targ_name ? */ - ret = gssp_call(net, &msg); gssp_free_receive_pages(&arg); @@ -304,6 +333,7 @@ int gssp_accept_sec_context_upcall(struct net *net, kfree(rctxh.mech.data); } client_name = rctxh.src_name.display_name; + target_name = rctxh.targ_name.display_name; } if (res.options.count == 1) { @@ -325,32 +355,22 @@ int gssp_accept_sec_context_upcall(struct net *net, } /* convert to GSS_NT_HOSTBASED_SERVICE form and set into creds */ - if (data->found_creds && client_name.data != NULL) { - char *c; - - data->creds.cr_raw_principal = kstrndup(client_name.data, - client_name.len, GFP_KERNEL); - - data->creds.cr_principal = kstrndup(client_name.data, - client_name.len, GFP_KERNEL); - if (data->creds.cr_principal) { - /* terminate and remove realm part */ - c = strchr(data->creds.cr_principal, '@'); - if (c) { - *c = '\0'; - - /* change service-hostname delimiter */ - c = strchr(data->creds.cr_principal, '/'); - if (c) *c = '@'; - } - if (!c) { - /* not a service principal */ - kfree(data->creds.cr_principal); - data->creds.cr_principal = NULL; - } + if (data->found_creds) { + if (client_name.data) { + data->creds.cr_raw_principal = + gssp_stringify(&client_name); + data->creds.cr_principal = + gssp_stringify(&client_name); + gssp_hostbased_service(&data->creds.cr_principal); + } + if (target_name.data) { + data->creds.cr_targ_princ = + gssp_stringify(&target_name); + gssp_hostbased_service(&data->creds.cr_targ_princ); } } kfree(client_name.data); + kfree(target_name.data); return ret; } From cb25e7b293ec5cd82bc958a6c89dadeb198504b4 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 16 Aug 2018 12:06:04 -0400 Subject: [PATCH 25/27] nfsd: Use correct credential for NFSv4.0 callback with GSS I've had trouble when operating a multi-homed Linux NFS server with Kerberos using NFSv4.0. Lately, I've seen my clients reporting this (and then hanging): May 9 11:43:26 manet kernel: NFS: NFSv4 callback contains invalid cred The client-side commit f11b2a1cfbf5 ("nfs4: copy acceptor name from context to nfs_client") appears to be related, but I suspect this problem has been going on for some time before that. RFC 7530 Section 3.3.3 says: > For Kerberos V5, nfs/hostname would be a server principal in the > Kerberos Key Distribution Center database. This is the same > principal the client acquired a GSS-API context for when it issued > the SETCLIENTID operation ... In other words, an NFSv4.0 client expects that the server will use the same GSS principal for callback that the client used to establish its lease. For example, if the client used the service principal "nfs@server.domain" to establish its lease, the server is required to use "nfs@server.domain" when performing NFSv4.0 callback operations. The Linux NFS server currently does not. It uses a common service principal for all callback connections. Sometimes this works as expected, and other times -- for example, when the server is accessible via multiple hostnames -- it won't work at all. This patch scrapes the target name from the client credential, and uses that for the NFSv4.0 callback credential. That should be correct much more often. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4callback.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 144be2dc18d7..ba8e902ec4ad 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -769,7 +769,14 @@ void cleanup_callback_cred(void) static struct rpc_cred *get_backchannel_cred(struct nfs4_client *clp, struct rpc_clnt *client, struct nfsd4_session *ses) { if (clp->cl_minorversion == 0) { - return get_rpccred(callback_cred); + char *principal = clp->cl_cred.cr_targ_princ ? + clp->cl_cred.cr_targ_princ : "nfs"; + struct rpc_cred *cred; + + cred = rpc_lookup_machine_cred(principal); + if (!IS_ERR(cred)) + get_rpccred(cred); + return cred; } else { struct rpc_auth *auth = client->cl_auth; struct auth_cred acred = {}; From a26dd64f5477968d730cf92868b4092314b8e45e Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 16 Aug 2018 12:06:09 -0400 Subject: [PATCH 26/27] nfsd: Remove callback_cred Clean up: The global callback_cred is no longer used, so it can be removed. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4callback.c | 20 -------------------- fs/nfsd/nfs4state.c | 10 ++-------- fs/nfsd/state.h | 2 -- 3 files changed, 2 insertions(+), 30 deletions(-) diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index ba8e902ec4ad..601bf33c26a0 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -746,26 +746,6 @@ static int max_cb_time(struct net *net) return max(nn->nfsd4_lease/10, (time_t)1) * HZ; } -static struct rpc_cred *callback_cred; - -int set_callback_cred(void) -{ - if (callback_cred) - return 0; - callback_cred = rpc_lookup_machine_cred("nfs"); - if (!callback_cred) - return -ENOMEM; - return 0; -} - -void cleanup_callback_cred(void) -{ - if (callback_cred) { - put_rpccred(callback_cred); - callback_cred = NULL; - } -} - static struct rpc_cred *get_backchannel_cred(struct nfs4_client *clp, struct rpc_clnt *client, struct nfsd4_session *ses) { if (clp->cl_minorversion == 0) { diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 3322fe35a8fe..b0ca0efd2875 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -7231,14 +7231,10 @@ nfs4_state_start(void) { int ret; - ret = set_callback_cred(); - if (ret) - return ret; - laundry_wq = alloc_workqueue("%s", WQ_UNBOUND, 0, "nfsd4"); if (laundry_wq == NULL) { ret = -ENOMEM; - goto out_cleanup_cred; + goto out; } ret = nfsd4_create_callback_queue(); if (ret) @@ -7249,8 +7245,7 @@ nfs4_state_start(void) out_free_laundry: destroy_workqueue(laundry_wq); -out_cleanup_cred: - cleanup_callback_cred(); +out: return ret; } @@ -7287,7 +7282,6 @@ nfs4_state_shutdown(void) { destroy_workqueue(laundry_wq); nfsd4_destroy_callback_queue(); - cleanup_callback_cred(); } static void diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index f3772ea8ba0d..0b15dac7e609 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -617,8 +617,6 @@ extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(const char *recdir, struct nfsd_net *nn); extern __be32 nfs4_check_open_reclaim(clientid_t *clid, struct nfsd4_compound_state *cstate, struct nfsd_net *nn); -extern int set_callback_cred(void); -extern void cleanup_callback_cred(void); extern void nfsd4_probe_callback(struct nfs4_client *clp); extern void nfsd4_probe_callback_sync(struct nfs4_client *clp); extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *); From 108b833cde9c9b93204e6a4e455829a67f9785c3 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 20 Aug 2018 10:39:16 -0400 Subject: [PATCH 27/27] sunrpc: Add comment defining gssd upcall API keywords During review, it was found that the target, service, and srchost keywords are easily conflated. Add an explainer. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- net/sunrpc/auth_gss/auth_gss.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 1943e1198804..246075960712 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -461,12 +461,28 @@ static int gss_encode_v1_msg(struct gss_upcall_msg *gss_msg, buflen -= len; p += len; gss_msg->msg.len = len; + + /* + * target= is a full service principal that names the remote + * identity that we are authenticating to. + */ if (target_name) { len = scnprintf(p, buflen, "target=%s ", target_name); buflen -= len; p += len; gss_msg->msg.len += len; } + + /* + * gssd uses service= and srchost= to select a matching key from + * the system's keytab to use as the source principal. + * + * service= is the service name part of the source principal, + * or "*" (meaning choose any). + * + * srchost= is the hostname part of the source principal. When + * not provided, gssd uses the local hostname. + */ if (service_name) { char *c = strchr(service_name, '@'); @@ -482,6 +498,7 @@ static int gss_encode_v1_msg(struct gss_upcall_msg *gss_msg, p += len; gss_msg->msg.len += len; } + if (mech->gm_upcall_enctypes) { len = scnprintf(p, buflen, "enctypes=%s ", mech->gm_upcall_enctypes);