From b09b9417d074e01a4e4ab5c19358f1b3dc76c1b2 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 25 Oct 2007 13:56:10 -0400 Subject: [PATCH 01/15] NFS: Fix the ustat() regression Since 2.6.18, the superblock sb->s_root has been a dummy dentry with a dummy inode. This breaks ustat(), which actually uses sb->s_root in a vfstat() call. Fix this by making the s_root a dummy alias to the directory inode that was used when creating the superblock. Signed-off-by: Trond Myklebust --- fs/nfs/getroot.c | 81 ++++++++++++++++-------------------------------- 1 file changed, 27 insertions(+), 54 deletions(-) diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c index 522e5ad4d8ad..0ee43843f4ec 100644 --- a/fs/nfs/getroot.c +++ b/fs/nfs/getroot.c @@ -42,6 +42,25 @@ #define NFSDBG_FACILITY NFSDBG_CLIENT +/* + * Set the superblock root dentry. + * Note that this function frees the inode in case of error. + */ +static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *inode) +{ + /* The mntroot acts as the dummy root dentry for this superblock */ + if (sb->s_root == NULL) { + sb->s_root = d_alloc_root(inode); + if (sb->s_root == NULL) { + iput(inode); + return -ENOMEM; + } + /* Circumvent igrab(): we know the inode is not being freed */ + atomic_inc(&inode->i_count); + } + return 0; +} + /* * get an NFS2/NFS3 root dentry from the root filehandle */ @@ -54,33 +73,6 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh) struct inode *inode; int error; - /* create a dummy root dentry with dummy inode for this superblock */ - if (!sb->s_root) { - struct nfs_fh dummyfh; - struct dentry *root; - struct inode *iroot; - - memset(&dummyfh, 0, sizeof(dummyfh)); - memset(&fattr, 0, sizeof(fattr)); - nfs_fattr_init(&fattr); - fattr.valid = NFS_ATTR_FATTR; - fattr.type = NFDIR; - fattr.mode = S_IFDIR | S_IRUSR | S_IWUSR; - fattr.nlink = 2; - - iroot = nfs_fhget(sb, &dummyfh, &fattr); - if (IS_ERR(iroot)) - return ERR_PTR(PTR_ERR(iroot)); - - root = d_alloc_root(iroot); - if (!root) { - iput(iroot); - return ERR_PTR(-ENOMEM); - } - - sb->s_root = root; - } - /* get the actual root for this mount */ fsinfo.fattr = &fattr; @@ -96,6 +88,10 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh) return ERR_PTR(PTR_ERR(inode)); } + error = nfs_superblock_set_dummy_root(sb, inode); + if (error != 0) + return ERR_PTR(error); + /* root dentries normally start off anonymous and get spliced in later * if the dentry tree reaches them; however if the dentry already * exists, we'll pick it up at this point and use it as the root @@ -241,33 +237,6 @@ struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh) dprintk("--> nfs4_get_root()\n"); - /* create a dummy root dentry with dummy inode for this superblock */ - if (!sb->s_root) { - struct nfs_fh dummyfh; - struct dentry *root; - struct inode *iroot; - - memset(&dummyfh, 0, sizeof(dummyfh)); - memset(&fattr, 0, sizeof(fattr)); - nfs_fattr_init(&fattr); - fattr.valid = NFS_ATTR_FATTR; - fattr.type = NFDIR; - fattr.mode = S_IFDIR | S_IRUSR | S_IWUSR; - fattr.nlink = 2; - - iroot = nfs_fhget(sb, &dummyfh, &fattr); - if (IS_ERR(iroot)) - return ERR_PTR(PTR_ERR(iroot)); - - root = d_alloc_root(iroot); - if (!root) { - iput(iroot); - return ERR_PTR(-ENOMEM); - } - - sb->s_root = root; - } - /* get the info about the server and filesystem */ error = nfs4_server_capabilities(server, mntfh); if (error < 0) { @@ -289,6 +258,10 @@ struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh) return ERR_PTR(PTR_ERR(inode)); } + error = nfs_superblock_set_dummy_root(sb, inode); + if (error != 0) + return ERR_PTR(error); + /* root dentries normally start off anonymous and get spliced in later * if the dentry tree reaches them; however if the dentry already * exists, we'll pick it up at this point and use it as the root From ffc40f569272b6be60c66441aeae79a686ff54d9 Mon Sep 17 00:00:00 2001 From: Kevin Coffman Date: Fri, 9 Nov 2007 18:42:04 -0500 Subject: [PATCH 02/15] sunrpc: gss_pipe_downcall(), don't assume all errors are transient Instead of mapping all errors except EACCES to EAGAIN, map all errors except EAGAIN to EACCES. An example is user-land negotiating a Kerberos context with an encryption type that is not supported by the kernel code. (This can happen due to mis-configuration or a bug in the Kerberos code that does not honor our request to limit the encryption types negotiated.) This failure is not transient, and returning EAGAIN causes mount to continuously retry rather than giving up. Signed-off-by: Kevin Coffman Signed-off-by: Trond Myklebust --- net/sunrpc/auth_gss/auth_gss.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 53995af9ca4b..c42362c33944 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -540,7 +540,7 @@ gss_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) p = gss_fill_context(p, end, ctx, gss_msg->auth->mech); if (IS_ERR(p)) { err = PTR_ERR(p); - gss_msg->msg.errno = (err == -EACCES) ? -EACCES : -EAGAIN; + gss_msg->msg.errno = (err == -EAGAIN) ? -EAGAIN : -EACCES; goto err_release_msg; } gss_msg->ctx = gss_get_ctx(ctx); From ef338bee3f4f509e82066e100f76fecbbbbc4cca Mon Sep 17 00:00:00 2001 From: Kevin Coffman Date: Fri, 9 Nov 2007 18:42:09 -0500 Subject: [PATCH 03/15] sunrpc: return error if unsupported enctype or cksumtype is encountered Return an error from gss_import_sec_context_kerberos if the negotiated context contains encryption or checksum types not supported by the kernel code. This fixes an Oops because success was assumed and later code found no internal_ctx_id. Signed-off-by: Kevin Coffman Signed-off-by: Trond Myklebust --- net/sunrpc/auth_gss/gss_krb5_mech.c | 8 ++++++-- net/sunrpc/auth_gss/gss_krb5_seal.c | 1 + 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c index 9843eacef11d..60c3dba545d7 100644 --- a/net/sunrpc/auth_gss/gss_krb5_mech.c +++ b/net/sunrpc/auth_gss/gss_krb5_mech.c @@ -147,13 +147,17 @@ gss_import_sec_context_kerberos(const void *p, p = simple_get_bytes(p, end, &tmp, sizeof(tmp)); if (IS_ERR(p)) goto out_err_free_ctx; - if (tmp != SGN_ALG_DES_MAC_MD5) + if (tmp != SGN_ALG_DES_MAC_MD5) { + p = ERR_PTR(-ENOSYS); goto out_err_free_ctx; + } p = simple_get_bytes(p, end, &tmp, sizeof(tmp)); if (IS_ERR(p)) goto out_err_free_ctx; - if (tmp != SEAL_ALG_DES) + if (tmp != SEAL_ALG_DES) { + p = ERR_PTR(-ENOSYS); goto out_err_free_ctx; + } p = simple_get_bytes(p, end, &ctx->endtime, sizeof(ctx->endtime)); if (IS_ERR(p)) goto out_err_free_ctx; diff --git a/net/sunrpc/auth_gss/gss_krb5_seal.c b/net/sunrpc/auth_gss/gss_krb5_seal.c index 1c6eda5077c1..dedcbd6108f4 100644 --- a/net/sunrpc/auth_gss/gss_krb5_seal.c +++ b/net/sunrpc/auth_gss/gss_krb5_seal.c @@ -83,6 +83,7 @@ gss_get_mic_kerberos(struct gss_ctx *gss_ctx, struct xdr_buf *text, u32 seq_send; dprintk("RPC: gss_krb5_seal\n"); + BUG_ON(ctx == NULL); now = get_seconds(); From eda4f9b7996e5520934ca2a7310b363463a4e3b0 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 6 Nov 2007 13:05:36 -0500 Subject: [PATCH 04/15] sunrpc: rpc_pipe_poll may miss available data in some cases Pipe messages start out life on a queue on the inode, but when first read they're moved to the filp's private pointer. So it's possible for a poll here to return null even though there's a partially read message available. Signed-off-by: J. Bruce Fields Signed-off-by: Trond Myklebust --- net/sunrpc/rpc_pipe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index 18f0a8dcc095..c59f3ca2b41b 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -280,7 +280,7 @@ rpc_pipe_poll(struct file *filp, struct poll_table_struct *wait) mask = POLLOUT | POLLWRNORM; if (rpci->ops == NULL) mask |= POLLERR | POLLHUP; - if (!list_empty(&rpci->pipe)) + if (filp->private_data || !list_empty(&rpci->pipe)) mask |= POLLIN | POLLRDNORM; return mask; } From 4c1fe2f78a08e2c514a39c91a0eb7b55bbd3c0d2 Mon Sep 17 00:00:00 2001 From: Neil Brown Date: Thu, 1 Nov 2007 16:50:20 +1100 Subject: [PATCH 05/15] kernel BUG at fs/nfs/namespace.c:108! - can be triggered by bad server Hi Trond, I have discovered that the BUG_ON in nfs_follow_mountpoint: BUG_ON(IS_ROOT(dentry)); can be triggered by a misbehaving server. What happens is the client does a lookup and discoveres that the named directory has a different fsid, so it initiates a mount. It then performs a GETATTR on the mounted directory and gets a different fsid again (due to a bug in the NFS server). This causes nfs_follow_mountpoint to be called on the newly mounted root, which triggers the BUG_ON. To duplicate this, have a directory which contains some mountpoints, and export that directory with the "crossmnt" flag using nfs-utils 1.1.1 (or 1.1.0 I think) The GETATTR on the root of the mounted filesystem will return the information for the top exportpoint, while a lookup will return the correct information. This difference causes the NFS client to BUG. I think the best way to fix this is to trap this possibility early, so just before completing the mount in the NFS client, check that it isn't going to use nfs_mountpoint_inode_operations. As long as i_op will never change once set (is that true?), this should be adequately safe. The following patch shows a possible approach, and it works for me. i.e. when the NFS server is misbehaving, I get ESTALE on those mountpoints, while when the NFS server is working correctly, I get correct behaviour on the client. NeilBrown Signed-off-by: Neil Brown Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/nfs/super.c b/fs/nfs/super.c index fa517ae9207f..71067d1ac9d9 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1474,6 +1474,11 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags, error = PTR_ERR(mntroot); goto error_splat_super; } + if (mntroot->d_inode->i_op != &nfs_dir_inode_operations) { + dput(mntroot); + error = -ESTALE; + goto error_splat_super; + } s->s_flags |= MS_ACTIVE; mnt->mnt_sb = s; From cfcb43ff7ce28f347a39e1a7519e01850b588718 Mon Sep 17 00:00:00 2001 From: James Lentini Date: Mon, 26 Nov 2007 12:42:44 -0500 Subject: [PATCH 06/15] SUNRPC: remove NFS/RDMA client's binary sysctls Support for binary sysctls is being deprecated in 2.6.24. Since there are no applications using the NFS/RDMA client's binary sysctls, it makes sense to remove them. The patch below does this while leaving the /proc/sys interface unchanged. Please consider this for 2.6.24. Signed-off-by: James Lentini Signed-off-by: Trond Myklebust --- include/linux/sunrpc/debug.h | 5 ----- net/sunrpc/xprtrdma/transport.c | 10 +++++----- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/include/linux/sunrpc/debug.h b/include/linux/sunrpc/debug.h index 3347c72b848a..3912cf16361e 100644 --- a/include/linux/sunrpc/debug.h +++ b/include/linux/sunrpc/debug.h @@ -88,11 +88,6 @@ enum { CTL_SLOTTABLE_TCP, CTL_MIN_RESVPORT, CTL_MAX_RESVPORT, - CTL_SLOTTABLE_RDMA, - CTL_RDMA_MAXINLINEREAD, - CTL_RDMA_MAXINLINEWRITE, - CTL_RDMA_WRITEPADDING, - CTL_RDMA_MEMREG, }; #endif /* _LINUX_SUNRPC_DEBUG_H_ */ diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 1afeb3eb8e4c..6f2112dd9f78 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -89,7 +89,7 @@ static struct ctl_table_header *sunrpc_table_header; static ctl_table xr_tunables_table[] = { { - .ctl_name = CTL_SLOTTABLE_RDMA, + .ctl_name = CTL_UNNUMBERED, .procname = "rdma_slot_table_entries", .data = &xprt_rdma_slot_table_entries, .maxlen = sizeof(unsigned int), @@ -100,7 +100,7 @@ static ctl_table xr_tunables_table[] = { .extra2 = &max_slot_table_size }, { - .ctl_name = CTL_RDMA_MAXINLINEREAD, + .ctl_name = CTL_UNNUMBERED, .procname = "rdma_max_inline_read", .data = &xprt_rdma_max_inline_read, .maxlen = sizeof(unsigned int), @@ -109,7 +109,7 @@ static ctl_table xr_tunables_table[] = { .strategy = &sysctl_intvec, }, { - .ctl_name = CTL_RDMA_MAXINLINEWRITE, + .ctl_name = CTL_UNNUMBERED, .procname = "rdma_max_inline_write", .data = &xprt_rdma_max_inline_write, .maxlen = sizeof(unsigned int), @@ -118,7 +118,7 @@ static ctl_table xr_tunables_table[] = { .strategy = &sysctl_intvec, }, { - .ctl_name = CTL_RDMA_WRITEPADDING, + .ctl_name = CTL_UNNUMBERED, .procname = "rdma_inline_write_padding", .data = &xprt_rdma_inline_write_padding, .maxlen = sizeof(unsigned int), @@ -129,7 +129,7 @@ static ctl_table xr_tunables_table[] = { .extra2 = &max_padding, }, { - .ctl_name = CTL_RDMA_MEMREG, + .ctl_name = CTL_UNNUMBERED, .procname = "rdma_memreg_strategy", .data = &xprt_rdma_memreg_strategy, .maxlen = sizeof(unsigned int), From f16c960332b125491178fc2da7ea7893b0d65d05 Mon Sep 17 00:00:00 2001 From: Russell King Date: Fri, 16 Nov 2007 22:13:24 +0000 Subject: [PATCH 07/15] NFS: mount failure causes bad page state While testing a kernel based upon ecd744eec3aa8bbc949ec04ed3fbf7ecb2958a0e (with wrong boot arguments), I got the following bad page state entry while NFS was trying to mount it's rootfs: IP-Config: Complete: device=eth0, addr=192.168.1.101, mask=255.255.255.0, gw=255.255.255.255, host=192.168.1.101, domain=, nis-domain=(none), bootserver=192.168.1.100, rootserver=192.168.1.100, rootpath= Looking up port of RPC 100003/2 on 192.168.1.100 rpcbind: server 192.168.1.100 not responding, timed out Root-NFS: Unable to get nfsd port number from server, using default Looking up port of RPC 100005/1 on 192.168.1.100 rpcbind: server 192.168.1.100 not responding, timed out Root-NFS: Unable to get mountd port number from server, using default mount: server 192.168.1.100 not responding, timed out Root-NFS: Server returned error -5 while mounting /nfs/rootfs/ VFS: Unable to mount root fs via NFS, trying floppy. Bad page state in process 'swapper' page:c02b1260 flags:0x00000400 mapping:00000000 mapcount:0 count:0 Trying to fix it up, but a reboot is needed Backtrace: [] (dump_stack+0x0/0x14) from [] (bad_page+0x70/0xac) [] (bad_page+0x0/0xac) from [] (free_hot_cold_page+0x80/0x178) [] (free_hot_cold_page+0x0/0x178) from [] (free_hot_page+0x14/0x18) [] (free_hot_page+0x0/0x18) from [] (put_page+0xf8/0x154) [] (put_page+0x0/0x154) from [] (kfree+0xc8/0xd0) [] (kfree+0x0/0xd0) from [] (nfs_get_sb+0x230/0x710) [] (nfs_get_sb+0x0/0x710) from [] (vfs_kern_mount+0x58/0xac)[] (vfs_kern_mount+0x0/0xac) from [] (do_kern_mount+0x38/0xf4) [] (do_kern_mount+0x0/0xf4) from [] (do_mount+0x1e8/0x614) ... This seems to be caused by use of an uninitialised structure due to NULL options being passed to nfs_validate_mount_data(). Ensure that the parsed mount data is always initialised. Signed-off-by: Russell King (Trond: added fix for the same bug in nfs4_validate_mount_data()). Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 71067d1ac9d9..2426e713b77f 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1054,10 +1054,11 @@ static int nfs_validate_mount_data(void *options, { struct nfs_mount_data *data = (struct nfs_mount_data *)options; + memset(args, 0, sizeof(*args)); + if (data == NULL) goto out_no_data; - memset(args, 0, sizeof(*args)); args->flags = (NFS_MOUNT_VER3 | NFS_MOUNT_TCP); args->rsize = NFS_MAX_FILE_IO_SIZE; args->wsize = NFS_MAX_FILE_IO_SIZE; @@ -1536,10 +1537,11 @@ static int nfs4_validate_mount_data(void *options, struct nfs4_mount_data *data = (struct nfs4_mount_data *)options; char *c; + memset(args, 0, sizeof(*args)); + if (data == NULL) goto out_no_data; - memset(args, 0, sizeof(*args)); args->rsize = NFS_MAX_FILE_IO_SIZE; args->wsize = NFS_MAX_FILE_IO_SIZE; args->timeo = 600; From 5334eb13d455dd26b7064980b118e3c957929701 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Wed, 21 Nov 2007 15:04:31 -0800 Subject: [PATCH 08/15] NFS: make nfs_wb_page_priority() static nfs_wb_page_priority() can now become static. Signed-off-by: Adrian Bunk Cc: Trond Myklebust Cc: "J. Bruce Fields" Signed-off-by: Andrew Morton Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 3 ++- include/linux/nfs_fs.h | 1 - 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 89527a487ed7..51cc1bd6a116 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1436,7 +1436,8 @@ out: return ret; } -int nfs_wb_page_priority(struct inode *inode, struct page *page, int how) +static int nfs_wb_page_priority(struct inode *inode, struct page *page, + int how) { loff_t range_start = page_offset(page); loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1); diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index e82a6ebc725d..2d15d4aac094 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -422,7 +422,6 @@ extern long nfs_sync_mapping_wait(struct address_space *, struct writeback_contr extern int nfs_wb_all(struct inode *inode); extern int nfs_wb_nocommit(struct inode *inode); extern int nfs_wb_page(struct inode *inode, struct page* page); -extern int nfs_wb_page_priority(struct inode *inode, struct page* page, int how); extern int nfs_wb_page_cancel(struct inode *inode, struct page* page); #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) extern int nfs_commit_inode(struct inode *, int); From 4c30d56edcaaa0446370189e8ab5c5393dc20ca3 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Wed, 21 Nov 2007 15:04:31 -0800 Subject: [PATCH 09/15] NFS: fs/nfs/dir.c should #include "internal.h" Every file should include the headers containing the prototypes for its global functions (in this case nfs_access_cache_shrinker()). Signed-off-by: Adrian Bunk Cc: Trond Myklebust Cc: "J. Bruce Fields" Signed-off-by: Andrew Morton Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 35334539d947..f697b5c74b7c 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -38,6 +38,7 @@ #include "nfs4_fs.h" #include "delegation.h" #include "iostat.h" +#include "internal.h" /* #define NFS_DEBUG_VERBOSE 1 */ From 483066d62ec583fb6379377a9bfa8d5645b91c75 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Wed, 24 Oct 2007 18:24:02 +0200 Subject: [PATCH 10/15] SUNRPC: make sunrpc/xprtsock.c:xs_setup_{udp,tcp}() static xs_setup_{udp,tcp}() can now become static. Signed-off-by: Adrian Bunk Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprtsock.h | 6 ------ net/sunrpc/xprtsock.c | 4 ++-- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/include/linux/sunrpc/xprtsock.h b/include/linux/sunrpc/xprtsock.h index 2c6c2c2783d8..c2a46c45c8f7 100644 --- a/include/linux/sunrpc/xprtsock.h +++ b/include/linux/sunrpc/xprtsock.h @@ -9,12 +9,6 @@ #ifdef __KERNEL__ -/* - * Socket transport setup operations - */ -struct rpc_xprt *xs_setup_udp(struct xprt_create *args); -struct rpc_xprt *xs_setup_tcp(struct xprt_create *args); - int init_socket_xprt(void); void cleanup_socket_xprt(void); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 02298f529dad..2f630a512ab7 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1828,7 +1828,7 @@ static struct rpc_xprt *xs_setup_xprt(struct xprt_create *args, * @args: rpc transport creation arguments * */ -struct rpc_xprt *xs_setup_udp(struct xprt_create *args) +static struct rpc_xprt *xs_setup_udp(struct xprt_create *args) { struct sockaddr *addr = args->dstaddr; struct rpc_xprt *xprt; @@ -1894,7 +1894,7 @@ struct rpc_xprt *xs_setup_udp(struct xprt_create *args) * @args: rpc transport creation arguments * */ -struct rpc_xprt *xs_setup_tcp(struct xprt_create *args) +static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args) { struct sockaddr *addr = args->dstaddr; struct rpc_xprt *xprt; From 014313a9d66272ed37b9ebd64c3f30b596a4c8e1 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 19 Nov 2007 17:53:43 -0800 Subject: [PATCH 11/15] SUNRPC: Add missing "space" to net/sunrpc/auth_gss.c Signed-off-by: Joe Perches Signed-off-by: Trond Myklebust --- net/sunrpc/auth_gss/auth_gss.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index c42362c33944..a6e57d1c2eb6 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -967,7 +967,7 @@ gss_validate(struct rpc_task *task, __be32 *p) if (maj_stat == GSS_S_CONTEXT_EXPIRED) clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags); if (maj_stat) { - dprintk("RPC: %5u gss_validate: gss_verify_mic returned" + dprintk("RPC: %5u gss_validate: gss_verify_mic returned " "error 0x%08x\n", task->tk_pid, maj_stat); goto out_bad; } From 19f737879cc623c3aa73e655465faa3bff121768 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 12 Nov 2007 12:16:47 -0500 Subject: [PATCH 12/15] NFS: Introduce iovec I/O helpers to fs/nfs/direct.c Add helpers that iterate over multi-segment iovecs. These will be used to support multi-segment scatter/gather direct I/O in a later patch. Signed-off-by: Chuck Lever Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 71 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index afcab007a22b..e30d9285a566 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -355,6 +355,41 @@ static ssize_t nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned lo return result < 0 ? (ssize_t) result : -EFAULT; } +static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, + const struct iovec *iov, + unsigned long nr_segs, + loff_t pos) +{ + ssize_t result = -EINVAL; + size_t requested_bytes = 0; + unsigned long seg; + + get_dreq(dreq); + + for (seg = 0; seg < nr_segs; seg++) { + const struct iovec *vec = &iov[seg]; + result = nfs_direct_read_schedule(dreq, + (unsigned long)vec->iov_base, + vec->iov_len, pos); + if (result < 0) + break; + requested_bytes += result; + if ((size_t)result < vec->iov_len) + break; + pos += vec->iov_len; + } + + if (put_dreq(dreq)) + nfs_direct_complete(dreq); + + if (requested_bytes != 0) + return 0; + + if (result < 0) + return result; + return -EIO; +} + static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size_t count, loff_t pos) { ssize_t result = 0; @@ -697,6 +732,42 @@ static ssize_t nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned l return result < 0 ? (ssize_t) result : -EFAULT; } +static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, + const struct iovec *iov, + unsigned long nr_segs, + loff_t pos, int sync) +{ + ssize_t result = 0; + size_t requested_bytes = 0; + unsigned long seg; + + get_dreq(dreq); + + for (seg = 0; seg < nr_segs; seg++) { + const struct iovec *vec = &iov[seg]; + result = nfs_direct_write_schedule(dreq, + (unsigned long)vec->iov_base, + vec->iov_len, + pos, sync); + if (result < 0) + break; + requested_bytes += result; + if ((size_t)result < vec->iov_len) + break; + pos += vec->iov_len; + } + + if (put_dreq(dreq)) + nfs_direct_write_complete(dreq, dreq->inode); + + if (requested_bytes != 0) + return 0; + + if (result < 0) + return result; + return -EIO; +} + static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, size_t count, loff_t pos) { ssize_t result = 0; From c216fd708e1a97431925ecffd6d1896cff61df0a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 12 Nov 2007 12:16:52 -0500 Subject: [PATCH 13/15] NFS: Support multiple segment iovecs in the NFS direct I/O path Allow applications to perform asynchronous scatter-gather direct I/O to NFS files. Signed-off-by: Chuck Lever Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 67 +++++++++++++++++-------------------------------- 1 file changed, 23 insertions(+), 44 deletions(-) diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index e30d9285a566..88d5d1c7f987 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -272,8 +272,6 @@ static ssize_t nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned lo int result; ssize_t started = 0; - get_dreq(dreq); - do { struct nfs_read_data *data; size_t bytes; @@ -347,11 +345,8 @@ static ssize_t nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned lo count -= bytes; } while (count != 0); - if (put_dreq(dreq)) - nfs_direct_complete(dreq); - if (started) - return 0; + return started; return result < 0 ? (ssize_t) result : -EFAULT; } @@ -390,7 +385,8 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, return -EIO; } -static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size_t count, loff_t pos) +static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) { ssize_t result = 0; sigset_t oldset; @@ -407,9 +403,8 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size if (!is_sync_kiocb(iocb)) dreq->iocb = iocb; - nfs_add_stats(inode, NFSIOS_DIRECTREADBYTES, count); rpc_clnt_sigmask(clnt, &oldset); - result = nfs_direct_read_schedule(dreq, user_addr, count, pos); + result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos); if (!result) result = nfs_direct_wait(dreq); rpc_clnt_sigunmask(clnt, &oldset); @@ -645,8 +640,6 @@ static ssize_t nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned l int result; ssize_t started = 0; - get_dreq(dreq); - do { struct nfs_write_data *data; size_t bytes; @@ -724,11 +717,8 @@ static ssize_t nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned l count -= bytes; } while (count != 0); - if (put_dreq(dreq)) - nfs_direct_write_complete(dreq, inode); - if (started) - return 0; + return started; return result < 0 ? (ssize_t) result : -EFAULT; } @@ -768,7 +758,9 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, return -EIO; } -static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, size_t count, loff_t pos) +static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos, + size_t count) { ssize_t result = 0; sigset_t oldset; @@ -791,10 +783,8 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, siz if (!is_sync_kiocb(iocb)) dreq->iocb = iocb; - nfs_add_stats(inode, NFSIOS_DIRECTWRITTENBYTES, count); - rpc_clnt_sigmask(clnt, &oldset); - result = nfs_direct_write_schedule(dreq, user_addr, count, pos, sync); + result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, sync); if (!result) result = nfs_direct_wait(dreq); rpc_clnt_sigunmask(clnt, &oldset); @@ -830,21 +820,16 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov, ssize_t retval = -EINVAL; struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; - /* XXX: temporary */ - const char __user *buf = iov[0].iov_base; - size_t count = iov[0].iov_len; + size_t count; - dprintk("nfs: direct read(%s/%s, %lu@%Ld)\n", + count = iov_length(iov, nr_segs); + nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count); + + dprintk("nfs: direct read(%s/%s, %zd@%Ld)\n", file->f_path.dentry->d_parent->d_name.name, file->f_path.dentry->d_name.name, - (unsigned long) count, (long long) pos); + count, (long long) pos); - if (nr_segs != 1) - goto out; - - retval = -EFAULT; - if (!access_ok(VERIFY_WRITE, buf, count)) - goto out; retval = 0; if (!count) goto out; @@ -853,7 +838,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov, if (retval) goto out; - retval = nfs_direct_read(iocb, (unsigned long) buf, count, pos); + retval = nfs_direct_read(iocb, iov, nr_segs, pos); if (retval > 0) iocb->ki_pos = pos + retval; @@ -892,17 +877,15 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov, ssize_t retval = -EINVAL; struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; - /* XXX: temporary */ - const char __user *buf = iov[0].iov_base; - size_t count = iov[0].iov_len; + size_t count; - dprintk("nfs: direct write(%s/%s, %lu@%Ld)\n", + count = iov_length(iov, nr_segs); + nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count); + + dfprintk(VFS, "nfs: direct write(%s/%s, %zd@%Ld)\n", file->f_path.dentry->d_parent->d_name.name, file->f_path.dentry->d_name.name, - (unsigned long) count, (long long) pos); - - if (nr_segs != 1) - goto out; + count, (long long) pos); retval = generic_write_checks(file, &pos, &count, 0); if (retval) @@ -915,15 +898,11 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov, if (!count) goto out; - retval = -EFAULT; - if (!access_ok(VERIFY_READ, buf, count)) - goto out; - retval = nfs_sync_mapping(mapping); if (retval) goto out; - retval = nfs_direct_write(iocb, (unsigned long) buf, count, pos); + retval = nfs_direct_write(iocb, iov, nr_segs, pos, count); if (retval > 0) iocb->ki_pos = pos + retval; From b9148c6b80d802dbc2a7530b29915a80432e50c7 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 12 Nov 2007 12:16:58 -0500 Subject: [PATCH 14/15] NFS: Ensure we return zero if applications attempt to write zero bytes A zero byte count direct write request should be a successful no-op, not an error. Signed-off-by: Chuck Lever Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 88d5d1c7f987..4d726e9db295 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -890,6 +890,8 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov, retval = generic_write_checks(file, &pos, &count, 0); if (retval) goto out; + if (!count) + goto out; /* return 0 */ retval = -EINVAL; if ((ssize_t) count < 0) From 02fe494619d525ea803ab1f4f671186dc8a52f7a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 12 Nov 2007 12:17:03 -0500 Subject: [PATCH 15/15] NFS: Clean up new multi-segment direct I/O changes Simplify calling sequence of nfs_direct_{read,write}_schedule(), and rename them to reflect their new role. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 4d726e9db295..5e8d82f6666b 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -263,10 +263,14 @@ static const struct rpc_call_ops nfs_read_direct_ops = { * handled automatically by nfs_direct_read_result(). Otherwise, if * no requests have been sent, just return an error. */ -static ssize_t nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned long user_addr, size_t count, loff_t pos) +static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq, + const struct iovec *iov, + loff_t pos) { struct nfs_open_context *ctx = dreq->ctx; struct inode *inode = ctx->path.dentry->d_inode; + unsigned long user_addr = (unsigned long)iov->iov_base; + size_t count = iov->iov_len; size_t rsize = NFS_SERVER(inode)->rsize; unsigned int pgbase; int result; @@ -363,9 +367,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, for (seg = 0; seg < nr_segs; seg++) { const struct iovec *vec = &iov[seg]; - result = nfs_direct_read_schedule(dreq, - (unsigned long)vec->iov_base, - vec->iov_len, pos); + result = nfs_direct_read_schedule_segment(dreq, vec, pos); if (result < 0) break; requested_bytes += result; @@ -631,10 +633,14 @@ static const struct rpc_call_ops nfs_write_direct_ops = { * handled automatically by nfs_direct_write_result(). Otherwise, if * no requests have been sent, just return an error. */ -static ssize_t nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned long user_addr, size_t count, loff_t pos, int sync) +static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq, + const struct iovec *iov, + loff_t pos, int sync) { struct nfs_open_context *ctx = dreq->ctx; struct inode *inode = ctx->path.dentry->d_inode; + unsigned long user_addr = (unsigned long)iov->iov_base; + size_t count = iov->iov_len; size_t wsize = NFS_SERVER(inode)->wsize; unsigned int pgbase; int result; @@ -735,10 +741,8 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, for (seg = 0; seg < nr_segs; seg++) { const struct iovec *vec = &iov[seg]; - result = nfs_direct_write_schedule(dreq, - (unsigned long)vec->iov_base, - vec->iov_len, - pos, sync); + result = nfs_direct_write_schedule_segment(dreq, vec, + pos, sync); if (result < 0) break; requested_bytes += result;