From 70b666c3b4cb2b96098d80e6f515e4bc6d37db5a Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 27 May 2011 09:24:26 -0700 Subject: [PATCH 1/5] ceph: use ihold when we already have an inode ref We should use ihold whenever we already have a stable inode ref, even when we aren't holding i_lock. This avoids adding new and unnecessary locking dependencies. Signed-off-by: Sage Weil --- fs/ceph/addr.c | 2 +- fs/ceph/caps.c | 10 ++++------ fs/ceph/dir.c | 11 +++++++---- fs/ceph/export.c | 4 ++-- fs/ceph/file.c | 3 ++- fs/ceph/inode.c | 18 ++++++++++-------- fs/ceph/ioctl.c | 6 ++++-- fs/ceph/locks.c | 3 ++- fs/ceph/snap.c | 2 +- fs/ceph/xattr.c | 6 ++++-- 10 files changed, 37 insertions(+), 28 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 33da49dc3cc6..5a3953db8118 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -453,7 +453,7 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc) int err; struct inode *inode = page->mapping->host; BUG_ON(!inode); - igrab(inode); + ihold(inode); err = writepage_nounlock(page, wbc); unlock_page(page); iput(inode); diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 1f72b00447c4..f605753c8fe9 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2940,14 +2940,12 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc) while (!list_empty(&mdsc->cap_dirty)) { ci = list_first_entry(&mdsc->cap_dirty, struct ceph_inode_info, i_dirty_item); - inode = igrab(&ci->vfs_inode); + inode = &ci->vfs_inode; + ihold(inode); dout("flush_dirty_caps %p\n", inode); spin_unlock(&mdsc->cap_dirty_lock); - if (inode) { - ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH, - NULL); - iput(inode); - } + ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH, NULL); + iput(inode); spin_lock(&mdsc->cap_dirty_lock); } spin_unlock(&mdsc->cap_dirty_lock); diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 33729e822bb9..ef8f08c343e8 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -308,7 +308,8 @@ more: req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); if (IS_ERR(req)) return PTR_ERR(req); - req->r_inode = igrab(inode); + req->r_inode = inode; + ihold(inode); req->r_dentry = dget(filp->f_dentry); /* hints to request -> mds selection code */ req->r_direct_mode = USE_AUTH_MDS; @@ -787,10 +788,12 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir, req->r_dentry_drop = CEPH_CAP_FILE_SHARED; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; err = ceph_mdsc_do_request(mdsc, dir, req); - if (err) + if (err) { d_drop(dentry); - else if (!req->r_reply_info.head->is_dentry) - d_instantiate(dentry, igrab(old_dentry->d_inode)); + } else if (!req->r_reply_info.head->is_dentry) { + ihold(old_dentry->d_inode); + d_instantiate(dentry, old_dentry->d_inode); + } ceph_mdsc_put_request(req); return err; } diff --git a/fs/ceph/export.c b/fs/ceph/export.c index a610d3d67488..f67b687550de 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -109,7 +109,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, err = ceph_mdsc_do_request(mdsc, NULL, req); inode = req->r_target_inode; if (inode) - igrab(inode); + ihold(inode); ceph_mdsc_put_request(req); if (!inode) return ERR_PTR(-ESTALE); @@ -167,7 +167,7 @@ static struct dentry *__cfh_to_dentry(struct super_block *sb, err = ceph_mdsc_do_request(mdsc, NULL, req); inode = req->r_target_inode; if (inode) - igrab(inode); + ihold(inode); ceph_mdsc_put_request(req); if (!inode) return ERR_PTR(err ? err : -ESTALE); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 203252d88d9f..8c5ac4e72832 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -191,7 +191,8 @@ int ceph_open(struct inode *inode, struct file *file) err = PTR_ERR(req); goto out; } - req->r_inode = igrab(inode); + req->r_inode = inode; + ihold(inode); req->r_num_caps = 1; err = ceph_mdsc_do_request(mdsc, parent_inode, req); if (!err) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 70b6a4839c38..d8858e96ab18 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1101,10 +1101,10 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, goto done; } req->r_dentry = dn; /* may have spliced */ - igrab(in); + ihold(in); } else if (ceph_ino(in) == vino.ino && ceph_snap(in) == vino.snap) { - igrab(in); + ihold(in); } else { dout(" %p links to %p %llx.%llx, not %llx.%llx\n", dn, in, ceph_ino(in), ceph_snap(in), @@ -1144,7 +1144,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, goto done; } req->r_dentry = dn; /* may have spliced */ - igrab(in); + ihold(in); rinfo->head->is_dentry = 1; /* fool notrace handlers */ } @@ -1328,7 +1328,7 @@ void ceph_queue_writeback(struct inode *inode) if (queue_work(ceph_inode_to_client(inode)->wb_wq, &ceph_inode(inode)->i_wb_work)) { dout("ceph_queue_writeback %p\n", inode); - igrab(inode); + ihold(inode); } else { dout("ceph_queue_writeback %p failed\n", inode); } @@ -1353,7 +1353,7 @@ void ceph_queue_invalidate(struct inode *inode) if (queue_work(ceph_inode_to_client(inode)->pg_inv_wq, &ceph_inode(inode)->i_pg_inv_work)) { dout("ceph_queue_invalidate %p\n", inode); - igrab(inode); + ihold(inode); } else { dout("ceph_queue_invalidate %p failed\n", inode); } @@ -1477,7 +1477,7 @@ void ceph_queue_vmtruncate(struct inode *inode) if (queue_work(ceph_sb_to_client(inode->i_sb)->trunc_wq, &ci->i_vmtruncate_work)) { dout("ceph_queue_vmtruncate %p\n", inode); - igrab(inode); + ihold(inode); } else { dout("ceph_queue_vmtruncate %p failed, pending=%d\n", inode, ci->i_truncate_pending); @@ -1738,7 +1738,8 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) __mark_inode_dirty(inode, inode_dirty_flags); if (mask) { - req->r_inode = igrab(inode); + req->r_inode = inode; + ihold(inode); req->r_inode_drop = release; req->r_args.setattr.mask = cpu_to_le32(mask); req->r_num_caps = 1; @@ -1779,7 +1780,8 @@ int ceph_do_getattr(struct inode *inode, int mask) req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS); if (IS_ERR(req)) return PTR_ERR(req); - req->r_inode = igrab(inode); + req->r_inode = inode; + ihold(inode); req->r_num_caps = 1; req->r_args.getattr.mask = cpu_to_le32(mask); err = ceph_mdsc_do_request(mdsc, NULL, req); diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index 8888c9ba68db..ef0b5f48e13a 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -73,7 +73,8 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg) USE_AUTH_MDS); if (IS_ERR(req)) return PTR_ERR(req); - req->r_inode = igrab(inode); + req->r_inode = inode; + ihold(inode); req->r_inode_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL; req->r_args.setlayout.layout.fl_stripe_unit = @@ -135,7 +136,8 @@ static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg) if (IS_ERR(req)) return PTR_ERR(req); - req->r_inode = igrab(inode); + req->r_inode = inode; + ihold(inode); req->r_args.setlayout.layout.fl_stripe_unit = cpu_to_le32(l.stripe_unit); diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index 476b329867d4..7f0f72c53f7e 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -23,7 +23,8 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file, req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS); if (IS_ERR(req)) return PTR_ERR(req); - req->r_inode = igrab(inode); + req->r_inode = inode; + ihold(inode); /* mds requires start and length rather than start and end */ if (LLONG_MAX == fl->fl_end) diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 24067d68a554..54b14de2e729 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -722,7 +722,7 @@ static void flush_snaps(struct ceph_mds_client *mdsc) ci = list_first_entry(&mdsc->snap_flush_list, struct ceph_inode_info, i_snap_flush_item); inode = &ci->vfs_inode; - igrab(inode); + ihold(inode); spin_unlock(&mdsc->snap_flush_lock); spin_lock(&inode->i_lock); __ceph_flush_snaps(ci, &session, 0); diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index f2b628696180..f42d730f1b66 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -665,7 +665,8 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name, err = PTR_ERR(req); goto out; } - req->r_inode = igrab(inode); + req->r_inode = inode; + ihold(inode); req->r_inode_drop = CEPH_CAP_XATTR_SHARED; req->r_num_caps = 1; req->r_args.setxattr.flags = cpu_to_le32(flags); @@ -795,7 +796,8 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name) USE_AUTH_MDS); if (IS_ERR(req)) return PTR_ERR(req); - req->r_inode = igrab(inode); + req->r_inode = inode; + ihold(inode); req->r_inode_drop = CEPH_CAP_XATTR_SHARED; req->r_num_caps = 1; req->r_path2 = kstrdup(name, GFP_NOFS); From 2584547230ae49b8de91ab3bb5e0a81898905b45 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 3 Jun 2011 09:37:09 -0700 Subject: [PATCH 2/5] ceph: fix sync vs canceled write If we cancel a write, trigger the safe completions to prevent a sync from blocking indefinitely in ceph_osdc_sync(). Signed-off-by: Sage Weil --- net/ceph/osd_client.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 6ea2b892f44b..9cb627a4073a 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1144,6 +1144,13 @@ static void handle_osds_timeout(struct work_struct *work) round_jiffies_relative(delay)); } +static void complete_request(struct ceph_osd_request *req) +{ + if (req->r_safe_callback) + req->r_safe_callback(req, NULL); + complete_all(&req->r_safe_completion); /* fsync waiter */ +} + /* * handle osd op reply. either call the callback if it is specified, * or do the completion to wake up the waiting thread. @@ -1226,11 +1233,8 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, else complete_all(&req->r_completion); - if (flags & CEPH_OSD_FLAG_ONDISK) { - if (req->r_safe_callback) - req->r_safe_callback(req, msg); - complete_all(&req->r_safe_completion); /* fsync waiter */ - } + if (flags & CEPH_OSD_FLAG_ONDISK) + complete_request(req); done: dout("req=%p req->r_linger=%d\n", req, req->r_linger); @@ -1732,6 +1736,7 @@ int ceph_osdc_wait_request(struct ceph_osd_client *osdc, __cancel_request(req); __unregister_request(osdc, req); mutex_unlock(&osdc->request_mutex); + complete_request(req); dout("wait_request tid %llu canceled/timed out\n", req->r_tid); return rc; } From c3cd62839aaa2cdb2b99687c9e44f1b300a4aece Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 1 Jun 2011 16:08:44 -0700 Subject: [PATCH 3/5] ceph: fix short sync reads from the OSD If we get a short read from the OSD because the object is small, we need to zero the remainder of the buffer. For O_DIRECT reads, the attempted range is not trimmed to i_size by the VFS, so we were actually looping indefinitely. Fix by trimming by i_size, and the unconditionally zeroing the trailing range. Reported-by: Jeff Wu Signed-off-by: Sage Weil --- fs/ceph/file.c | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 8c5ac4e72832..b654f403139e 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -283,7 +283,7 @@ int ceph_release(struct inode *inode, struct file *file) static int striped_read(struct inode *inode, u64 off, u64 len, struct page **pages, int num_pages, - int *checkeof, bool align_to_pages, + int *checkeof, bool o_direct, unsigned long buf_align) { struct ceph_fs_client *fsc = ceph_inode_to_client(inode); @@ -308,7 +308,7 @@ static int striped_read(struct inode *inode, io_align = off & ~PAGE_MASK; more: - if (align_to_pages) + if (o_direct) page_align = (pos - io_align + buf_align) & ~PAGE_MASK; else page_align = pos & ~PAGE_MASK; @@ -346,20 +346,22 @@ more: } if (was_short) { - /* was original extent fully inside i_size? */ - if (pos + left <= inode->i_size) { - dout("zero tail\n"); - ceph_zero_page_vector_range(page_off + read, len - read, - pages); - read = len; - goto out; - } + /* did we bounce off eof? */ + if (pos + left > inode->i_size) + *checkeof = 1; - /* check i_size */ - *checkeof = 1; + /* zero trailing bytes (inside i_size) */ + if (left > 0 && pos < inode->i_size) { + if (pos + left > inode->i_size) + left = inode->i_size - pos; + + dout("zero tail %d\n", left); + ceph_zero_page_vector_range(page_off + read, left, + pages); + read += left; + } } -out: if (ret >= 0) ret = read; dout("striped_read returns %d\n", ret); @@ -659,7 +661,7 @@ out: /* hit EOF or hole? */ if (statret == 0 && *ppos < inode->i_size) { - dout("aio_read sync_read hit hole, reading more\n"); + dout("aio_read sync_read hit hole, ppos %lld < size %lld, reading more\n", *ppos, inode->i_size); read += ret; base += ret; len -= ret; From 0e98728fa32d338907631349a8cc2afa07c0cb9a Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 7 Jun 2011 20:40:35 -0700 Subject: [PATCH 4/5] ceph: fix ENOENT logic in striped_read Getting ENOENT is equivalent to reading 0 bytes. Make that correction before setting up the hit_stripe and was_short flags. Fixes the following case: dd if=/dev/zero of=/mnt/fs_depot/dd3 bs=1 seek=1048576 count=0 dd if=/mnt/fs_depot/dd3 of=/root/ddout1 skip=8 bs=500 count=2 iflag=direct Reported-by: Henry C Chang Signed-off-by: Sage Weil --- fs/ceph/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index b654f403139e..9542f07d0b93 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -318,10 +318,10 @@ more: ci->i_truncate_seq, ci->i_truncate_size, page_pos, pages_left, page_align); - hit_stripe = this_len < left; - was_short = ret >= 0 && ret < this_len; if (ret == -ENOENT) ret = 0; + hit_stripe = this_len < left; + was_short = ret >= 0 && ret < this_len; dout("striped_read %llu~%u (read %u) got %d%s%s\n", pos, left, read, ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : ""); From 0c1f91f27140cf3b6e38dc4e892adac241c73a20 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 25 May 2011 14:56:12 -0700 Subject: [PATCH 5/5] ceph: unwind canceled flock state If we request a lock and then abort (e.g., ^C), we need to send a matching unlock request to the MDS to unwind our lock attempt to avoid indefinitely blocking other clients. Reported-by: Brian Chrisman Signed-off-by: Sage Weil --- fs/ceph/locks.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index 7f0f72c53f7e..80576d05d687 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -33,11 +33,10 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file, length = fl->fl_end - fl->fl_start + 1; dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, " - "length: %llu, wait: %d, type`: %d", (int)lock_type, + "length: %llu, wait: %d, type: %d", (int)lock_type, (int)operation, (u64)fl->fl_pid, fl->fl_start, length, wait, fl->fl_type); - req->r_args.filelock_change.rule = lock_type; req->r_args.filelock_change.type = cmd; req->r_args.filelock_change.pid = cpu_to_le64((u64)fl->fl_pid); @@ -71,7 +70,7 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file, } ceph_mdsc_put_request(req); dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, " - "length: %llu, wait: %d, type`: %d, err code %d", (int)lock_type, + "length: %llu, wait: %d, type: %d, err code %d", (int)lock_type, (int)operation, (u64)fl->fl_pid, fl->fl_start, length, wait, fl->fl_type, err); return err; @@ -110,16 +109,20 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) dout("mds locked, locking locally"); err = posix_lock_file(file, fl, NULL); if (err && (CEPH_MDS_OP_SETFILELOCK == op)) { - /* undo! This should only happen if the kernel detects - * local deadlock. */ + /* undo! This should only happen if + * the kernel detects local + * deadlock. */ ceph_lock_message(CEPH_LOCK_FCNTL, op, file, CEPH_LOCK_UNLOCK, 0, fl); - dout("got %d on posix_lock_file, undid lock", err); + dout("got %d on posix_lock_file, undid lock", + err); } } - } else { - dout("mds returned error code %d", err); + } else if (err == -ERESTARTSYS) { + dout("undoing lock\n"); + ceph_lock_message(CEPH_LOCK_FCNTL, op, file, + CEPH_LOCK_UNLOCK, 0, fl); } return err; } @@ -156,8 +159,11 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl) file, CEPH_LOCK_UNLOCK, 0, fl); dout("got %d on flock_lock_file_wait, undid lock", err); } - } else { - dout("mds error code %d", err); + } else if (err == -ERESTARTSYS) { + dout("undoing lock\n"); + ceph_lock_message(CEPH_LOCK_FLOCK, + CEPH_MDS_OP_SETFILELOCK, + file, CEPH_LOCK_UNLOCK, 0, fl); } return err; }