From b7ec35b304b64af2830027350cc99d31e6e537c2 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 28 Apr 2016 16:07:25 +0200 Subject: [PATCH 01/10] libceph: change ceph_osdmap_flag() to take osdc For the benefit of every single caller, take osdc instead of map. Also, now that osdc->osdmap can't ever be NULL, drop the check. Signed-off-by: Ilya Dryomov --- fs/ceph/file.c | 8 +++---- include/linux/ceph/osd_client.h | 5 +++++ include/linux/ceph/osdmap.h | 5 ----- net/ceph/osd_client.c | 39 ++++++++++++++++----------------- 4 files changed, 28 insertions(+), 29 deletions(-) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index a888df6f2d71..8eeb9f579db5 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1349,7 +1349,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) } retry_snap: - if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) { + if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL)) { err = -ENOSPC; goto out; } @@ -1440,7 +1440,7 @@ retry_snap: ceph_put_cap_refs(ci, got); if (written >= 0) { - if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL)) + if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_NEARFULL)) iocb->ki_flags |= IOCB_DSYNC; written = generic_write_sync(iocb, written); @@ -1672,8 +1672,8 @@ static long ceph_fallocate(struct file *file, int mode, goto unlock; } - if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) && - !(mode & FALLOC_FL_PUNCH_HOLE)) { + if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) && + !(mode & FALLOC_FL_PUNCH_HOLE)) { ret = -ENOSPC; goto unlock; } diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 19b14862d3e0..1b3b6e155392 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -279,6 +279,11 @@ struct ceph_osd_client { struct workqueue_struct *notify_wq; }; +static inline bool ceph_osdmap_flag(struct ceph_osd_client *osdc, int flag) +{ + return osdc->osdmap->flags & flag; +} + extern int ceph_osdc_setup(void); extern void ceph_osdc_cleanup(void); diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index ddc426b22d81..9ccf4dbe55f8 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -189,11 +189,6 @@ static inline bool ceph_osd_is_down(struct ceph_osdmap *map, int osd) return !ceph_osd_is_up(map, osd); } -static inline bool ceph_osdmap_flag(struct ceph_osdmap *map, int flag) -{ - return map && (map->flags & flag); -} - extern char *ceph_osdmap_state_str(char *str, int len, int state); extern u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 0160d7d09a1e..79c3bad87e62 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1276,9 +1276,9 @@ static bool target_should_be_paused(struct ceph_osd_client *osdc, const struct ceph_osd_request_target *t, struct ceph_pg_pool_info *pi) { - bool pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD); - bool pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) || - ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) || + bool pauserd = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD); + bool pausewr = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) || + ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || __pool_full(pi); WARN_ON(pi->id != t->base_oloc.pool); @@ -1303,8 +1303,7 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc, bool force_resend = false; bool need_check_tiering = false; bool need_resend = false; - bool sort_bitwise = ceph_osdmap_flag(osdc->osdmap, - CEPH_OSDMAP_SORTBITWISE); + bool sort_bitwise = ceph_osdmap_flag(osdc, CEPH_OSDMAP_SORTBITWISE); enum calc_target_result ct_res; int ret; @@ -1590,9 +1589,9 @@ static void maybe_request_map(struct ceph_osd_client *osdc) verify_osdc_locked(osdc); WARN_ON(!osdc->osdmap->epoch); - if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) || - ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD) || - ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR)) { + if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || + ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD) || + ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR)) { dout("%s osdc %p continuous\n", __func__, osdc); continuous = true; } else { @@ -1629,19 +1628,19 @@ again: } if ((req->r_flags & CEPH_OSD_FLAG_WRITE) && - ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR)) { + ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR)) { dout("req %p pausewr\n", req); req->r_t.paused = true; maybe_request_map(osdc); } else if ((req->r_flags & CEPH_OSD_FLAG_READ) && - ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD)) { + ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD)) { dout("req %p pauserd\n", req); req->r_t.paused = true; maybe_request_map(osdc); } else if ((req->r_flags & CEPH_OSD_FLAG_WRITE) && !(req->r_flags & (CEPH_OSD_FLAG_FULL_TRY | CEPH_OSD_FLAG_FULL_FORCE)) && - (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) || + (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || pool_full(osdc, req->r_t.base_oloc.pool))) { dout("req %p full/pool_full\n", req); pr_warn_ratelimited("FULL or reached pool quota\n"); @@ -2280,7 +2279,7 @@ static void send_linger_ping(struct ceph_osd_linger_request *lreq) struct ceph_osd_request *req = lreq->ping_req; struct ceph_osd_req_op *op = &req->r_ops[0]; - if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD)) { + if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD)) { dout("%s PAUSERD\n", __func__); return; } @@ -3050,7 +3049,7 @@ static int handle_one_map(struct ceph_osd_client *osdc, bool skipped_map = false; bool was_full; - was_full = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL); + was_full = ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL); set_pool_was_full(osdc); if (incremental) @@ -3088,7 +3087,7 @@ static int handle_one_map(struct ceph_osd_client *osdc, osdc->osdmap = newmap; } - was_full &= !ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL); + was_full &= !ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL); scan_requests(&osdc->homeless_osd, skipped_map, was_full, true, need_resend, need_resend_linger); @@ -3174,9 +3173,9 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) if (ceph_check_fsid(osdc->client, &fsid) < 0) goto bad; - was_pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD); - was_pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) || - ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) || + was_pauserd = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD); + was_pausewr = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) || + ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || have_pool_full(osdc); /* incremental maps */ @@ -3238,9 +3237,9 @@ done: * we find out when we are no longer full and stop returning * ENOSPC. */ - pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD); - pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) || - ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) || + pauserd = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD); + pausewr = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) || + ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || have_pool_full(osdc); if (was_pauserd || was_pausewr || pauserd || pausewr) maybe_request_map(osdc); From dc045a9168c83b2dc590930a0565e066346de382 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 27 May 2016 15:18:34 +0200 Subject: [PATCH 02/10] libceph: put request only if it's done in handle_reply() handle_reply() may be called twice on the same request: on ack and then on commit. This occurs on btrfs-formatted OSDs or if cephfs sync write path is triggered - CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK. handle_reply() handles this with the help of done_request(). Fixes: 5aea3dcd5021 ("libceph: a major OSD client update") Signed-off-by: Ilya Dryomov --- net/ceph/osd_client.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 79c3bad87e62..cfe0be8bf2b7 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -2892,6 +2892,9 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg) dout("req %p tid %llu cb\n", req, req->r_tid); __complete_request(req); } + if (m.flags & CEPH_OSD_FLAG_ONDISK) + complete_all(&req->r_safe_completion); + ceph_osdc_put_request(req); } else { if (req->r_unsafe_callback) { dout("req %p tid %llu unsafe-cb\n", req, req->r_tid); @@ -2900,10 +2903,7 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg) WARN_ON(1); } } - if (m.flags & CEPH_OSD_FLAG_ONDISK) - complete_all(&req->r_safe_completion); - ceph_osdc_put_request(req); return; fail_request: From 4a3262b17c96b6ff332134c9e57f193a20226eb2 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 30 May 2016 18:33:32 +0200 Subject: [PATCH 03/10] libceph: use %s instead of %pE in dout()s Commit d30291b985d1 ("libceph: variable-sized ceph_object_id") changed dout()s in what is now encode_request() and ceph_object_locator_to_pg() to use %pE, mostly to document that, although all rbd and cephfs object names are NULL-terminated strings, ceph_object_id will handle any RADOS object name, including the one containing NULs, just fine. However, it turns out that vbin_printf() can't handle anything but ints and %s - all %p suffixes are ignored. The buffer %p** points to isn't recorded, resulting in trash in the messages if the buffer had been reused by the time bstr_printf() got to it. Signed-off-by: Ilya Dryomov --- net/ceph/osd_client.c | 6 +++--- net/ceph/osdmap.c | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index cfe0be8bf2b7..89469592076c 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1539,9 +1539,9 @@ static void encode_request(struct ceph_osd_request *req, struct ceph_msg *msg) */ msg->hdr.data_off = cpu_to_le16(req->r_data_offset); - dout("%s req %p oid %*pE oid_len %d front %zu data %u\n", __func__, - req, req->r_t.target_oid.name_len, req->r_t.target_oid.name, - req->r_t.target_oid.name_len, msg->front.iov_len, data_len); + dout("%s req %p oid %s oid_len %d front %zu data %u\n", __func__, + req, req->r_t.target_oid.name, req->r_t.target_oid.name_len, + msg->front.iov_len, data_len); } /* diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index cde52e94732f..03062bb763b3 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -1778,8 +1778,8 @@ int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap, raw_pgid->seed = ceph_str_hash(pi->object_hash, oid->name, oid->name_len); - dout("%s %*pE -> raw_pgid %llu.%x\n", __func__, oid->name_len, - oid->name, raw_pgid->pool, raw_pgid->seed); + dout("%s %s -> raw_pgid %llu.%x\n", __func__, oid->name, + raw_pgid->pool, raw_pgid->seed); return 0; } EXPORT_SYMBOL(ceph_object_locator_to_pg); From d2138455286f9db8fbff9b50b53b51766f6c1f92 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 17 May 2016 11:52:48 +0800 Subject: [PATCH 04/10] FS-Cache: wake write waiter after invalidating writes Signed-off-by: Yan, Zheng Acked-by: David Howells --- fs/fscache/page.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/fscache/page.c b/fs/fscache/page.c index 3078b679fcd1..c8c4f79c7ce1 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -887,6 +887,8 @@ void fscache_invalidate_writes(struct fscache_cookie *cookie) put_page(results[i]); } + wake_up_bit(&cookie->flags, 0); + _leave(""); } From 480ce08a70e4179f34808a3bdbfe6627f624cf54 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 20 May 2016 18:32:31 +0800 Subject: [PATCH 05/10] FS-Cache: make check_consistency callback return int __fscache_check_consistency() calls check_consistency() callback and return the callback's return value. But the return type of check_consistency() is bool. So __fscache_check_consistency() return 1 if the cache is inconsistent. This is inconsistent with the document. Signed-off-by: Yan, Zheng Acked-by: David Howells --- fs/cachefiles/interface.c | 2 +- include/linux/fscache-cache.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index 861d611b8c05..ce5f345d70f5 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -380,7 +380,7 @@ static void cachefiles_sync_cache(struct fscache_cache *_cache) * check if the backing cache is updated to FS-Cache * - called by FS-Cache when evaluates if need to invalidate the cache */ -static bool cachefiles_check_consistency(struct fscache_operation *op) +static int cachefiles_check_consistency(struct fscache_operation *op) { struct cachefiles_object *object; struct cachefiles_cache *cache; diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h index 604e1526cd00..13ba552e6c09 100644 --- a/include/linux/fscache-cache.h +++ b/include/linux/fscache-cache.h @@ -241,7 +241,7 @@ struct fscache_cache_ops { /* check the consistency between the backing cache and the FS-Cache * cookie */ - bool (*check_consistency)(struct fscache_operation *op); + int (*check_consistency)(struct fscache_operation *op); /* store the updated auxiliary data on an object */ void (*update_object)(struct fscache_object *object); From 368e35857dfab264f512d040c4486c9b13297988 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 17 May 2016 11:58:02 +0800 Subject: [PATCH 06/10] ceph: call __fscache_uncache_page() if readpages fails If readpages fails, fscache needs to cleanup its internal state. Signed-off-by: Yan, Zheng --- fs/ceph/addr.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index eeb71e5de27a..4ff62fc98fb5 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -276,8 +276,10 @@ static void finish_read(struct ceph_osd_request *req) for (i = 0; i < num_pages; i++) { struct page *page = osd_data->pages[i]; - if (rc < 0 && rc != -ENOENT) + if (rc < 0 && rc != -ENOENT) { + ceph_fscache_readpage_cancel(inode, page); goto unlock; + } if (bytes < (int)PAGE_SIZE) { /* zero (remainder of) page */ int s = bytes < 0 ? 0 : bytes; From 1464975816c79a7cd28dc314384f060a122a9d55 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 20 May 2016 15:41:20 +0800 Subject: [PATCH 07/10] ceph: avoid unnecessary fscache invalidation/revlidation ceph_fill_file_size() has already called ceph_fscache_invalidate() if it return true. Signed-off-by: Yan, Zheng --- fs/ceph/caps.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index c17b5d76d75e..7bdf7d59a36d 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2993,10 +2993,9 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, if (fill_inline) ceph_fill_inline_data(inode, NULL, inline_data, inline_len); - if (queue_trunc) { + if (queue_trunc) ceph_queue_vmtruncate(inode); - ceph_queue_revalidate(inode); - } else if (queue_revalidate) + else if (queue_revalidate) ceph_queue_revalidate(inode); if (writeback) @@ -3199,10 +3198,8 @@ static void handle_cap_trunc(struct inode *inode, truncate_seq, truncate_size, size); spin_unlock(&ci->i_ceph_lock); - if (queue_trunc) { + if (queue_trunc) ceph_queue_vmtruncate(inode); - ceph_fscache_invalidate(inode); - } } /* From 46b59b2be05a71d80d76883d2f495f182d768f47 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Wed, 18 May 2016 15:25:03 +0800 Subject: [PATCH 08/10] ceph: disable fscache when inode is opened for write All other filesystems do not add dirty pages to fscache. They all disable fscache when inode is opened for write. Only ceph adds dirty pages to fscache, but the code is buggy. Signed-off-by: Yan, Zheng --- fs/ceph/addr.c | 2 -- fs/ceph/cache.c | 54 ++++++++++++++++++++++++++++++++++--------------- fs/ceph/cache.h | 30 +++++++++++---------------- fs/ceph/file.c | 19 ++--------------- 4 files changed, 52 insertions(+), 53 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 4ff62fc98fb5..26a9d10d75e9 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -537,8 +537,6 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb)) set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC); - ceph_readpage_to_fscache(inode, page); - set_page_writeback(page); err = ceph_osdc_writepages(osdc, ceph_vino(inode), &ci->i_layout, snapc, diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index c052b5bf219b..c19db6afd0cc 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -181,32 +181,26 @@ static const struct fscache_cookie_def ceph_fscache_inode_object_def = { .now_uncached = ceph_fscache_inode_now_uncached, }; -void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc, - struct ceph_inode_info* ci) +void ceph_fscache_register_inode_cookie(struct inode *inode) { - struct inode* inode = &ci->vfs_inode; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); /* No caching for filesystem */ if (fsc->fscache == NULL) return; /* Only cache for regular files that are read only */ - if ((ci->vfs_inode.i_mode & S_IFREG) == 0) + if (!S_ISREG(inode->i_mode)) return; - /* Avoid multiple racing open requests */ - inode_lock(inode); - - if (ci->fscache) - goto done; - - ci->fscache = fscache_acquire_cookie(fsc->fscache, - &ceph_fscache_inode_object_def, - ci, true); - fscache_check_consistency(ci->fscache); -done: + inode_lock_nested(inode, I_MUTEX_CHILD); + if (!ci->fscache) { + ci->fscache = fscache_acquire_cookie(fsc->fscache, + &ceph_fscache_inode_object_def, + ci, false); + } inode_unlock(inode); - } void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci) @@ -222,6 +216,34 @@ void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci) fscache_relinquish_cookie(cookie, 0); } +static bool ceph_fscache_can_enable(void *data) +{ + struct inode *inode = data; + return !inode_is_open_for_write(inode); +} + +void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + if (!fscache_cookie_valid(ci->fscache)) + return; + + if (inode_is_open_for_write(inode)) { + dout("fscache_file_set_cookie %p %p disabling cache\n", + inode, filp); + fscache_disable_cookie(ci->fscache, false); + fscache_uncache_all_inode_pages(ci->fscache, inode); + } else { + fscache_enable_cookie(ci->fscache, ceph_fscache_can_enable, + inode); + if (fscache_cookie_enabled(ci->fscache)) { + dout("fscache_file_set_cookie %p %p enabing cache\n", + inode, filp); + } + } +} + static void ceph_vfs_readpage_complete(struct page *page, void *data, int error) { if (!error) diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h index 5ac591bd012b..dfe9b5823bd3 100644 --- a/fs/ceph/cache.h +++ b/fs/ceph/cache.h @@ -35,9 +35,9 @@ int ceph_fscache_register_fs(struct ceph_fs_client* fsc); void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc); void ceph_fscache_inode_init(struct ceph_inode_info *ci); -void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc, - struct ceph_inode_info* ci); +void ceph_fscache_register_inode_cookie(struct inode *inode); void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci); +void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp); int ceph_readpage_from_fscache(struct inode *inode, struct page *page); int ceph_readpages_from_fscache(struct inode *inode, @@ -48,12 +48,6 @@ void ceph_readpage_to_fscache(struct inode *inode, struct page *page); void ceph_invalidate_fscache_page(struct inode* inode, struct page *page); void ceph_queue_revalidate(struct inode *inode); -static inline void ceph_fscache_update_objectsize(struct inode *inode) -{ - struct ceph_inode_info *ci = ceph_inode(inode); - fscache_attr_changed(ci->fscache); -} - static inline void ceph_fscache_invalidate(struct inode *inode) { fscache_invalidate(ceph_inode(inode)->fscache); @@ -112,8 +106,16 @@ static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci) { } -static inline void ceph_fscache_register_inode_cookie(struct ceph_fs_client* parent_fsc, - struct ceph_inode_info* ci) +static inline void ceph_fscache_register_inode_cookie(struct inode *inode) +{ +} + +static inline void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci) +{ +} + +static inline void ceph_fscache_file_set_cookie(struct inode *inode, + struct file *filp) { } @@ -141,10 +143,6 @@ static inline void ceph_readpage_to_fscache(struct inode *inode, { } -static inline void ceph_fscache_update_objectsize(struct inode *inode) -{ -} - static inline void ceph_fscache_invalidate(struct inode *inode) { } @@ -154,10 +152,6 @@ static inline void ceph_invalidate_fscache_page(struct inode *inode, { } -static inline void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci) -{ -} - static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp) { return 1; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 8eeb9f579db5..ce2f5795e44b 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -137,23 +137,11 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) { struct ceph_file_info *cf; int ret = 0; - struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); - struct ceph_mds_client *mdsc = fsc->mdsc; switch (inode->i_mode & S_IFMT) { case S_IFREG: - /* First file open request creates the cookie, we want to keep - * this cookie around for the filetime of the inode as not to - * have to worry about fscache register / revoke / operation - * races. - * - * Also, if we know the operation is going to invalidate data - * (non readonly) just nuke the cache right away. - */ - ceph_fscache_register_inode_cookie(mdsc->fsc, ci); - if ((fmode & CEPH_FILE_MODE_WR)) - ceph_fscache_invalidate(inode); + ceph_fscache_register_inode_cookie(inode); + ceph_fscache_file_set_cookie(inode, file); case S_IFDIR: dout("init_file %p %p 0%o (regular)\n", inode, file, inode->i_mode); @@ -1407,7 +1395,6 @@ retry_snap: iov_iter_advance(from, written); ceph_put_snap_context(snapc); } else { - loff_t old_size = i_size_read(inode); /* * No need to acquire the i_truncate_mutex. Because * the MDS revokes Fwb caps before sending truncate @@ -1418,8 +1405,6 @@ retry_snap: written = generic_perform_write(file, from, pos); if (likely(written >= 0)) iocb->ki_pos = pos + written; - if (i_size_read(inode) > old_size) - ceph_fscache_update_objectsize(inode); inode_unlock(inode); } From f7f7e7a0635dedd5064fba255cb3facfa87b06d6 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Wed, 18 May 2016 20:31:55 +0800 Subject: [PATCH 09/10] ceph: improve fscache revalidation There are several issues in fscache revalidation code. - In ceph_revalidate_work(), fscache_invalidate() is called when fscache_check_consistency() return 0. This is complete wrong because 0 means cache is valid. - Handle_cap_grant() calls ceph_queue_revalidate() if client already has CAP_FILE_CACHE. This code is confusing. Client should revalidate the cache each time it got CAP_FILE_CACHE anew. - In Handle_cap_grant(), fscache_invalidate() is called if MDS revokes CAP_FILE_CACHE. This is inconsistency with the case that inode get evicted. In the later case, the cache is not discarded. Client may use the cache when inode is reloaded. This patch moves the fscache revalidation into ceph_get_caps(). Client revalidates the cache after it gets CAP_FILE_CACHE. i_rdcache_gen should keep constance while CAP_FILE_CACHE is used. If i_fscache_gen is not equal to i_rdcache_gen, client needs to check cache's consistency. Signed-off-by: Yan, Zheng --- fs/ceph/cache.c | 84 ++++++++++--------------------------------------- fs/ceph/cache.h | 20 ++++++++++-- fs/ceph/caps.c | 16 ++++------ fs/ceph/super.h | 4 +-- 4 files changed, 41 insertions(+), 83 deletions(-) diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index c19db6afd0cc..5b3f4828f214 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -69,15 +69,8 @@ int ceph_fscache_register_fs(struct ceph_fs_client* fsc) fsc->fscache = fscache_acquire_cookie(ceph_cache_netfs.primary_index, &ceph_fscache_fsid_object_def, fsc, true); - - if (fsc->fscache == NULL) { + if (!fsc->fscache) pr_err("Unable to resgister fsid: %p fscache cookie", fsc); - return 0; - } - - fsc->revalidate_wq = alloc_workqueue("ceph-revalidate", 0, 1); - if (fsc->revalidate_wq == NULL) - return -ENOMEM; return 0; } @@ -260,8 +253,7 @@ static void ceph_vfs_readpage_complete_unlock(struct page *page, void *data, int static inline bool cache_valid(struct ceph_inode_info *ci) { - return ((ceph_caps_issued(ci) & CEPH_CAP_FILE_CACHE) && - (ci->i_fscache_gen == ci->i_rdcache_gen)); + return ci->i_fscache_gen == ci->i_rdcache_gen; } @@ -354,69 +346,27 @@ void ceph_invalidate_fscache_page(struct inode* inode, struct page *page) void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc) { - if (fsc->revalidate_wq) - destroy_workqueue(fsc->revalidate_wq); - fscache_relinquish_cookie(fsc->fscache, 0); fsc->fscache = NULL; } -static void ceph_revalidate_work(struct work_struct *work) +/* + * caller should hold CEPH_CAP_FILE_{RD,CACHE} + */ +void ceph_fscache_revalidate_cookie(struct ceph_inode_info *ci) { - int issued; - u32 orig_gen; - struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info, - i_revalidate_work); - struct inode *inode = &ci->vfs_inode; - - spin_lock(&ci->i_ceph_lock); - issued = __ceph_caps_issued(ci, NULL); - orig_gen = ci->i_rdcache_gen; - spin_unlock(&ci->i_ceph_lock); - - if (!(issued & CEPH_CAP_FILE_CACHE)) { - dout("revalidate_work lost cache before validation %p\n", - inode); - goto out; - } - - if (!fscache_check_consistency(ci->fscache)) - fscache_invalidate(ci->fscache); - - spin_lock(&ci->i_ceph_lock); - /* Update the new valid generation (backwards sanity check too) */ - if (orig_gen > ci->i_fscache_gen) { - ci->i_fscache_gen = orig_gen; - } - spin_unlock(&ci->i_ceph_lock); - -out: - iput(&ci->vfs_inode); -} - -void ceph_queue_revalidate(struct inode *inode) -{ - struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); - struct ceph_inode_info *ci = ceph_inode(inode); - - if (fsc->revalidate_wq == NULL || ci->fscache == NULL) + if (cache_valid(ci)) return; - ihold(inode); - - if (queue_work(ceph_sb_to_client(inode->i_sb)->revalidate_wq, - &ci->i_revalidate_work)) { - dout("ceph_queue_revalidate %p\n", inode); - } else { - dout("ceph_queue_revalidate %p failed\n)", inode); - iput(inode); + /* resue i_truncate_mutex. There should be no pending + * truncate while the caller holds CEPH_CAP_FILE_RD */ + mutex_lock(&ci->i_truncate_mutex); + if (!cache_valid(ci)) { + if (fscache_check_consistency(ci->fscache)) + fscache_invalidate(ci->fscache); + spin_lock(&ci->i_ceph_lock); + ci->i_fscache_gen = ci->i_rdcache_gen; + spin_unlock(&ci->i_ceph_lock); } -} - -void ceph_fscache_inode_init(struct ceph_inode_info *ci) -{ - ci->fscache = NULL; - /* The first load is verifed cookie open time */ - ci->i_fscache_gen = 1; - INIT_WORK(&ci->i_revalidate_work, ceph_revalidate_work); + mutex_unlock(&ci->i_truncate_mutex); } diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h index dfe9b5823bd3..7e72c7594f0c 100644 --- a/fs/ceph/cache.h +++ b/fs/ceph/cache.h @@ -34,10 +34,10 @@ void ceph_fscache_unregister(void); int ceph_fscache_register_fs(struct ceph_fs_client* fsc); void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc); -void ceph_fscache_inode_init(struct ceph_inode_info *ci); void ceph_fscache_register_inode_cookie(struct inode *inode); void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci); void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp); +void ceph_fscache_revalidate_cookie(struct ceph_inode_info *ci); int ceph_readpage_from_fscache(struct inode *inode, struct page *page); int ceph_readpages_from_fscache(struct inode *inode, @@ -46,7 +46,12 @@ int ceph_readpages_from_fscache(struct inode *inode, unsigned *nr_pages); void ceph_readpage_to_fscache(struct inode *inode, struct page *page); void ceph_invalidate_fscache_page(struct inode* inode, struct page *page); -void ceph_queue_revalidate(struct inode *inode); + +static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci) +{ + ci->fscache = NULL; + ci->i_fscache_gen = 0; +} static inline void ceph_fscache_invalidate(struct inode *inode) { @@ -82,6 +87,11 @@ static inline void ceph_fscache_readpages_cancel(struct inode *inode, return fscache_readpages_cancel(ci->fscache, pages); } +static inline void ceph_disable_fscache_readpage(struct ceph_inode_info *ci) +{ + ci->i_fscache_gen = ci->i_rdcache_gen - 1; +} + #else static inline int ceph_fscache_register(void) @@ -119,6 +129,10 @@ static inline void ceph_fscache_file_set_cookie(struct inode *inode, { } +static inline void ceph_fscache_revalidate_cookie(struct ceph_inode_info *ci) +{ +} + static inline void ceph_fscache_uncache_page(struct inode *inode, struct page *pages) { @@ -167,7 +181,7 @@ static inline void ceph_fscache_readpages_cancel(struct inode *inode, { } -static inline void ceph_queue_revalidate(struct inode *inode) +static inline void ceph_disable_fscache_readpage(struct ceph_inode_info *ci) { } diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 7bdf7d59a36d..6f60d0a3d0f9 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2393,6 +2393,9 @@ again: snap_rwsem_locked = true; } *got = need | (have & want); + if ((need & CEPH_CAP_FILE_RD) && + !(*got & CEPH_CAP_FILE_CACHE)) + ceph_disable_fscache_readpage(ci); __take_cap_refs(ci, *got, true); ret = 1; } @@ -2554,6 +2557,9 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, break; } + if ((_got & CEPH_CAP_FILE_RD) && (_got & CEPH_CAP_FILE_CACHE)) + ceph_fscache_revalidate_cookie(ci); + *got = _got; return 0; } @@ -2795,7 +2801,6 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, bool writeback = false; bool queue_trunc = false; bool queue_invalidate = false; - bool queue_revalidate = false; bool deleted_inode = false; bool fill_inline = false; @@ -2837,8 +2842,6 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, ci->i_rdcache_revoking = ci->i_rdcache_gen; } } - - ceph_fscache_invalidate(inode); } /* side effects now are allowed */ @@ -2880,11 +2883,6 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, } } - /* Do we need to revalidate our fscache cookie. Don't bother on the - * first cache cap as we already validate at cookie creation time. */ - if ((issued & CEPH_CAP_FILE_CACHE) && ci->i_rdcache_gen > 1) - queue_revalidate = true; - if (newcaps & CEPH_CAP_ANY_RD) { /* ctime/mtime/atime? */ ceph_decode_timespec(&mtime, &grant->mtime); @@ -2995,8 +2993,6 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, if (queue_trunc) ceph_queue_vmtruncate(inode); - else if (queue_revalidate) - ceph_queue_revalidate(inode); if (writeback) /* diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 0130a8592191..0168b49fb6ad 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -103,7 +103,6 @@ struct ceph_fs_client { #ifdef CONFIG_CEPH_FSCACHE struct fscache_cookie *fscache; - struct workqueue_struct *revalidate_wq; #endif }; @@ -360,8 +359,7 @@ struct ceph_inode_info { #ifdef CONFIG_CEPH_FSCACHE struct fscache_cookie *fscache; - u32 i_fscache_gen; /* sequence, for delayed fscache validate */ - struct work_struct i_revalidate_work; + u32 i_fscache_gen; #endif struct inode vfs_inode; /* at end */ }; From f6973c09490c919398fc87d9c6bec9c0b670c4c4 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 20 May 2016 16:57:29 +0800 Subject: [PATCH 10/10] ceph: use i_version to check validity of fscache Signed-off-by: Yan, Zheng --- fs/ceph/cache.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index 5b3f4828f214..238c55b01723 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -25,6 +25,7 @@ #include "cache.h" struct ceph_aux_inode { + u64 version; struct timespec mtime; loff_t size; }; @@ -98,6 +99,7 @@ static uint16_t ceph_fscache_inode_get_aux(const void *cookie_netfs_data, const struct inode* inode = &ci->vfs_inode; memset(&aux, 0, sizeof(aux)); + aux.version = ci->i_version; aux.mtime = inode->i_mtime; aux.size = i_size_read(inode); @@ -124,6 +126,7 @@ static enum fscache_checkaux ceph_fscache_inode_check_aux( return FSCACHE_CHECKAUX_OBSOLETE; memset(&aux, 0, sizeof(aux)); + aux.version = ci->i_version; aux.mtime = inode->i_mtime; aux.size = i_size_read(inode);