From 3b663780347ce532b08be1c859b1df14f0eea4c8 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 18 May 2011 16:12:12 -0700 Subject: [PATCH 01/23] ceph: take reference on mds request r_unsafe_dir We put ourselves on an inode list for the parent directory of metadata operations so that an fsync on the directory will wait for metadata updates to commit to disk. We weren't holding a reference to that directory, however, and under certain workloads (fsstress in this case) the directory can go away. Signed-off-by: Sage Weil --- fs/ceph/mds_client.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index d0fae4ce9ba5..ebce88ab3982 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -578,6 +578,7 @@ static void __register_request(struct ceph_mds_client *mdsc, if (dir) { struct ceph_inode_info *ci = ceph_inode(dir); + ihold(dir); spin_lock(&ci->i_unsafe_lock); req->r_unsafe_dir = dir; list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops); @@ -598,6 +599,9 @@ static void __unregister_request(struct ceph_mds_client *mdsc, spin_lock(&ci->i_unsafe_lock); list_del_init(&req->r_unsafe_dir_item); spin_unlock(&ci->i_unsafe_lock); + + iput(req->r_unsafe_dir); + req->r_unsafe_dir = NULL; } ceph_mdsc_put_request(req); From 0da5d70369e87f80adf794080cfff1ca15a34198 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 19 May 2011 11:21:05 -0700 Subject: [PATCH 02/23] libceph: handle connection reopen race with callbacks If a connection is closed and/or reopened (ceph_con_close, ceph_con_open) it can race with a callback. con_work does various state checks for closed or reopened sockets at the beginning, but drops con->mutex before making callbacks. We need to check for state bit changes after retaking the lock to ensure we restart con_work and execute those CLOSED/OPENING tests or else we may end up operating under stale assumptions. In Jim's case, this was causing 'bad tag' errors. There are four cases where we re-take the con->mutex inside con_work: catch them all and return EAGAIN from try_{read,write} so that we can restart con_work. Reported-by: Jim Schutt Tested-by: Jim Schutt Signed-off-by: Sage Weil --- net/ceph/messenger.c | 64 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 51 insertions(+), 13 deletions(-) diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index e15a82ccc05f..b140dd3515de 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -598,7 +598,7 @@ static void prepare_write_keepalive(struct ceph_connection *con) * Connection negotiation. */ -static void prepare_connect_authorizer(struct ceph_connection *con) +static int prepare_connect_authorizer(struct ceph_connection *con) { void *auth_buf; int auth_len = 0; @@ -612,6 +612,10 @@ static void prepare_connect_authorizer(struct ceph_connection *con) con->auth_retry); mutex_lock(&con->mutex); + if (test_bit(CLOSED, &con->state) || + test_bit(OPENING, &con->state)) + return -EAGAIN; + con->out_connect.authorizer_protocol = cpu_to_le32(auth_protocol); con->out_connect.authorizer_len = cpu_to_le32(auth_len); @@ -619,6 +623,8 @@ static void prepare_connect_authorizer(struct ceph_connection *con) con->out_kvec[con->out_kvec_left].iov_len = auth_len; con->out_kvec_left++; con->out_kvec_bytes += auth_len; + + return 0; } /* @@ -640,9 +646,9 @@ static void prepare_write_banner(struct ceph_messenger *msgr, set_bit(WRITE_PENDING, &con->state); } -static void prepare_write_connect(struct ceph_messenger *msgr, - struct ceph_connection *con, - int after_banner) +static int prepare_write_connect(struct ceph_messenger *msgr, + struct ceph_connection *con, + int after_banner) { unsigned global_seq = get_global_seq(con->msgr, 0); int proto; @@ -683,7 +689,7 @@ static void prepare_write_connect(struct ceph_messenger *msgr, con->out_more = 0; set_bit(WRITE_PENDING, &con->state); - prepare_connect_authorizer(con); + return prepare_connect_authorizer(con); } @@ -1216,6 +1222,7 @@ static int process_connect(struct ceph_connection *con) u64 sup_feat = con->msgr->supported_features; u64 req_feat = con->msgr->required_features; u64 server_feat = le64_to_cpu(con->in_reply.features); + int ret; dout("process_connect on %p tag %d\n", con, (int)con->in_tag); @@ -1250,7 +1257,9 @@ static int process_connect(struct ceph_connection *con) return -1; } con->auth_retry = 1; - prepare_write_connect(con->msgr, con, 0); + ret = prepare_write_connect(con->msgr, con, 0); + if (ret < 0) + return ret; prepare_read_connect(con); break; @@ -1277,6 +1286,9 @@ static int process_connect(struct ceph_connection *con) if (con->ops->peer_reset) con->ops->peer_reset(con); mutex_lock(&con->mutex); + if (test_bit(CLOSED, &con->state) || + test_bit(OPENING, &con->state)) + return -EAGAIN; break; case CEPH_MSGR_TAG_RETRY_SESSION: @@ -1810,6 +1822,17 @@ static int try_read(struct ceph_connection *con) more: dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag, con->in_base_pos); + + /* + * process_connect and process_message drop and re-take + * con->mutex. make sure we handle a racing close or reopen. + */ + if (test_bit(CLOSED, &con->state) || + test_bit(OPENING, &con->state)) { + ret = -EAGAIN; + goto out; + } + if (test_bit(CONNECTING, &con->state)) { if (!test_bit(NEGOTIATING, &con->state)) { dout("try_read connecting\n"); @@ -1938,8 +1961,10 @@ static void con_work(struct work_struct *work) { struct ceph_connection *con = container_of(work, struct ceph_connection, work.work); + int ret; mutex_lock(&con->mutex); +restart: if (test_and_clear_bit(BACKOFF, &con->state)) { dout("con_work %p backing off\n", con); if (queue_delayed_work(ceph_msgr_wq, &con->work, @@ -1969,18 +1994,31 @@ static void con_work(struct work_struct *work) con_close_socket(con); } - if (test_and_clear_bit(SOCK_CLOSED, &con->state) || - try_read(con) < 0 || - try_write(con) < 0) { - mutex_unlock(&con->mutex); - ceph_fault(con); /* error/fault path */ - goto done_unlocked; - } + if (test_and_clear_bit(SOCK_CLOSED, &con->state)) + goto fault; + + ret = try_read(con); + if (ret == -EAGAIN) + goto restart; + if (ret < 0) + goto fault; + + ret = try_write(con); + if (ret == -EAGAIN) + goto restart; + if (ret < 0) + goto fault; done: mutex_unlock(&con->mutex); done_unlocked: con->ops->put(con); + return; + +fault: + mutex_unlock(&con->mutex); + ceph_fault(con); /* error/fault path */ + goto done_unlocked; } From 1b36698577c1008dc1e63f0bf4b6f3d9deada94a Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 12 May 2011 14:14:51 -0700 Subject: [PATCH 03/23] libceph: remove unused variable Signed-off-by: Sage Weil --- fs/ceph/mds_client.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index ebce88ab3982..c12d2e9a0ec6 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2695,7 +2695,6 @@ static void handle_lease(struct ceph_mds_client *mdsc, { struct super_block *sb = mdsc->fsc->sb; struct inode *inode; - struct ceph_inode_info *ci; struct dentry *parent, *dentry; struct ceph_dentry_info *di; int mds = session->s_mds; @@ -2732,7 +2731,6 @@ static void handle_lease(struct ceph_mds_client *mdsc, dout("handle_lease no inode %llx\n", vino.ino); goto release; } - ci = ceph_inode(inode); /* dentry */ parent = d_find_alias(inode); From e8f54ce169125a2e59330fac25ad3c9ac0ce22a5 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 12 May 2011 14:18:42 -0700 Subject: [PATCH 04/23] libceph: fix uninitialized value when no get_authorizer method is set If there is no get_authorizer method we set the out_kvec to a bogus pointer. The length is also zero in that case, so it doesn't much matter, but it's better not to add the empty item in the first place. Signed-off-by: Sage Weil --- net/ceph/messenger.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index b140dd3515de..ce326c806237 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -619,11 +619,12 @@ static int prepare_connect_authorizer(struct ceph_connection *con) con->out_connect.authorizer_protocol = cpu_to_le32(auth_protocol); con->out_connect.authorizer_len = cpu_to_le32(auth_len); - con->out_kvec[con->out_kvec_left].iov_base = auth_buf; - con->out_kvec[con->out_kvec_left].iov_len = auth_len; - con->out_kvec_left++; - con->out_kvec_bytes += auth_len; - + if (auth_len) { + con->out_kvec[con->out_kvec_left].iov_base = auth_buf; + con->out_kvec[con->out_kvec_left].iov_len = auth_len; + con->out_kvec_left++; + con->out_kvec_bytes += auth_len; + } return 0; } From ae598083015e22d1802617c18d3408971b1bddc0 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 12 May 2011 14:28:05 -0700 Subject: [PATCH 05/23] ceph: use snprintf for dirstat content We allocate a buffer for rstats if the dirstat option is enabled. Use snprintf. Signed-off-by: Sage Weil --- fs/ceph/dir.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 1a867a3601ae..53a5eb417856 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -1066,16 +1066,17 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, struct inode *inode = file->f_dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); int left; + const int bufsize = 1024; if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT)) return -EISDIR; if (!cf->dir_info) { - cf->dir_info = kmalloc(1024, GFP_NOFS); + cf->dir_info = kmalloc(bufsize, GFP_NOFS); if (!cf->dir_info) return -ENOMEM; cf->dir_info_len = - sprintf(cf->dir_info, + snprintf(cf->dir_info, bufsize, "entries: %20lld\n" " files: %20lld\n" " subdirs: %20lld\n" From 2dab036b8c349d747f447ac98c8eb40c769727a8 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 12 May 2011 14:29:51 -0700 Subject: [PATCH 06/23] libceph: use snprintf for formatting object name Signed-off-by: Sage Weil --- net/ceph/osd_client.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 6b5dda1cb5df..9adbb01d23ad 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -124,7 +124,7 @@ static void calc_layout(struct ceph_osd_client *osdc, ceph_calc_raw_layout(osdc, layout, vino.snap, off, plen, &bno, req, op); - sprintf(req->r_oid, "%llx.%08llx", vino.ino, bno); + snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx", vino.ino, bno); req->r_oid_len = strlen(req->r_oid); } From 12a2f643b0e6e791ba61485430d0003eeb3e373c Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 12 May 2011 14:34:04 -0700 Subject: [PATCH 07/23] libceph: use snprintf for unknown addrs Signed-off-by: Sage Weil --- net/ceph/messenger.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index ce326c806237..3cdbb8853cd7 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -76,7 +76,8 @@ const char *ceph_pr_addr(const struct sockaddr_storage *ss) break; default: - sprintf(s, "(unknown sockaddr family %d)", (int)ss->ss_family); + snprintf(s, MAX_ADDR_STR_LEN, "(unknown sockaddr family %d)", + (int)ss->ss_family); } return s; From 3540303f87115cbdae6ed2cab44ce6a7676d48d3 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 12 May 2011 15:13:23 -0700 Subject: [PATCH 08/23] ceph: fix rare potential cap leak If we grab new_cap, retake the lock, and find we already have a cap now for the given mds, release new_cap. Signed-off-by: Sage Weil --- fs/ceph/caps.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 2a5404c1c42f..591202bc9668 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -569,7 +569,8 @@ retry: list_add_tail(&cap->session_caps, &session->s_caps); session->s_nr_caps++; spin_unlock(&session->s_cap_lock); - } + } else if (new_cap) + ceph_put_cap(mdsc, new_cap); if (!ci->i_snap_realm) { /* From 31456665a02148353a83fec84d3182700e356588 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 12 May 2011 15:18:43 -0700 Subject: [PATCH 09/23] libceph: fix osdmap timestamp assignment Signed-off-by: Sage Weil --- net/ceph/osdmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 71603ac3dff5..94f068dda433 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -765,7 +765,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, } map->epoch++; - map->modified = map->modified; + map->modified = modified; if (newcrush) { if (map->crush) crush_destroy(map->crush); From da39822c6565095a0151ccf9d6b95e2ae5612885 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 12 May 2011 15:28:11 -0700 Subject: [PATCH 10/23] ceph: fix broken comparison in readdir loop Both off and fi->offset are unsigned, so the difference is always >= 0. Compare them directly instead of the sign of the difference. Signed-off-by: Sage Weil --- fs/ceph/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 53a5eb417856..33729e822bb9 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -360,7 +360,7 @@ more: rinfo = &fi->last_readdir->r_reply_info; dout("readdir frag %x num %d off %d chunkoff %d\n", frag, rinfo->dir_nr, off, fi->offset); - while (off - fi->offset >= 0 && off - fi->offset < rinfo->dir_nr) { + while (off >= fi->offset && off - fi->offset < rinfo->dir_nr) { u64 pos = ceph_make_fpos(frag, off); struct ceph_mds_reply_inode *in = rinfo->dir_in[off - fi->offset].in; From 04177882265bc5014300a631e7384f8fe6b6aa0f Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 12 May 2011 15:33:17 -0700 Subject: [PATCH 11/23] libceph: fix TAG_WAIT case If we get a WAIT as a client something went wrong; error out. And don't fall through to an unrelated case. Signed-off-by: Sage Weil --- net/ceph/messenger.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 3cdbb8853cd7..35c0000a658e 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1355,7 +1355,9 @@ static int process_connect(struct ceph_connection *con) * to WAIT. This shouldn't happen if we are the * client. */ - pr_err("process_connect peer connecting WAIT\n"); + pr_err("process_connect got WAIT as client\n"); + con->error_msg = "protocol error, got WAIT as client"; + return -1; default: pr_err("connect protocol error, will retry\n"); From a2a79609c044d3ddb540671d5029a41c90c57251 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 12 May 2011 15:34:24 -0700 Subject: [PATCH 12/23] libceph: add missing breaks in addr_set_port Signed-off-by: Sage Weil --- net/ceph/messenger.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 35c0000a658e..78b55f49de7c 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1073,8 +1073,10 @@ static void addr_set_port(struct sockaddr_storage *ss, int p) switch (ss->ss_family) { case AF_INET: ((struct sockaddr_in *)ss)->sin_port = htons(p); + break; case AF_INET6: ((struct sockaddr_in6 *)ss)->sin6_port = htons(p); + break; } } From 6b4a3b517a767c483d16a200730b2967e0e23b83 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 12 May 2011 15:43:48 -0700 Subject: [PATCH 13/23] ceph: remove useless check rc is only ever 0 or negative in this method. Signed-off-by: Sage Weil --- fs/ceph/addr.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 38b8ab554924..54967268a340 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -880,8 +880,6 @@ release_pvec_pages: out: if (req) ceph_osdc_put_request(req); - if (rc > 0) - rc = 0; /* vfs expects us to return 0 */ ceph_put_snap_context(snapc); dout("writepages done, rc = %d\n", rc); return rc; From 9d6fcb081a4770c3772c51c59c7251c22716d7bb Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 12 May 2011 15:48:16 -0700 Subject: [PATCH 14/23] ceph: check return value for start_request in writepages Since we pass the nofail arg, we should never get an error; BUG if we do. (And fix the function to not return an error if __map_request fails.) Signed-off-by: Sage Weil --- fs/ceph/addr.c | 3 ++- net/ceph/osd_client.c | 8 +++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 54967268a340..33da49dc3cc6 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -848,7 +848,8 @@ get_more_pages: op->payload_len = cpu_to_le32(len); req->r_request->hdr.data_len = cpu_to_le32(len); - ceph_osdc_start_request(&fsc->client->osdc, req, true); + rc = ceph_osdc_start_request(&fsc->client->osdc, req, true); + BUG_ON(rc); req = NULL; /* continue? */ diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 9adbb01d23ad..caa092eb0009 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1677,8 +1677,14 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc, */ if (req->r_sent == 0) { rc = __map_request(osdc, req); - if (rc < 0) + if (rc < 0) { + if (nofail) { + dout("osdc_start_request failed map, " + " will retry %lld\n", req->r_tid); + rc = 0; + } goto out_unlock; + } if (req->r_osd == NULL) { dout("send_request %p no up osds in pg\n", req); ceph_monc_request_next_osdmap(&osdc->client->monc); From 13143d2d1cffd243a6d778000b02ab4938ac751a Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 12 May 2011 16:08:30 -0700 Subject: [PATCH 15/23] rbd: warn on update_snaps failure on notify Signed-off-by: Sage Weil --- drivers/block/rbd.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 9712fad82bc6..78d0011a7556 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1191,14 +1191,19 @@ static int rbd_req_sync_notify_ack(struct rbd_device *dev, static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) { struct rbd_device *dev = (struct rbd_device *)data; + int rc; + if (!dev) return; dout("rbd_watch_cb %s notify_id=%lld opcode=%d\n", dev->obj_md_name, notify_id, (int)opcode); mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); - __rbd_update_snaps(dev); + rc = __rbd_update_snaps(dev); mutex_unlock(&ctl_mutex); + if (rc) + pr_warning(DRV_NAME "%d got notification but failed to update" + " snaps: %d\n", dev->major, rc); rbd_req_sync_notify_ack(dev, ver, notify_id, dev->obj_md_name); } From 916d4d672779de8e42346fff338617c7b841e8e5 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 12 May 2011 16:10:50 -0700 Subject: [PATCH 16/23] rbd: cleanup: make kfree match kmalloc Signed-off-by: Sage Weil --- drivers/block/rbd.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 78d0011a7556..0d80967e140c 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1602,7 +1602,7 @@ static int rbd_header_add_snap(struct rbd_device *dev, int name_len = strlen(snap_name); u64 new_snapid; int ret; - void *data, *data_start, *data_end; + void *data, *p, *e; u64 ver; /* we should create a snapshot only if we're pointing at the head */ @@ -1619,16 +1619,16 @@ static int rbd_header_add_snap(struct rbd_device *dev, if (!data) return -ENOMEM; - data_start = data; - data_end = data + name_len + 16; + p = data; + e = data + name_len + 16; - ceph_encode_string_safe(&data, data_end, snap_name, name_len, bad); - ceph_encode_64_safe(&data, data_end, new_snapid, bad); + ceph_encode_string_safe(&p, e, snap_name, name_len, bad); + ceph_encode_64_safe(&p, e, new_snapid, bad); ret = rbd_req_sync_exec(dev, dev->obj_md_name, "rbd", "snap_add", - data_start, data - data_start, &ver); + data, p - data, &ver); - kfree(data_start); + kfree(data); if (ret < 0) return ret; From aedfec59eed37d1ff7ce09b303b668234e9a7f8e Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 12 May 2011 20:57:03 -0700 Subject: [PATCH 17/23] rbd: use snprintf for disk->disk_name Signed-off-by: Sage Weil --- drivers/block/rbd.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 0d80967e140c..d833d02b321f 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1721,7 +1721,8 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) if (!disk) goto out; - sprintf(disk->disk_name, DRV_NAME "%d", rbd_dev->id); + snprintf(disk->disk_name, sizeof(disk->disk_name), DRV_NAME "%d", + rbd_dev->id); disk->major = rbd_dev->major; disk->first_minor = 0; disk->fops = &rbd_bd_ops; From 3c454cf21645bc96668e286f6352ac2c4c895fa2 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 6 Apr 2011 09:31:40 -0700 Subject: [PATCH 18/23] ceph: use LOOKUPINO to make unconnected nfs fh more reliable If we are unable to locate an inode by ino, ask the MDS using the new LOOKUPINO command. Signed-off-by: Sage Weil --- fs/ceph/export.c | 19 +++++++++++++++++-- include/linux/ceph/ceph_fs.h | 1 + 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/fs/ceph/export.c b/fs/ceph/export.c index e41056174bf8..f1828af09912 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -86,6 +86,7 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len, static struct dentry *__fh_to_dentry(struct super_block *sb, struct ceph_nfs_fh *fh) { + struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc; struct inode *inode; struct dentry *dentry; struct ceph_vino vino; @@ -95,8 +96,22 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, vino.ino = fh->ino; vino.snap = CEPH_NOSNAP; inode = ceph_find_inode(sb, vino); - if (!inode) - return ERR_PTR(-ESTALE); + if (!inode) { + struct ceph_mds_request *req; + + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPINO, + USE_ANY_MDS); + if (IS_ERR(req)) + return ERR_CAST(req); + + req->r_ino1 = vino; + req->r_num_caps = 1; + err = ceph_mdsc_do_request(mdsc, NULL, req); + ceph_mdsc_put_request(req); + inode = ceph_find_inode(sb, vino); + if (!inode) + return ERR_PTR(-ESTALE); + } dentry = d_obtain_alias(inode); if (IS_ERR(dentry)) { diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index b8e995fbd867..b8c60694b2b0 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -313,6 +313,7 @@ enum { CEPH_MDS_OP_GETATTR = 0x00101, CEPH_MDS_OP_LOOKUPHASH = 0x00102, CEPH_MDS_OP_LOOKUPPARENT = 0x00103, + CEPH_MDS_OP_LOOKUPINO = 0x00104, CEPH_MDS_OP_SETXATTR = 0x01105, CEPH_MDS_OP_RMXATTR = 0x01106, From 45e3d3eeb6578e523e100622266945ecd71723bb Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 6 Apr 2011 09:35:00 -0700 Subject: [PATCH 19/23] ceph: avoid inode lookup on nfs fh reconnect If we get the inode from the MDS, we have a reference in req; don't do a fresh lookup. Signed-off-by: Sage Weil --- fs/ceph/export.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/ceph/export.c b/fs/ceph/export.c index f1828af09912..a610d3d67488 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -107,8 +107,10 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, req->r_ino1 = vino; req->r_num_caps = 1; err = ceph_mdsc_do_request(mdsc, NULL, req); + inode = req->r_target_inode; + if (inode) + igrab(inode); ceph_mdsc_put_request(req); - inode = ceph_find_inode(sb, vino); if (!inode) return ERR_PTR(-ESTALE); } @@ -163,8 +165,10 @@ static struct dentry *__cfh_to_dentry(struct super_block *sb, snprintf(req->r_path2, 16, "%d", cfh->parent_name_hash); req->r_num_caps = 1; err = ceph_mdsc_do_request(mdsc, NULL, req); + inode = req->r_target_inode; + if (inode) + igrab(inode); ceph_mdsc_put_request(req); - inode = ceph_find_inode(sb, vino); if (!inode) return ERR_PTR(err ? err : -ESTALE); } From 9db4b3e32778400555d5cc6fb61d4058902d37f7 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 19 Apr 2011 22:49:06 -0700 Subject: [PATCH 20/23] rbd: handle online resize of underlying rbd image If we get a notification that the image header has changed, check for a change in the image size. Signed-off-by: Sage Weil --- drivers/block/rbd.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index d833d02b321f..1278098624e6 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1664,6 +1664,9 @@ static int __rbd_update_snaps(struct rbd_device *rbd_dev) if (ret < 0) return ret; + /* resized? */ + set_capacity(rbd_dev->disk, h.image_size / 512ULL); + down_write(&rbd_dev->header.snap_rwsem); snap_seq = rbd_dev->header.snapc->seq; From 7662d8ff57d2b00ce8f7fe0b60a85efbb2c05652 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 3 May 2011 12:52:05 -0700 Subject: [PATCH 21/23] libceph: handle new osdmap down/state change encoding Old incrementals encode a 0 value (nearly always) when an osd goes down. Change that to allow any state bit(s) to be flipped. Special case 0 to mean flip the CEPH_OSD_UP bit to mimic the old behavior. Signed-off-by: Sage Weil --- net/ceph/osdmap.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 94f068dda433..e97c3588c3ec 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -830,15 +830,20 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, map->osd_addr[osd] = addr; } - /* new_down */ + /* new_state */ ceph_decode_32_safe(p, end, len, bad); while (len--) { u32 osd; + u8 xorstate; ceph_decode_32_safe(p, end, osd, bad); + xorstate = **(u8 **)p; (*p)++; /* clean flag */ - pr_info("osd%d down\n", osd); + if (xorstate == 0) + xorstate = CEPH_OSD_UP; + if (xorstate & CEPH_OSD_UP) + pr_info("osd%d down\n", osd); if (osd < map->max_osd) - map->osd_state[osd] &= ~CEPH_OSD_UP; + map->osd_state[osd] ^= xorstate; } /* new_weight */ From cd634fb6eec72ef8e6dd677546b8d0ffdd2501eb Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 12 May 2011 09:29:18 -0700 Subject: [PATCH 22/23] libceph: subscribe to osdmap when cluster is full When the cluster is marked full, subscribe to subsequent map updates to ensure we find out promptly when it is no longer full. This will prevent us from spewing ENOSPC for (much) longer than necessary. Signed-off-by: Sage Weil --- net/ceph/osd_client.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index caa092eb0009..6ea2b892f44b 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1421,6 +1421,15 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) done: downgrade_write(&osdc->map_sem); ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch); + + /* + * subscribe to subsequent osdmap updates if full to ensure + * we find out when we are no longer full and stop returning + * ENOSPC. + */ + if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) + ceph_monc_request_next_osdmap(&osdc->client->monc); + send_queued(osdc); up_read(&osdc->map_sem); wake_up_all(&osdc->client->auth_wq); From db3540522e955c1ebb391f4f5324dff4f20ecd09 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 24 May 2011 11:46:31 -0700 Subject: [PATCH 23/23] ceph: fix cap flush race reentrancy In e9964c10 we change cap flushing to do a delicate dance because some inodes on the cap_dirty list could be in a migrating state (got EXPORT but not IMPORT) in which we couldn't actually flush and move from dirty->flushing, breaking the while (!empty) { process first } loop structure. It worked for a single sync thread, but was not reentrant and triggered infinite loops when multiple syncers came along. Instead, move inodes with dirty to a separate cap_dirty_migrating list when in the limbo export-but-no-import state, allowing us to go back to the simple loop structure (which was reentrant). This is cleaner and more robust. Audited the cap_dirty users and this looks fine: list_empty(&ci->i_dirty_item) is still a reliable indicator of whether we have dirty caps (which list we're on is irrelevant) and list_del_init() calls still do the right thing. Signed-off-by: Sage Weil --- fs/ceph/caps.c | 58 ++++++++++++++++++++++---------------------- fs/ceph/mds_client.c | 1 + fs/ceph/mds_client.h | 1 + 3 files changed, 31 insertions(+), 29 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 591202bc9668..1f72b00447c4 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2635,6 +2635,7 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, struct ceph_mds_session *session, int *open_target_sessions) { + struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); int mds = session->s_mds; unsigned mseq = le32_to_cpu(ex->migrate_seq); @@ -2671,6 +2672,19 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, * export targets, so that we get the matching IMPORT */ *open_target_sessions = 1; + + /* + * we can't flush dirty caps that we've seen the + * EXPORT but no IMPORT for + */ + spin_lock(&mdsc->cap_dirty_lock); + if (!list_empty(&ci->i_dirty_item)) { + dout(" moving %p to cap_dirty_migrating\n", + inode); + list_move(&ci->i_dirty_item, + &mdsc->cap_dirty_migrating); + } + spin_unlock(&mdsc->cap_dirty_lock); } __ceph_remove_cap(cap); } @@ -2708,6 +2722,13 @@ static void handle_cap_import(struct ceph_mds_client *mdsc, ci->i_cap_exporting_issued = 0; ci->i_cap_exporting_mseq = 0; ci->i_cap_exporting_mds = -1; + + spin_lock(&mdsc->cap_dirty_lock); + if (!list_empty(&ci->i_dirty_item)) { + dout(" moving %p back to cap_dirty\n", inode); + list_move(&ci->i_dirty_item, &mdsc->cap_dirty); + } + spin_unlock(&mdsc->cap_dirty_lock); } else { dout("handle_cap_import inode %p ci %p mds%d mseq %d\n", inode, ci, mds, mseq); @@ -2911,38 +2932,16 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc) */ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc) { - struct ceph_inode_info *ci, *nci = NULL; - struct inode *inode, *ninode = NULL; - struct list_head *p, *n; + struct ceph_inode_info *ci; + struct inode *inode; dout("flush_dirty_caps\n"); spin_lock(&mdsc->cap_dirty_lock); - list_for_each_safe(p, n, &mdsc->cap_dirty) { - if (nci) { - ci = nci; - inode = ninode; - ci->i_ceph_flags &= ~CEPH_I_NOFLUSH; - dout("flush_dirty_caps inode %p (was next inode)\n", - inode); - } else { - ci = list_entry(p, struct ceph_inode_info, - i_dirty_item); - inode = igrab(&ci->vfs_inode); - BUG_ON(!inode); - dout("flush_dirty_caps inode %p\n", inode); - } - if (n != &mdsc->cap_dirty) { - nci = list_entry(n, struct ceph_inode_info, - i_dirty_item); - ninode = igrab(&nci->vfs_inode); - BUG_ON(!ninode); - nci->i_ceph_flags |= CEPH_I_NOFLUSH; - dout("flush_dirty_caps next inode %p, noflush\n", - ninode); - } else { - nci = NULL; - ninode = NULL; - } + while (!list_empty(&mdsc->cap_dirty)) { + ci = list_first_entry(&mdsc->cap_dirty, struct ceph_inode_info, + i_dirty_item); + inode = igrab(&ci->vfs_inode); + dout("flush_dirty_caps %p\n", inode); spin_unlock(&mdsc->cap_dirty_lock); if (inode) { ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH, @@ -2952,6 +2951,7 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc) spin_lock(&mdsc->cap_dirty_lock); } spin_unlock(&mdsc->cap_dirty_lock); + dout("flush_dirty_caps done\n"); } /* diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index c12d2e9a0ec6..79743d146be6 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -3004,6 +3004,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) spin_lock_init(&mdsc->snap_flush_lock); mdsc->cap_flush_seq = 0; INIT_LIST_HEAD(&mdsc->cap_dirty); + INIT_LIST_HEAD(&mdsc->cap_dirty_migrating); mdsc->num_cap_flushing = 0; spin_lock_init(&mdsc->cap_dirty_lock); init_waitqueue_head(&mdsc->cap_flushing_wq); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 4e3a9cc0bba6..7d8a0d662d56 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -278,6 +278,7 @@ struct ceph_mds_client { u64 cap_flush_seq; struct list_head cap_dirty; /* inodes with dirty caps */ + struct list_head cap_dirty_migrating; /* ...that are migration... */ int num_cap_flushing; /* # caps we are flushing */ spinlock_t cap_dirty_lock; /* protects above items */ wait_queue_head_t cap_flushing_wq;