diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 9712fad82bc6..1278098624e6 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1191,14 +1191,19 @@ static int rbd_req_sync_notify_ack(struct rbd_device *dev, static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) { struct rbd_device *dev = (struct rbd_device *)data; + int rc; + if (!dev) return; dout("rbd_watch_cb %s notify_id=%lld opcode=%d\n", dev->obj_md_name, notify_id, (int)opcode); mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); - __rbd_update_snaps(dev); + rc = __rbd_update_snaps(dev); mutex_unlock(&ctl_mutex); + if (rc) + pr_warning(DRV_NAME "%d got notification but failed to update" + " snaps: %d\n", dev->major, rc); rbd_req_sync_notify_ack(dev, ver, notify_id, dev->obj_md_name); } @@ -1597,7 +1602,7 @@ static int rbd_header_add_snap(struct rbd_device *dev, int name_len = strlen(snap_name); u64 new_snapid; int ret; - void *data, *data_start, *data_end; + void *data, *p, *e; u64 ver; /* we should create a snapshot only if we're pointing at the head */ @@ -1614,16 +1619,16 @@ static int rbd_header_add_snap(struct rbd_device *dev, if (!data) return -ENOMEM; - data_start = data; - data_end = data + name_len + 16; + p = data; + e = data + name_len + 16; - ceph_encode_string_safe(&data, data_end, snap_name, name_len, bad); - ceph_encode_64_safe(&data, data_end, new_snapid, bad); + ceph_encode_string_safe(&p, e, snap_name, name_len, bad); + ceph_encode_64_safe(&p, e, new_snapid, bad); ret = rbd_req_sync_exec(dev, dev->obj_md_name, "rbd", "snap_add", - data_start, data - data_start, &ver); + data, p - data, &ver); - kfree(data_start); + kfree(data); if (ret < 0) return ret; @@ -1659,6 +1664,9 @@ static int __rbd_update_snaps(struct rbd_device *rbd_dev) if (ret < 0) return ret; + /* resized? */ + set_capacity(rbd_dev->disk, h.image_size / 512ULL); + down_write(&rbd_dev->header.snap_rwsem); snap_seq = rbd_dev->header.snapc->seq; @@ -1716,7 +1724,8 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) if (!disk) goto out; - sprintf(disk->disk_name, DRV_NAME "%d", rbd_dev->id); + snprintf(disk->disk_name, sizeof(disk->disk_name), DRV_NAME "%d", + rbd_dev->id); disk->major = rbd_dev->major; disk->first_minor = 0; disk->fops = &rbd_bd_ops; diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 38b8ab554924..33da49dc3cc6 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -848,7 +848,8 @@ get_more_pages: op->payload_len = cpu_to_le32(len); req->r_request->hdr.data_len = cpu_to_le32(len); - ceph_osdc_start_request(&fsc->client->osdc, req, true); + rc = ceph_osdc_start_request(&fsc->client->osdc, req, true); + BUG_ON(rc); req = NULL; /* continue? */ @@ -880,8 +881,6 @@ release_pvec_pages: out: if (req) ceph_osdc_put_request(req); - if (rc > 0) - rc = 0; /* vfs expects us to return 0 */ ceph_put_snap_context(snapc); dout("writepages done, rc = %d\n", rc); return rc; diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 2a5404c1c42f..1f72b00447c4 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -569,7 +569,8 @@ retry: list_add_tail(&cap->session_caps, &session->s_caps); session->s_nr_caps++; spin_unlock(&session->s_cap_lock); - } + } else if (new_cap) + ceph_put_cap(mdsc, new_cap); if (!ci->i_snap_realm) { /* @@ -2634,6 +2635,7 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, struct ceph_mds_session *session, int *open_target_sessions) { + struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); int mds = session->s_mds; unsigned mseq = le32_to_cpu(ex->migrate_seq); @@ -2670,6 +2672,19 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, * export targets, so that we get the matching IMPORT */ *open_target_sessions = 1; + + /* + * we can't flush dirty caps that we've seen the + * EXPORT but no IMPORT for + */ + spin_lock(&mdsc->cap_dirty_lock); + if (!list_empty(&ci->i_dirty_item)) { + dout(" moving %p to cap_dirty_migrating\n", + inode); + list_move(&ci->i_dirty_item, + &mdsc->cap_dirty_migrating); + } + spin_unlock(&mdsc->cap_dirty_lock); } __ceph_remove_cap(cap); } @@ -2707,6 +2722,13 @@ static void handle_cap_import(struct ceph_mds_client *mdsc, ci->i_cap_exporting_issued = 0; ci->i_cap_exporting_mseq = 0; ci->i_cap_exporting_mds = -1; + + spin_lock(&mdsc->cap_dirty_lock); + if (!list_empty(&ci->i_dirty_item)) { + dout(" moving %p back to cap_dirty\n", inode); + list_move(&ci->i_dirty_item, &mdsc->cap_dirty); + } + spin_unlock(&mdsc->cap_dirty_lock); } else { dout("handle_cap_import inode %p ci %p mds%d mseq %d\n", inode, ci, mds, mseq); @@ -2910,38 +2932,16 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc) */ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc) { - struct ceph_inode_info *ci, *nci = NULL; - struct inode *inode, *ninode = NULL; - struct list_head *p, *n; + struct ceph_inode_info *ci; + struct inode *inode; dout("flush_dirty_caps\n"); spin_lock(&mdsc->cap_dirty_lock); - list_for_each_safe(p, n, &mdsc->cap_dirty) { - if (nci) { - ci = nci; - inode = ninode; - ci->i_ceph_flags &= ~CEPH_I_NOFLUSH; - dout("flush_dirty_caps inode %p (was next inode)\n", - inode); - } else { - ci = list_entry(p, struct ceph_inode_info, - i_dirty_item); - inode = igrab(&ci->vfs_inode); - BUG_ON(!inode); - dout("flush_dirty_caps inode %p\n", inode); - } - if (n != &mdsc->cap_dirty) { - nci = list_entry(n, struct ceph_inode_info, - i_dirty_item); - ninode = igrab(&nci->vfs_inode); - BUG_ON(!ninode); - nci->i_ceph_flags |= CEPH_I_NOFLUSH; - dout("flush_dirty_caps next inode %p, noflush\n", - ninode); - } else { - nci = NULL; - ninode = NULL; - } + while (!list_empty(&mdsc->cap_dirty)) { + ci = list_first_entry(&mdsc->cap_dirty, struct ceph_inode_info, + i_dirty_item); + inode = igrab(&ci->vfs_inode); + dout("flush_dirty_caps %p\n", inode); spin_unlock(&mdsc->cap_dirty_lock); if (inode) { ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH, @@ -2951,6 +2951,7 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc) spin_lock(&mdsc->cap_dirty_lock); } spin_unlock(&mdsc->cap_dirty_lock); + dout("flush_dirty_caps done\n"); } /* diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 1a867a3601ae..33729e822bb9 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -360,7 +360,7 @@ more: rinfo = &fi->last_readdir->r_reply_info; dout("readdir frag %x num %d off %d chunkoff %d\n", frag, rinfo->dir_nr, off, fi->offset); - while (off - fi->offset >= 0 && off - fi->offset < rinfo->dir_nr) { + while (off >= fi->offset && off - fi->offset < rinfo->dir_nr) { u64 pos = ceph_make_fpos(frag, off); struct ceph_mds_reply_inode *in = rinfo->dir_in[off - fi->offset].in; @@ -1066,16 +1066,17 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, struct inode *inode = file->f_dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); int left; + const int bufsize = 1024; if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT)) return -EISDIR; if (!cf->dir_info) { - cf->dir_info = kmalloc(1024, GFP_NOFS); + cf->dir_info = kmalloc(bufsize, GFP_NOFS); if (!cf->dir_info) return -ENOMEM; cf->dir_info_len = - sprintf(cf->dir_info, + snprintf(cf->dir_info, bufsize, "entries: %20lld\n" " files: %20lld\n" " subdirs: %20lld\n" diff --git a/fs/ceph/export.c b/fs/ceph/export.c index e41056174bf8..a610d3d67488 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -86,6 +86,7 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len, static struct dentry *__fh_to_dentry(struct super_block *sb, struct ceph_nfs_fh *fh) { + struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc; struct inode *inode; struct dentry *dentry; struct ceph_vino vino; @@ -95,8 +96,24 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, vino.ino = fh->ino; vino.snap = CEPH_NOSNAP; inode = ceph_find_inode(sb, vino); - if (!inode) - return ERR_PTR(-ESTALE); + if (!inode) { + struct ceph_mds_request *req; + + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPINO, + USE_ANY_MDS); + if (IS_ERR(req)) + return ERR_CAST(req); + + req->r_ino1 = vino; + req->r_num_caps = 1; + err = ceph_mdsc_do_request(mdsc, NULL, req); + inode = req->r_target_inode; + if (inode) + igrab(inode); + ceph_mdsc_put_request(req); + if (!inode) + return ERR_PTR(-ESTALE); + } dentry = d_obtain_alias(inode); if (IS_ERR(dentry)) { @@ -148,8 +165,10 @@ static struct dentry *__cfh_to_dentry(struct super_block *sb, snprintf(req->r_path2, 16, "%d", cfh->parent_name_hash); req->r_num_caps = 1; err = ceph_mdsc_do_request(mdsc, NULL, req); + inode = req->r_target_inode; + if (inode) + igrab(inode); ceph_mdsc_put_request(req); - inode = ceph_find_inode(sb, vino); if (!inode) return ERR_PTR(err ? err : -ESTALE); } diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index d0fae4ce9ba5..79743d146be6 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -578,6 +578,7 @@ static void __register_request(struct ceph_mds_client *mdsc, if (dir) { struct ceph_inode_info *ci = ceph_inode(dir); + ihold(dir); spin_lock(&ci->i_unsafe_lock); req->r_unsafe_dir = dir; list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops); @@ -598,6 +599,9 @@ static void __unregister_request(struct ceph_mds_client *mdsc, spin_lock(&ci->i_unsafe_lock); list_del_init(&req->r_unsafe_dir_item); spin_unlock(&ci->i_unsafe_lock); + + iput(req->r_unsafe_dir); + req->r_unsafe_dir = NULL; } ceph_mdsc_put_request(req); @@ -2691,7 +2695,6 @@ static void handle_lease(struct ceph_mds_client *mdsc, { struct super_block *sb = mdsc->fsc->sb; struct inode *inode; - struct ceph_inode_info *ci; struct dentry *parent, *dentry; struct ceph_dentry_info *di; int mds = session->s_mds; @@ -2728,7 +2731,6 @@ static void handle_lease(struct ceph_mds_client *mdsc, dout("handle_lease no inode %llx\n", vino.ino); goto release; } - ci = ceph_inode(inode); /* dentry */ parent = d_find_alias(inode); @@ -3002,6 +3004,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) spin_lock_init(&mdsc->snap_flush_lock); mdsc->cap_flush_seq = 0; INIT_LIST_HEAD(&mdsc->cap_dirty); + INIT_LIST_HEAD(&mdsc->cap_dirty_migrating); mdsc->num_cap_flushing = 0; spin_lock_init(&mdsc->cap_dirty_lock); init_waitqueue_head(&mdsc->cap_flushing_wq); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 4e3a9cc0bba6..7d8a0d662d56 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -278,6 +278,7 @@ struct ceph_mds_client { u64 cap_flush_seq; struct list_head cap_dirty; /* inodes with dirty caps */ + struct list_head cap_dirty_migrating; /* ...that are migration... */ int num_cap_flushing; /* # caps we are flushing */ spinlock_t cap_dirty_lock; /* protects above items */ wait_queue_head_t cap_flushing_wq; diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index b8e995fbd867..b8c60694b2b0 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -313,6 +313,7 @@ enum { CEPH_MDS_OP_GETATTR = 0x00101, CEPH_MDS_OP_LOOKUPHASH = 0x00102, CEPH_MDS_OP_LOOKUPPARENT = 0x00103, + CEPH_MDS_OP_LOOKUPINO = 0x00104, CEPH_MDS_OP_SETXATTR = 0x01105, CEPH_MDS_OP_RMXATTR = 0x01106, diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index e15a82ccc05f..78b55f49de7c 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -76,7 +76,8 @@ const char *ceph_pr_addr(const struct sockaddr_storage *ss) break; default: - sprintf(s, "(unknown sockaddr family %d)", (int)ss->ss_family); + snprintf(s, MAX_ADDR_STR_LEN, "(unknown sockaddr family %d)", + (int)ss->ss_family); } return s; @@ -598,7 +599,7 @@ static void prepare_write_keepalive(struct ceph_connection *con) * Connection negotiation. */ -static void prepare_connect_authorizer(struct ceph_connection *con) +static int prepare_connect_authorizer(struct ceph_connection *con) { void *auth_buf; int auth_len = 0; @@ -612,13 +613,20 @@ static void prepare_connect_authorizer(struct ceph_connection *con) con->auth_retry); mutex_lock(&con->mutex); + if (test_bit(CLOSED, &con->state) || + test_bit(OPENING, &con->state)) + return -EAGAIN; + con->out_connect.authorizer_protocol = cpu_to_le32(auth_protocol); con->out_connect.authorizer_len = cpu_to_le32(auth_len); - con->out_kvec[con->out_kvec_left].iov_base = auth_buf; - con->out_kvec[con->out_kvec_left].iov_len = auth_len; - con->out_kvec_left++; - con->out_kvec_bytes += auth_len; + if (auth_len) { + con->out_kvec[con->out_kvec_left].iov_base = auth_buf; + con->out_kvec[con->out_kvec_left].iov_len = auth_len; + con->out_kvec_left++; + con->out_kvec_bytes += auth_len; + } + return 0; } /* @@ -640,9 +648,9 @@ static void prepare_write_banner(struct ceph_messenger *msgr, set_bit(WRITE_PENDING, &con->state); } -static void prepare_write_connect(struct ceph_messenger *msgr, - struct ceph_connection *con, - int after_banner) +static int prepare_write_connect(struct ceph_messenger *msgr, + struct ceph_connection *con, + int after_banner) { unsigned global_seq = get_global_seq(con->msgr, 0); int proto; @@ -683,7 +691,7 @@ static void prepare_write_connect(struct ceph_messenger *msgr, con->out_more = 0; set_bit(WRITE_PENDING, &con->state); - prepare_connect_authorizer(con); + return prepare_connect_authorizer(con); } @@ -1065,8 +1073,10 @@ static void addr_set_port(struct sockaddr_storage *ss, int p) switch (ss->ss_family) { case AF_INET: ((struct sockaddr_in *)ss)->sin_port = htons(p); + break; case AF_INET6: ((struct sockaddr_in6 *)ss)->sin6_port = htons(p); + break; } } @@ -1216,6 +1226,7 @@ static int process_connect(struct ceph_connection *con) u64 sup_feat = con->msgr->supported_features; u64 req_feat = con->msgr->required_features; u64 server_feat = le64_to_cpu(con->in_reply.features); + int ret; dout("process_connect on %p tag %d\n", con, (int)con->in_tag); @@ -1250,7 +1261,9 @@ static int process_connect(struct ceph_connection *con) return -1; } con->auth_retry = 1; - prepare_write_connect(con->msgr, con, 0); + ret = prepare_write_connect(con->msgr, con, 0); + if (ret < 0) + return ret; prepare_read_connect(con); break; @@ -1277,6 +1290,9 @@ static int process_connect(struct ceph_connection *con) if (con->ops->peer_reset) con->ops->peer_reset(con); mutex_lock(&con->mutex); + if (test_bit(CLOSED, &con->state) || + test_bit(OPENING, &con->state)) + return -EAGAIN; break; case CEPH_MSGR_TAG_RETRY_SESSION: @@ -1341,7 +1357,9 @@ static int process_connect(struct ceph_connection *con) * to WAIT. This shouldn't happen if we are the * client. */ - pr_err("process_connect peer connecting WAIT\n"); + pr_err("process_connect got WAIT as client\n"); + con->error_msg = "protocol error, got WAIT as client"; + return -1; default: pr_err("connect protocol error, will retry\n"); @@ -1810,6 +1828,17 @@ static int try_read(struct ceph_connection *con) more: dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag, con->in_base_pos); + + /* + * process_connect and process_message drop and re-take + * con->mutex. make sure we handle a racing close or reopen. + */ + if (test_bit(CLOSED, &con->state) || + test_bit(OPENING, &con->state)) { + ret = -EAGAIN; + goto out; + } + if (test_bit(CONNECTING, &con->state)) { if (!test_bit(NEGOTIATING, &con->state)) { dout("try_read connecting\n"); @@ -1938,8 +1967,10 @@ static void con_work(struct work_struct *work) { struct ceph_connection *con = container_of(work, struct ceph_connection, work.work); + int ret; mutex_lock(&con->mutex); +restart: if (test_and_clear_bit(BACKOFF, &con->state)) { dout("con_work %p backing off\n", con); if (queue_delayed_work(ceph_msgr_wq, &con->work, @@ -1969,18 +2000,31 @@ static void con_work(struct work_struct *work) con_close_socket(con); } - if (test_and_clear_bit(SOCK_CLOSED, &con->state) || - try_read(con) < 0 || - try_write(con) < 0) { - mutex_unlock(&con->mutex); - ceph_fault(con); /* error/fault path */ - goto done_unlocked; - } + if (test_and_clear_bit(SOCK_CLOSED, &con->state)) + goto fault; + + ret = try_read(con); + if (ret == -EAGAIN) + goto restart; + if (ret < 0) + goto fault; + + ret = try_write(con); + if (ret == -EAGAIN) + goto restart; + if (ret < 0) + goto fault; done: mutex_unlock(&con->mutex); done_unlocked: con->ops->put(con); + return; + +fault: + mutex_unlock(&con->mutex); + ceph_fault(con); /* error/fault path */ + goto done_unlocked; } diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 6b5dda1cb5df..6ea2b892f44b 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -124,7 +124,7 @@ static void calc_layout(struct ceph_osd_client *osdc, ceph_calc_raw_layout(osdc, layout, vino.snap, off, plen, &bno, req, op); - sprintf(req->r_oid, "%llx.%08llx", vino.ino, bno); + snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx", vino.ino, bno); req->r_oid_len = strlen(req->r_oid); } @@ -1421,6 +1421,15 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) done: downgrade_write(&osdc->map_sem); ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch); + + /* + * subscribe to subsequent osdmap updates if full to ensure + * we find out when we are no longer full and stop returning + * ENOSPC. + */ + if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) + ceph_monc_request_next_osdmap(&osdc->client->monc); + send_queued(osdc); up_read(&osdc->map_sem); wake_up_all(&osdc->client->auth_wq); @@ -1677,8 +1686,14 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc, */ if (req->r_sent == 0) { rc = __map_request(osdc, req); - if (rc < 0) + if (rc < 0) { + if (nofail) { + dout("osdc_start_request failed map, " + " will retry %lld\n", req->r_tid); + rc = 0; + } goto out_unlock; + } if (req->r_osd == NULL) { dout("send_request %p no up osds in pg\n", req); ceph_monc_request_next_osdmap(&osdc->client->monc); diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 71603ac3dff5..e97c3588c3ec 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -765,7 +765,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, } map->epoch++; - map->modified = map->modified; + map->modified = modified; if (newcrush) { if (map->crush) crush_destroy(map->crush); @@ -830,15 +830,20 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, map->osd_addr[osd] = addr; } - /* new_down */ + /* new_state */ ceph_decode_32_safe(p, end, len, bad); while (len--) { u32 osd; + u8 xorstate; ceph_decode_32_safe(p, end, osd, bad); + xorstate = **(u8 **)p; (*p)++; /* clean flag */ - pr_info("osd%d down\n", osd); + if (xorstate == 0) + xorstate = CEPH_OSD_UP; + if (xorstate & CEPH_OSD_UP) + pr_info("osd%d down\n", osd); if (osd < map->max_osd) - map->osd_state[osd] &= ~CEPH_OSD_UP; + map->osd_state[osd] ^= xorstate; } /* new_weight */