Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client

Pull ceph updates from Sage Weil: "The bulk of this pull is a series from Alex that refactors and cleans up the RBD code to lay the groundwork for supporting the new image format and evolving feature set. There are also some cleanups in libceph, and for ceph there's fixed validation of file striping layouts and a bugfix in the code handling a shrinking MDS cluster." * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: (71 commits) ceph: avoid 32-bit page index overflow ceph: return EIO on invalid layout on GET_DATALOC ioctl rbd: BUG on invalid layout ceph: propagate layout error on osd request creation libceph: check for invalid mapping ceph: convert to use le32_add_cpu() ceph: Fix oops when handling mdsmap that decreases max_mds rbd: update remaining header fields for v2 rbd: get snapshot name for a v2 image rbd: get the snapshot context for a v2 image rbd: get image features for a v2 image rbd: get the object prefix for a v2 rbd image rbd: add code to get the size of a v2 rbd image rbd: lay out header probe infrastructure rbd: encapsulate code that gets snapshot info rbd: add an rbd features field rbd: don't use index in __rbd_add_snap_dev() rbd: kill create_snap sysfs entry rbd: define rbd_dev_image_id() rbd: define some new format constants ...
2012-10-08 06:38:18 +09:00 · 2012-10-08 06:38:18 +09:00 · 7035cdf36d
parent 6432f21284 6285bc2312
commit 7035cdf36d
16 changed files with 1322 additions and 712 deletions
--- a/Documentation/ABI/testing/sysfs-bus-rbd
+++ b/Documentation/ABI/testing/sysfs-bus-rbd
@ -25,6 +25,10 @@ client_id

 	The ceph unique client id that was assigned for this specific session.

+features
+
+	A hexadecimal encoding of the feature bits for this image.
+
 major

 	The block device major number.
@ -33,6 +37,11 @@ name

 	The name of the rbd image.

+image_id
+
+	The unique id for the rbd image.  (For rbd image format 1
+	this is empty.)
+
 pool

 	The name of the storage pool where this rbd image resides.
@ -57,12 +66,6 @@ current_snap

 	The current snapshot for which the device is mapped.

-create_snap
-
-	Create a snapshot:
-
-	 $ echo <snap-name> > /sys/bus/rbd/devices/<dev-id>/snap_create
-
 snap_*

 	A directory per each snapshot
@ -79,4 +82,7 @@ snap_size

 	The size of the image when this snapshot was taken.

+snap_features
+
+	A hexadecimal encoding of the feature bits for this snapshot.

--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
--- a/drivers/block/rbd_types.h
+++ b/drivers/block/rbd_types.h
@ -15,15 +15,30 @@

 #include <linux/types.h>

+/* For format version 2, rbd image 'foo' consists of objects
+ *   rbd_id.foo		- id of image
+ *   rbd_header.<id>	- image metadata
+ *   rbd_data.<id>.0000000000000000
+ *   rbd_data.<id>.0000000000000001
+ *   ...		- data
+ * Clients do not access header data directly in rbd format 2.
+ */
+
+#define RBD_HEADER_PREFIX      "rbd_header."
+#define RBD_DATA_PREFIX        "rbd_data."
+#define RBD_ID_PREFIX          "rbd_id."
+
 /*
- * rbd image 'foo' consists of objects
- *   foo.rbd      - image metadata
- *   foo.00000000
- *   foo.00000001
- *   ...          - data
+ * For format version 1, rbd image 'foo' consists of objects
+ *   foo.rbd		- image metadata
+ *   rb.<idhi>.<idlo>.00000000
+ *   rb.<idhi>.<idlo>.00000001
+ *   ...		- data
+ * There is no notion of a persistent image id in rbd format 1.
 */

 #define RBD_SUFFIX		".rbd"
+
 #define RBD_DIRECTORY           "rbd_directory"
 #define RBD_INFO                "rbd_info"

@ -47,7 +62,7 @@ struct rbd_image_snap_ondisk {

 struct rbd_image_header_ondisk {
 	char text[40];
-	char block_name[24];
+	char object_prefix[24];
 	char signature[4];
 	char version[8];
 	struct {
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@ -205,7 +205,7 @@ static int readpage_nounlock(struct file *filp, struct page *page)
 	dout("readpage inode %p file %p page %p index %lu\n",
 	     inode, filp, page, page->index);
 	err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
-				  page->index << PAGE_CACHE_SHIFT, &len,
+				  (u64) page_offset(page), &len,
 				  ci->i_truncate_seq, ci->i_truncate_size,
 				  &page, 1, 0);
 	if (err == -ENOENT)
@ -286,7 +286,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
 	int nr_pages = 0;
 	int ret;

-	off = page->index << PAGE_CACHE_SHIFT;
+	off = (u64) page_offset(page);

 	/* count pages */
 	next_index = page->index;
@ -308,8 +308,8 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
 				    NULL, 0,
 				    ci->i_truncate_seq, ci->i_truncate_size,
 				    NULL, false, 1, 0);
-	if (!req)
-		return -ENOMEM;
+	if (IS_ERR(req))
+		return PTR_ERR(req);

 	/* build page vector */
 	nr_pages = len >> PAGE_CACHE_SHIFT;
@ -426,7 +426,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 	struct ceph_inode_info *ci;
 	struct ceph_fs_client *fsc;
 	struct ceph_osd_client *osdc;
-	loff_t page_off = page->index << PAGE_CACHE_SHIFT;
+	loff_t page_off = page_offset(page);
 	int len = PAGE_CACHE_SIZE;
 	loff_t i_size;
 	int err = 0;
@ -817,8 +817,7 @@ get_more_pages:
 			/* ok */
 			if (locked_pages == 0) {
 				/* prepare async write request */
-				offset = (unsigned long long)page->index
-					<< PAGE_CACHE_SHIFT;
+				offset = (u64) page_offset(page);
 				len = wsize;
 				req = ceph_osdc_new_request(&fsc->client->osdc,
 					    &ci->i_layout,
@ -832,8 +831,8 @@ get_more_pages:
 					    ci->i_truncate_size,
 					    &inode->i_mtime, true, 1, 0);

-				if (!req) {
-					rc = -ENOMEM;
+				if (IS_ERR(req)) {
+					rc = PTR_ERR(req);
 					unlock_page(page);
 					break;
 				}
@ -1180,7 +1179,7 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	struct inode *inode = vma->vm_file->f_dentry->d_inode;
 	struct page *page = vmf->page;
 	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
-	loff_t off = page->index << PAGE_CACHE_SHIFT;
+	loff_t off = page_offset(page);
 	loff_t size, len;
 	int ret;

--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@ -1005,7 +1005,7 @@ static void __queue_cap_release(struct ceph_mds_session *session,

 	BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE);
 	head = msg->front.iov_base;
-	head->num = cpu_to_le32(le32_to_cpu(head->num) + 1);
+	le32_add_cpu(&head->num, 1);
 	item = msg->front.iov_base + msg->front.iov_len;
 	item->ino = cpu_to_le64(ino);
 	item->cap_id = cpu_to_le64(cap_id);
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@ -536,8 +536,8 @@ more:
 				    do_sync,
 				    ci->i_truncate_seq, ci->i_truncate_size,
 				    &mtime, false, 2, page_align);
-	if (!req)
-		return -ENOMEM;
+	if (IS_ERR(req))
+		return PTR_ERR(req);

 	if (file->f_flags & O_DIRECT) {
 		pages = ceph_get_direct_page_vector(data, num_pages, false);
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@ -187,14 +187,18 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
 	u64 tmp;
 	struct ceph_object_layout ol;
 	struct ceph_pg pgid;
+	int r;

 	/* copy and validate */
 	if (copy_from_user(&dl, arg, sizeof(dl)))
 		return -EFAULT;

 	down_read(&osdc->map_sem);
-	ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, &len,
-				      &dl.object_no, &dl.object_offset, &olen);
+	r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, &len,
+					  &dl.object_no, &dl.object_offset,
+					  &olen);
+	if (r < 0)
+		return -EIO;
 	dl.file_offset -= dl.object_offset;
 	dl.object_size = ceph_file_layout_object_size(ci->i_layout);
 	dl.block_size = ceph_file_layout_su(ci->i_layout);
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@ -2625,7 +2625,8 @@ static void check_new_map(struct ceph_mds_client *mdsc,
 		     ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "",
 		     session_state_name(s->s_state));

-		if (memcmp(ceph_mdsmap_get_addr(oldmap, i),
+		if (i >= newmap->m_max_mds ||
+		    memcmp(ceph_mdsmap_get_addr(oldmap, i),
 			   ceph_mdsmap_get_addr(newmap, i),
 			   sizeof(struct ceph_entity_addr))) {
 			if (s->s_state == CEPH_MDS_SESSION_OPENING) {
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@ -307,7 +307,10 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
 {
 	struct ceph_mount_options *fsopt;
 	const char *dev_name_end;
-	int err = -ENOMEM;
+	int err;
+
+	if (!dev_name || !*dev_name)
+		return -EINVAL;

 	fsopt = kzalloc(sizeof(*fsopt), GFP_KERNEL);
 	if (!fsopt)
@ -328,21 +331,33 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
 	fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
 	fsopt->congestion_kb = default_congestion_kb();

-	/* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
+	/*
+	 * Distinguish the server list from the path in "dev_name".
+	 * Internally we do not include the leading '/' in the path.
+	 *
+	 * "dev_name" will look like:
+	 *     <server_spec>[,<server_spec>...]:[<path>]
+	 * where
+	 *     <server_spec> is <ip>[:<port>]
+	 *     <path> is optional, but if present must begin with '/'
+	 */
+	dev_name_end = strchr(dev_name, '/');
+	if (dev_name_end) {
+		/* skip over leading '/' for path */
+		*path = dev_name_end + 1;
+	} else {
+		/* path is empty */
+		dev_name_end = dev_name + strlen(dev_name);
+		*path = dev_name_end;
+	}
 	err = -EINVAL;
-	if (!dev_name)
-		goto out;
-	*path = strstr(dev_name, ":/");
-	if (*path == NULL) {
-		pr_err("device name is missing path (no :/ in %s)\n",
+	dev_name_end--;		/* back up to ':' separator */
+	if (*dev_name_end != ':') {
+		pr_err("device name is missing path (no : separator in %s)\n",
 				dev_name);
 		goto out;
 	}
-	dev_name_end = *path;
 	dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name);
-
-	/* path on server */
-	*path += 2;
 	dout("server path '%s'\n", *path);

 	*popt = ceph_parse_options(options, dev_name, dev_name_end,
--- a/include/linux/ceph/mon_client.h
+++ b/include/linux/ceph/mon_client.h
@ -71,7 +71,6 @@ struct ceph_mon_client {
 	int cur_mon;                       /* last monitor i contacted */
 	unsigned long sub_sent, sub_renew_after;
 	struct ceph_connection con;
-	bool have_fsid;

 	/* pending generic requests */
 	struct rb_root generic_request_tree;
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@ -207,7 +207,7 @@ extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
 extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
 				 struct ceph_msg *msg);

-extern void ceph_calc_raw_layout(struct ceph_osd_client *osdc,
+extern int ceph_calc_raw_layout(struct ceph_osd_client *osdc,
 			struct ceph_file_layout *layout,
 			u64 snapid,
 			u64 off, u64 *plen, u64 *bno,
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@ -109,9 +109,9 @@ extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 extern void ceph_osdmap_destroy(struct ceph_osdmap *map);

 /* calculate mapping of a file extent to an object */
-extern void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
-					  u64 off, u64 *plen,
-					  u64 *bno, u64 *oxoff, u64 *oxlen);
+extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
+					 u64 off, u64 *plen,
+					 u64 *bno, u64 *oxoff, u64 *oxlen);

 /* calculate mapping of object to a placement group */
 extern int ceph_calc_object_layout(struct ceph_object_layout *ol,
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@ -637,7 +637,7 @@ bad:
 /*
 * Do a synchronous pool op.
 */
-int ceph_monc_do_poolop(struct ceph_mon_client *monc, u32 op,
+static int do_poolop(struct ceph_mon_client *monc, u32 op,
 			u32 pool, u64 snapid,
 			char *buf, int len)
 {
@ -687,7 +687,7 @@ out:
 int ceph_monc_create_snapid(struct ceph_mon_client *monc,
 			    u32 pool, u64 *snapid)
 {
-	return ceph_monc_do_poolop(monc,  POOL_OP_CREATE_UNMANAGED_SNAP,
+	return do_poolop(monc,  POOL_OP_CREATE_UNMANAGED_SNAP,
 				   pool, 0, (char *)snapid, sizeof(*snapid));

 }
@ -696,7 +696,7 @@ EXPORT_SYMBOL(ceph_monc_create_snapid);
 int ceph_monc_delete_snapid(struct ceph_mon_client *monc,
 			    u32 pool, u64 snapid)
 {
-	return ceph_monc_do_poolop(monc,  POOL_OP_CREATE_UNMANAGED_SNAP,
+	return do_poolop(monc,  POOL_OP_CREATE_UNMANAGED_SNAP,
 				   pool, snapid, 0, 0);

 }
@ -769,7 +769,6 @@ static int build_initial_monmap(struct ceph_mon_client *monc)
 		monc->monmap->mon_inst[i].name.num = cpu_to_le64(i);
 	}
 	monc->monmap->num_mon = num_mon;
-	monc->have_fsid = false;
 	return 0;
 }

--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@ -52,7 +52,7 @@ static int op_has_extent(int op)
 		op == CEPH_OSD_OP_WRITE);
 }

-void ceph_calc_raw_layout(struct ceph_osd_client *osdc,
+int ceph_calc_raw_layout(struct ceph_osd_client *osdc,
 			struct ceph_file_layout *layout,
 			u64 snapid,
 			u64 off, u64 *plen, u64 *bno,
@ -62,12 +62,15 @@ void ceph_calc_raw_layout(struct ceph_osd_client *osdc,
 	struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
 	u64 orig_len = *plen;
 	u64 objoff, objlen;    /* extent in object */
+	int r;

 	reqhead->snapid = cpu_to_le64(snapid);

 	/* object extent? */
-	ceph_calc_file_object_mapping(layout, off, plen, bno,
-				      &objoff, &objlen);
+	r = ceph_calc_file_object_mapping(layout, off, plen, bno,
+					  &objoff, &objlen);
+	if (r < 0)
+		return r;
 	if (*plen < orig_len)
 		dout(" skipping last %llu, final file extent %llu~%llu\n",
 		     orig_len - *plen, off, *plen);
@ -83,7 +86,7 @@ void ceph_calc_raw_layout(struct ceph_osd_client *osdc,

 	dout("calc_layout bno=%llx %llu~%llu (%d pages)\n",
 	     *bno, objoff, objlen, req->r_num_pages);
-
+	return 0;
 }
 EXPORT_SYMBOL(ceph_calc_raw_layout);

@ -112,20 +115,25 @@ EXPORT_SYMBOL(ceph_calc_raw_layout);
 *
 * fill osd op in request message.
 */
-static void calc_layout(struct ceph_osd_client *osdc,
-			struct ceph_vino vino,
-			struct ceph_file_layout *layout,
-			u64 off, u64 *plen,
-			struct ceph_osd_request *req,
-			struct ceph_osd_req_op *op)
+static int calc_layout(struct ceph_osd_client *osdc,
+		       struct ceph_vino vino,
+		       struct ceph_file_layout *layout,
+		       u64 off, u64 *plen,
+		       struct ceph_osd_request *req,
+		       struct ceph_osd_req_op *op)
 {
 	u64 bno;
+	int r;

-	ceph_calc_raw_layout(osdc, layout, vino.snap, off,
-			     plen, &bno, req, op);
+	r = ceph_calc_raw_layout(osdc, layout, vino.snap, off,
+				 plen, &bno, req, op);
+	if (r < 0)
+		return r;

 	snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx", vino.ino, bno);
 	req->r_oid_len = strlen(req->r_oid);
+
+	return r;
 }

 /*
@ -456,6 +464,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 {
 	struct ceph_osd_req_op ops[3];
 	struct ceph_osd_request *req;
+	int r;

 	ops[0].op = opcode;
 	ops[0].extent.truncate_seq = truncate_seq;
@ -474,10 +483,12 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 					 use_mempool,
 					 GFP_NOFS, NULL, NULL);
 	if (!req)
-		return NULL;
+		return ERR_PTR(-ENOMEM);

 	/* calculate max write size */
-	calc_layout(osdc, vino, layout, off, plen, req, ops);
+	r = calc_layout(osdc, vino, layout, off, plen, req, ops);
+	if (r < 0)
+		return ERR_PTR(r);
 	req->r_file_layout = *layout;  /* keep a copy */

 	/* in case it differs from natural (file) alignment that
@ -1920,8 +1931,8 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc,
 				    CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
 				    NULL, 0, truncate_seq, truncate_size, NULL,
 				    false, 1, page_align);
-	if (!req)
-		return -ENOMEM;
+	if (IS_ERR(req))
+		return PTR_ERR(req);

 	/* it may be a short read due to an object boundary */
 	req->r_pages = pages;
@ -1963,8 +1974,8 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
 				    snapc, do_sync,
 				    truncate_seq, truncate_size, mtime,
 				    nofail, 1, page_align);
-	if (!req)
-		return -ENOMEM;
+	if (IS_ERR(req))
+		return PTR_ERR(req);

 	/* it may be a short write due to an object boundary */
 	req->r_pages = pages;
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@ -984,7 +984,7 @@ bad:
 * for now, we write only a single su, until we can
 * pass a stride back to the caller.
 */
-void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
+int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
 				   u64 off, u64 *plen,
 				   u64 *ono,
 				   u64 *oxoff, u64 *oxlen)
@ -998,11 +998,17 @@ void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,

 	dout("mapping %llu~%llu  osize %u fl_su %u\n", off, *plen,
 	     osize, su);
+	if (su == 0 || sc == 0)
+		goto invalid;
 	su_per_object = osize / su;
+	if (su_per_object == 0)
+		goto invalid;
 	dout("osize %u / su %u = su_per_object %u\n", osize, su,
 	     su_per_object);

-	BUG_ON((su & ~PAGE_MASK) != 0);
+	if ((su & ~PAGE_MASK) != 0)
+		goto invalid;
+
 	/* bl = *off / su; */
 	t = off;
 	do_div(t, su);
@ -1030,6 +1036,14 @@ void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
 	*plen = *oxlen;

 	dout(" obj extent %llu~%llu\n", *oxoff, *oxlen);
+	return 0;
+
+invalid:
+	dout(" invalid layout\n");
+	*ono = 0;
+	*oxoff = 0;
+	*oxlen = 0;
+	return -EINVAL;
 }
 EXPORT_SYMBOL(ceph_calc_file_object_mapping);

--- a/net/ceph/pagelist.c
+++ b/net/ceph/pagelist.c
@ -1,4 +1,3 @@
-
 #include <linux/module.h>
 #include <linux/gfp.h>
 #include <linux/pagemap.h>
@ -134,8 +133,8 @@ int ceph_pagelist_truncate(struct ceph_pagelist *pl,
 	ceph_pagelist_unmap_tail(pl);
 	while (pl->head.prev != c->page_lru) {
 		page = list_entry(pl->head.prev, struct page, lru);
-		list_del(&page->lru);                /* remove from pagelist */
-		list_add_tail(&page->lru, &pl->free_list); /* add to reserve */
+		/* move from pagelist to reserve */
+		list_move_tail(&page->lru, &pl->free_list);
 		++pl->num_pages_free;
 	}
 	pl->room = c->room;