From e1518c7c0a67a75727f7285780dbef0ca7121cc9 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Thu, 13 May 2010 11:19:06 -0700
Subject: [PATCH 01/59] ceph: clean up mds reply, error handling

We would occasionally BUG out in the reply handler because r_reply was
nonzero, due to a race with ceph_mdsc_do_request temporarily setting
r_reply to an ERR_PTR value.  This is unnecessary, messy, and also wrong
in the EIO case.

Clean up by consistently using r_err for errors and r_reply for messages.
Also fix the abort logic to trigger consistently for all errors that return
to the caller early (e.g., EIO from timeout case).  If an abort races with
a reply, use the result from the reply.

Also fix locking for r_err, r_reply update in the reply handler.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/mds_client.c | 121 +++++++++++++++++++++----------------------
 fs/ceph/mds_client.h |   2 +-
 2 files changed, 60 insertions(+), 63 deletions(-)

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 24561a557e01..b3b19f05b821 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1517,7 +1517,7 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
 	}
 	msg = create_request_message(mdsc, req, mds);
 	if (IS_ERR(msg)) {
-		req->r_reply = ERR_PTR(PTR_ERR(msg));
+		req->r_err = PTR_ERR(msg);
 		complete_request(mdsc, req);
 		return -PTR_ERR(msg);
 	}
@@ -1552,7 +1552,7 @@ static int __do_request(struct ceph_mds_client *mdsc,
 	int mds = -1;
 	int err = -EAGAIN;
 
-	if (req->r_reply)
+	if (req->r_err || req->r_got_result)
 		goto out;
 
 	if (req->r_timeout &&
@@ -1609,7 +1609,7 @@ out:
 	return err;
 
 finish:
-	req->r_reply = ERR_PTR(err);
+	req->r_err = err;
 	complete_request(mdsc, req);
 	goto out;
 }
@@ -1689,59 +1689,53 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
 	__register_request(mdsc, req, dir);
 	__do_request(mdsc, req);
 
-	/* wait */
-	if (!req->r_reply) {
-		mutex_unlock(&mdsc->mutex);
-		if (req->r_timeout) {
-			err = (long)wait_for_completion_interruptible_timeout(
-				&req->r_completion, req->r_timeout);
-			if (err == 0)
-				req->r_reply = ERR_PTR(-EIO);
-			else if (err < 0)
-				req->r_reply = ERR_PTR(err);
-		} else {
-                        err = wait_for_completion_interruptible(
-                                &req->r_completion);
-                        if (err)
-                                req->r_reply = ERR_PTR(err);
-		}
-		mutex_lock(&mdsc->mutex);
-	}
-
-	if (IS_ERR(req->r_reply)) {
-		err = PTR_ERR(req->r_reply);
-		req->r_reply = NULL;
-
-		if (err == -ERESTARTSYS) {
-			/* aborted */
-			req->r_aborted = true;
-
-			if (req->r_locked_dir &&
-			    (req->r_op & CEPH_MDS_OP_WRITE)) {
-				struct ceph_inode_info *ci =
-					ceph_inode(req->r_locked_dir);
-
-				dout("aborted, clearing I_COMPLETE on %p\n", 
-				     req->r_locked_dir);
-				spin_lock(&req->r_locked_dir->i_lock);
-				ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
-				ci->i_release_count++;
-				spin_unlock(&req->r_locked_dir->i_lock);
-			}
-		} else {
-			/* clean up this request */
-			__unregister_request(mdsc, req);
-			if (!list_empty(&req->r_unsafe_item))
-				list_del_init(&req->r_unsafe_item);
-			complete(&req->r_safe_completion);
-		}
-	} else if (req->r_err) {
+	if (req->r_err) {
 		err = req->r_err;
-	} else {
-		err = le32_to_cpu(req->r_reply_info.head->result);
+		__unregister_request(mdsc, req);
+		dout("do_request early error %d\n", err);
+		goto out;
 	}
-	mutex_unlock(&mdsc->mutex);
 
+	/* wait */
+	mutex_unlock(&mdsc->mutex);
+	dout("do_request waiting\n");
+	if (req->r_timeout) {
+		err = (long)wait_for_completion_interruptible_timeout(
+			&req->r_completion, req->r_timeout);
+		if (err == 0)
+			err = -EIO;
+	} else {
+		err = wait_for_completion_interruptible(&req->r_completion);
+	}
+	dout("do_request waited, got %d\n", err);
+	mutex_lock(&mdsc->mutex);
+
+	/* only abort if we didn't race with a real reply */
+	if (req->r_got_result) {
+		err = le32_to_cpu(req->r_reply_info.head->result);
+	} else if (err < 0) {
+		dout("aborted request %lld with %d\n", req->r_tid, err);
+		req->r_err = err;
+		req->r_aborted = true;
+
+		if (req->r_locked_dir &&
+		    (req->r_op & CEPH_MDS_OP_WRITE)) {
+			struct ceph_inode_info *ci =
+				ceph_inode(req->r_locked_dir);
+
+			dout("aborted, clearing I_COMPLETE on %p\n",
+			     req->r_locked_dir);
+			spin_lock(&req->r_locked_dir->i_lock);
+			ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
+			ci->i_release_count++;
+			spin_unlock(&req->r_locked_dir->i_lock);
+		}
+	} else {
+		err = req->r_err;
+	}
+
+out:
+	mutex_unlock(&mdsc->mutex);
 	dout("do_request %p done, result %d\n", req, err);
 	return err;
 }
@@ -1838,11 +1832,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
 			mutex_unlock(&mdsc->mutex);
 			goto out;
 		}
-	}
-
-	BUG_ON(req->r_reply);
-
-	if (!head->safe) {
+	} else {
 		req->r_got_unsafe = true;
 		list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
 	}
@@ -1880,12 +1870,19 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
 
 	up_read(&mdsc->snap_rwsem);
 out_err:
-	if (err) {
-		req->r_err = err;
+	mutex_lock(&mdsc->mutex);
+	if (!req->r_aborted) {
+		if (err) {
+			req->r_err = err;
+		} else {
+			req->r_reply = msg;
+			ceph_msg_get(msg);
+			req->r_got_result = true;
+		}
 	} else {
-		req->r_reply = msg;
-		ceph_msg_get(msg);
+		dout("reply arrived after request %lld was aborted\n", tid);
 	}
+	mutex_unlock(&mdsc->mutex);
 
 	add_cap_releases(mdsc, req->r_session, -1);
 	mutex_unlock(&session->s_mutex);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 961cc6f65878..0b1dd10be39a 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -213,7 +213,7 @@ struct ceph_mds_request {
 	struct completion r_safe_completion;
 	ceph_mds_request_callback_t r_callback;
 	struct list_head  r_unsafe_item;  /* per-session unsafe list item */
-	bool		  r_got_unsafe, r_got_safe;
+	bool		  r_got_unsafe, r_got_safe, r_got_result;
 
 	bool              r_did_prepopulate;
 	u32               r_readdir_offset;

From b4556396fac5b3f063d5b8ac54dc02f7612a75e1 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Thu, 13 May 2010 12:01:13 -0700
Subject: [PATCH 02/59] ceph: fix race between aborted requests and fill_trace

When we abort requests we need to prevent fill_trace et al from doing
anything that relies on locks held by the VFS caller.  This fixes a race
between the reply handler and the abort code, ensuring that continue
holding the dir mutex until the reply handler completes.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/mds_client.c | 11 +++++++++++
 fs/ceph/mds_client.h |  2 ++
 2 files changed, 13 insertions(+)

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index b3b19f05b821..c0568fe3c0ba 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1181,6 +1181,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
 	if (!req)
 		return ERR_PTR(-ENOMEM);
 
+	mutex_init(&req->r_fill_mutex);
 	req->r_started = jiffies;
 	req->r_resend_mds = -1;
 	INIT_LIST_HEAD(&req->r_unsafe_dir_item);
@@ -1715,8 +1716,16 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
 		err = le32_to_cpu(req->r_reply_info.head->result);
 	} else if (err < 0) {
 		dout("aborted request %lld with %d\n", req->r_tid, err);
+
+		/*
+		 * ensure we aren't running concurrently with
+		 * ceph_fill_trace or ceph_readdir_prepopulate, which
+		 * rely on locks (dir mutex) held by our caller.
+		 */
+		mutex_lock(&req->r_fill_mutex);
 		req->r_err = err;
 		req->r_aborted = true;
+		mutex_unlock(&req->r_fill_mutex);
 
 		if (req->r_locked_dir &&
 		    (req->r_op & CEPH_MDS_OP_WRITE)) {
@@ -1861,12 +1870,14 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
 	}
 
 	/* insert trace into our cache */
+	mutex_lock(&req->r_fill_mutex);
 	err = ceph_fill_trace(mdsc->client->sb, req, req->r_session);
 	if (err == 0) {
 		if (result == 0 && rinfo->dir_nr)
 			ceph_readdir_prepopulate(req, req->r_session);
 		ceph_unreserve_caps(&req->r_caps_reservation);
 	}
+	mutex_unlock(&req->r_fill_mutex);
 
 	up_read(&mdsc->snap_rwsem);
 out_err:
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 0b1dd10be39a..141a265bda75 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -165,6 +165,8 @@ struct ceph_mds_request {
 	struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */
 	struct inode *r_target_inode;       /* resulting inode */
 
+	struct mutex r_fill_mutex;
+
 	union ceph_mds_request_args r_args;
 	int r_fmode;        /* file mode, if expecting cap */
 

From 81a6cf2d30eac5d790f53cdff110892f7b18c7fe Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 14 May 2010 09:35:38 -0700
Subject: [PATCH 03/59] ceph: invalidate affected dentry leases on aborted
 requests

If we abort a request, we return to caller, but the request may still
complete.  And if we hold the dir FILE_EXCL bit, we may not release a
lease when sending a request.  A simple un-tar, control-c, un-tar again
will reproduce the bug (manifested as a 'Cannot open: File exists').

Ensure we invalidate affected dentry leases (as well dir I_COMPLETE) so
we don't have valid (but incorrect) leases.  Do the same, consistently, at
other sites where I_COMPLETE is similarly cleared.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/dir.c        | 13 +++++++++++--
 fs/ceph/inode.c      | 13 +++++++++++--
 fs/ceph/mds_client.c |  7 ++++++-
 fs/ceph/super.h      |  1 +
 4 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 650d2db5ed26..4b1a7a4bae0b 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -888,13 +888,22 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
 
 		/* ensure target dentry is invalidated, despite
 		   rehashing bug in vfs_rename_dir */
-		new_dentry->d_time = jiffies;
-		ceph_dentry(new_dentry)->lease_shared_gen = 0;
+		ceph_invalidate_dentry_lease(new_dentry);
 	}
 	ceph_mdsc_put_request(req);
 	return err;
 }
 
+/*
+ * Ensure a dentry lease will no longer revalidate.
+ */
+void ceph_invalidate_dentry_lease(struct dentry *dentry)
+{
+	spin_lock(&dentry->d_lock);
+	dentry->d_time = jiffies;
+	ceph_dentry(dentry)->lease_shared_gen = 0;
+	spin_unlock(&dentry->d_lock);
+}
 
 /*
  * Check if dentry lease is valid.  If not, delete the lease.  Try to
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 85b4d2ffdeba..220a2aec0545 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -938,8 +938,15 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
 				ceph_inode(req->r_locked_dir);
 			dout(" clearing %p complete (empty trace)\n",
 			     req->r_locked_dir);
+			spin_lock(&req->r_locked_dir->i_lock);
 			ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
 			ci->i_release_count++;
+			spin_unlock(&req->r_locked_dir->i_lock);
+
+			if (req->r_dentry)
+				ceph_invalidate_dentry_lease(req->r_dentry);
+			if (req->r_old_dentry)
+				ceph_invalidate_dentry_lease(req->r_old_dentry);
 		}
 		return 0;
 	}
@@ -1011,13 +1018,15 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
 			     req->r_old_dentry->d_name.len,
 			     req->r_old_dentry->d_name.name,
 			     dn, dn->d_name.len, dn->d_name.name);
+
 			/* ensure target dentry is invalidated, despite
 			   rehashing bug in vfs_rename_dir */
-			dn->d_time = jiffies;
-			ceph_dentry(dn)->lease_shared_gen = 0;
+			ceph_invalidate_dentry_lease(dn);
+
 			/* take overwritten dentry's readdir offset */
 			ceph_dentry(req->r_old_dentry)->offset =
 				ceph_dentry(dn)->offset;
+
 			dn = req->r_old_dentry;  /* use old_dentry */
 			in = dn->d_inode;
 		}
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index c0568fe3c0ba..76995a960432 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1732,12 +1732,17 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
 			struct ceph_inode_info *ci =
 				ceph_inode(req->r_locked_dir);
 
-			dout("aborted, clearing I_COMPLETE on %p\n",
+			dout("aborted, clearing I_COMPLETE on %p, leases\n",
 			     req->r_locked_dir);
 			spin_lock(&req->r_locked_dir->i_lock);
 			ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
 			ci->i_release_count++;
 			spin_unlock(&req->r_locked_dir->i_lock);
+
+			if (req->r_dentry)
+				ceph_invalidate_dentry_lease(req->r_dentry);
+			if (req->r_old_dentry)
+				ceph_invalidate_dentry_lease(req->r_old_dentry);
 		}
 	} else {
 		err = req->r_err;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 13513b80d87f..cfe3b0834d54 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -871,6 +871,7 @@ extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
 extern void ceph_dentry_lru_add(struct dentry *dn);
 extern void ceph_dentry_lru_touch(struct dentry *dn);
 extern void ceph_dentry_lru_del(struct dentry *dn);
+extern void ceph_invalidate_dentry_lease(struct dentry *dentry);
 
 /*
  * our d_ops vary depending on whether the inode is live,

From 21b667f69b023979410188d7d94c9b219f216626 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Thu, 4 Mar 2010 10:22:59 -0800
Subject: [PATCH 04/59] ceph: simplify page setup for incoming data

Drop largely useless helper __prepare_pages(), and simplify sanity checks.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/osd_client.c | 56 ++++++++++----------------------------------
 1 file changed, 12 insertions(+), 44 deletions(-)

diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index 3514f71ff85f..22a33f8c8807 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -1087,45 +1087,6 @@ bad:
 	return;
 }
 
-
-/*
- * A read request prepares specific pages that data is to be read into.
- * When a message is being read off the wire, we call prepare_pages to
- * find those pages.
- *  0 = success, -1 failure.
- */
-static int __prepare_pages(struct ceph_connection *con,
-			 struct ceph_msg_header *hdr,
-			 struct ceph_osd_request *req,
-			 u64 tid,
-			 struct ceph_msg *m)
-{
-	struct ceph_osd *osd = con->private;
-	struct ceph_osd_client *osdc;
-	int ret = -1;
-	int data_len = le32_to_cpu(hdr->data_len);
-	unsigned data_off = le16_to_cpu(hdr->data_off);
-
-	int want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
-
-	if (!osd)
-		return -1;
-
-	osdc = osd->o_osdc;
-
-	dout("__prepare_pages on msg %p tid %llu, has %d pages, want %d\n", m,
-	     tid, req->r_num_pages, want);
-	if (unlikely(req->r_num_pages < want))
-		goto out;
-	m->pages = req->r_pages;
-	m->nr_pages = req->r_num_pages;
-	ret = 0; /* success */
-out:
-	BUG_ON(ret < 0 || m->nr_pages < want);
-
-	return ret;
-}
-
 /*
  * Register request, send initial attempt.
  */
@@ -1394,7 +1355,8 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
 }
 
 /*
- * lookup and return message for incoming reply
+ * lookup and return message for incoming reply.  set up reply message
+ * pages.
  */
 static struct ceph_msg *get_reply(struct ceph_connection *con,
 				  struct ceph_msg_header *hdr,
@@ -1407,7 +1369,6 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
 	int front = le32_to_cpu(hdr->front_len);
 	int data_len = le32_to_cpu(hdr->data_len);
 	u64 tid;
-	int err;
 
 	tid = le64_to_cpu(hdr->tid);
 	mutex_lock(&osdc->request_mutex);
@@ -1439,12 +1400,19 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
 	m = ceph_msg_get(req->r_reply);
 
 	if (data_len > 0) {
-		err = __prepare_pages(con, hdr, req, tid, m);
-		if (err < 0) {
+		unsigned data_off = le16_to_cpu(hdr->data_off);
+		int want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
+
+		if (unlikely(req->r_num_pages < want)) {
+			pr_warning("tid %lld reply %d > expected %d pages\n",
+				   tid, want, m->nr_pages);
 			*skip = 1;
 			ceph_msg_put(m);
-			m = ERR_PTR(err);
+			m = ERR_PTR(-EIO);
+			goto out;
 		}
+		m->pages = req->r_pages;
+		m->nr_pages = req->r_num_pages;
 	}
 	*skip = 0;
 	req->r_con_filling_msg = ceph_con_get(con);

From f553069e5d7c6f53688ae4470173fcb1be97cbe7 Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Wed, 17 Mar 2010 08:53:04 -0700
Subject: [PATCH 05/59] ceph: update for removal of kref_set

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/msgpool.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/ceph/msgpool.c b/fs/ceph/msgpool.c
index ca3b44a89f2d..030297f62fb7 100644
--- a/fs/ceph/msgpool.c
+++ b/fs/ceph/msgpool.c
@@ -170,7 +170,7 @@ void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg)
 		msg->front.iov_len = pool->front_len;
 		msg->hdr.front_len = cpu_to_le32(pool->front_len);
 
-		kref_set(&msg->kref, 1);  /* retake a single ref */
+		kref_init(&msg->kref);  /* retake a single ref */
 		list_add(&msg->list_head, &pool->msgs);
 		pool->num++;
 		dout("msgpool_put %p reclaim %p, now %d/%d\n", pool, msg,

From 31459fe4b24c1e09712eff0d82a5276f4fd0e3cf Mon Sep 17 00:00:00 2001
From: Yehuda Sadeh <yehuda@hq.newdream.net>
Date: Wed, 17 Mar 2010 13:54:02 -0700
Subject: [PATCH 06/59] ceph: use __page_cache_alloc and add_to_page_cache_lru

Following Nick Piggin patches in btrfs, pagecache pages should be
allocated with __page_cache_alloc, so they obey pagecache memory
policies.

Also, using add_to_page_cache_lru instead of using a private
pagevec where applicable.

Signed-off-by: Yehuda Sadeh <yehuda@hq.newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/addr.c      | 9 ++-------
 fs/ceph/file.c      | 2 +-
 fs/ceph/messenger.c | 2 +-
 fs/ceph/pagelist.c  | 2 +-
 fs/ceph/xattr.c     | 2 +-
 5 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index a9005d862ed4..caf76c7fa77c 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -274,7 +274,6 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
 	struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
 	int rc = 0;
 	struct page **pages;
-	struct pagevec pvec;
 	loff_t offset;
 	u64 len;
 
@@ -297,8 +296,6 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
 	if (rc < 0)
 		goto out;
 
-	/* set uptodate and add to lru in pagevec-sized chunks */
-	pagevec_init(&pvec, 0);
 	for (; !list_empty(page_list) && len > 0;
 	     rc -= PAGE_CACHE_SIZE, len -= PAGE_CACHE_SIZE) {
 		struct page *page =
@@ -312,7 +309,7 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
 			zero_user_segment(page, s, PAGE_CACHE_SIZE);
 		}
 
-		if (add_to_page_cache(page, mapping, page->index, GFP_NOFS)) {
+		if (add_to_page_cache_lru(page, mapping, page->index, GFP_NOFS)) {
 			page_cache_release(page);
 			dout("readpages %p add_to_page_cache failed %p\n",
 			     inode, page);
@@ -323,10 +320,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
 		flush_dcache_page(page);
 		SetPageUptodate(page);
 		unlock_page(page);
-		if (pagevec_add(&pvec, page) == 0)
-			pagevec_lru_add_file(&pvec);   /* add to lru */
+		page_cache_release(page);
 	}
-	pagevec_lru_add_file(&pvec);
 	rc = 0;
 
 out:
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index ed6f19721d6e..6230c3de1f06 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -326,7 +326,7 @@ static struct page **alloc_page_vector(int num_pages)
 	if (!pages)
 		return ERR_PTR(-ENOMEM);
 	for (i = 0; i < num_pages; i++) {
-		pages[i] = alloc_page(GFP_NOFS);
+		pages[i] = __page_cache_alloc(GFP_NOFS);
 		if (pages[i] == NULL) {
 			ceph_release_page_vector(pages, i);
 			return ERR_PTR(-ENOMEM);
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index cd4fadb6491a..af3b143fbf02 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -1947,7 +1947,7 @@ struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr)
 
 	/* the zero page is needed if a request is "canceled" while the message
 	 * is being written over the socket */
-	msgr->zero_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+	msgr->zero_page = __page_cache_alloc(GFP_KERNEL | __GFP_ZERO);
 	if (!msgr->zero_page) {
 		kfree(msgr);
 		return ERR_PTR(-ENOMEM);
diff --git a/fs/ceph/pagelist.c b/fs/ceph/pagelist.c
index 5f8dbf7c745a..b6859f47d364 100644
--- a/fs/ceph/pagelist.c
+++ b/fs/ceph/pagelist.c
@@ -20,7 +20,7 @@ int ceph_pagelist_release(struct ceph_pagelist *pl)
 
 static int ceph_pagelist_addpage(struct ceph_pagelist *pl)
 {
-	struct page *page = alloc_page(GFP_NOFS);
+	struct page *page = __page_cache_alloc(GFP_NOFS);
 	if (!page)
 		return -ENOMEM;
 	pl->room += PAGE_SIZE;
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 2845422907fc..8938934c4c93 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -641,7 +641,7 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
 			return -ENOMEM;
 		err = -ENOMEM;
 		for (i = 0; i < nr_pages; i++) {
-			pages[i] = alloc_page(GFP_NOFS);
+			pages[i] = __page_cache_alloc(GFP_NOFS);
 			if (!pages[i]) {
 				nr_pages = i;
 				goto out;

From 104648ad3f2ebe8556c020e5f0344853076cd5ee Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Thu, 18 Mar 2010 10:14:30 -0700
Subject: [PATCH 07/59] ceph: reduce build_path debug output

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/mds_client.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 76995a960432..ccb4141e306f 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1268,7 +1268,7 @@ retry:
 		struct inode *inode = temp->d_inode;
 
 		if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {
-			dout("build_path_dentry path+%d: %p SNAPDIR\n",
+			dout("build_path path+%d: %p SNAPDIR\n",
 			     pos, temp);
 		} else if (stop_on_nosnap && inode &&
 			   ceph_snap(inode) == CEPH_NOSNAP) {
@@ -1279,20 +1279,18 @@ retry:
 				break;
 			strncpy(path + pos, temp->d_name.name,
 				temp->d_name.len);
-			dout("build_path_dentry path+%d: %p '%.*s'\n",
-			     pos, temp, temp->d_name.len, path + pos);
 		}
 		if (pos)
 			path[--pos] = '/';
 		temp = temp->d_parent;
 		if (temp == NULL) {
-			pr_err("build_path_dentry corrupt dentry\n");
+			pr_err("build_path corrupt dentry\n");
 			kfree(path);
 			return ERR_PTR(-EINVAL);
 		}
 	}
 	if (pos != 0) {
-		pr_err("build_path_dentry did not end path lookup where "
+		pr_err("build_path did not end path lookup where "
 		       "expected, namelen is %d, pos is %d\n", len, pos);
 		/* presumably this is only possible if racing with a
 		   rename of one of the parent directories (we can not
@@ -1304,7 +1302,7 @@ retry:
 
 	*base = ceph_ino(temp->d_inode);
 	*plen = len;
-	dout("build_path_dentry on %p %d built %llx '%.*s'\n",
+	dout("build_path on %p %d built %llx '%.*s'\n",
 	     dentry, atomic_read(&dentry->d_count), *base, len, path);
 	return path;
 }

From c7708075f18086ee7d02df8b891910893e9ea372 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Sat, 20 Mar 2010 16:01:27 +0300
Subject: [PATCH 08/59] ceph: cleanup: remove dead code

"xattr" is never NULL here.  We took care of that in the previous
if statement block.

Signed-off-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/xattr.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 8938934c4c93..d940d14f6922 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -186,12 +186,6 @@ static int __set_xattr(struct ceph_inode_info *ci,
 		ci->i_xattrs.names_size -= xattr->name_len;
 		ci->i_xattrs.vals_size -= xattr->val_len;
 	}
-	if (!xattr) {
-		pr_err("__set_xattr ENOMEM on %p %llx.%llx xattr %s=%s\n",
-		       &ci->vfs_inode, ceph_vinop(&ci->vfs_inode), name,
-		       xattr->val);
-		return -ENOMEM;
-	}
 	ci->i_xattrs.names_size += name_len;
 	ci->i_xattrs.vals_size += val_len;
 	if (val)

From 6f46cb29350963527b663c9eb4fe964daa9ae707 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Wed, 24 Mar 2010 21:30:19 -0700
Subject: [PATCH 09/59] ceph: fix theoretically possible double-put on
 connection

This would only trigger if we bailed out before resetting r_con_filling_msg
because the server reply was corrupt (oversized).

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/osd_client.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index 22a33f8c8807..3d2bfbc232dc 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -1386,6 +1386,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
 		     req->r_reply, req->r_con_filling_msg);
 		ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply);
 		ceph_con_put(req->r_con_filling_msg);
+		req->r_con_filling_msg = NULL;
 	}
 
 	if (front > req->r_reply->front.iov_len) {

From 3143edd3a185f1fd370ebdd21b4151aa9f3283a3 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Wed, 24 Mar 2010 21:43:33 -0700
Subject: [PATCH 10/59] ceph: clean up statfs

Avoid unnecessary msgpool.  Preallocate reply.  Fix use-after-free race.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/mon_client.c | 151 +++++++++++++++++++++++++++----------------
 fs/ceph/mon_client.h |   5 +-
 2 files changed, 98 insertions(+), 58 deletions(-)

diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
index 8fdc011ca956..43cfab06fcee 100644
--- a/fs/ceph/mon_client.c
+++ b/fs/ceph/mon_client.c
@@ -393,16 +393,64 @@ static void __insert_statfs(struct ceph_mon_client *monc,
 	rb_insert_color(&new->node, &monc->statfs_request_tree);
 }
 
+static void release_statfs_request(struct kref *kref)
+{
+	struct ceph_mon_statfs_request *req =
+		container_of(kref, struct ceph_mon_statfs_request, kref);
+
+	if (req->reply)
+		ceph_msg_put(req->reply);
+	if (req->request)
+		ceph_msg_put(req->request);
+}
+
+static void put_statfs_request(struct ceph_mon_statfs_request *req)
+{
+	kref_put(&req->kref, release_statfs_request);
+}
+
+static void get_statfs_request(struct ceph_mon_statfs_request *req)
+{
+	kref_get(&req->kref);
+}
+
+static struct ceph_msg *get_statfs_reply(struct ceph_connection *con,
+					 struct ceph_msg_header *hdr,
+					 int *skip)
+{
+	struct ceph_mon_client *monc = con->private;
+	struct ceph_mon_statfs_request *req;
+	u64 tid = le64_to_cpu(hdr->tid);
+	struct ceph_msg *m;
+
+	mutex_lock(&monc->mutex);
+	req = __lookup_statfs(monc, tid);
+	if (!req) {
+		dout("get_statfs_reply %lld dne\n", tid);
+		*skip = 1;
+		m = NULL;
+	} else {
+		dout("get_statfs_reply %lld got %p\n", tid, req->reply);
+		m = ceph_msg_get(req->reply);
+		/*
+		 * we don't need to track the connection reading into
+		 * this reply because we only have one open connection
+		 * at a time, ever.
+		 */
+	}
+	mutex_unlock(&monc->mutex);
+	return m;
+}
+
 static void handle_statfs_reply(struct ceph_mon_client *monc,
 				struct ceph_msg *msg)
 {
 	struct ceph_mon_statfs_request *req;
 	struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
-	u64 tid;
+	u64 tid = le64_to_cpu(msg->hdr.tid);
 
 	if (msg->front.iov_len != sizeof(*reply))
 		goto bad;
-	tid = le64_to_cpu(msg->hdr.tid);
 	dout("handle_statfs_reply %p tid %llu\n", msg, tid);
 
 	mutex_lock(&monc->mutex);
@@ -410,10 +458,13 @@ static void handle_statfs_reply(struct ceph_mon_client *monc,
 	if (req) {
 		*req->buf = reply->st;
 		req->result = 0;
+		get_statfs_request(req);
 	}
 	mutex_unlock(&monc->mutex);
-	if (req)
+	if (req) {
 		complete(&req->completion);
+		put_statfs_request(req);
+	}
 	return;
 
 bad:
@@ -421,68 +472,64 @@ bad:
 	ceph_msg_dump(msg);
 }
 
-/*
- * (re)send a statfs request
- */
-static int send_statfs(struct ceph_mon_client *monc,
-		       struct ceph_mon_statfs_request *req)
-{
-	struct ceph_msg *msg;
-	struct ceph_mon_statfs *h;
-
-	dout("send_statfs tid %llu\n", req->tid);
-	msg = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), 0, 0, NULL);
-	if (IS_ERR(msg))
-		return PTR_ERR(msg);
-	req->request = msg;
-	msg->hdr.tid = cpu_to_le64(req->tid);
-	h = msg->front.iov_base;
-	h->monhdr.have_version = 0;
-	h->monhdr.session_mon = cpu_to_le16(-1);
-	h->monhdr.session_mon_tid = 0;
-	h->fsid = monc->monmap->fsid;
-	ceph_con_send(monc->con, msg);
-	return 0;
-}
-
 /*
  * Do a synchronous statfs().
  */
 int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
 {
-	struct ceph_mon_statfs_request req;
+	struct ceph_mon_statfs_request *req;
+	struct ceph_mon_statfs *h;
 	int err;
 
-	req.buf = buf;
-	init_completion(&req.completion);
+	req = kmalloc(sizeof(*req), GFP_NOFS);
+	if (!req)
+		return -ENOMEM;
 
-	/* allocate memory for reply */
-	err = ceph_msgpool_resv(&monc->msgpool_statfs_reply, 1);
-	if (err)
-		return err;
+	memset(req, 0, sizeof(*req));
+	kref_init(&req->kref);
+	req->buf = buf;
+	init_completion(&req->completion);
+
+	req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), 0, 0, NULL);
+	if (IS_ERR(req->request)) {
+		err = PTR_ERR(req->request);
+		goto out;
+	}
+	req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, 0, 0, NULL);
+	if (IS_ERR(req->reply)) {
+		err = PTR_ERR(req->reply);
+		goto out;
+	}
+
+	/* fill out request */
+	h = req->request->front.iov_base;
+	h->monhdr.have_version = 0;
+	h->monhdr.session_mon = cpu_to_le16(-1);
+	h->monhdr.session_mon_tid = 0;
+	h->fsid = monc->monmap->fsid;
 
 	/* register request */
 	mutex_lock(&monc->mutex);
-	req.tid = ++monc->last_tid;
-	req.last_attempt = jiffies;
-	req.delay = BASE_DELAY_INTERVAL;
-	__insert_statfs(monc, &req);
+	req->tid = ++monc->last_tid;
+	req->request->hdr.tid = cpu_to_le64(req->tid);
+	__insert_statfs(monc, req);
 	monc->num_statfs_requests++;
 	mutex_unlock(&monc->mutex);
 
 	/* send request and wait */
-	err = send_statfs(monc, &req);
-	if (!err)
-		err = wait_for_completion_interruptible(&req.completion);
+	ceph_con_send(monc->con, ceph_msg_get(req->request));
+	err = wait_for_completion_interruptible(&req->completion);
 
 	mutex_lock(&monc->mutex);
-	rb_erase(&req.node, &monc->statfs_request_tree);
+	rb_erase(&req->node, &monc->statfs_request_tree);
 	monc->num_statfs_requests--;
-	ceph_msgpool_resv(&monc->msgpool_statfs_reply, -1);
 	mutex_unlock(&monc->mutex);
 
 	if (!err)
-		err = req.result;
+		err = req->result;
+
+out:
+	kref_put(&req->kref, release_statfs_request);
 	return err;
 }
 
@@ -496,7 +543,7 @@ static void __resend_statfs(struct ceph_mon_client *monc)
 
 	for (p = rb_first(&monc->statfs_request_tree); p; p = rb_next(p)) {
 		req = rb_entry(p, struct ceph_mon_statfs_request, node);
-		send_statfs(monc, req);
+		ceph_con_send(monc->con, ceph_msg_get(req->request));
 	}
 }
 
@@ -591,13 +638,9 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
 			       sizeof(struct ceph_mon_subscribe_ack), 1, false);
 	if (err < 0)
 		goto out_monmap;
-	err = ceph_msgpool_init(&monc->msgpool_statfs_reply,
-				sizeof(struct ceph_mon_statfs_reply), 0, false);
-	if (err < 0)
-		goto out_pool1;
 	err = ceph_msgpool_init(&monc->msgpool_auth_reply, 4096, 1, false);
 	if (err < 0)
-		goto out_pool2;
+		goto out_pool;
 
 	monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, 0, 0, NULL);
 	monc->pending_auth = 0;
@@ -624,10 +667,8 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
 
 out_pool3:
 	ceph_msgpool_destroy(&monc->msgpool_auth_reply);
-out_pool2:
+out_pool:
 	ceph_msgpool_destroy(&monc->msgpool_subscribe_ack);
-out_pool1:
-	ceph_msgpool_destroy(&monc->msgpool_statfs_reply);
 out_monmap:
 	kfree(monc->monmap);
 out:
@@ -652,7 +693,6 @@ void ceph_monc_stop(struct ceph_mon_client *monc)
 
 	ceph_msg_put(monc->m_auth);
 	ceph_msgpool_destroy(&monc->msgpool_subscribe_ack);
-	ceph_msgpool_destroy(&monc->msgpool_statfs_reply);
 	ceph_msgpool_destroy(&monc->msgpool_auth_reply);
 
 	kfree(monc->monmap);
@@ -773,8 +813,7 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
 		m = ceph_msgpool_get(&monc->msgpool_subscribe_ack, front_len);
 		break;
 	case CEPH_MSG_STATFS_REPLY:
-		m = ceph_msgpool_get(&monc->msgpool_statfs_reply, front_len);
-		break;
+		return get_statfs_reply(con, hdr, skip);
 	case CEPH_MSG_AUTH_REPLY:
 		m = ceph_msgpool_get(&monc->msgpool_auth_reply, front_len);
 		break;
diff --git a/fs/ceph/mon_client.h b/fs/ceph/mon_client.h
index b958ad5afa06..cc89a86c8589 100644
--- a/fs/ceph/mon_client.h
+++ b/fs/ceph/mon_client.h
@@ -2,6 +2,7 @@
 #define _FS_CEPH_MON_CLIENT_H
 
 #include <linux/completion.h>
+#include <linux/kref.h>
 #include <linux/rbtree.h>
 
 #include "messenger.h"
@@ -44,13 +45,14 @@ struct ceph_mon_request {
  * to the caller
  */
 struct ceph_mon_statfs_request {
+	struct kref kref;
 	u64 tid;
 	struct rb_node node;
 	int result;
 	struct ceph_statfs *buf;
 	struct completion completion;
-	unsigned long last_attempt, delay; /* jiffies */
 	struct ceph_msg *request;  /* original request */
+	struct ceph_msg *reply;    /* and reply */
 };
 
 struct ceph_mon_client {
@@ -72,7 +74,6 @@ struct ceph_mon_client {
 
 	/* msg pools */
 	struct ceph_msgpool msgpool_subscribe_ack;
-	struct ceph_msgpool msgpool_statfs_reply;
 	struct ceph_msgpool msgpool_auth_reply;
 
 	/* pending statfs requests */

From 6694d6b95cf3b41751e78815d05968fa2084d7bf Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Wed, 24 Mar 2010 21:48:05 -0700
Subject: [PATCH 11/59] ceph: drop unnecessary msgpool for mon_client
 auth_reply

Preallocate a single reply message that we can reuse instead.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/mon_client.c | 19 ++++++++++++-------
 fs/ceph/mon_client.h |  5 ++---
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
index 43cfab06fcee..5a67732fa6e1 100644
--- a/fs/ceph/mon_client.c
+++ b/fs/ceph/mon_client.c
@@ -638,16 +638,21 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
 			       sizeof(struct ceph_mon_subscribe_ack), 1, false);
 	if (err < 0)
 		goto out_monmap;
-	err = ceph_msgpool_init(&monc->msgpool_auth_reply, 4096, 1, false);
-	if (err < 0)
+
+	monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, 0, 0,
+					  NULL);
+	if (IS_ERR(monc->m_auth_reply)) {
+		err = PTR_ERR(monc->m_auth_reply);
+		monc->m_auth_reply = NULL;
 		goto out_pool;
+	}
 
 	monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, 0, 0, NULL);
 	monc->pending_auth = 0;
 	if (IS_ERR(monc->m_auth)) {
 		err = PTR_ERR(monc->m_auth);
 		monc->m_auth = NULL;
-		goto out_pool3;
+		goto out_auth_reply;
 	}
 
 	monc->cur_mon = -1;
@@ -665,8 +670,8 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
 	monc->want_next_osdmap = 1;
 	return 0;
 
-out_pool3:
-	ceph_msgpool_destroy(&monc->msgpool_auth_reply);
+out_auth_reply:
+	ceph_msg_put(monc->m_auth_reply);
 out_pool:
 	ceph_msgpool_destroy(&monc->msgpool_subscribe_ack);
 out_monmap:
@@ -692,8 +697,8 @@ void ceph_monc_stop(struct ceph_mon_client *monc)
 	ceph_auth_destroy(monc->auth);
 
 	ceph_msg_put(monc->m_auth);
+	ceph_msg_put(monc->m_auth_reply);
 	ceph_msgpool_destroy(&monc->msgpool_subscribe_ack);
-	ceph_msgpool_destroy(&monc->msgpool_auth_reply);
 
 	kfree(monc->monmap);
 }
@@ -815,7 +820,7 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
 	case CEPH_MSG_STATFS_REPLY:
 		return get_statfs_reply(con, hdr, skip);
 	case CEPH_MSG_AUTH_REPLY:
-		m = ceph_msgpool_get(&monc->msgpool_auth_reply, front_len);
+		m = ceph_msg_get(monc->m_auth_reply);
 		break;
 	case CEPH_MSG_MON_MAP:
 	case CEPH_MSG_MDS_MAP:
diff --git a/fs/ceph/mon_client.h b/fs/ceph/mon_client.h
index cc89a86c8589..2658e3e3b27c 100644
--- a/fs/ceph/mon_client.h
+++ b/fs/ceph/mon_client.h
@@ -63,7 +63,7 @@ struct ceph_mon_client {
 	struct delayed_work delayed_work;
 
 	struct ceph_auth_client *auth;
-	struct ceph_msg *m_auth;
+	struct ceph_msg *m_auth, *m_auth_reply;
 	int pending_auth;
 
 	bool hunting;
@@ -72,9 +72,8 @@ struct ceph_mon_client {
 	struct ceph_connection *con;
 	bool have_fsid;
 
-	/* msg pools */
+	/* msgs */
 	struct ceph_msgpool msgpool_subscribe_ack;
-	struct ceph_msgpool msgpool_auth_reply;
 
 	/* pending statfs requests */
 	struct rb_root statfs_request_tree;

From 7c315c552c7442eab73461de61dbcce579a31d3a Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Wed, 24 Mar 2010 21:52:30 -0700
Subject: [PATCH 12/59] ceph: drop unnecessary msgpool for mon_client
 subscribe_ack

Preallocate a single message to reuse instead.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/mon_client.c | 20 ++++++++++++--------
 fs/ceph/mon_client.h |  6 +-----
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
index 5a67732fa6e1..5bee9250bf2a 100644
--- a/fs/ceph/mon_client.c
+++ b/fs/ceph/mon_client.c
@@ -634,17 +634,21 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
 		CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS;
 
 	/* msg pools */
-	err = ceph_msgpool_init(&monc->msgpool_subscribe_ack,
-			       sizeof(struct ceph_mon_subscribe_ack), 1, false);
-	if (err < 0)
+	monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK,
+				     sizeof(struct ceph_mon_subscribe_ack),
+				     0, 0, NULL);
+	if (IS_ERR(monc->m_subscribe_ack)) {
+		err = PTR_ERR(monc->m_subscribe_ack);
+		monc->m_subscribe_ack = NULL;
 		goto out_monmap;
+	}
 
 	monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, 0, 0,
 					  NULL);
 	if (IS_ERR(monc->m_auth_reply)) {
 		err = PTR_ERR(monc->m_auth_reply);
 		monc->m_auth_reply = NULL;
-		goto out_pool;
+		goto out_subscribe_ack;
 	}
 
 	monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, 0, 0, NULL);
@@ -672,8 +676,8 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
 
 out_auth_reply:
 	ceph_msg_put(monc->m_auth_reply);
-out_pool:
-	ceph_msgpool_destroy(&monc->msgpool_subscribe_ack);
+out_subscribe_ack:
+	ceph_msg_put(monc->m_subscribe_ack);
 out_monmap:
 	kfree(monc->monmap);
 out:
@@ -698,7 +702,7 @@ void ceph_monc_stop(struct ceph_mon_client *monc)
 
 	ceph_msg_put(monc->m_auth);
 	ceph_msg_put(monc->m_auth_reply);
-	ceph_msgpool_destroy(&monc->msgpool_subscribe_ack);
+	ceph_msg_put(monc->m_subscribe_ack);
 
 	kfree(monc->monmap);
 }
@@ -815,7 +819,7 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
 
 	switch (type) {
 	case CEPH_MSG_MON_SUBSCRIBE_ACK:
-		m = ceph_msgpool_get(&monc->msgpool_subscribe_ack, front_len);
+		m = ceph_msg_get(monc->m_subscribe_ack);
 		break;
 	case CEPH_MSG_STATFS_REPLY:
 		return get_statfs_reply(con, hdr, skip);
diff --git a/fs/ceph/mon_client.h b/fs/ceph/mon_client.h
index 2658e3e3b27c..0046deed0740 100644
--- a/fs/ceph/mon_client.h
+++ b/fs/ceph/mon_client.h
@@ -6,7 +6,6 @@
 #include <linux/rbtree.h>
 
 #include "messenger.h"
-#include "msgpool.h"
 
 struct ceph_client;
 struct ceph_mount_args;
@@ -63,7 +62,7 @@ struct ceph_mon_client {
 	struct delayed_work delayed_work;
 
 	struct ceph_auth_client *auth;
-	struct ceph_msg *m_auth, *m_auth_reply;
+	struct ceph_msg *m_auth, *m_auth_reply, *m_subscribe_ack;
 	int pending_auth;
 
 	bool hunting;
@@ -72,9 +71,6 @@ struct ceph_mon_client {
 	struct ceph_connection *con;
 	bool have_fsid;
 
-	/* msgs */
-	struct ceph_msgpool msgpool_subscribe_ack;
-
 	/* pending statfs requests */
 	struct rb_root statfs_request_tree;
 	int num_statfs_requests;

From 2d06eeb877581a7f53209af1582c5f66c799f0bd Mon Sep 17 00:00:00 2001
From: Cheng Renquan <crquan@gmail.com>
Date: Fri, 26 Mar 2010 18:04:40 +0800
Subject: [PATCH 13/59] ceph: handle kzalloc() failure

Signed-off-by: Cheng Renquan <crquan@gmail.com>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/mds_client.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index ccb4141e306f..d549ab3adfda 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2631,6 +2631,9 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
 	mdsc->client = client;
 	mutex_init(&mdsc->mutex);
 	mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
+	if (mdsc->mdsmap == NULL)
+		return -ENOMEM;
+
 	init_completion(&mdsc->safe_umount_waiters);
 	init_completion(&mdsc->session_close_waiters);
 	INIT_LIST_HEAD(&mdsc->waiting_for_map);
@@ -2656,6 +2659,7 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
 	init_waitqueue_head(&mdsc->cap_flushing_wq);
 	spin_lock_init(&mdsc->dentry_lru_lock);
 	INIT_LIST_HEAD(&mdsc->dentry_lru);
+
 	return 0;
 }
 

From 640ef79d27c81b7a3265a344ec1d25644dd463ad Mon Sep 17 00:00:00 2001
From: Cheng Renquan <crquan@gmail.com>
Date: Fri, 26 Mar 2010 17:40:33 +0800
Subject: [PATCH 14/59] ceph: use ceph_sb_to_client instead of ceph_client

ceph_sb_to_client and ceph_client are really identical, we need to dump
one; while function ceph_client is confusing with "struct ceph_client",
ceph_sb_to_client's definition is more clear; so we'd better switch all
call to ceph_sb_to_client.

  -static inline struct ceph_client *ceph_client(struct super_block *sb)
  -{
  -	return sb->s_fs_info;
  -}

Signed-off-by: Cheng Renquan <crquan@gmail.com>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/addr.c   |  2 +-
 fs/ceph/caps.c   | 15 +++++++++------
 fs/ceph/dir.c    | 10 +++++-----
 fs/ceph/export.c |  2 +-
 fs/ceph/file.c   |  2 +-
 fs/ceph/inode.c  | 10 +++++-----
 fs/ceph/ioctl.c  |  2 +-
 fs/ceph/snap.c   |  2 +-
 fs/ceph/super.c  |  8 ++++----
 fs/ceph/super.h  |  6 ------
 fs/ceph/xattr.c  |  4 ++--
 11 files changed, 30 insertions(+), 33 deletions(-)

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index caf76c7fa77c..d9c60b84949a 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -563,7 +563,7 @@ static void writepages_finish(struct ceph_osd_request *req,
 	ceph_release_pages(req->r_pages, req->r_num_pages);
 	if (req->r_pages_from_pool)
 		mempool_free(req->r_pages,
-			     ceph_client(inode->i_sb)->wb_pagevec_pool);
+			     ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool);
 	else
 		kfree(req->r_pages);
 	ceph_osdc_put_request(req);
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index d9400534b279..51fd39da1470 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -867,7 +867,8 @@ void __ceph_remove_cap(struct ceph_cap *cap)
 {
 	struct ceph_mds_session *session = cap->session;
 	struct ceph_inode_info *ci = cap->ci;
-	struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
+	struct ceph_mds_client *mdsc =
+		&ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
 	int removed = 0;
 
 	dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
@@ -1298,7 +1299,8 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci)
  */
 void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
 {
-	struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
+	struct ceph_mds_client *mdsc =
+		&ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
 	struct inode *inode = &ci->vfs_inode;
 	int was = ci->i_dirty_caps;
 	int dirty = 0;
@@ -1336,7 +1338,7 @@ void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
 static int __mark_caps_flushing(struct inode *inode,
 				 struct ceph_mds_session *session)
 {
-	struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+	struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	int flushing;
 
@@ -1663,7 +1665,7 @@ ack:
 static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
 			  unsigned *flush_tid)
 {
-	struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+	struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	int unlock_session = session ? 0 : 1;
 	int flushing = 0;
@@ -1829,7 +1831,8 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
 			err = wait_event_interruptible(ci->i_cap_wq,
 				       caps_are_flushed(inode, flush_tid));
 	} else {
-		struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+		struct ceph_mds_client *mdsc =
+			&ceph_sb_to_client(inode->i_sb)->mdsc;
 
 		spin_lock(&inode->i_lock);
 		if (__ceph_caps_dirty(ci))
@@ -2411,7 +2414,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
 	__releases(inode->i_lock)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+	struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
 	unsigned seq = le32_to_cpu(m->seq);
 	int dirty = le32_to_cpu(m->dirty);
 	int cleaned = 0;
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 4b1a7a4bae0b..33feac75c532 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -478,7 +478,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
 struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
 				  struct dentry *dentry, int err)
 {
-	struct ceph_client *client = ceph_client(dentry->d_sb);
+	struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
 	struct inode *parent = dentry->d_parent->d_inode;
 
 	/* .snap dir? */
@@ -1059,7 +1059,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	int left;
 
-	if (!ceph_test_opt(ceph_client(inode->i_sb), DIRSTAT))
+	if (!ceph_test_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
 		return -EISDIR;
 
 	if (!cf->dir_info) {
@@ -1161,7 +1161,7 @@ void ceph_dentry_lru_add(struct dentry *dn)
 	dout("dentry_lru_add %p %p '%.*s'\n", di, dn,
 	     dn->d_name.len, dn->d_name.name);
 	if (di) {
-		mdsc = &ceph_client(dn->d_sb)->mdsc;
+		mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
 		spin_lock(&mdsc->dentry_lru_lock);
 		list_add_tail(&di->lru, &mdsc->dentry_lru);
 		mdsc->num_dentry++;
@@ -1177,7 +1177,7 @@ void ceph_dentry_lru_touch(struct dentry *dn)
 	dout("dentry_lru_touch %p %p '%.*s'\n", di, dn,
 	     dn->d_name.len, dn->d_name.name);
 	if (di) {
-		mdsc = &ceph_client(dn->d_sb)->mdsc;
+		mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
 		spin_lock(&mdsc->dentry_lru_lock);
 		list_move_tail(&di->lru, &mdsc->dentry_lru);
 		spin_unlock(&mdsc->dentry_lru_lock);
@@ -1192,7 +1192,7 @@ void ceph_dentry_lru_del(struct dentry *dn)
 	dout("dentry_lru_del %p %p '%.*s'\n", di, dn,
 	     dn->d_name.len, dn->d_name.name);
 	if (di) {
-		mdsc = &ceph_client(dn->d_sb)->mdsc;
+		mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
 		spin_lock(&mdsc->dentry_lru_lock);
 		list_del_init(&di->lru);
 		mdsc->num_dentry--;
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 9d67572fb328..15683905a7b6 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -115,7 +115,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
 static struct dentry *__cfh_to_dentry(struct super_block *sb,
 				      struct ceph_nfs_confh *cfh)
 {
-	struct ceph_mds_client *mdsc = &ceph_client(sb)->mdsc;
+	struct ceph_mds_client *mdsc = &ceph_sb_to_client(sb)->mdsc;
 	struct inode *inode;
 	struct dentry *dentry;
 	struct ceph_vino vino;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 6230c3de1f06..834d2b83834a 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -809,7 +809,7 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_dentry->d_inode;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc;
+	struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc;
 	loff_t endoff = pos + iov->iov_len;
 	int got = 0;
 	int ret, err;
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 220a2aec0545..ef917232cf37 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -384,7 +384,7 @@ void ceph_destroy_inode(struct inode *inode)
 	 */
 	if (ci->i_snap_realm) {
 		struct ceph_mds_client *mdsc =
-			&ceph_client(ci->vfs_inode.i_sb)->mdsc;
+			&ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
 		struct ceph_snap_realm *realm = ci->i_snap_realm;
 
 		dout(" dropping residual ref to snap realm %p\n", realm);
@@ -623,7 +623,7 @@ static int fill_inode(struct inode *inode,
 
 	inode->i_mapping->a_ops = &ceph_aops;
 	inode->i_mapping->backing_dev_info =
-		&ceph_client(inode->i_sb)->backing_dev_info;
+		&ceph_sb_to_client(inode->i_sb)->backing_dev_info;
 
 	switch (inode->i_mode & S_IFMT) {
 	case S_IFIFO:
@@ -681,7 +681,7 @@ static int fill_inode(struct inode *inode,
 		}
 
 		/* it may be better to set st_size in getattr instead? */
-		if (ceph_test_opt(ceph_client(inode->i_sb), RBYTES))
+		if (ceph_test_opt(ceph_sb_to_client(inode->i_sb), RBYTES))
 			inode->i_size = ci->i_rbytes;
 		break;
 	default:
@@ -1438,7 +1438,7 @@ void ceph_queue_vmtruncate(struct inode *inode)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
 
-	if (queue_work(ceph_client(inode->i_sb)->trunc_wq,
+	if (queue_work(ceph_sb_to_client(inode->i_sb)->trunc_wq,
 		       &ci->i_vmtruncate_work)) {
 		dout("ceph_queue_vmtruncate %p\n", inode);
 		igrab(inode);
@@ -1527,7 +1527,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
 	struct inode *parent_inode = dentry->d_parent->d_inode;
 	const unsigned int ia_valid = attr->ia_valid;
 	struct ceph_mds_request *req;
-	struct ceph_mds_client *mdsc = &ceph_client(dentry->d_sb)->mdsc;
+	struct ceph_mds_client *mdsc = &ceph_sb_to_client(dentry->d_sb)->mdsc;
 	int issued;
 	int release = 0, dirtied = 0;
 	int mask = 0;
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 8a5bcae62846..d085f07756b4 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -98,7 +98,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
 	struct ceph_ioctl_dataloc dl;
 	struct inode *inode = file->f_dentry->d_inode;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc;
+	struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc;
 	u64 len = 1, olen;
 	u64 tmp;
 	struct ceph_object_layout ol;
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index d5114db70453..c0b26b6badba 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -512,7 +512,7 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
 			    struct ceph_cap_snap *capsnap)
 {
 	struct inode *inode = &ci->vfs_inode;
-	struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+	struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
 
 	BUG_ON(capsnap->writing);
 	capsnap->size = inode->i_size;
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 110857ba9269..93ad169a6ae1 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -107,8 +107,8 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
 static int ceph_syncfs(struct super_block *sb, int wait)
 {
 	dout("sync_fs %d\n", wait);
-	ceph_osdc_sync(&ceph_client(sb)->osdc);
-	ceph_mdsc_sync(&ceph_client(sb)->mdsc);
+	ceph_osdc_sync(&ceph_sb_to_client(sb)->osdc);
+	ceph_mdsc_sync(&ceph_sb_to_client(sb)->mdsc);
 	dout("sync_fs %d done\n", wait);
 	return 0;
 }
@@ -932,9 +932,9 @@ static int ceph_get_sb(struct file_system_type *fs_type,
 		goto out;
 	}
 
-	if (ceph_client(sb) != client) {
+	if (ceph_sb_to_client(sb) != client) {
 		ceph_destroy_client(client);
-		client = ceph_client(sb);
+		client = ceph_sb_to_client(sb);
 		dout("get_sb got existing client %p\n", client);
 	} else {
 		dout("get_sb using new client %p\n", client);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index cfe3b0834d54..51b3ff238454 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -160,12 +160,6 @@ struct ceph_client {
 #endif
 };
 
-static inline struct ceph_client *ceph_client(struct super_block *sb)
-{
-	return sb->s_fs_info;
-}
-
-
 /*
  * File i/o capability.  This tracks shared state with the metadata
  * server that allows us to cache or writeback attributes or to read
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index d940d14f6922..3b4c2620030a 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -616,7 +616,7 @@ out:
 static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
 			      const char *value, size_t size, int flags)
 {
-	struct ceph_client *client = ceph_client(dentry->d_sb);
+	struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
 	struct inode *inode = dentry->d_inode;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct inode *parent_inode = dentry->d_parent->d_inode;
@@ -773,7 +773,7 @@ out:
 
 static int ceph_send_removexattr(struct dentry *dentry, const char *name)
 {
-	struct ceph_client *client = ceph_client(dentry->d_sb);
+	struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
 	struct ceph_mds_client *mdsc = &client->mdsc;
 	struct inode *inode = dentry->d_inode;
 	struct inode *parent_inode = dentry->d_parent->d_inode;

From d52f847a841bfeba0ea87a7842732d388a1ca2e8 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Thu, 1 Apr 2010 15:23:14 -0700
Subject: [PATCH 15/59] ceph: rewrite msgpool using mempool_t

Since we don't need to maintain large pools of messages, we can just
use the standard mempool_t.  We maintain a msgpool 'wrapper' because we
need the mempool_t* in the alloc function, and mempool gives us only
pool_data.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/msgpool.c | 174 ++++++++--------------------------------------
 fs/ceph/msgpool.h |   8 +--
 2 files changed, 30 insertions(+), 152 deletions(-)

diff --git a/fs/ceph/msgpool.c b/fs/ceph/msgpool.c
index 030297f62fb7..ca032223e87b 100644
--- a/fs/ceph/msgpool.c
+++ b/fs/ceph/msgpool.c
@@ -7,106 +7,43 @@
 
 #include "msgpool.h"
 
-/*
- * We use msg pools to preallocate memory for messages we expect to
- * receive over the wire, to avoid getting ourselves into OOM
- * conditions at unexpected times.  We take use a few different
- * strategies:
- *
- *  - for request/response type interactions, we preallocate the
- * memory needed for the response when we generate the request.
- *
- *  - for messages we can receive at any time from the MDS, we preallocate
- * a pool of messages we can re-use.
- *
- *  - for writeback, we preallocate some number of messages to use for
- * requests and their replies, so that we always make forward
- * progress.
- *
- * The msgpool behaves like a mempool_t, but keeps preallocated
- * ceph_msgs strung together on a list_head instead of using a pointer
- * vector.  This avoids vector reallocation when we adjust the number
- * of preallocated items (which happens frequently).
- */
-
-
-/*
- * Allocate or release as necessary to meet our target pool size.
- */
-static int __fill_msgpool(struct ceph_msgpool *pool)
+static void *alloc_fn(gfp_t gfp_mask, void *arg)
 {
-	struct ceph_msg *msg;
+	struct ceph_msgpool *pool = arg;
+	struct ceph_msg *m;
 
-	while (pool->num < pool->min) {
-		dout("fill_msgpool %p %d/%d allocating\n", pool, pool->num,
-		     pool->min);
-		spin_unlock(&pool->lock);
-		msg = ceph_msg_new(0, pool->front_len, 0, 0, NULL);
-		spin_lock(&pool->lock);
-		if (IS_ERR(msg))
-			return PTR_ERR(msg);
-		msg->pool = pool;
-		list_add(&msg->list_head, &pool->msgs);
-		pool->num++;
-	}
-	while (pool->num > pool->min) {
-		msg = list_first_entry(&pool->msgs, struct ceph_msg, list_head);
-		dout("fill_msgpool %p %d/%d releasing %p\n", pool, pool->num,
-		     pool->min, msg);
-		list_del_init(&msg->list_head);
-		pool->num--;
-		ceph_msg_kfree(msg);
-	}
-	return 0;
+	m = ceph_msg_new(0, pool->front_len, 0, 0, NULL);
+	if (IS_ERR(m))
+		return NULL;
+	return m;
+}
+
+static void free_fn(void *element, void *arg)
+{
+	ceph_msg_put(element);
 }
 
 int ceph_msgpool_init(struct ceph_msgpool *pool,
-		      int front_len, int min, bool blocking)
+		      int front_len, int size, bool blocking)
 {
-	int ret;
-
-	dout("msgpool_init %p front_len %d min %d\n", pool, front_len, min);
-	spin_lock_init(&pool->lock);
 	pool->front_len = front_len;
-	INIT_LIST_HEAD(&pool->msgs);
-	pool->num = 0;
-	pool->min = min;
-	pool->blocking = blocking;
-	init_waitqueue_head(&pool->wait);
-
-	spin_lock(&pool->lock);
-	ret = __fill_msgpool(pool);
-	spin_unlock(&pool->lock);
-	return ret;
+	pool->pool = mempool_create(size, alloc_fn, free_fn, pool);
+	if (!pool->pool)
+		return -ENOMEM;
+	return 0;
 }
 
 void ceph_msgpool_destroy(struct ceph_msgpool *pool)
 {
-	dout("msgpool_destroy %p\n", pool);
-	spin_lock(&pool->lock);
-	pool->min = 0;
-	__fill_msgpool(pool);
-	spin_unlock(&pool->lock);
+	mempool_destroy(pool->pool);
 }
 
-int ceph_msgpool_resv(struct ceph_msgpool *pool, int delta)
+struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool,
+				  int front_len)
 {
-	int ret;
+	if (front_len > pool->front_len) {
+		struct ceph_msg *msg;
 
-	spin_lock(&pool->lock);
-	dout("msgpool_resv %p delta %d\n", pool, delta);
-	pool->min += delta;
-	ret = __fill_msgpool(pool);
-	spin_unlock(&pool->lock);
-	return ret;
-}
-
-struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, int front_len)
-{
-	wait_queue_t wait;
-	struct ceph_msg *msg;
-
-	if (front_len && front_len > pool->front_len) {
 		pr_err("msgpool_get pool %p need front %d, pool size is %d\n",
 		       pool, front_len, pool->front_len);
 		WARN_ON(1);
@@ -115,72 +52,17 @@ struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, int front_len)
 		msg = ceph_msg_new(0, front_len, 0, 0, NULL);
 		if (!IS_ERR(msg))
 			return msg;
+		return NULL;
 	}
 
-	if (!front_len)
-		front_len = pool->front_len;
-
-	if (pool->blocking) {
-		/* mempool_t behavior; first try to alloc */
-		msg = ceph_msg_new(0, front_len, 0, 0, NULL);
-		if (!IS_ERR(msg))
-			return msg;
-	}
-
-	while (1) {
-		spin_lock(&pool->lock);
-		if (likely(pool->num)) {
-			msg = list_entry(pool->msgs.next, struct ceph_msg,
-					 list_head);
-			list_del_init(&msg->list_head);
-			pool->num--;
-			dout("msgpool_get %p got %p, now %d/%d\n", pool, msg,
-			     pool->num, pool->min);
-			spin_unlock(&pool->lock);
-			return msg;
-		}
-		pr_err("msgpool_get %p now %d/%d, %s\n", pool, pool->num,
-		       pool->min, pool->blocking ? "waiting" : "may fail");
-		spin_unlock(&pool->lock);
-
-		if (!pool->blocking) {
-			WARN_ON(1);
-
-			/* maybe we can allocate it now? */
-			msg = ceph_msg_new(0, front_len, 0, 0, NULL);
-			if (!IS_ERR(msg))
-				return msg;
-
-			pr_err("msgpool_get %p empty + alloc failed\n", pool);
-			return ERR_PTR(-ENOMEM);
-		}
-
-		init_wait(&wait);
-		prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
-		schedule();
-		finish_wait(&pool->wait, &wait);
-	}
+	return mempool_alloc(pool->pool, GFP_NOFS);
 }
 
 void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg)
 {
-	spin_lock(&pool->lock);
-	if (pool->num < pool->min) {
-		/* reset msg front_len; user may have changed it */
-		msg->front.iov_len = pool->front_len;
-		msg->hdr.front_len = cpu_to_le32(pool->front_len);
+	/* reset msg front_len; user may have changed it */
+	msg->front.iov_len = pool->front_len;
+	msg->hdr.front_len = cpu_to_le32(pool->front_len);
 
-		kref_init(&msg->kref);  /* retake a single ref */
-		list_add(&msg->list_head, &pool->msgs);
-		pool->num++;
-		dout("msgpool_put %p reclaim %p, now %d/%d\n", pool, msg,
-		     pool->num, pool->min);
-		spin_unlock(&pool->lock);
-		wake_up(&pool->wait);
-	} else {
-		dout("msgpool_put %p drop %p, at %d/%d\n", pool, msg,
-		     pool->num, pool->min);
-		spin_unlock(&pool->lock);
-		ceph_msg_kfree(msg);
-	}
+	kref_init(&msg->kref);  /* retake single ref */
 }
diff --git a/fs/ceph/msgpool.h b/fs/ceph/msgpool.h
index bc834bfcd720..62a61c7fca08 100644
--- a/fs/ceph/msgpool.h
+++ b/fs/ceph/msgpool.h
@@ -1,6 +1,7 @@
 #ifndef _FS_CEPH_MSGPOOL
 #define _FS_CEPH_MSGPOOL
 
+#include <linux/mempool.h>
 #include "messenger.h"
 
 /*
@@ -8,18 +9,13 @@
  * avoid unexpected OOM conditions.
  */
 struct ceph_msgpool {
-	spinlock_t lock;
+	mempool_t *pool;
 	int front_len;          /* preallocated payload size */
-	struct list_head msgs;  /* msgs in the pool; each has 1 ref */
-	int num, min;           /* cur, min # msgs in the pool */
-	bool blocking;
-	wait_queue_head_t wait;
 };
 
 extern int ceph_msgpool_init(struct ceph_msgpool *pool,
 			     int front_len, int size, bool blocking);
 extern void ceph_msgpool_destroy(struct ceph_msgpool *pool);
-extern int ceph_msgpool_resv(struct ceph_msgpool *, int delta);
 extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *,
 					 int front_len);
 extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *);

From a79832f26be370ee26ea81eecdfd42d10e49d66a Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Thu, 1 Apr 2010 16:06:19 -0700
Subject: [PATCH 16/59] ceph: make ceph_msg_new return NULL on failure; clean
 up, fix callers

Returning ERR_PTR(-ENOMEM) is useless extra work.  Return NULL on failure
instead, and fix up the callers (about half of which were wrong anyway).

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/caps.c       |  4 ++--
 fs/ceph/file.c       |  4 ++--
 fs/ceph/mds_client.c | 40 +++++++++++++++++-----------------------
 fs/ceph/messenger.c  | 20 +++++++-------------
 fs/ceph/mon_client.c | 25 +++++++------------------
 fs/ceph/msgpool.c    | 13 ++-----------
 fs/ceph/osd_client.c | 22 +++++++++++-----------
 7 files changed, 48 insertions(+), 80 deletions(-)

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 51fd39da1470..8755e2d83d4c 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -939,8 +939,8 @@ static int send_cap_msg(struct ceph_mds_session *session,
 	     xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0);
 
 	msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), 0, 0, NULL);
-	if (IS_ERR(msg))
-		return PTR_ERR(msg);
+	if (!msg)
+		return -ENOMEM;
 
 	msg->hdr.tid = cpu_to_le64(flush_tid);
 
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 834d2b83834a..b0426090e8c3 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -649,8 +649,8 @@ more:
 				    do_sync,
 				    ci->i_truncate_seq, ci->i_truncate_size,
 				    &mtime, false, 2);
-	if (IS_ERR(req))
-		return PTR_ERR(req);
+	if (!req)
+		return -ENOMEM;
 
 	num_pages = calc_pages_for(pos, len);
 
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index d549ab3adfda..7e89c185d38d 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -666,9 +666,9 @@ static struct ceph_msg *create_session_msg(u32 op, u64 seq)
 	struct ceph_mds_session_head *h;
 
 	msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), 0, 0, NULL);
-	if (IS_ERR(msg)) {
+	if (!msg) {
 		pr_err("create_session_msg ENOMEM creating msg\n");
-		return ERR_PTR(PTR_ERR(msg));
+		return NULL;
 	}
 	h = msg->front.iov_base;
 	h->op = cpu_to_le32(op);
@@ -687,7 +687,6 @@ static int __open_session(struct ceph_mds_client *mdsc,
 	struct ceph_msg *msg;
 	int mstate;
 	int mds = session->s_mds;
-	int err = 0;
 
 	/* wait for mds to go active? */
 	mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds);
@@ -698,13 +697,9 @@ static int __open_session(struct ceph_mds_client *mdsc,
 
 	/* send connect message */
 	msg = create_session_msg(CEPH_SESSION_REQUEST_OPEN, session->s_seq);
-	if (IS_ERR(msg)) {
-		err = PTR_ERR(msg);
-		goto out;
-	}
+	if (!msg)
+		return -ENOMEM;
 	ceph_con_send(&session->s_con, msg);
-
-out:
 	return 0;
 }
 
@@ -883,8 +878,8 @@ static int send_renew_caps(struct ceph_mds_client *mdsc,
 		ceph_mds_state_name(state));
 	msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
 				 ++session->s_renew_seq);
-	if (IS_ERR(msg))
-		return PTR_ERR(msg);
+	if (!msg)
+		return -ENOMEM;
 	ceph_con_send(&session->s_con, msg);
 	return 0;
 }
@@ -931,17 +926,15 @@ static int request_close_session(struct ceph_mds_client *mdsc,
 				 struct ceph_mds_session *session)
 {
 	struct ceph_msg *msg;
-	int err = 0;
 
 	dout("request_close_session mds%d state %s seq %lld\n",
 	     session->s_mds, session_state_name(session->s_state),
 	     session->s_seq);
 	msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq);
-	if (IS_ERR(msg))
-		err = PTR_ERR(msg);
-	else
-		ceph_con_send(&session->s_con, msg);
-	return err;
+	if (!msg)
+		return -ENOMEM;
+	ceph_con_send(&session->s_con, msg);
+	return 0;
 }
 
 /*
@@ -1426,8 +1419,10 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
 		len += req->r_old_dentry->d_name.len;
 
 	msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, 0, 0, NULL);
-	if (IS_ERR(msg))
+	if (!msg) {
+		msg = ERR_PTR(-ENOMEM);
 		goto out_free2;
+	}
 
 	msg->hdr.tid = cpu_to_le64(req->r_tid);
 
@@ -1518,7 +1513,7 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
 	if (IS_ERR(msg)) {
 		req->r_err = PTR_ERR(msg);
 		complete_request(mdsc, req);
-		return -PTR_ERR(msg);
+		return PTR_ERR(msg);
 	}
 	req->r_request = msg;
 
@@ -2158,11 +2153,10 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds)
 		goto fail_nopagelist;
 	ceph_pagelist_init(pagelist);
 
+	err = -ENOMEM;
 	reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, 0, 0, NULL);
-	if (IS_ERR(reply)) {
-		err = PTR_ERR(reply);
+	if (!reply)
 		goto fail_nomsg;
-	}
 
 	/* find session */
 	session = __ceph_lookup_mds_session(mdsc, mds);
@@ -2469,7 +2463,7 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
 	len += dnamelen;
 
 	msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, 0, 0, NULL);
-	if (IS_ERR(msg))
+	if (!msg)
 		return;
 	lease = msg->front.iov_base;
 	lease->action = action;
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index af3b143fbf02..fe7d0c52ae3c 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -1402,19 +1402,17 @@ static int read_partial_message(struct ceph_connection *con)
 		con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip);
 		if (skip) {
 			/* skip this message */
-			dout("alloc_msg returned NULL, skipping message\n");
+			dout("alloc_msg said skip message\n");
 			con->in_base_pos = -front_len - middle_len - data_len -
 				sizeof(m->footer);
 			con->in_tag = CEPH_MSGR_TAG_READY;
 			con->in_seq++;
 			return 0;
 		}
-		if (IS_ERR(con->in_msg)) {
-			ret = PTR_ERR(con->in_msg);
-			con->in_msg = NULL;
+		if (!con->in_msg) {
 			con->error_msg =
 				"error allocating memory for incoming message";
-			return ret;
+			return -ENOMEM;
 		}
 		m = con->in_msg;
 		m->front.iov_len = 0;    /* haven't read it yet */
@@ -2147,7 +2145,7 @@ out2:
 	ceph_msg_put(m);
 out:
 	pr_err("msg_new can't create type %d len %d\n", type, front_len);
-	return ERR_PTR(-ENOMEM);
+	return NULL;
 }
 
 /*
@@ -2190,10 +2188,7 @@ static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
 		mutex_unlock(&con->mutex);
 		msg = con->ops->alloc_msg(con, hdr, skip);
 		mutex_lock(&con->mutex);
-		if (IS_ERR(msg))
-			return msg;
-
-		if (*skip)
+		if (!msg || *skip)
 			return NULL;
 	}
 	if (!msg) {
@@ -2202,17 +2197,16 @@ static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
 		if (!msg) {
 			pr_err("unable to allocate msg type %d len %d\n",
 			       type, front_len);
-			return ERR_PTR(-ENOMEM);
+			return NULL;
 		}
 	}
 	memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
 
 	if (middle_len) {
 		ret = ceph_alloc_middle(con, msg);
-
 		if (ret < 0) {
 			ceph_msg_put(msg);
-			return msg;
+			return NULL;
 		}
 	}
 
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
index 5bee9250bf2a..35f593e7e364 100644
--- a/fs/ceph/mon_client.c
+++ b/fs/ceph/mon_client.c
@@ -490,16 +490,13 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
 	req->buf = buf;
 	init_completion(&req->completion);
 
+	err = -ENOMEM;
 	req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), 0, 0, NULL);
-	if (IS_ERR(req->request)) {
-		err = PTR_ERR(req->request);
+	if (!req->request)
 		goto out;
-	}
 	req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, 0, 0, NULL);
-	if (IS_ERR(req->reply)) {
-		err = PTR_ERR(req->reply);
+	if (!req->reply)
 		goto out;
-	}
 
 	/* fill out request */
 	h = req->request->front.iov_base;
@@ -634,30 +631,22 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
 		CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS;
 
 	/* msg pools */
+	err = -ENOMEM;
 	monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK,
 				     sizeof(struct ceph_mon_subscribe_ack),
 				     0, 0, NULL);
-	if (IS_ERR(monc->m_subscribe_ack)) {
-		err = PTR_ERR(monc->m_subscribe_ack);
-		monc->m_subscribe_ack = NULL;
+	if (!monc->m_subscribe_ack)
 		goto out_monmap;
-	}
 
 	monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, 0, 0,
 					  NULL);
-	if (IS_ERR(monc->m_auth_reply)) {
-		err = PTR_ERR(monc->m_auth_reply);
-		monc->m_auth_reply = NULL;
+	if (!monc->m_auth_reply)
 		goto out_subscribe_ack;
-	}
 
 	monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, 0, 0, NULL);
 	monc->pending_auth = 0;
-	if (IS_ERR(monc->m_auth)) {
-		err = PTR_ERR(monc->m_auth);
-		monc->m_auth = NULL;
+	if (!monc->m_auth)
 		goto out_auth_reply;
-	}
 
 	monc->cur_mon = -1;
 	monc->hunting = true;
diff --git a/fs/ceph/msgpool.c b/fs/ceph/msgpool.c
index ca032223e87b..04fea84890da 100644
--- a/fs/ceph/msgpool.c
+++ b/fs/ceph/msgpool.c
@@ -10,12 +10,8 @@
 static void *alloc_fn(gfp_t gfp_mask, void *arg)
 {
 	struct ceph_msgpool *pool = arg;
-	struct ceph_msg *m;
 
-	m = ceph_msg_new(0, pool->front_len, 0, 0, NULL);
-	if (IS_ERR(m))
-		return NULL;
-	return m;
+	return ceph_msg_new(0, pool->front_len, 0, 0, NULL);
 }
 
 static void free_fn(void *element, void *arg)
@@ -42,17 +38,12 @@ struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool,
 				  int front_len)
 {
 	if (front_len > pool->front_len) {
-		struct ceph_msg *msg;
-
 		pr_err("msgpool_get pool %p need front %d, pool size is %d\n",
 		       pool, front_len, pool->front_len);
 		WARN_ON(1);
 
 		/* try to alloc a fresh message */
-		msg = ceph_msg_new(0, front_len, 0, 0, NULL);
-		if (!IS_ERR(msg))
-			return msg;
-		return NULL;
+		return ceph_msg_new(0, front_len, 0, 0, NULL);
 	}
 
 	return mempool_alloc(pool->pool, GFP_NOFS);
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index 3d2bfbc232dc..a51d0df2af30 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -147,7 +147,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 		req = kzalloc(sizeof(*req), GFP_NOFS);
 	}
 	if (req == NULL)
-		return ERR_PTR(-ENOMEM);
+		return NULL;
 
 	req->r_osdc = osdc;
 	req->r_mempool = use_mempool;
@@ -165,9 +165,9 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 	else
 		msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY,
 				   OSD_OPREPLY_FRONT_LEN, 0, 0, NULL);
-	if (IS_ERR(msg)) {
+	if (!msg) {
 		ceph_osdc_put_request(req);
-		return ERR_PTR(PTR_ERR(msg));
+		return NULL;
 	}
 	req->r_reply = msg;
 
@@ -179,9 +179,9 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 		msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
 	else
 		msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, 0, 0, NULL);
-	if (IS_ERR(msg)) {
+	if (!msg) {
 		ceph_osdc_put_request(req);
-		return ERR_PTR(PTR_ERR(msg));
+		return NULL;
 	}
 	msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP);
 	memset(msg->front.iov_base, 0, msg->front.iov_len);
@@ -1263,8 +1263,8 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc,
 				    CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
 				    NULL, 0, truncate_seq, truncate_size, NULL,
 				    false, 1);
-	if (IS_ERR(req))
-		return PTR_ERR(req);
+	if (!req)
+		return -ENOMEM;
 
 	/* it may be a short read due to an object boundary */
 	req->r_pages = pages;
@@ -1306,8 +1306,8 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
 				    snapc, do_sync,
 				    truncate_seq, truncate_size, mtime,
 				    nofail, 1);
-	if (IS_ERR(req))
-		return PTR_ERR(req);
+	if (!req)
+		return -ENOMEM;
 
 	/* it may be a short write due to an object boundary */
 	req->r_pages = pages;
@@ -1393,7 +1393,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
 		pr_warning("get_reply front %d > preallocated %d\n",
 			   front, (int)req->r_reply->front.iov_len);
 		m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, 0, 0, NULL);
-		if (IS_ERR(m))
+		if (!m)
 			goto out;
 		ceph_msg_put(req->r_reply);
 		req->r_reply = m;
@@ -1409,7 +1409,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
 				   tid, want, m->nr_pages);
 			*skip = 1;
 			ceph_msg_put(m);
-			m = ERR_PTR(-EIO);
+			m = NULL;
 			goto out;
 		}
 		m->pages = req->r_pages;

From bb257664f748bcfc80715f85f70f0f560caec3b4 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Thu, 1 Apr 2010 16:07:23 -0700
Subject: [PATCH 17/59] ceph: simplify ceph_msg_new

We only need to pass in front_len.  Callers can attach any other payload
pieces (middle, data) as they see fit.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/caps.c       |  2 +-
 fs/ceph/mds_client.c | 11 +++++------
 fs/ceph/messenger.c  | 20 +++++++++-----------
 fs/ceph/messenger.h  |  4 +---
 fs/ceph/mon_client.c | 16 +++++++---------
 fs/ceph/msgpool.c    |  4 ++--
 fs/ceph/osd_client.c |  8 ++++----
 7 files changed, 29 insertions(+), 36 deletions(-)

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 8755e2d83d4c..74c74842c860 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -938,7 +938,7 @@ static int send_cap_msg(struct ceph_mds_session *session,
 	     seq, issue_seq, mseq, follows, size, max_size,
 	     xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0);
 
-	msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), 0, 0, NULL);
+	msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc));
 	if (!msg)
 		return -ENOMEM;
 
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 7e89c185d38d..35dbdad07b1c 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -665,7 +665,7 @@ static struct ceph_msg *create_session_msg(u32 op, u64 seq)
 	struct ceph_msg *msg;
 	struct ceph_mds_session_head *h;
 
-	msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), 0, 0, NULL);
+	msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h));
 	if (!msg) {
 		pr_err("create_session_msg ENOMEM creating msg\n");
 		return NULL;
@@ -1051,8 +1051,7 @@ static int add_cap_releases(struct ceph_mds_client *mdsc,
 
 	while (session->s_num_cap_releases < session->s_nr_caps + extra) {
 		spin_unlock(&session->s_cap_lock);
-		msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE,
-				   0, 0, NULL);
+		msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE);
 		if (!msg)
 			goto out_unlocked;
 		dout("add_cap_releases %p msg %p now %d\n", session, msg,
@@ -1418,7 +1417,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
 	if (req->r_old_dentry_drop)
 		len += req->r_old_dentry->d_name.len;
 
-	msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, 0, 0, NULL);
+	msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len);
 	if (!msg) {
 		msg = ERR_PTR(-ENOMEM);
 		goto out_free2;
@@ -2154,7 +2153,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds)
 	ceph_pagelist_init(pagelist);
 
 	err = -ENOMEM;
-	reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, 0, 0, NULL);
+	reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0);
 	if (!reply)
 		goto fail_nomsg;
 
@@ -2462,7 +2461,7 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
 	dnamelen = dentry->d_name.len;
 	len += dnamelen;
 
-	msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, 0, 0, NULL);
+	msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len);
 	if (!msg)
 		return;
 	lease = msg->front.iov_base;
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index fe7d0c52ae3c..395ce326beda 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -2081,8 +2081,7 @@ void ceph_con_keepalive(struct ceph_connection *con)
  * construct a new message with given type, size
  * the new msg has a ref count of 1.
  */
-struct ceph_msg *ceph_msg_new(int type, int front_len,
-			      int page_len, int page_off, struct page **pages)
+struct ceph_msg *ceph_msg_new(int type, int front_len)
 {
 	struct ceph_msg *m;
 
@@ -2098,8 +2097,8 @@ struct ceph_msg *ceph_msg_new(int type, int front_len,
 	m->hdr.version = 0;
 	m->hdr.front_len = cpu_to_le32(front_len);
 	m->hdr.middle_len = 0;
-	m->hdr.data_len = cpu_to_le32(page_len);
-	m->hdr.data_off = cpu_to_le16(page_off);
+	m->hdr.data_len = 0;
+	m->hdr.data_off = 0;
 	m->hdr.reserved = 0;
 	m->footer.front_crc = 0;
 	m->footer.middle_crc = 0;
@@ -2133,18 +2132,17 @@ struct ceph_msg *ceph_msg_new(int type, int front_len,
 	m->middle = NULL;
 
 	/* data */
-	m->nr_pages = calc_pages_for(page_off, page_len);
-	m->pages = pages;
+	m->nr_pages = 0;
+	m->pages = NULL;
 	m->pagelist = NULL;
 
-	dout("ceph_msg_new %p page %d~%d -> %d\n", m, page_off, page_len,
-	     m->nr_pages);
+	dout("ceph_msg_new %p front %d\n", m, front_len);
 	return m;
 
 out2:
 	ceph_msg_put(m);
 out:
-	pr_err("msg_new can't create type %d len %d\n", type, front_len);
+	pr_err("msg_new can't create type %d front %d\n", type, front_len);
 	return NULL;
 }
 
@@ -2193,7 +2191,7 @@ static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
 	}
 	if (!msg) {
 		*skip = 0;
-		msg = ceph_msg_new(type, front_len, 0, 0, NULL);
+		msg = ceph_msg_new(type, front_len);
 		if (!msg) {
 			pr_err("unable to allocate msg type %d len %d\n",
 			       type, front_len);
@@ -2202,7 +2200,7 @@ static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
 	}
 	memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
 
-	if (middle_len) {
+	if (middle_len && !msg->middle) {
 		ret = ceph_alloc_middle(con, msg);
 		if (ret < 0) {
 			ceph_msg_put(msg);
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h
index a5caf91cc971..27fb69585f63 100644
--- a/fs/ceph/messenger.h
+++ b/fs/ceph/messenger.h
@@ -234,9 +234,7 @@ extern void ceph_con_keepalive(struct ceph_connection *con);
 extern struct ceph_connection *ceph_con_get(struct ceph_connection *con);
 extern void ceph_con_put(struct ceph_connection *con);
 
-extern struct ceph_msg *ceph_msg_new(int type, int front_len,
-				     int page_len, int page_off,
-				     struct page **pages);
+extern struct ceph_msg *ceph_msg_new(int type, int front_len);
 extern void ceph_msg_kfree(struct ceph_msg *m);
 
 
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
index 35f593e7e364..9900706fee75 100644
--- a/fs/ceph/mon_client.c
+++ b/fs/ceph/mon_client.c
@@ -191,7 +191,7 @@ static void __send_subscribe(struct ceph_mon_client *monc)
 		struct ceph_mon_subscribe_item *i;
 		void *p, *end;
 
-		msg = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, 0, 0, NULL);
+		msg = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96);
 		if (!msg)
 			return;
 
@@ -491,10 +491,10 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
 	init_completion(&req->completion);
 
 	err = -ENOMEM;
-	req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), 0, 0, NULL);
+	req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h));
 	if (!req->request)
 		goto out;
-	req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, 0, 0, NULL);
+	req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024);
 	if (!req->reply)
 		goto out;
 
@@ -633,17 +633,15 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
 	/* msg pools */
 	err = -ENOMEM;
 	monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK,
-				     sizeof(struct ceph_mon_subscribe_ack),
-				     0, 0, NULL);
+				     sizeof(struct ceph_mon_subscribe_ack));
 	if (!monc->m_subscribe_ack)
 		goto out_monmap;
 
-	monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, 0, 0,
-					  NULL);
+	monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096);
 	if (!monc->m_auth_reply)
 		goto out_subscribe_ack;
 
-	monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, 0, 0, NULL);
+	monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096);
 	monc->pending_auth = 0;
 	if (!monc->m_auth)
 		goto out_auth_reply;
@@ -818,7 +816,7 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
 	case CEPH_MSG_MON_MAP:
 	case CEPH_MSG_MDS_MAP:
 	case CEPH_MSG_OSD_MAP:
-		m = ceph_msg_new(type, front_len, 0, 0, NULL);
+		m = ceph_msg_new(type, front_len);
 		break;
 	}
 
diff --git a/fs/ceph/msgpool.c b/fs/ceph/msgpool.c
index 04fea84890da..10d3632cc403 100644
--- a/fs/ceph/msgpool.c
+++ b/fs/ceph/msgpool.c
@@ -11,7 +11,7 @@ static void *alloc_fn(gfp_t gfp_mask, void *arg)
 {
 	struct ceph_msgpool *pool = arg;
 
-	return ceph_msg_new(0, pool->front_len, 0, 0, NULL);
+	return ceph_msg_new(0, pool->front_len);
 }
 
 static void free_fn(void *element, void *arg)
@@ -43,7 +43,7 @@ struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool,
 		WARN_ON(1);
 
 		/* try to alloc a fresh message */
-		return ceph_msg_new(0, front_len, 0, 0, NULL);
+		return ceph_msg_new(0, front_len);
 	}
 
 	return mempool_alloc(pool->pool, GFP_NOFS);
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index a51d0df2af30..a44b3b6374ee 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -164,7 +164,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 		msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
 	else
 		msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY,
-				   OSD_OPREPLY_FRONT_LEN, 0, 0, NULL);
+				   OSD_OPREPLY_FRONT_LEN);
 	if (!msg) {
 		ceph_osdc_put_request(req);
 		return NULL;
@@ -178,7 +178,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 	if (use_mempool)
 		msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
 	else
-		msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, 0, 0, NULL);
+		msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size);
 	if (!msg) {
 		ceph_osdc_put_request(req);
 		return NULL;
@@ -1392,7 +1392,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
 	if (front > req->r_reply->front.iov_len) {
 		pr_warning("get_reply front %d > preallocated %d\n",
 			   front, (int)req->r_reply->front.iov_len);
-		m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, 0, 0, NULL);
+		m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front);
 		if (!m)
 			goto out;
 		ceph_msg_put(req->r_reply);
@@ -1435,7 +1435,7 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con,
 
 	switch (type) {
 	case CEPH_MSG_OSD_MAP:
-		return ceph_msg_new(type, front, 0, 0, NULL);
+		return ceph_msg_new(type, front);
 	case CEPH_MSG_OSD_OPREPLY:
 		return get_reply(con, hdr, skip);
 	default:

From 6f2bc3ff4cdb03903c79e155e9e1889ce176de09 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 2 Apr 2010 16:16:34 -0700
Subject: [PATCH 18/59] ceph: clean up connection reset

Reset out_keepalive_pending and peer_global_seq, and drop unused var.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/messenger.c | 2 ++
 fs/ceph/messenger.h | 1 -
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index 395ce326beda..8cfca375c6a9 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -340,6 +340,7 @@ static void reset_connection(struct ceph_connection *con)
 		ceph_msg_put(con->out_msg);
 		con->out_msg = NULL;
 	}
+	con->out_keepalive_pending = false;
 	con->in_seq = 0;
 	con->in_seq_acked = 0;
 }
@@ -357,6 +358,7 @@ void ceph_con_close(struct ceph_connection *con)
 	clear_bit(WRITE_PENDING, &con->state);
 	mutex_lock(&con->mutex);
 	reset_connection(con);
+	con->peer_global_seq = 0;
 	cancel_delayed_work(&con->work);
 	mutex_unlock(&con->mutex);
 	queue_con(con);
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h
index 27fb69585f63..e56564f7e71c 100644
--- a/fs/ceph/messenger.h
+++ b/fs/ceph/messenger.h
@@ -158,7 +158,6 @@ struct ceph_connection {
 	struct list_head out_queue;
 	struct list_head out_sent;   /* sending or sent but unacked */
 	u64 out_seq;		     /* last message queued for send */
-	u64 out_seq_sent;            /* last message sent */
 	bool out_keepalive_pending;
 
 	u64 in_seq, in_seq_acked;  /* last message received, acked */

From 6822d00b5462e7a9dfa11dcc60cc25823a2107c5 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Wed, 7 Apr 2010 11:23:20 -0700
Subject: [PATCH 19/59] ceph: wait for both monmap and osdmap when opening
 session

Signed-off-by: Yehuda Sadeh <yehuda@hq.newdream.net>
---
 fs/ceph/super.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 93ad169a6ae1..a8124e89dea1 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -682,9 +682,10 @@ int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
 /*
  * true if we have the mon map (and have thus joined the cluster)
  */
-static int have_mon_map(struct ceph_client *client)
+static int have_mon_and_osd_map(struct ceph_client *client)
 {
-	return client->monc.monmap && client->monc.monmap->epoch;
+	return client->monc.monmap && client->monc.monmap->epoch &&
+	       client->osdc.osdmap && client->osdc.osdmap->epoch;
 }
 
 /*
@@ -762,7 +763,7 @@ static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt,
 	if (err < 0)
 		goto out;
 
-	while (!have_mon_map(client)) {
+	while (!have_mon_and_osd_map(client)) {
 		err = -EIO;
 		if (timeout && time_after_eq(jiffies, started + timeout))
 			goto out;
@@ -770,8 +771,8 @@ static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt,
 		/* wait */
 		dout("mount waiting for mon_map\n");
 		err = wait_event_interruptible_timeout(client->auth_wq,
-			       have_mon_map(client) || (client->auth_err < 0),
-			       timeout);
+		       have_mon_and_osd_map(client) || (client->auth_err < 0),
+		       timeout);
 		if (err == -EINTR || err == -ERESTARTSYS)
 			goto out;
 		if (client->auth_err < 0) {

From 1bb71637d07d58e993ef3f8e2c6b7ca6f4c0e0b8 Mon Sep 17 00:00:00 2001
From: Huang Weiyi <weiyi.huang@gmail.com>
Date: Thu, 8 Apr 2010 19:48:57 +0800
Subject: [PATCH 20/59] ceph: remove unused #includes

Remove unused #include's in
  fs/ceph/super.c

Signed-off-by: Huang Weiyi <weiyi.huang@gmail.com>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/super.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index a8124e89dea1..b3225bf63f58 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -8,14 +8,11 @@
 #include <linux/module.h>
 #include <linux/mount.h>
 #include <linux/parser.h>
-#include <linux/rwsem.h>
 #include <linux/sched.h>
 #include <linux/seq_file.h>
 #include <linux/slab.h>
 #include <linux/statfs.h>
 #include <linux/string.h>
-#include <linux/version.h>
-#include <linux/vmalloc.h>
 
 #include "decode.h"
 #include "super.h"

From c473ad927e6b3be0bac51ddf312e5d8d2b9220b0 Mon Sep 17 00:00:00 2001
From: Yehuda Sadeh <yehuda@hq.newdream.net>
Date: Tue, 13 Apr 2010 19:34:26 +0100
Subject: [PATCH 21/59] ceph: wake up mount thread when getting osdmap

Now that the mount thread waits for the osdmap, it needs
to be awaken.

Signed-off-by: Yehuda Sadeh <yehuda@hq.newdream.net>
---
 fs/ceph/osd_client.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index a44b3b6374ee..b81e6f97010e 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -1078,6 +1078,7 @@ done:
 	if (newmap)
 		kick_requests(osdc, NULL);
 	up_read(&osdc->map_sem);
+	wake_up(&osdc->client->auth_wq);
 	return;
 
 bad:

From 0d509c949a4d3f21943bd93863a2e6c2f0d0c004 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Wed, 21 Apr 2010 12:31:13 +0200
Subject: [PATCH 22/59] ceph: d_obtain_alias() returns ERR_PTR()

d_obtain_alias() doesn't return NULL, it returns an ERR_PTR().

Signed-off-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/export.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 15683905a7b6..17447644d675 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -93,11 +93,11 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
 		return ERR_PTR(-ESTALE);
 
 	dentry = d_obtain_alias(inode);
-	if (!dentry) {
+	if (IS_ERR(dentry)) {
 		pr_err("fh_to_dentry %llx -- inode %p but ENOMEM\n",
 		       fh->ino, inode);
 		iput(inode);
-		return ERR_PTR(-ENOMEM);
+		return dentry;
 	}
 	err = ceph_init_dentry(dentry);
 
@@ -149,11 +149,11 @@ static struct dentry *__cfh_to_dentry(struct super_block *sb,
 	}
 
 	dentry = d_obtain_alias(inode);
-	if (!dentry) {
+	if (IS_ERR(dentry)) {
 		pr_err("cfh_to_dentry %llx -- inode %p but ENOMEM\n",
 		       cfh->ino, inode);
 		iput(inode);
-		return ERR_PTR(-ENOMEM);
+		return dentry;
 	}
 	err = ceph_init_dentry(dentry);
 	if (err < 0) {
@@ -202,11 +202,11 @@ static struct dentry *ceph_fh_to_parent(struct super_block *sb,
 		return ERR_PTR(-ESTALE);
 
 	dentry = d_obtain_alias(inode);
-	if (!dentry) {
+	if (IS_ERR(dentry)) {
 		pr_err("fh_to_parent %llx -- inode %p but ENOMEM\n",
 		       cfh->ino, inode);
 		iput(inode);
-		return ERR_PTR(-ENOMEM);
+		return dentry;
 	}
 	err = ceph_init_dentry(dentry);
 	if (err < 0) {

From f26e681d52fcc4778d8c604ef047487ca9f3df95 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Wed, 21 Apr 2010 11:09:38 -0700
Subject: [PATCH 23/59] ceph: osdtimeout=0 for now timeout

Allow the osd reset timeout to be disabled.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/osd_client.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index b81e6f97010e..97a3a57fe8cd 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -715,7 +715,7 @@ static void handle_timeout(struct work_struct *work)
 	 * should mark the osd as failed and we should find out about
 	 * it from an updated osd map.
 	 */
-	while (!list_empty(&osdc->req_lru)) {
+	while (timeout && !list_empty(&osdc->req_lru)) {
 		req = list_entry(osdc->req_lru.next, struct ceph_osd_request,
 				 r_req_lru_item);
 

From 559c1e0073ae779d60e1c673cda837f3e4295302 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 14 May 2010 09:55:18 -0700
Subject: [PATCH 24/59] ceph: include auth method in error messages

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/auth.c      | 9 +++++----
 fs/ceph/auth.h      | 2 ++
 fs/ceph/auth_none.c | 1 +
 fs/ceph/auth_x.c    | 1 +
 4 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c
index 818afe72e6c7..9f46de2ba7a7 100644
--- a/fs/ceph/auth.c
+++ b/fs/ceph/auth.c
@@ -150,7 +150,8 @@ int ceph_build_auth_request(struct ceph_auth_client *ac,
 
 	ret = ac->ops->build_request(ac, p + sizeof(u32), end);
 	if (ret < 0) {
-		pr_err("error %d building request\n", ret);
+		pr_err("error %d building auth method %s request\n", ret,
+		       ac->ops->name);
 		return ret;
 	}
 	dout(" built request %d bytes\n", ret);
@@ -216,8 +217,8 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
 		if (ac->protocol != protocol) {
 			ret = ceph_auth_init_protocol(ac, protocol);
 			if (ret) {
-				pr_err("error %d on auth protocol %d init\n",
-				       ret, protocol);
+				pr_err("error %d on auth method %s init\n",
+				       ret, ac->ops->name);
 				goto out;
 			}
 		}
@@ -229,7 +230,7 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
 	if (ret == -EAGAIN) {
 		return ceph_build_auth_request(ac, reply_buf, reply_len);
 	} else if (ret) {
-		pr_err("authentication error %d\n", ret);
+		pr_err("auth method '%s' error %d\n", ac->ops->name, ret);
 		return ret;
 	}
 	return 0;
diff --git a/fs/ceph/auth.h b/fs/ceph/auth.h
index ca4f57cfb267..4429a707c021 100644
--- a/fs/ceph/auth.h
+++ b/fs/ceph/auth.h
@@ -15,6 +15,8 @@ struct ceph_auth_client;
 struct ceph_authorizer;
 
 struct ceph_auth_client_ops {
+	const char *name;
+
 	/*
 	 * true if we are authenticated and can connect to
 	 * services.
diff --git a/fs/ceph/auth_none.c b/fs/ceph/auth_none.c
index 8cd9e3af07f7..24407c119291 100644
--- a/fs/ceph/auth_none.c
+++ b/fs/ceph/auth_none.c
@@ -94,6 +94,7 @@ static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac,
 }
 
 static const struct ceph_auth_client_ops ceph_auth_none_ops = {
+	.name = "none",
 	.reset = reset,
 	.destroy = destroy,
 	.is_authenticated = is_authenticated,
diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c
index fee5a08da881..18c1a1de5547 100644
--- a/fs/ceph/auth_x.c
+++ b/fs/ceph/auth_x.c
@@ -618,6 +618,7 @@ static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
 
 
 static const struct ceph_auth_client_ops ceph_x_ops = {
+	.name = "x",
 	.is_authenticated = ceph_x_is_authenticated,
 	.build_request = ceph_x_build_request,
 	.handle_reply = ceph_x_handle_reply,

From 8c6efb58a5bab880d45b2078cb55ec4320707daf Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 23 Apr 2010 11:36:54 -0700
Subject: [PATCH 25/59] ceph: fix memory leak due to possible dentry init race

Free dentry_info in error path.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/dir.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 33feac75c532..422794fd10e0 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -51,8 +51,11 @@ int ceph_init_dentry(struct dentry *dentry)
 		return -ENOMEM;          /* oh well */
 
 	spin_lock(&dentry->d_lock);
-	if (dentry->d_fsdata) /* lost a race */
+	if (dentry->d_fsdata) {
+		/* lost a race */
+		kmem_cache_free(ceph_dentry_cachep, di);
 		goto out_unlock;
+	}
 	di->dentry = dentry;
 	di->lease_session = NULL;
 	dentry->d_fsdata = di;

From 4f48280ee1d0654390cd50ad0c41ea93309e7c91 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Sat, 24 Apr 2010 09:56:35 -0700
Subject: [PATCH 26/59] ceph: name msgpools; useful error messages

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/msgpool.c    | 13 +++++++++----
 fs/ceph/msgpool.h    |  4 +++-
 fs/ceph/osd_client.c |  6 ++++--
 3 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/fs/ceph/msgpool.c b/fs/ceph/msgpool.c
index 10d3632cc403..50fe5cc5de76 100644
--- a/fs/ceph/msgpool.c
+++ b/fs/ceph/msgpool.c
@@ -10,8 +10,12 @@
 static void *alloc_fn(gfp_t gfp_mask, void *arg)
 {
 	struct ceph_msgpool *pool = arg;
+	void *p;
 
-	return ceph_msg_new(0, pool->front_len);
+	p = ceph_msg_new(0, pool->front_len);
+	if (!p)
+		pr_err("msgpool %s alloc failed\n", pool->name);
+	return p;
 }
 
 static void free_fn(void *element, void *arg)
@@ -20,12 +24,13 @@ static void free_fn(void *element, void *arg)
 }
 
 int ceph_msgpool_init(struct ceph_msgpool *pool,
-		      int front_len, int size, bool blocking)
+		      int front_len, int size, bool blocking, const char *name)
 {
 	pool->front_len = front_len;
 	pool->pool = mempool_create(size, alloc_fn, free_fn, pool);
 	if (!pool->pool)
 		return -ENOMEM;
+	pool->name = name;
 	return 0;
 }
 
@@ -38,8 +43,8 @@ struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool,
 				  int front_len)
 {
 	if (front_len > pool->front_len) {
-		pr_err("msgpool_get pool %p need front %d, pool size is %d\n",
-		       pool, front_len, pool->front_len);
+		pr_err("msgpool_get pool %s need front %d, pool size is %d\n",
+		       pool->name, front_len, pool->front_len);
 		WARN_ON(1);
 
 		/* try to alloc a fresh message */
diff --git a/fs/ceph/msgpool.h b/fs/ceph/msgpool.h
index 62a61c7fca08..a362605f9368 100644
--- a/fs/ceph/msgpool.h
+++ b/fs/ceph/msgpool.h
@@ -9,12 +9,14 @@
  * avoid unexpected OOM conditions.
  */
 struct ceph_msgpool {
+	const char *name;
 	mempool_t *pool;
 	int front_len;          /* preallocated payload size */
 };
 
 extern int ceph_msgpool_init(struct ceph_msgpool *pool,
-			     int front_len, int size, bool blocking);
+			     int front_len, int size, bool blocking,
+			     const char *name);
 extern void ceph_msgpool_destroy(struct ceph_msgpool *pool);
 extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *,
 					 int front_len);
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index 97a3a57fe8cd..c0aca732efd3 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -1214,11 +1214,13 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
 	if (!osdc->req_mempool)
 		goto out;
 
-	err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true);
+	err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true,
+				"osd_op");
 	if (err < 0)
 		goto out_mempool;
 	err = ceph_msgpool_init(&osdc->msgpool_op_reply,
-				OSD_OPREPLY_FRONT_LEN, 10, true);
+				OSD_OPREPLY_FRONT_LEN, 10, true,
+				"osd_op_reply");
 	if (err < 0)
 		goto out_msgpool;
 	return 0;

From 9dd4658db1be5ca92c2ed2fd7a100d973125d9c5 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Wed, 28 Apr 2010 13:51:50 -0700
Subject: [PATCH 27/59] ceph: close messenger race

Simplify messenger locking, and close race between ceph_con_close() setting
the CLOSED bit and con_work() checking the bit, then taking the mutex.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/messenger.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index 8cfca375c6a9..bb16edb1aef6 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -1546,7 +1546,6 @@ static int try_write(struct ceph_connection *con)
 	dout("try_write start %p state %lu nref %d\n", con, con->state,
 	     atomic_read(&con->nref));
 
-	mutex_lock(&con->mutex);
 more:
 	dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
 
@@ -1639,7 +1638,6 @@ do_next:
 done:
 	ret = 0;
 out:
-	mutex_unlock(&con->mutex);
 	dout("try_write done on %p\n", con);
 	return ret;
 }
@@ -1651,7 +1649,6 @@ out:
  */
 static int try_read(struct ceph_connection *con)
 {
-	struct ceph_messenger *msgr;
 	int ret = -1;
 
 	if (!con->sock)
@@ -1661,9 +1658,6 @@ static int try_read(struct ceph_connection *con)
 		return 0;
 
 	dout("try_read start on %p\n", con);
-	msgr = con->msgr;
-
-	mutex_lock(&con->mutex);
 
 more:
 	dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
@@ -1758,7 +1752,6 @@ more:
 done:
 	ret = 0;
 out:
-	mutex_unlock(&con->mutex);
 	dout("try_read done on %p\n", con);
 	return ret;
 
@@ -1830,6 +1823,8 @@ more:
 	dout("con_work %p start, clearing QUEUED\n", con);
 	clear_bit(QUEUED, &con->state);
 
+	mutex_lock(&con->mutex);
+
 	if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */
 		dout("con_work CLOSED\n");
 		con_close_socket(con);
@@ -1844,11 +1839,16 @@ more:
 	if (test_and_clear_bit(SOCK_CLOSED, &con->state) ||
 	    try_read(con) < 0 ||
 	    try_write(con) < 0) {
+		mutex_unlock(&con->mutex);
 		backoff = 1;
 		ceph_fault(con);     /* error/fault path */
+		goto done_unlocked;
 	}
 
 done:
+	mutex_unlock(&con->mutex);
+
+done_unlocked:
 	clear_bit(BUSY, &con->state);
 	dout("con->state=%lu\n", con->state);
 	if (test_bit(QUEUED, &con->state)) {

From a6424e48c8d54a5795430b07c4487f1ed280df4e Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Thu, 29 Apr 2010 09:28:11 -0700
Subject: [PATCH 28/59] ceph: fix xattr dangling pointer / double free

If we use the xattr_blob, clear the pointer so we don't release the memory
at the bottom of the fuction.

Reported-by: Henry C Chang <henry_c_chang@tcloudcomputing.com>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/inode.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index ef917232cf37..913cafd70cd0 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -619,6 +619,7 @@ static int fill_inode(struct inode *inode,
 			memcpy(ci->i_xattrs.blob->vec.iov_base,
 			       iinfo->xattr_data, iinfo->xattr_len);
 		ci->i_xattrs.version = le64_to_cpu(info->xattr_version);
+		xattr_blob = NULL;
 	}
 
 	inode->i_mapping->a_ops = &ceph_aops;

From bddfa3cc18fcd9c9313a1030b19d3b0ea2639310 Mon Sep 17 00:00:00 2001
From: Henry C Chang <henry_c_chang@tcloudcomputing.com>
Date: Thu, 29 Apr 2010 09:32:28 -0700
Subject: [PATCH 29/59] ceph: listxattr should compare version by >=

If the version hasn't changed, don't rebuild the index.

Signed-off-by: Henry C Chang <henry_c_chang@tcloudcomputing.com>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/xattr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 3b4c2620030a..7185d0794df3 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -568,7 +568,7 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
 	     ci->i_xattrs.version, ci->i_xattrs.index_version);
 
 	if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
-	    (ci->i_xattrs.index_version > ci->i_xattrs.version)) {
+	    (ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
 		goto list_xattr;
 	} else {
 		spin_unlock(&inode->i_lock);

From f1f2765faedc24f8f2e9fd68521a5ea469801b60 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 3 May 2010 21:50:39 -0700
Subject: [PATCH 30/59] ceph: set next_offset on readdir finish

Set next_offset to 2 (always 2!), not 0, on readdir finish.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/dir.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 422794fd10e0..0b0a39d05ca6 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -338,7 +338,7 @@ more:
 		if (req->r_reply_info.dir_end) {
 			kfree(fi->last_name);
 			fi->last_name = NULL;
-			fi->next_offset = 0;
+			fi->next_offset = 2;
 		} else {
 			rinfo = &req->r_reply_info;
 			err = note_last_dentry(fi,

From e8a7498715181ece36130335536e13733a5c3187 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Thu, 15 Apr 2010 14:08:49 -0700
Subject: [PATCH 31/59] ceph: skip set_dentry_offset work if directory not
 I_COMPLETE

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/inode.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 913cafd70cd0..49a0935c4390 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -861,6 +861,10 @@ static void ceph_set_dentry_offset(struct dentry *dn)
 	di = ceph_dentry(dn);
 
 	spin_lock(&inode->i_lock);
+	if ((ceph_inode(inode)->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
+		spin_unlock(&inode->i_lock);
+		return;
+	}
 	di->offset = ceph_inode(inode)->i_max_offset++;
 	spin_unlock(&inode->i_lock);
 

From 1b7facc41b42c2ab904b2f88b64b1f8ca0ca6cb7 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 16 Apr 2010 12:58:02 -0700
Subject: [PATCH 32/59] ceph: don't clobber i_max_offset on already complete
 dir

This can screw up offsets assigned to new dentries and break dcache
readdir results.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/inode.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 49a0935c4390..aa22a0bce52f 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -675,7 +675,8 @@ static int fill_inode(struct inode *inode,
 		/* set dir completion flag? */
 		if (ci->i_files == 0 && ci->i_subdirs == 0 &&
 		    ceph_snap(inode) == CEPH_NOSNAP &&
-		    (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED)) {
+		    (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) &&
+		    (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
 			dout(" marking %p complete (empty)\n", inode);
 			ci->i_ceph_flags |= CEPH_I_COMPLETE;
 			ci->i_max_offset = 2;

From 1cd3935bedccf592d44343890251452a6dd74fc4 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 3 May 2010 22:08:02 -0700
Subject: [PATCH 33/59] ceph: set dn offset when spliced

We want to assign an offset when the dentry goes from null to linked, which
is always done by splice_dentry().  Notably, we should NOT assign an
offset when a dentry is first created and is still null.

BUG if we try to splice a non-null dentry (we shouldn't).

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/dir.c   | 13 +++----
 fs/ceph/inode.c | 95 +++++++++++++++++++++++++------------------------
 2 files changed, 56 insertions(+), 52 deletions(-)

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 0b0a39d05ca6..d3bb8132a1aa 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -128,7 +128,8 @@ more:
 	dentry = list_entry(p, struct dentry, d_u.d_child);
 	di = ceph_dentry(dentry);
 	while (1) {
-		dout(" p %p/%p d_subdirs %p/%p\n", p->prev, p->next,
+		dout(" p %p/%p %s d_subdirs %p/%p\n", p->prev, p->next,
+		     d_unhashed(dentry) ? "!hashed" : "hashed",
 		     parent->d_subdirs.prev, parent->d_subdirs.next);
 		if (p == &parent->d_subdirs) {
 			fi->at_end = 1;
@@ -571,7 +572,6 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
 		    !is_root_ceph_dentry(dir, dentry) &&
 		    (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
 		    (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
-			di->offset = ci->i_max_offset++;
 			spin_unlock(&dir->i_lock);
 			dout(" dir %p complete, -ENOENT\n", dir);
 			d_add(dentry, NULL);
@@ -984,8 +984,9 @@ static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
 	struct inode *dir = dentry->d_parent->d_inode;
 
-	dout("d_revalidate %p '%.*s' inode %p\n", dentry,
-	     dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
+	dout("d_revalidate %p '%.*s' inode %p offset %lld\n", dentry,
+	     dentry->d_name.len, dentry->d_name.name, dentry->d_inode,
+	     ceph_dentry(dentry)->offset);
 
 	/* always trust cached snapped dentries, snapdir dentry */
 	if (ceph_snap(dir) != CEPH_NOSNAP) {
@@ -1177,8 +1178,8 @@ void ceph_dentry_lru_touch(struct dentry *dn)
 	struct ceph_dentry_info *di = ceph_dentry(dn);
 	struct ceph_mds_client *mdsc;
 
-	dout("dentry_lru_touch %p %p '%.*s'\n", di, dn,
-	     dn->d_name.len, dn->d_name.name);
+	dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn,
+	     dn->d_name.len, dn->d_name.name, di->offset);
 	if (di) {
 		mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
 		spin_lock(&mdsc->dentry_lru_lock);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index aa22a0bce52f..1bcf98bd0255 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -803,50 +803,6 @@ out_unlock:
 	return;
 }
 
-/*
- * splice a dentry to an inode.
- * caller must hold directory i_mutex for this to be safe.
- *
- * we will only rehash the resulting dentry if @prehash is
- * true; @prehash will be set to false (for the benefit of
- * the caller) if we fail.
- */
-static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
-				    bool *prehash)
-{
-	struct dentry *realdn;
-
-	/* dn must be unhashed */
-	if (!d_unhashed(dn))
-		d_drop(dn);
-	realdn = d_materialise_unique(dn, in);
-	if (IS_ERR(realdn)) {
-		pr_err("splice_dentry error %p inode %p ino %llx.%llx\n",
-		       dn, in, ceph_vinop(in));
-		if (prehash)
-			*prehash = false; /* don't rehash on error */
-		dn = realdn; /* note realdn contains the error */
-		goto out;
-	} else if (realdn) {
-		dout("dn %p (%d) spliced with %p (%d) "
-		     "inode %p ino %llx.%llx\n",
-		     dn, atomic_read(&dn->d_count),
-		     realdn, atomic_read(&realdn->d_count),
-		     realdn->d_inode, ceph_vinop(realdn->d_inode));
-		dput(dn);
-		dn = realdn;
-	} else {
-		BUG_ON(!ceph_dentry(dn));
-
-		dout("dn %p attached to %p ino %llx.%llx\n",
-		     dn, dn->d_inode, ceph_vinop(dn->d_inode));
-	}
-	if ((!prehash || *prehash) && d_unhashed(dn))
-		d_rehash(dn);
-out:
-	return dn;
-}
-
 /*
  * Set dentry's directory position based on the current dir's max, and
  * order it in d_subdirs, so that dcache_readdir behaves.
@@ -878,6 +834,52 @@ static void ceph_set_dentry_offset(struct dentry *dn)
 	spin_unlock(&dcache_lock);
 }
 
+/*
+ * splice a dentry to an inode.
+ * caller must hold directory i_mutex for this to be safe.
+ *
+ * we will only rehash the resulting dentry if @prehash is
+ * true; @prehash will be set to false (for the benefit of
+ * the caller) if we fail.
+ */
+static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
+				    bool *prehash)
+{
+	struct dentry *realdn;
+
+	BUG_ON(dn->d_inode);
+
+	/* dn must be unhashed */
+	if (!d_unhashed(dn))
+		d_drop(dn);
+	realdn = d_materialise_unique(dn, in);
+	if (IS_ERR(realdn)) {
+		pr_err("splice_dentry error %p inode %p ino %llx.%llx\n",
+		       dn, in, ceph_vinop(in));
+		if (prehash)
+			*prehash = false; /* don't rehash on error */
+		dn = realdn; /* note realdn contains the error */
+		goto out;
+	} else if (realdn) {
+		dout("dn %p (%d) spliced with %p (%d) "
+		     "inode %p ino %llx.%llx\n",
+		     dn, atomic_read(&dn->d_count),
+		     realdn, atomic_read(&realdn->d_count),
+		     realdn->d_inode, ceph_vinop(realdn->d_inode));
+		dput(dn);
+		dn = realdn;
+	} else {
+		BUG_ON(!ceph_dentry(dn));
+		dout("dn %p attached to %p ino %llx.%llx\n",
+		     dn, dn->d_inode, ceph_vinop(dn->d_inode));
+	}
+	if ((!prehash || *prehash) && d_unhashed(dn))
+		d_rehash(dn);
+	ceph_set_dentry_offset(dn);
+out:
+	return dn;
+}
+
 /*
  * Incorporate results into the local cache.  This is either just
  * one inode, or a directory, dentry, and possibly linked-to inode (e.g.,
@@ -1030,6 +1032,9 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
 			ceph_invalidate_dentry_lease(dn);
 
 			/* take overwritten dentry's readdir offset */
+			dout("dn %p gets %p offset %lld (old offset %lld)\n",
+			     req->r_old_dentry, dn, ceph_dentry(dn)->offset,
+			     ceph_dentry(req->r_old_dentry)->offset);
 			ceph_dentry(req->r_old_dentry)->offset =
 				ceph_dentry(dn)->offset;
 
@@ -1074,7 +1079,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
 				goto done;
 			}
 			req->r_dentry = dn;  /* may have spliced */
-			ceph_set_dentry_offset(dn);
 			igrab(in);
 		} else if (ceph_ino(in) == vino.ino &&
 			   ceph_snap(in) == vino.snap) {
@@ -1117,7 +1121,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
 			err = PTR_ERR(dn);
 			goto done;
 		}
-		ceph_set_dentry_offset(dn);
 		req->r_dentry = dn;  /* may have spliced */
 		igrab(in);
 		rinfo->head->is_dentry = 1;  /* fool notrace handlers */

From 6e19a16ef28aee09dbcbb9f3ff24ac4f439def7d Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Thu, 29 Apr 2010 16:38:32 -0700
Subject: [PATCH 34/59] ceph: clean up mount options, ->show_options()

Ensure all options are included in /proc/mounts.  Some cleanup.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/super.c | 128 +++++++++++++++++++++++++++++-------------------
 fs/ceph/super.h |  23 ++++-----
 2 files changed, 90 insertions(+), 61 deletions(-)

diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index b3225bf63f58..34b16cb302fe 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -110,54 +110,6 @@ static int ceph_syncfs(struct super_block *sb, int wait)
 	return 0;
 }
 
-
-/**
- * ceph_show_options - Show mount options in /proc/mounts
- * @m: seq_file to write to
- * @mnt: mount descriptor
- */
-static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
-{
-	struct ceph_client *client = ceph_sb_to_client(mnt->mnt_sb);
-	struct ceph_mount_args *args = client->mount_args;
-
-	if (args->flags & CEPH_OPT_FSID)
-		seq_printf(m, ",fsidmajor=%llu,fsidminor%llu",
-			   le64_to_cpu(*(__le64 *)&args->fsid.fsid[0]),
-			   le64_to_cpu(*(__le64 *)&args->fsid.fsid[8]));
-	if (args->flags & CEPH_OPT_NOSHARE)
-		seq_puts(m, ",noshare");
-	if (args->flags & CEPH_OPT_DIRSTAT)
-		seq_puts(m, ",dirstat");
-	if ((args->flags & CEPH_OPT_RBYTES) == 0)
-		seq_puts(m, ",norbytes");
-	if (args->flags & CEPH_OPT_NOCRC)
-		seq_puts(m, ",nocrc");
-	if (args->flags & CEPH_OPT_NOASYNCREADDIR)
-		seq_puts(m, ",noasyncreaddir");
-	if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
-		seq_printf(m, ",snapdirname=%s", args->snapdir_name);
-	if (args->name)
-		seq_printf(m, ",name=%s", args->name);
-	if (args->secret)
-		seq_puts(m, ",secret=<hidden>");
-	return 0;
-}
-
-/*
- * caches
- */
-struct kmem_cache *ceph_inode_cachep;
-struct kmem_cache *ceph_cap_cachep;
-struct kmem_cache *ceph_dentry_cachep;
-struct kmem_cache *ceph_file_cachep;
-
-static void ceph_inode_init_once(void *foo)
-{
-	struct ceph_inode_info *ci = foo;
-	inode_init_once(&ci->vfs_inode);
-}
-
 static int default_congestion_kb(void)
 {
 	int congestion_kb;
@@ -187,6 +139,80 @@ static int default_congestion_kb(void)
 	return congestion_kb;
 }
 
+/**
+ * ceph_show_options - Show mount options in /proc/mounts
+ * @m: seq_file to write to
+ * @mnt: mount descriptor
+ */
+static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
+{
+	struct ceph_client *client = ceph_sb_to_client(mnt->mnt_sb);
+	struct ceph_mount_args *args = client->mount_args;
+
+	if (args->flags & CEPH_OPT_FSID)
+		seq_printf(m, ",fsidmajor=%llu,fsidminor%llu",
+			   le64_to_cpu(*(__le64 *)&args->fsid.fsid[0]),
+			   le64_to_cpu(*(__le64 *)&args->fsid.fsid[8]));
+	if (args->flags & CEPH_OPT_NOSHARE)
+		seq_puts(m, ",noshare");
+	if (args->flags & CEPH_OPT_DIRSTAT)
+		seq_puts(m, ",dirstat");
+	if ((args->flags & CEPH_OPT_RBYTES) == 0)
+		seq_puts(m, ",norbytes");
+	if (args->flags & CEPH_OPT_NOCRC)
+		seq_puts(m, ",nocrc");
+	if (args->flags & CEPH_OPT_NOASYNCREADDIR)
+		seq_puts(m, ",noasyncreaddir");
+
+	if (args->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
+		seq_printf(m, ",mount_timeout=%d", args->mount_timeout);
+	if (args->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
+		seq_printf(m, ",osd_idle_ttl=%d", args->osd_idle_ttl);
+	if (args->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT)
+		seq_printf(m, ",osdtimeout=%d", args->osd_timeout);
+	if (args->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
+		seq_printf(m, ",osdkeepalivetimeout=%d",
+			 args->osd_keepalive_timeout);
+	if (args->wsize)
+		seq_printf(m, ",wsize=%d", args->wsize);
+	if (args->rsize != CEPH_MOUNT_RSIZE_DEFAULT)
+		seq_printf(m, ",rsize=%d", args->rsize);
+	if (args->congestion_kb != default_congestion_kb())
+		seq_printf(m, ",write_congestion_kb=%d", args->congestion_kb);
+	if (args->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
+		seq_printf(m, ",caps_wanted_delay_min=%d",
+			 args->caps_wanted_delay_min);
+	if (args->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT)
+		seq_printf(m, ",caps_wanted_delay_max=%d",
+			   args->caps_wanted_delay_max);
+	if (args->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT)
+		seq_printf(m, ",cap_release_safety=%d",
+			   args->cap_release_safety);
+	if (args->max_readdir != CEPH_MAX_READDIR_DEFAULT)
+		seq_printf(m, ",readdir_max_entries=%d", args->max_readdir);
+	if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
+		seq_printf(m, ",snapdirname=%s", args->snapdir_name);
+	if (args->name)
+		seq_printf(m, ",name=%s", args->name);
+	if (args->secret)
+		seq_puts(m, ",secret=<hidden>");
+	return 0;
+}
+
+/*
+ * caches
+ */
+struct kmem_cache *ceph_inode_cachep;
+struct kmem_cache *ceph_cap_cachep;
+struct kmem_cache *ceph_dentry_cachep;
+struct kmem_cache *ceph_file_cachep;
+
+static void ceph_inode_init_once(void *foo)
+{
+	struct ceph_inode_info *ci = foo;
+	inode_init_once(&ci->vfs_inode);
+}
+
 static int __init init_caches(void)
 {
 	ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
@@ -305,6 +331,7 @@ enum {
 	Opt_osd_idle_ttl,
 	Opt_caps_wanted_delay_min,
 	Opt_caps_wanted_delay_max,
+	Opt_cap_release_safety,
 	Opt_readdir_max_entries,
 	Opt_congestion_kb,
 	Opt_last_int,
@@ -336,6 +363,7 @@ static match_table_t arg_tokens = {
 	{Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
 	{Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
 	{Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
+	{Opt_cap_release_safety, "cap_release_safety=%d"},
 	{Opt_readdir_max_entries, "readdir_max_entries=%d"},
 	{Opt_congestion_kb, "write_congestion_kb=%d"},
 	/* int args above */
@@ -385,8 +413,8 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options,
 	args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
 	args->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
 	args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
-	args->cap_release_safety = CEPH_CAPS_PER_RELEASE * 4;
-	args->max_readdir = 1024;
+	args->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
+	args->max_readdir = CEPH_MAX_READDIR_DEFAULT;
 	args->congestion_kb = default_congestion_kb();
 
 	/* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 51b3ff238454..395adc5fcebb 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -52,24 +52,24 @@
 
 struct ceph_mount_args {
 	int sb_flags;
-	int num_mon;
-	struct ceph_entity_addr *mon_addr;
 	int flags;
-	int mount_timeout;
-	int osd_idle_ttl;
-	int caps_wanted_delay_min, caps_wanted_delay_max;
 	struct ceph_fsid fsid;
 	struct ceph_entity_addr my_addr;
-	int wsize;
-	int rsize;            /* max readahead */
-	int max_readdir;      /* max readdir size */
-	int congestion_kb;      /* max readdir size */
+	int num_mon;
+	struct ceph_entity_addr *mon_addr;
+	int mount_timeout;
+	int osd_idle_ttl;
 	int osd_timeout;
 	int osd_keepalive_timeout;
+	int wsize;
+	int rsize;            /* max readahead */
+	int congestion_kb;    /* max writeback in flight */
+	int caps_wanted_delay_min, caps_wanted_delay_max;
+	int cap_release_safety;
+	int max_readdir;      /* max readdir size */
 	char *snapdir_name;   /* default ".snap" */
 	char *name;
 	char *secret;
-	int cap_release_safety;
 };
 
 /*
@@ -80,13 +80,13 @@ struct ceph_mount_args {
 #define CEPH_OSD_KEEPALIVE_DEFAULT  5
 #define CEPH_OSD_IDLE_TTL_DEFAULT    60
 #define CEPH_MOUNT_RSIZE_DEFAULT    (512*1024) /* readahead */
+#define CEPH_MAX_READDIR_DEFAULT    1024
 
 #define CEPH_MSG_MAX_FRONT_LEN	(16*1024*1024)
 #define CEPH_MSG_MAX_DATA_LEN	(16*1024*1024)
 
 #define CEPH_SNAPDIRNAME_DEFAULT ".snap"
 #define CEPH_AUTH_NAME_DEFAULT   "guest"
-
 /*
  * Delay telling the MDS we no longer want caps, in case we reopen
  * the file.  Delay a minimum amount of time, even if we send a cap
@@ -96,6 +96,7 @@ struct ceph_mount_args {
 #define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT      5  /* cap release delay */
 #define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT     60  /* cap release delay */
 
+#define CEPH_CAP_RELEASE_SAFETY_DEFAULT        (CEPH_CAPS_PER_RELEASE * 4)
 
 /* mount state */
 enum {

From b736b3d9d0ba52693701373d7cd88aaad8e5bed3 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 30 Apr 2010 12:45:02 -0700
Subject: [PATCH 35/59] ceph: adjust masked struct_v variable names

Reported-by: Bill Pemberton <wfp5p@virginia.edu>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/auth_x.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c
index 18c1a1de5547..7b206231566d 100644
--- a/fs/ceph/auth_x.c
+++ b/fs/ceph/auth_x.c
@@ -127,7 +127,7 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
 	int ret;
 	char *dbuf;
 	char *ticket_buf;
-	u8 struct_v;
+	u8 reply_struct_v;
 
 	dbuf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS);
 	if (!dbuf)
@@ -139,14 +139,14 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
 		goto out_dbuf;
 
 	ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
-	struct_v = ceph_decode_8(&p);
-	if (struct_v != 1)
+	reply_struct_v = ceph_decode_8(&p);
+	if (reply_struct_v != 1)
 		goto bad;
 	num = ceph_decode_32(&p);
 	dout("%d tickets\n", num);
 	while (num--) {
 		int type;
-		u8 struct_v;
+		u8 tkt_struct_v, blob_struct_v;
 		struct ceph_x_ticket_handler *th;
 		void *dp, *dend;
 		int dlen;
@@ -165,8 +165,8 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
 		type = ceph_decode_32(&p);
 		dout(" ticket type %d %s\n", type, ceph_entity_type_name(type));
 
-		struct_v = ceph_decode_8(&p);
-		if (struct_v != 1)
+		tkt_struct_v = ceph_decode_8(&p);
+		if (tkt_struct_v != 1)
 			goto bad;
 
 		th = get_ticket_handler(ac, type);
@@ -186,8 +186,8 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
 		dend = dbuf + dlen;
 		dp = dbuf;
 
-		struct_v = ceph_decode_8(&dp);
-		if (struct_v != 1)
+		tkt_struct_v = ceph_decode_8(&dp);
+		if (tkt_struct_v != 1)
 			goto bad;
 
 		memcpy(&old_key, &th->session_key, sizeof(old_key));
@@ -224,7 +224,7 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
 		tpend = tp + dlen;
 		dout(" ticket blob is %d bytes\n", dlen);
 		ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad);
-		struct_v = ceph_decode_8(&tp);
+		blob_struct_v = ceph_decode_8(&tp);
 		new_secret_id = ceph_decode_64(&tp);
 		ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend);
 		if (ret)

From 56b7cf9581fa0486657102a6fb8efabc3eadeba1 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 3 May 2010 15:22:00 -0700
Subject: [PATCH 36/59] ceph: skip mds sync on forced unmount

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/mds_client.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 35dbdad07b1c..0d451a83bc83 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2748,6 +2748,9 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
 {
 	u64 want_tid, want_flush;
 
+	if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN)
+		return;
+
 	dout("sync\n");
 	mutex_lock(&mdsc->mutex);
 	want_tid = mdsc->last_tid;

From 31e0cf8f6a1488b6ca69dcdceeaed107ecfd6463 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 4 May 2010 16:39:35 -0700
Subject: [PATCH 37/59] ceph: name bdi ceph-%d instead of major:minor

The bdi_setup_and_register() helper doesn't help us since we bdi_init() in
create_client() and bdi_register() only when sget() succeeds.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/super.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 34b16cb302fe..7f5b20dc4945 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -910,6 +910,8 @@ static int ceph_compare_super(struct super_block *sb, void *data)
 /*
  * construct our own bdi so we can control readahead, etc.
  */
+static atomic_long_t bdi_seq = ATOMIC_INIT(0);
+
 static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client)
 {
 	int err;
@@ -919,7 +921,8 @@ static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client)
 		client->backing_dev_info.ra_pages =
 			(client->mount_args->rsize + PAGE_CACHE_SIZE - 1)
 			>> PAGE_SHIFT;
-	err = bdi_register_dev(&client->backing_dev_info, sb->s_dev);
+	err = bdi_register(&client->backing_dev_info, NULL, "ceph-%d",
+			   atomic_long_inc_return(&bdi_seq));
 	if (!err)
 		sb->s_bdi = &client->backing_dev_info;
 	return err;

From 0f8605f2bde2c69737709765dfc574558ea35d4e Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Wed, 5 May 2010 15:51:35 -0700
Subject: [PATCH 38/59] ceph: clean up cap release loop vs spinlock

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/mds_client.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 0d451a83bc83..824a9b2cde71 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1143,10 +1143,8 @@ static void send_cap_releases(struct ceph_mds_client *mdsc,
 	struct ceph_msg *msg;
 
 	dout("send_cap_releases mds%d\n", session->s_mds);
-	while (1) {
-		spin_lock(&session->s_cap_lock);
-		if (list_empty(&session->s_cap_releases_done))
-			break;
+	spin_lock(&session->s_cap_lock);
+	while (!list_empty(&session->s_cap_releases_done)) {
 		msg = list_first_entry(&session->s_cap_releases_done,
 				 struct ceph_msg, list_head);
 		list_del_init(&msg->list_head);
@@ -1154,6 +1152,7 @@ static void send_cap_releases(struct ceph_mds_client *mdsc,
 		msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
 		dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
 		ceph_con_send(&session->s_con, msg);
+		spin_lock(&session->s_cap_lock);
 	}
 	spin_unlock(&session->s_cap_lock);
 }

From a5ee751c15016d0deee0d651e42a3b163ea73ade Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Fri, 7 May 2010 10:27:14 +0200
Subject: [PATCH 39/59] ceph: cleanup: remove unused assignement

We don't ever use "dirty" so we can remove it.

Signed-off-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/caps.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 74c74842c860..0e85d3c80790 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1718,10 +1718,9 @@ out_unlocked:
 static int caps_are_flushed(struct inode *inode, unsigned tid)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	int dirty, i, ret = 1;
+	int i, ret = 1;
 
 	spin_lock(&inode->i_lock);
-	dirty = __ceph_caps_dirty(ci);
 	for (i = 0; i < CEPH_CAP_BITS; i++)
 		if ((ci->i_flushing_caps & (1 << i)) &&
 		    ci->i_cap_flush_tid[i] <= tid) {

From dbad185d4939ffb806f6fa753ef9f470e3b72b62 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Thu, 25 Mar 2010 15:45:38 -0700
Subject: [PATCH 40/59] ceph: drop src address(es) from message header [new
 protocol feature]

The CEPH_FEATURE_NOSRCADDR protocol feature avoids putting the full source
address in each message header (twice).  This patch switches the client to
the new scheme, and _requires_ this feature on the server.  The server
will support both the old and new schemes.  That means an old client will
work with a new server, but a new client will not work with an old server.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/ceph_fs.h   | 13 +++++++++++--
 fs/ceph/messenger.c | 14 ++++++--------
 fs/ceph/msgr.h      | 20 +++++++++++++++++++-
 3 files changed, 36 insertions(+), 11 deletions(-)

diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
index 0c2241ef3653..8b194c6ef7d3 100644
--- a/fs/ceph/ceph_fs.h
+++ b/fs/ceph/ceph_fs.h
@@ -53,8 +53,17 @@
 /*
  * feature bits
  */
-#define CEPH_FEATURE_SUPPORTED  0
-#define CEPH_FEATURE_REQUIRED   0
+#define CEPH_FEATURE_UID        1
+#define CEPH_FEATURE_NOSRCADDR  2
+
+#define CEPH_FEATURE_SUPPORTED_MON  CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR
+#define CEPH_FEATURE_REQUIRED_MON   CEPH_FEATURE_UID
+#define CEPH_FEATURE_SUPPORTED_MDS  CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR
+#define CEPH_FEATURE_REQUIRED_MDS   CEPH_FEATURE_UID
+#define CEPH_FEATURE_SUPPORTED_OSD  CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR
+#define CEPH_FEATURE_REQUIRED_OSD   CEPH_FEATURE_UID
+#define CEPH_FEATURE_SUPPORTED_CLIENT CEPH_FEATURE_NOSRCADDR
+#define CEPH_FEATURE_REQUIRED_CLIENT CEPH_FEATURE_NOSRCADDR
 
 
 /*
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index bb16edb1aef6..fe75ec7d8402 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -663,7 +663,7 @@ static void prepare_write_connect(struct ceph_messenger *msgr,
 	dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
 	     con->connect_seq, global_seq, proto);
 
-	con->out_connect.features = CEPH_FEATURE_SUPPORTED;
+	con->out_connect.features = CEPH_FEATURE_SUPPORTED_CLIENT;
 	con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
 	con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
 	con->out_connect.global_seq = cpu_to_le32(global_seq);
@@ -1126,8 +1126,8 @@ static void fail_protocol(struct ceph_connection *con)
 
 static int process_connect(struct ceph_connection *con)
 {
-	u64 sup_feat = CEPH_FEATURE_SUPPORTED;
-	u64 req_feat = CEPH_FEATURE_REQUIRED;
+	u64 sup_feat = CEPH_FEATURE_SUPPORTED_CLIENT;
+	u64 req_feat = CEPH_FEATURE_REQUIRED_CLIENT;
 	u64 server_feat = le64_to_cpu(con->in_reply.features);
 
 	dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
@@ -1514,14 +1514,14 @@ static void process_message(struct ceph_connection *con)
 
 	/* if first message, set peer_name */
 	if (con->peer_name.type == 0)
-		con->peer_name = msg->hdr.src.name;
+		con->peer_name = msg->hdr.src;
 
 	con->in_seq++;
 	mutex_unlock(&con->mutex);
 
 	dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n",
 	     msg, le64_to_cpu(msg->hdr.seq),
-	     ENTITY_NAME(msg->hdr.src.name),
+	     ENTITY_NAME(msg->hdr.src),
 	     le16_to_cpu(msg->hdr.type),
 	     ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
 	     le32_to_cpu(msg->hdr.front_len),
@@ -1987,9 +1987,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
 	}
 
 	/* set src+dst */
-	msg->hdr.src.name = con->msgr->inst.name;
-	msg->hdr.src.addr = con->msgr->my_enc_addr;
-	msg->hdr.orig_src = msg->hdr.src;
+	msg->hdr.src = con->msgr->inst.name;
 
 	BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len));
 
diff --git a/fs/ceph/msgr.h b/fs/ceph/msgr.h
index 8aaab414f3f8..6baa8e4f8e7f 100644
--- a/fs/ceph/msgr.h
+++ b/fs/ceph/msgr.h
@@ -120,7 +120,7 @@ struct ceph_msg_connect_reply {
 /*
  * message header
  */
-struct ceph_msg_header {
+struct ceph_msg_header_old {
 	__le64 seq;       /* message seq# for this session */
 	__le64 tid;       /* transaction id */
 	__le16 type;      /* message type */
@@ -138,6 +138,24 @@ struct ceph_msg_header {
 	__le32 crc;       /* header crc32c */
 } __attribute__ ((packed));
 
+struct ceph_msg_header {
+	__le64 seq;       /* message seq# for this session */
+	__le64 tid;       /* transaction id */
+	__le16 type;      /* message type */
+	__le16 priority;  /* priority.  higher value == higher priority */
+	__le16 version;   /* version of message encoding */
+
+	__le32 front_len; /* bytes in main payload */
+	__le32 middle_len;/* bytes in middle payload */
+	__le32 data_len;  /* bytes of data payload */
+	__le16 data_off;  /* sender: include full offset;
+			     receiver: mask against ~PAGE_MASK */
+
+	struct ceph_entity_name src;
+	__le32 reserved;
+	__le32 crc;       /* header crc32c */
+} __attribute__ ((packed));
+
 #define CEPH_MSG_PRIO_LOW     64
 #define CEPH_MSG_PRIO_DEFAULT 127
 #define CEPH_MSG_PRIO_HIGH    196

From f8c76f6f250edbdc9d2011b0e05d196a3d8ae895 Mon Sep 17 00:00:00 2001
From: Yehuda Sadeh <yehuda@hq.newdream.net>
Date: Thu, 22 Apr 2010 15:40:37 -0700
Subject: [PATCH 41/59] ceph: make mon client statfs handling more generic

This is being done so that we could reuse the statfs
infrastructure with other requests that return values.

Signed-off-by: Yehuda Sadeh <yehuda@hq.newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/debugfs.c    | 13 ++++---
 fs/ceph/mon_client.c | 82 ++++++++++++++++++++++----------------------
 fs/ceph/mon_client.h | 15 ++++----
 3 files changed, 58 insertions(+), 52 deletions(-)

diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index f7048da92acc..3be33fb066cc 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -113,7 +113,7 @@ static int osdmap_show(struct seq_file *s, void *p)
 static int monc_show(struct seq_file *s, void *p)
 {
 	struct ceph_client *client = s->private;
-	struct ceph_mon_statfs_request *req;
+	struct ceph_mon_generic_request *req;
 	struct ceph_mon_client *monc = &client->monc;
 	struct rb_node *rp;
 
@@ -126,9 +126,14 @@ static int monc_show(struct seq_file *s, void *p)
 	if (monc->want_next_osdmap)
 		seq_printf(s, "want next osdmap\n");
 
-	for (rp = rb_first(&monc->statfs_request_tree); rp; rp = rb_next(rp)) {
-		req = rb_entry(rp, struct ceph_mon_statfs_request, node);
-		seq_printf(s, "%lld statfs\n", req->tid);
+	for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) {
+		__u16 op;
+		req = rb_entry(rp, struct ceph_mon_generic_request, node);
+		op = le16_to_cpu(req->request->hdr.type);
+		if (op == CEPH_MSG_STATFS)
+			seq_printf(s, "%lld statfs\n", req->tid);
+		else
+			seq_printf(s, "%lld unknown\n", req->tid);
 	}
 
 	mutex_unlock(&monc->mutex);
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
index 9900706fee75..7df2f8c157f1 100644
--- a/fs/ceph/mon_client.c
+++ b/fs/ceph/mon_client.c
@@ -353,14 +353,14 @@ out:
 /*
  * statfs
  */
-static struct ceph_mon_statfs_request *__lookup_statfs(
+static struct ceph_mon_generic_request *__lookup_generic_req(
 	struct ceph_mon_client *monc, u64 tid)
 {
-	struct ceph_mon_statfs_request *req;
-	struct rb_node *n = monc->statfs_request_tree.rb_node;
+	struct ceph_mon_generic_request *req;
+	struct rb_node *n = monc->generic_request_tree.rb_node;
 
 	while (n) {
-		req = rb_entry(n, struct ceph_mon_statfs_request, node);
+		req = rb_entry(n, struct ceph_mon_generic_request, node);
 		if (tid < req->tid)
 			n = n->rb_left;
 		else if (tid > req->tid)
@@ -371,16 +371,16 @@ static struct ceph_mon_statfs_request *__lookup_statfs(
 	return NULL;
 }
 
-static void __insert_statfs(struct ceph_mon_client *monc,
-			    struct ceph_mon_statfs_request *new)
+static void __insert_generic_request(struct ceph_mon_client *monc,
+			    struct ceph_mon_generic_request *new)
 {
-	struct rb_node **p = &monc->statfs_request_tree.rb_node;
+	struct rb_node **p = &monc->generic_request_tree.rb_node;
 	struct rb_node *parent = NULL;
-	struct ceph_mon_statfs_request *req = NULL;
+	struct ceph_mon_generic_request *req = NULL;
 
 	while (*p) {
 		parent = *p;
-		req = rb_entry(parent, struct ceph_mon_statfs_request, node);
+		req = rb_entry(parent, struct ceph_mon_generic_request, node);
 		if (new->tid < req->tid)
 			p = &(*p)->rb_left;
 		else if (new->tid > req->tid)
@@ -390,13 +390,13 @@ static void __insert_statfs(struct ceph_mon_client *monc,
 	}
 
 	rb_link_node(&new->node, parent, p);
-	rb_insert_color(&new->node, &monc->statfs_request_tree);
+	rb_insert_color(&new->node, &monc->generic_request_tree);
 }
 
-static void release_statfs_request(struct kref *kref)
+static void release_generic_request(struct kref *kref)
 {
-	struct ceph_mon_statfs_request *req =
-		container_of(kref, struct ceph_mon_statfs_request, kref);
+	struct ceph_mon_generic_request *req =
+		container_of(kref, struct ceph_mon_generic_request, kref);
 
 	if (req->reply)
 		ceph_msg_put(req->reply);
@@ -404,33 +404,33 @@ static void release_statfs_request(struct kref *kref)
 		ceph_msg_put(req->request);
 }
 
-static void put_statfs_request(struct ceph_mon_statfs_request *req)
+static void put_generic_request(struct ceph_mon_generic_request *req)
 {
-	kref_put(&req->kref, release_statfs_request);
+	kref_put(&req->kref, release_generic_request);
 }
 
-static void get_statfs_request(struct ceph_mon_statfs_request *req)
+static void get_generic_request(struct ceph_mon_generic_request *req)
 {
 	kref_get(&req->kref);
 }
 
-static struct ceph_msg *get_statfs_reply(struct ceph_connection *con,
+static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
 					 struct ceph_msg_header *hdr,
 					 int *skip)
 {
 	struct ceph_mon_client *monc = con->private;
-	struct ceph_mon_statfs_request *req;
+	struct ceph_mon_generic_request *req;
 	u64 tid = le64_to_cpu(hdr->tid);
 	struct ceph_msg *m;
 
 	mutex_lock(&monc->mutex);
-	req = __lookup_statfs(monc, tid);
+	req = __lookup_generic_req(monc, tid);
 	if (!req) {
-		dout("get_statfs_reply %lld dne\n", tid);
+		dout("get_generic_reply %lld dne\n", tid);
 		*skip = 1;
 		m = NULL;
 	} else {
-		dout("get_statfs_reply %lld got %p\n", tid, req->reply);
+		dout("get_generic_reply %lld got %p\n", tid, req->reply);
 		m = ceph_msg_get(req->reply);
 		/*
 		 * we don't need to track the connection reading into
@@ -445,7 +445,7 @@ static struct ceph_msg *get_statfs_reply(struct ceph_connection *con,
 static void handle_statfs_reply(struct ceph_mon_client *monc,
 				struct ceph_msg *msg)
 {
-	struct ceph_mon_statfs_request *req;
+	struct ceph_mon_generic_request *req;
 	struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
 	u64 tid = le64_to_cpu(msg->hdr.tid);
 
@@ -454,21 +454,21 @@ static void handle_statfs_reply(struct ceph_mon_client *monc,
 	dout("handle_statfs_reply %p tid %llu\n", msg, tid);
 
 	mutex_lock(&monc->mutex);
-	req = __lookup_statfs(monc, tid);
+	req = __lookup_generic_req(monc, tid);
 	if (req) {
-		*req->buf = reply->st;
+		*(struct ceph_statfs *)req->buf = reply->st;
 		req->result = 0;
-		get_statfs_request(req);
+		get_generic_request(req);
 	}
 	mutex_unlock(&monc->mutex);
 	if (req) {
 		complete(&req->completion);
-		put_statfs_request(req);
+		put_generic_request(req);
 	}
 	return;
 
 bad:
-	pr_err("corrupt statfs reply, no tid\n");
+	pr_err("corrupt generic reply, no tid\n");
 	ceph_msg_dump(msg);
 }
 
@@ -477,7 +477,7 @@ bad:
  */
 int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
 {
-	struct ceph_mon_statfs_request *req;
+	struct ceph_mon_generic_request *req;
 	struct ceph_mon_statfs *h;
 	int err;
 
@@ -509,8 +509,8 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
 	mutex_lock(&monc->mutex);
 	req->tid = ++monc->last_tid;
 	req->request->hdr.tid = cpu_to_le64(req->tid);
-	__insert_statfs(monc, req);
-	monc->num_statfs_requests++;
+	__insert_generic_request(monc, req);
+	monc->num_generic_requests++;
 	mutex_unlock(&monc->mutex);
 
 	/* send request and wait */
@@ -518,28 +518,28 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
 	err = wait_for_completion_interruptible(&req->completion);
 
 	mutex_lock(&monc->mutex);
-	rb_erase(&req->node, &monc->statfs_request_tree);
-	monc->num_statfs_requests--;
+	rb_erase(&req->node, &monc->generic_request_tree);
+	monc->num_generic_requests--;
 	mutex_unlock(&monc->mutex);
 
 	if (!err)
 		err = req->result;
 
 out:
-	kref_put(&req->kref, release_statfs_request);
+	kref_put(&req->kref, release_generic_request);
 	return err;
 }
 
 /*
  * Resend pending statfs requests.
  */
-static void __resend_statfs(struct ceph_mon_client *monc)
+static void __resend_generic_request(struct ceph_mon_client *monc)
 {
-	struct ceph_mon_statfs_request *req;
+	struct ceph_mon_generic_request *req;
 	struct rb_node *p;
 
-	for (p = rb_first(&monc->statfs_request_tree); p; p = rb_next(p)) {
-		req = rb_entry(p, struct ceph_mon_statfs_request, node);
+	for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) {
+		req = rb_entry(p, struct ceph_mon_generic_request, node);
 		ceph_con_send(monc->con, ceph_msg_get(req->request));
 	}
 }
@@ -652,8 +652,8 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
 	monc->sub_sent = 0;
 
 	INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
-	monc->statfs_request_tree = RB_ROOT;
-	monc->num_statfs_requests = 0;
+	monc->generic_request_tree = RB_ROOT;
+	monc->num_generic_requests = 0;
 	monc->last_tid = 0;
 
 	monc->have_mdsmap = 0;
@@ -717,7 +717,7 @@ static void handle_auth_reply(struct ceph_mon_client *monc,
 		monc->client->msgr->inst.name.num = monc->auth->global_id;
 
 		__send_subscribe(monc);
-		__resend_statfs(monc);
+		__resend_generic_request(monc);
 	}
 	mutex_unlock(&monc->mutex);
 }
@@ -809,7 +809,7 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
 		m = ceph_msg_get(monc->m_subscribe_ack);
 		break;
 	case CEPH_MSG_STATFS_REPLY:
-		return get_statfs_reply(con, hdr, skip);
+		return get_generic_reply(con, hdr, skip);
 	case CEPH_MSG_AUTH_REPLY:
 		m = ceph_msg_get(monc->m_auth_reply);
 		break;
diff --git a/fs/ceph/mon_client.h b/fs/ceph/mon_client.h
index 0046deed0740..76887785a4d7 100644
--- a/fs/ceph/mon_client.h
+++ b/fs/ceph/mon_client.h
@@ -22,7 +22,7 @@ struct ceph_monmap {
 };
 
 struct ceph_mon_client;
-struct ceph_mon_statfs_request;
+struct ceph_mon_generic_request;
 
 
 /*
@@ -40,15 +40,16 @@ struct ceph_mon_request {
 };
 
 /*
- * statfs() is done a bit differently because we need to get data back
+ * ceph_mon_generic_request is being used for the statfs and poolop requests
+ * which are bening done a bit differently because we need to get data back
  * to the caller
  */
-struct ceph_mon_statfs_request {
+struct ceph_mon_generic_request {
 	struct kref kref;
 	u64 tid;
 	struct rb_node node;
 	int result;
-	struct ceph_statfs *buf;
+	void *buf;
 	struct completion completion;
 	struct ceph_msg *request;  /* original request */
 	struct ceph_msg *reply;    /* and reply */
@@ -71,9 +72,9 @@ struct ceph_mon_client {
 	struct ceph_connection *con;
 	bool have_fsid;
 
-	/* pending statfs requests */
-	struct rb_root statfs_request_tree;
-	int num_statfs_requests;
+	/* pending generic requests */
+	struct rb_root generic_request_tree;
+	int num_generic_requests;
 	u64 last_tid;
 
 	/* mds/osd map */

From e01a594646ebbf964b6058e3bf28125379063439 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 10 May 2010 15:36:44 -0700
Subject: [PATCH 42/59] ceph: dicard cap releases on mds restart

If the MDS restarts, the expire caps state is no longer shared, and can be
thrown out.  Caps state will be rebuilt on the MDS during the reconnect
process that follows.  Zero out any release messages and adjust the
release counter accordingly.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/mds_client.c | 41 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 824a9b2cde71..a4d9e5b0fd3d 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1157,6 +1157,44 @@ static void send_cap_releases(struct ceph_mds_client *mdsc,
 	spin_unlock(&session->s_cap_lock);
 }
 
+static void discard_cap_releases(struct ceph_mds_client *mdsc,
+				 struct ceph_mds_session *session)
+{
+	struct ceph_msg *msg;
+	struct ceph_mds_cap_release *head;
+	unsigned num;
+
+	dout("discard_cap_releases mds%d\n", session->s_mds);
+	spin_lock(&session->s_cap_lock);
+
+	/* zero out the in-progress message */
+	msg = list_first_entry(&session->s_cap_releases,
+			       struct ceph_msg, list_head);
+	head = msg->front.iov_base;
+	num = le32_to_cpu(head->num);
+	dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, num);
+	head->num = cpu_to_le32(0);
+	session->s_num_cap_releases += num;
+
+	/* requeue completed messages */
+	while (!list_empty(&session->s_cap_releases_done)) {
+		msg = list_first_entry(&session->s_cap_releases_done,
+				 struct ceph_msg, list_head);
+		list_del_init(&msg->list_head);
+
+		head = msg->front.iov_base;
+		num = le32_to_cpu(head->num);
+		dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg,
+		     num);
+		session->s_num_cap_releases += num;
+		head->num = cpu_to_le32(0);
+		msg->front.iov_len = sizeof(*head);
+		list_add(&msg->list_head, &session->s_cap_releases);
+	}
+
+	spin_unlock(&session->s_cap_lock);
+}
+
 /*
  * requests
  */
@@ -2183,6 +2221,9 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds)
 	dout("session %p state %s\n", session,
 	     session_state_name(session->s_state));
 
+	/* drop old cap expires; we're about to reestablish that state */
+	discard_cap_releases(mdsc, session);
+
 	/* traverse this session's caps */
 	err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps);
 	if (err)

From aab53dd9e81ccefa7b8d88eec5138dd73639a783 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Wed, 17 Mar 2010 16:30:21 -0700
Subject: [PATCH 43/59] ceph: only send cap releases when mds is OPEN|HUNG

On OPENING we shouldn't have any caps (or releases).
On CLOSING, we should wait until we succeed (and throw it all out), or
don't (and are OPEN again).
On RECONNECTING we can wait until we are OPEN.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/mds_client.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index a4d9e5b0fd3d..d45787470fb5 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2647,7 +2647,9 @@ static void delayed_work(struct work_struct *work)
 		else
 			ceph_con_keepalive(&s->s_con);
 		add_cap_releases(mdsc, s, -1);
-		send_cap_releases(mdsc, s);
+		if (s->s_state == CEPH_MDS_SESSION_OPEN ||
+		    s->s_state == CEPH_MDS_SESSION_HUNG)
+			send_cap_releases(mdsc, s);
 		mutex_unlock(&s->s_mutex);
 		ceph_put_mds_session(s);
 

From 29790f26ab3e63b2a083f0811b80e2f086e4fcb2 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Thu, 18 Mar 2010 14:45:05 -0700
Subject: [PATCH 44/59] ceph: wait for mds OPEN reply to indicate reconnect
 success

We used to infer reconnect success by watching the MDS state, essentially
assuming that hearing nothing meant things were ok.  That wasn't
particularly reliable.  Instead, the MDS replies with an explicit OPEN
message to indicate success.

Strictly speaking, this is a protocol change, but it is a backwards
compatible one that does not break new clients + old servers or old
clients + new servers.  At least not yet.

Drop unused @all argument from kick_requests while we're at it.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/mds_client.c | 28 +++++++++++++---------------
 1 file changed, 13 insertions(+), 15 deletions(-)

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index d45787470fb5..972ec5ae2ac4 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1660,10 +1660,9 @@ static void __wake_requests(struct ceph_mds_client *mdsc,
 
 /*
  * Wake up threads with requests pending for @mds, so that they can
- * resubmit their requests to a possibly different mds.  If @all is set,
- * wake up if their requests has been forwarded to @mds, too.
+ * resubmit their requests to a possibly different mds.
  */
-static void kick_requests(struct ceph_mds_client *mdsc, int mds, int all)
+static void kick_requests(struct ceph_mds_client *mdsc, int mds)
 {
 	struct ceph_mds_request *req;
 	struct rb_node *p;
@@ -2026,6 +2025,8 @@ static void handle_session(struct ceph_mds_session *session,
 
 	switch (op) {
 	case CEPH_SESSION_OPEN:
+		if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
+			pr_info("mds%d reconnect success\n", session->s_mds);
 		session->s_state = CEPH_MDS_SESSION_OPEN;
 		renewed_caps(mdsc, session, 0);
 		wake = 1;
@@ -2039,10 +2040,12 @@ static void handle_session(struct ceph_mds_session *session,
 		break;
 
 	case CEPH_SESSION_CLOSE:
+		if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
+			pr_info("mds%d reconnect denied\n", session->s_mds);
 		remove_session_caps(session);
 		wake = 1; /* for good measure */
 		complete(&mdsc->session_close_waiters);
-		kick_requests(mdsc, mds, 0);      /* cur only */
+		kick_requests(mdsc, mds);
 		break;
 
 	case CEPH_SESSION_STALE:
@@ -2258,7 +2261,6 @@ send:
 	reply->nr_pages = calc_pages_for(0, pagelist->length);
 	ceph_con_send(&session->s_con, reply);
 
-	session->s_state = CEPH_MDS_SESSION_OPEN;
 	mutex_unlock(&session->s_mutex);
 
 	mutex_lock(&mdsc->mutex);
@@ -2334,7 +2336,7 @@ static void check_new_map(struct ceph_mds_client *mdsc,
 			}
 
 			/* kick any requests waiting on the recovering mds */
-			kick_requests(mdsc, i, 1);
+			kick_requests(mdsc, i);
 		} else if (oldstate == newstate) {
 			continue;  /* nothing new with this mds */
 		}
@@ -2347,18 +2349,14 @@ static void check_new_map(struct ceph_mds_client *mdsc,
 			send_mds_reconnect(mdsc, i);
 
 		/*
-		 * kick requests on any mds that has gone active.
-		 *
-		 * kick requests on cur or forwarder: we may have sent
-		 * the request to mds1, mds1 told us it forwarded it
-		 * to mds2, but then we learn mds1 failed and can't be
-		 * sure it successfully forwarded our request before
-		 * it died.
+		 * kick request on any mds that has gone active.
 		 */
 		if (oldstate < CEPH_MDS_STATE_ACTIVE &&
 		    newstate >= CEPH_MDS_STATE_ACTIVE) {
-			pr_info("mds%d reconnect completed\n", s->s_mds);
-			kick_requests(mdsc, i, 1);
+			if (oldstate != CEPH_MDS_STATE_CREATING &&
+			    oldstate != CEPH_MDS_STATE_STARTING)
+				pr_info("mds%d recovery completed\n", s->s_mds);
+			kick_requests(mdsc, i);
 			ceph_kick_flushing_caps(mdsc, s);
 			wake_up_session_caps(s, 1);
 		}

From 34b6c855fafc54ef130649809cd580f98e3f8416 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 10 May 2010 16:31:25 -0700
Subject: [PATCH 45/59] ceph: clean up send_mds_reconnect interface

Pass a ceph_mds_session, since the caller has it.

Remove the dead code for sending empty reconnects.  It used to be used
when the MDS contacted _us_ to solicit a reconnect, and we could reply
saying "go away, I have no session."  Now we only send reconnects based
on the mds map, and only when we do in fact have an open session.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/mds_client.c | 47 +++++++++++++++-----------------------------
 1 file changed, 16 insertions(+), 31 deletions(-)

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 972ec5ae2ac4..e310b4be5588 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2177,50 +2177,38 @@ out:
  *
  * called with mdsc->mutex held.
  */
-static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds)
+static void send_mds_reconnect(struct ceph_mds_client *mdsc,
+			       struct ceph_mds_session *session)
 {
-	struct ceph_mds_session *session = NULL;
 	struct ceph_msg *reply;
 	struct rb_node *p;
+	int mds = session->s_mds;
 	int err = -ENOMEM;
 	struct ceph_pagelist *pagelist;
 
-	pr_info("reconnect to recovering mds%d\n", mds);
+	pr_info("mds%d reconnect start\n", mds);
 
 	pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
 	if (!pagelist)
 		goto fail_nopagelist;
 	ceph_pagelist_init(pagelist);
 
-	err = -ENOMEM;
 	reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0);
 	if (!reply)
 		goto fail_nomsg;
 
-	/* find session */
-	session = __ceph_lookup_mds_session(mdsc, mds);
-	mutex_unlock(&mdsc->mutex);    /* drop lock for duration */
+	mutex_lock(&session->s_mutex);
+	session->s_state = CEPH_MDS_SESSION_RECONNECTING;
+	session->s_seq = 0;
 
-	if (session) {
-		mutex_lock(&session->s_mutex);
+	ceph_con_open(&session->s_con,
+		      ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
 
-		session->s_state = CEPH_MDS_SESSION_RECONNECTING;
-		session->s_seq = 0;
-
-		ceph_con_open(&session->s_con,
-			      ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
-
-		/* replay unsafe requests */
-		replay_unsafe_requests(mdsc, session);
-	} else {
-		dout("no session for mds%d, will send short reconnect\n",
-		     mds);
-	}
+	/* replay unsafe requests */
+	replay_unsafe_requests(mdsc, session);
 
 	down_read(&mdsc->snap_rwsem);
 
-	if (!session)
-		goto send;
 	dout("session %p state %s\n", session,
 	     session_state_name(session->s_state));
 
@@ -2255,7 +2243,6 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds)
 			goto fail;
 	}
 
-send:
 	reply->pagelist = pagelist;
 	reply->hdr.data_len = cpu_to_le32(pagelist->length);
 	reply->nr_pages = calc_pages_for(0, pagelist->length);
@@ -2267,23 +2254,18 @@ send:
 	__wake_requests(mdsc, &session->s_waiting);
 	mutex_unlock(&mdsc->mutex);
 
-	ceph_put_mds_session(session);
-
 	up_read(&mdsc->snap_rwsem);
-	mutex_lock(&mdsc->mutex);
 	return;
 
 fail:
 	ceph_msg_put(reply);
 	up_read(&mdsc->snap_rwsem);
 	mutex_unlock(&session->s_mutex);
-	ceph_put_mds_session(session);
 fail_nomsg:
 	ceph_pagelist_release(pagelist);
 	kfree(pagelist);
 fail_nopagelist:
 	pr_err("error %d preparing reconnect for mds%d\n", err, mds);
-	mutex_lock(&mdsc->mutex);
 	return;
 }
 
@@ -2345,8 +2327,11 @@ static void check_new_map(struct ceph_mds_client *mdsc,
 		 * send reconnect?
 		 */
 		if (s->s_state == CEPH_MDS_SESSION_RESTARTING &&
-		    newstate >= CEPH_MDS_STATE_RECONNECT)
-			send_mds_reconnect(mdsc, i);
+		    newstate >= CEPH_MDS_STATE_RECONNECT) {
+			mutex_unlock(&mdsc->mutex);
+			send_mds_reconnect(mdsc, s);
+			mutex_lock(&mdsc->mutex);
+		}
 
 		/*
 		 * kick request on any mds that has gone active.

From 7e70f0ed9f3ee47394576be86c593f66832413e9 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Thu, 18 Mar 2010 13:59:12 -0700
Subject: [PATCH 46/59] ceph: attempt mds reconnect if mds closes our session

Currently, if our session is closed (due to a timeout, or explicit close,
or whatever), we just sit there doing nothing unless/until the MDS
restarts, at which point we try to reconnect.

Change client to attempt an immediate reconnect if our session is closed.

Note that currently the MDS doesn't support this, and our attempt will
fail.  We'll get a session CLOSE, our caps and dirty cap state will be
dropped, and the client will be free to attempt to reconnect.  That's
clearly not as nice as a successful reconnect, but it at least allows us
to try to carry on, and in the future the MDS will support a reconnect
and we will fare better.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/mds_client.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index e310b4be5588..525085f36db9 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2958,9 +2958,10 @@ static void con_put(struct ceph_connection *con)
 static void peer_reset(struct ceph_connection *con)
 {
 	struct ceph_mds_session *s = con->private;
+	struct ceph_mds_client *mdsc = s->s_mdsc;
 
-	pr_err("mds%d gave us the boot.  IMPLEMENT RECONNECT.\n",
-	       s->s_mds);
+	pr_warning("mds%d closed our session\n", s->s_mds);
+	send_mds_reconnect(mdsc, s);
 }
 
 static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)

From 6c99f2545dbb9e53afe0d1d037c51ab04ef1ff4e Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 10 May 2010 16:12:25 -0700
Subject: [PATCH 47/59] ceph: throw out dirty caps metadata, data on session
 teardown

The remove_session_caps() helper is called when an MDS closes out our
session (either normally, or as a result of a failed reconnect), and when
we tear down state for umount.  If we remove the last cap, and there are
no cap migrations in progress, then there is little hope of us flushing
out that data to the mds (without heroic efforts to reconnect and flush).

So, to avoid leaving inodes pinned (due to dirty state) and crashing after
umount, throw out dirty caps state and unpin the inodes.  Print a warning
to the console so we know something was lost.

NOTE: Although we drop wrbuffer refs, we don't actually mark pages clean;
maybe a truncate should be queued?

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/mds_client.c | 44 +++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 41 insertions(+), 3 deletions(-)

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 525085f36db9..114bada97c16 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -799,12 +799,49 @@ out:
 }
 
 static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
-				   void *arg)
+				  void *arg)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
+	int drop = 0;
+
 	dout("removing cap %p, ci is %p, inode is %p\n",
 	     cap, ci, &ci->vfs_inode);
-	ceph_remove_cap(cap);
+	spin_lock(&inode->i_lock);
+	__ceph_remove_cap(cap);
+	if (!__ceph_is_any_real_caps(ci)) {
+		struct ceph_mds_client *mdsc =
+			&ceph_sb_to_client(inode->i_sb)->mdsc;
+
+		spin_lock(&mdsc->cap_dirty_lock);
+		if (!list_empty(&ci->i_dirty_item)) {
+			pr_info(" dropping dirty %s state for %p %lld\n",
+				ceph_cap_string(ci->i_dirty_caps),
+				inode, ceph_ino(inode));
+			ci->i_dirty_caps = 0;
+			list_del_init(&ci->i_dirty_item);
+			drop = 1;
+		}
+		if (!list_empty(&ci->i_flushing_item)) {
+			pr_info(" dropping dirty+flushing %s state for %p %lld\n",
+				ceph_cap_string(ci->i_flushing_caps),
+				inode, ceph_ino(inode));
+			ci->i_flushing_caps = 0;
+			list_del_init(&ci->i_flushing_item);
+			mdsc->num_cap_flushing--;
+			drop = 1;
+		}
+		if (drop && ci->i_wrbuffer_ref) {
+			pr_info(" dropping dirty data for %p %lld\n",
+				inode, ceph_ino(inode));
+			ci->i_wrbuffer_ref = 0;
+			ci->i_wrbuffer_ref_head = 0;
+			drop++;
+		}
+		spin_unlock(&mdsc->cap_dirty_lock);
+	}
+	spin_unlock(&inode->i_lock);
+	while (drop--)
+		iput(inode);
 	return 0;
 }
 
@@ -816,6 +853,7 @@ static void remove_session_caps(struct ceph_mds_session *session)
 	dout("remove_session_caps on %p\n", session);
 	iterate_session_caps(session, remove_session_caps_cb, NULL);
 	BUG_ON(session->s_nr_caps > 0);
+	BUG_ON(!list_empty(&session->s_cap_flushing));
 	cleanup_cap_releases(session);
 }
 
@@ -1281,7 +1319,7 @@ retry:
 			len += 1 + temp->d_name.len;
 		temp = temp->d_parent;
 		if (temp == NULL) {
-			pr_err("build_path_dentry corrupt dentry %p\n", dentry);
+			pr_err("build_path corrupt dentry %p\n", dentry);
 			return ERR_PTR(-EINVAL);
 		}
 	}

From 1a75627896fe67d0124eab6fe2f83dd40188c40c Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 11 May 2010 11:40:25 -0700
Subject: [PATCH 48/59] ceph: use ceph. prefix for virtual xattrs

Drop the 'user.' prefix and use just 'ceph.' for fs virtual xattrs.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/xattr.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 7185d0794df3..68aeebc69681 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -7,7 +7,8 @@
 
 static bool ceph_is_valid_xattr(const char *name)
 {
-	return !strncmp(name, XATTR_SECURITY_PREFIX,
+	return !strncmp(name, "ceph.", 5) ||
+	       !strncmp(name, XATTR_SECURITY_PREFIX,
 			XATTR_SECURITY_PREFIX_LEN) ||
 	       !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
 	       !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
@@ -76,14 +77,14 @@ static size_t ceph_vxattrcb_rctime(struct ceph_inode_info *ci, char *val,
 }
 
 static struct ceph_vxattr_cb ceph_dir_vxattrs[] = {
-	{ true, "user.ceph.dir.entries", ceph_vxattrcb_entries},
-	{ true, "user.ceph.dir.files", ceph_vxattrcb_files},
-	{ true, "user.ceph.dir.subdirs", ceph_vxattrcb_subdirs},
-	{ true, "user.ceph.dir.rentries", ceph_vxattrcb_rentries},
-	{ true, "user.ceph.dir.rfiles", ceph_vxattrcb_rfiles},
-	{ true, "user.ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs},
-	{ true, "user.ceph.dir.rbytes", ceph_vxattrcb_rbytes},
-	{ true, "user.ceph.dir.rctime", ceph_vxattrcb_rctime},
+	{ true, "ceph.dir.entries", ceph_vxattrcb_entries},
+	{ true, "ceph.dir.files", ceph_vxattrcb_files},
+	{ true, "ceph.dir.subdirs", ceph_vxattrcb_subdirs},
+	{ true, "ceph.dir.rentries", ceph_vxattrcb_rentries},
+	{ true, "ceph.dir.rfiles", ceph_vxattrcb_rfiles},
+	{ true, "ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs},
+	{ true, "ceph.dir.rbytes", ceph_vxattrcb_rbytes},
+	{ true, "ceph.dir.rctime", ceph_vxattrcb_rctime},
 	{ true, NULL, NULL }
 };
 
@@ -107,7 +108,7 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
 }
 
 static struct ceph_vxattr_cb ceph_file_vxattrs[] = {
-	{ true, "user.ceph.layout", ceph_vxattrcb_layout},
+	{ true, "ceph.layout", ceph_vxattrcb_layout},
 	{ NULL, NULL }
 };
 

From ca9d93a292e327bbcddd8f8ea4197397e35097d4 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Wed, 12 May 2010 14:48:20 -0700
Subject: [PATCH 49/59] ceph: resync headers with userland

Notable changes include pool op defines and types, FLOCK feature bit, and
new CMPXATTR osd ops.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/ceph_fs.h      | 50 +++++++++++++++++++++++++++++++++++++++---
 fs/ceph/ceph_strings.c | 23 ++++++++++++++++++-
 fs/ceph/messenger.c    | 12 ----------
 fs/ceph/messenger.h    |  4 +---
 fs/ceph/msgr.h         |  1 -
 fs/ceph/rados.h        | 23 +++++++++++++++++--
 6 files changed, 91 insertions(+), 22 deletions(-)

diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
index 8b194c6ef7d3..715e39ce517a 100644
--- a/fs/ceph/ceph_fs.h
+++ b/fs/ceph/ceph_fs.h
@@ -19,7 +19,7 @@
  * Ceph release version
  */
 #define CEPH_VERSION_MAJOR 0
-#define CEPH_VERSION_MINOR 19
+#define CEPH_VERSION_MINOR 20
 #define CEPH_VERSION_PATCH 0
 
 #define _CEPH_STRINGIFY(x) #x
@@ -36,7 +36,7 @@
  * client-facing protocol.
  */
 #define CEPH_OSD_PROTOCOL     8 /* cluster internal */
-#define CEPH_MDS_PROTOCOL     9 /* cluster internal */
+#define CEPH_MDS_PROTOCOL    12 /* cluster internal */
 #define CEPH_MON_PROTOCOL     5 /* cluster internal */
 #define CEPH_OSDC_PROTOCOL   24 /* server/client */
 #define CEPH_MDSC_PROTOCOL   32 /* server/client */
@@ -55,10 +55,11 @@
  */
 #define CEPH_FEATURE_UID        1
 #define CEPH_FEATURE_NOSRCADDR  2
+#define CEPH_FEATURE_FLOCK      4
 
 #define CEPH_FEATURE_SUPPORTED_MON  CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR
 #define CEPH_FEATURE_REQUIRED_MON   CEPH_FEATURE_UID
-#define CEPH_FEATURE_SUPPORTED_MDS  CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR
+#define CEPH_FEATURE_SUPPORTED_MDS  CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR|CEPH_FEATURE_FLOCK
 #define CEPH_FEATURE_REQUIRED_MDS   CEPH_FEATURE_UID
 #define CEPH_FEATURE_SUPPORTED_OSD  CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR
 #define CEPH_FEATURE_REQUIRED_OSD   CEPH_FEATURE_UID
@@ -100,6 +101,8 @@ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
 #define CEPH_AUTH_NONE	 	0x1
 #define CEPH_AUTH_CEPHX	 	0x2
 
+#define CEPH_AUTH_UID_DEFAULT ((__u64) -1)
+
 
 /*********************************************
  * message layer
@@ -137,11 +140,27 @@ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
 #define CEPH_MSG_CLIENT_SNAP            0x312
 #define CEPH_MSG_CLIENT_CAPRELEASE      0x313
 
+/* pool ops */
+#define CEPH_MSG_POOLOP_REPLY           48
+#define CEPH_MSG_POOLOP                 49
+
+
 /* osd */
 #define CEPH_MSG_OSD_MAP          41
 #define CEPH_MSG_OSD_OP           42
 #define CEPH_MSG_OSD_OPREPLY      43
 
+/* pool operations */
+enum {
+  POOL_OP_CREATE			= 0x01,
+  POOL_OP_DELETE			= 0x02,
+  POOL_OP_AUID_CHANGE			= 0x03,
+  POOL_OP_CREATE_SNAP			= 0x11,
+  POOL_OP_DELETE_SNAP			= 0x12,
+  POOL_OP_CREATE_UNMANAGED_SNAP		= 0x21,
+  POOL_OP_DELETE_UNMANAGED_SNAP		= 0x22,
+};
+
 struct ceph_mon_request_header {
 	__le64 have_version;
 	__le16 session_mon;
@@ -164,6 +183,31 @@ struct ceph_mon_statfs_reply {
 	struct ceph_statfs st;
 } __attribute__ ((packed));
 
+const char *ceph_pool_op_name(int op);
+
+struct ceph_mon_poolop {
+	struct ceph_mon_request_header monhdr;
+	struct ceph_fsid fsid;
+	__le32 pool;
+	__le32 op;
+	__le64 auid;
+	__le64 snapid;
+	__le32 name_len;
+} __attribute__ ((packed));
+
+struct ceph_mon_poolop_reply {
+	struct ceph_mon_request_header monhdr;
+	struct ceph_fsid fsid;
+	__le32 reply_code;
+	__le32 epoch;
+	char has_data;
+	char data[0];
+} __attribute__ ((packed));
+
+struct ceph_mon_unmanaged_snap {
+	__le64 snapid;
+} __attribute__ ((packed));
+
 struct ceph_osd_getmap {
 	struct ceph_mon_request_header monhdr;
 	struct ceph_fsid fsid;
diff --git a/fs/ceph/ceph_strings.c b/fs/ceph/ceph_strings.c
index 8e4be6a80c62..78908301ff17 100644
--- a/fs/ceph/ceph_strings.c
+++ b/fs/ceph/ceph_strings.c
@@ -10,7 +10,6 @@ const char *ceph_entity_type_name(int type)
 	case CEPH_ENTITY_TYPE_OSD: return "osd";
 	case CEPH_ENTITY_TYPE_MON: return "mon";
 	case CEPH_ENTITY_TYPE_CLIENT: return "client";
-	case CEPH_ENTITY_TYPE_ADMIN: return "admin";
 	case CEPH_ENTITY_TYPE_AUTH: return "auth";
 	default: return "unknown";
 	}
@@ -45,6 +44,7 @@ const char *ceph_osd_op_name(int op)
 	case CEPH_OSD_OP_SETXATTRS: return "setxattrs";
 	case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs";
 	case CEPH_OSD_OP_RMXATTR: return "rmxattr";
+	case CEPH_OSD_OP_CMPXATTR: return "cmpxattr";
 
 	case CEPH_OSD_OP_PULL: return "pull";
 	case CEPH_OSD_OP_PUSH: return "push";
@@ -174,3 +174,24 @@ const char *ceph_snap_op_name(int o)
 	}
 	return "???";
 }
+
+const char *ceph_pool_op_name(int op) {
+  switch (op) {
+  case POOL_OP_CREATE:
+    return "create pool";
+  case POOL_OP_DELETE:
+    return "delete pool";
+  case POOL_OP_AUID_CHANGE:
+    return "change auid";
+  case POOL_OP_CREATE_SNAP:
+    return "create snap";
+  case POOL_OP_DELETE_SNAP:
+    return "delete snap";
+  case POOL_OP_CREATE_UNMANAGED_SNAP:
+    return "create unmanaged snap";
+  case POOL_OP_DELETE_UNMANAGED_SNAP:
+    return "delete unmanaged snap";
+  default:
+    return "unknown";
+  }
+}
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index fe75ec7d8402..d18a544490f6 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -39,18 +39,6 @@ static void queue_con(struct ceph_connection *con);
 static void con_work(struct work_struct *);
 static void ceph_fault(struct ceph_connection *con);
 
-const char *ceph_name_type_str(int t)
-{
-	switch (t) {
-	case CEPH_ENTITY_TYPE_MON: return "mon";
-	case CEPH_ENTITY_TYPE_MDS: return "mds";
-	case CEPH_ENTITY_TYPE_OSD: return "osd";
-	case CEPH_ENTITY_TYPE_CLIENT: return "client";
-	case CEPH_ENTITY_TYPE_ADMIN: return "admin";
-	default: return "???";
-	}
-}
-
 /*
  * nicely render a sockaddr as a string.
  */
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h
index e56564f7e71c..4e5764c7fbc2 100644
--- a/fs/ceph/messenger.h
+++ b/fs/ceph/messenger.h
@@ -49,10 +49,8 @@ struct ceph_connection_operations {
 					int *skip);
 };
 
-extern const char *ceph_name_type_str(int t);
-
 /* use format string %s%d */
-#define ENTITY_NAME(n) ceph_name_type_str((n).type), le64_to_cpu((n).num)
+#define ENTITY_NAME(n) ceph_entity_type_name((n).type), le64_to_cpu((n).num)
 
 struct ceph_messenger {
 	struct ceph_entity_inst inst;    /* my name+address */
diff --git a/fs/ceph/msgr.h b/fs/ceph/msgr.h
index 6baa8e4f8e7f..892a0298dfdf 100644
--- a/fs/ceph/msgr.h
+++ b/fs/ceph/msgr.h
@@ -50,7 +50,6 @@ struct ceph_entity_name {
 #define CEPH_ENTITY_TYPE_MDS    0x02
 #define CEPH_ENTITY_TYPE_OSD    0x04
 #define CEPH_ENTITY_TYPE_CLIENT 0x08
-#define CEPH_ENTITY_TYPE_ADMIN  0x10
 #define CEPH_ENTITY_TYPE_AUTH   0x20
 
 #define CEPH_ENTITY_TYPE_ANY    0xFF
diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h
index fd56451a871f..8fcc023056c7 100644
--- a/fs/ceph/rados.h
+++ b/fs/ceph/rados.h
@@ -101,8 +101,8 @@ struct ceph_pg_pool {
 	__le64 snap_seq;          /* seq for per-pool snapshot */
 	__le32 snap_epoch;        /* epoch of last snap */
 	__le32 num_snaps;
-	__le32 num_removed_snap_intervals;
-	__le64 uid;
+	__le32 num_removed_snap_intervals; /* if non-empty, NO per-pool snaps */
+	__le64 auid;               /* who owns the pg */
 } __attribute__ ((packed));
 
 /*
@@ -208,6 +208,7 @@ enum {
 	/* read */
 	CEPH_OSD_OP_GETXATTR  = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1,
 	CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2,
+	CEPH_OSD_OP_CMPXATTR  = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 3,
 
 	/* write */
 	CEPH_OSD_OP_SETXATTR  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1,
@@ -305,6 +306,22 @@ enum {
 #define EOLDSNAPC    ERESTART  /* ORDERSNAP flag set; writer has old snapc*/
 #define EBLACKLISTED ESHUTDOWN /* blacklisted */
 
+/* xattr comparison */
+enum {
+	CEPH_OSD_CMPXATTR_OP_NOP = 0,
+	CEPH_OSD_CMPXATTR_OP_EQ  = 1,
+	CEPH_OSD_CMPXATTR_OP_NE  = 2,
+	CEPH_OSD_CMPXATTR_OP_GT  = 3,
+	CEPH_OSD_CMPXATTR_OP_GTE = 4,
+	CEPH_OSD_CMPXATTR_OP_LT  = 5,
+	CEPH_OSD_CMPXATTR_OP_LTE = 6
+};
+
+enum {
+	CEPH_OSD_CMPXATTR_MODE_STRING = 1,
+	CEPH_OSD_CMPXATTR_MODE_U64    = 2
+};
+
 /*
  * an individual object operation.  each may be accompanied by some data
  * payload
@@ -321,6 +338,8 @@ struct ceph_osd_op {
 		struct {
 			__le32 name_len;
 			__le32 value_len;
+			__u8 cmp_op;       /* CEPH_OSD_CMPXATTR_OP_* */
+			__u8 cmp_mode;     /* CEPH_OSD_CMPXATTR_MODE_* */
 		} __attribute__ ((packed)) xattr;
 		struct {
 			__u8 class_len;

From aba558e28ac40a598542d995c09efa8439ee3ed4 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Wed, 12 May 2010 15:23:30 -0700
Subject: [PATCH 50/59] ceph: save peer feature bits in connection structure

These are used for adjusting behavior, such as conditionally encoding a
newer message format.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/messenger.c | 1 +
 fs/ceph/messenger.h | 1 +
 2 files changed, 2 insertions(+)

diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index d18a544490f6..50c5f24086fd 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -1223,6 +1223,7 @@ static int process_connect(struct ceph_connection *con)
 		clear_bit(CONNECTING, &con->state);
 		con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
 		con->connect_seq++;
+		con->peer_features = server_feat;
 		dout("process_connect got READY gseq %d cseq %d (%d)\n",
 		     con->peer_global_seq,
 		     le32_to_cpu(con->in_reply.connect_seq),
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h
index 4e5764c7fbc2..889f81f093c9 100644
--- a/fs/ceph/messenger.h
+++ b/fs/ceph/messenger.h
@@ -142,6 +142,7 @@ struct ceph_connection {
 	struct ceph_entity_addr peer_addr; /* peer address */
 	struct ceph_entity_name peer_name; /* peer name */
 	struct ceph_entity_addr peer_addr_for_me;
+	unsigned peer_features;
 	u32 connect_seq;      /* identify the most recent connection
 				 attempt for this connection, client */
 	u32 peer_global_seq;  /* peer's global seq for this connection */

From 85792d0dd6e7a7a18fba55c97e49871211b28fe0 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Thu, 13 May 2010 09:06:02 -0700
Subject: [PATCH 51/59] ceph: cope with out of order (unsafe after safe) mds
 reply

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/mds_client.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 114bada97c16..40dd437a26a9 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1871,6 +1871,12 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
 		mutex_unlock(&mdsc->mutex);
 		goto out;
 	}
+	if (req->r_got_safe && !head->safe) {
+		pr_warning("got unsafe after safe on %llu from mds%d\n",
+			   tid, mds);
+		mutex_unlock(&mdsc->mutex);
+		goto out;
+	}
 
 	result = le32_to_cpu(head->result);
 

From 167c9e352deb7e25568c926c49c3eafad69cbe76 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 14 May 2010 10:02:57 -0700
Subject: [PATCH 52/59] ceph: use common helper for aborted dir request
 invalidation

We invalidate I_COMPLETE and dentry leases in two places: on aborted mds
request and on request replay.  Use common helper to avoid duplicate code.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/inode.c      | 17 ++---------------
 fs/ceph/mds_client.c | 39 +++++++++++++++++++++++----------------
 fs/ceph/mds_client.h |  2 ++
 3 files changed, 27 insertions(+), 31 deletions(-)

diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 1bcf98bd0255..a81b8b662c7b 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -941,21 +941,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
 
 	if (!rinfo->head->is_target && !rinfo->head->is_dentry) {
 		dout("fill_trace reply is empty!\n");
-		if (rinfo->head->result == 0 && req->r_locked_dir) {
-			struct ceph_inode_info *ci =
-				ceph_inode(req->r_locked_dir);
-			dout(" clearing %p complete (empty trace)\n",
-			     req->r_locked_dir);
-			spin_lock(&req->r_locked_dir->i_lock);
-			ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
-			ci->i_release_count++;
-			spin_unlock(&req->r_locked_dir->i_lock);
-
-			if (req->r_dentry)
-				ceph_invalidate_dentry_lease(req->r_dentry);
-			if (req->r_old_dentry)
-				ceph_invalidate_dentry_lease(req->r_old_dentry);
-		}
+		if (rinfo->head->result == 0 && req->r_locked_dir)
+			ceph_invalidate_dir_request(req);
 		return 0;
 	}
 
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 40dd437a26a9..5c17ab44d02d 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1794,22 +1794,8 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
 		mutex_unlock(&req->r_fill_mutex);
 
 		if (req->r_locked_dir &&
-		    (req->r_op & CEPH_MDS_OP_WRITE)) {
-			struct ceph_inode_info *ci =
-				ceph_inode(req->r_locked_dir);
-
-			dout("aborted, clearing I_COMPLETE on %p, leases\n",
-			     req->r_locked_dir);
-			spin_lock(&req->r_locked_dir->i_lock);
-			ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
-			ci->i_release_count++;
-			spin_unlock(&req->r_locked_dir->i_lock);
-
-			if (req->r_dentry)
-				ceph_invalidate_dentry_lease(req->r_dentry);
-			if (req->r_old_dentry)
-				ceph_invalidate_dentry_lease(req->r_old_dentry);
-		}
+		    (req->r_op & CEPH_MDS_OP_WRITE))
+			ceph_invalidate_dir_request(req);
 	} else {
 		err = req->r_err;
 	}
@@ -1820,6 +1806,27 @@ out:
 	return err;
 }
 
+/*
+ * Invalidate dir I_COMPLETE, dentry lease state on an aborted MDS
+ * namespace request.
+ */
+void ceph_invalidate_dir_request(struct ceph_mds_request *req)
+{
+	struct inode *inode = req->r_locked_dir;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+
+	dout("invalidate_dir_request %p (I_COMPLETE, lease(s))\n", inode);
+	spin_lock(&inode->i_lock);
+	ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
+	ci->i_release_count++;
+	spin_unlock(&inode->i_lock);
+
+	if (req->r_dentry)
+		ceph_invalidate_dentry_lease(req->r_dentry);
+	if (req->r_old_dentry)
+		ceph_invalidate_dentry_lease(req->r_old_dentry);
+}
+
 /*
  * Handle mds reply.
  *
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 141a265bda75..d9936c4f1212 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -303,6 +303,8 @@ extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc,
 				    struct inode *inode,
 				    struct dentry *dn, int mask);
 
+extern void ceph_invalidate_dir_request(struct ceph_mds_request *req);
+
 extern struct ceph_mds_request *
 ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
 extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,

From cffe7b6d8cc029d13524acedf4917210dc0102ab Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia@diku.dk>
Date: Thu, 13 May 2010 22:07:29 +0200
Subject: [PATCH 53/59] ceph: Use kzalloc

Use kzalloc rather than the combination of kmalloc and memset.

The semantic patch that makes this change is as follows:
(http://coccinelle.lip6.fr/)

// <smpl>
@@
expression x,size,flags;
statement S;
@@

-x = kmalloc(size,flags);
+x = kzalloc(size,flags);
 if (x == NULL) S
-memset(x, 0, size);
// </smpl>

Signed-off-by: Julia Lawall <julia@diku.dk>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/mon_client.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
index 7df2f8c157f1..7062b889ab2f 100644
--- a/fs/ceph/mon_client.c
+++ b/fs/ceph/mon_client.c
@@ -481,11 +481,10 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
 	struct ceph_mon_statfs *h;
 	int err;
 
-	req = kmalloc(sizeof(*req), GFP_NOFS);
+	req = kzalloc(sizeof(*req), GFP_NOFS);
 	if (!req)
 		return -ENOMEM;
 
-	memset(req, 0, sizeof(*req));
 	kref_init(&req->kref);
 	req->buf = buf;
 	init_completion(&req->completion);

From 366837706bae00abc2edd75add2579c1be18b2b8 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 14 May 2010 11:36:48 -0700
Subject: [PATCH 54/59] ceph: cleanup pool op strings

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/ceph_strings.c | 31 ++++++++++++-------------------
 1 file changed, 12 insertions(+), 19 deletions(-)

diff --git a/fs/ceph/ceph_strings.c b/fs/ceph/ceph_strings.c
index 78908301ff17..7503aee828ce 100644
--- a/fs/ceph/ceph_strings.c
+++ b/fs/ceph/ceph_strings.c
@@ -175,23 +175,16 @@ const char *ceph_snap_op_name(int o)
 	return "???";
 }
 
-const char *ceph_pool_op_name(int op) {
-  switch (op) {
-  case POOL_OP_CREATE:
-    return "create pool";
-  case POOL_OP_DELETE:
-    return "delete pool";
-  case POOL_OP_AUID_CHANGE:
-    return "change auid";
-  case POOL_OP_CREATE_SNAP:
-    return "create snap";
-  case POOL_OP_DELETE_SNAP:
-    return "delete snap";
-  case POOL_OP_CREATE_UNMANAGED_SNAP:
-    return "create unmanaged snap";
-  case POOL_OP_DELETE_UNMANAGED_SNAP:
-    return "delete unmanaged snap";
-  default:
-    return "unknown";
-  }
+const char *ceph_pool_op_name(int op)
+{
+	switch (op) {
+	case POOL_OP_CREATE: return "create";
+	case POOL_OP_DELETE: return "delete";
+	case POOL_OP_AUID_CHANGE: return "auid change";
+	case POOL_OP_CREATE_SNAP: return "create snap";
+	case POOL_OP_DELETE_SNAP: return "delete snap";
+	case POOL_OP_CREATE_UNMANAGED_SNAP: return "create unmanaged snap";
+	case POOL_OP_DELETE_UNMANAGED_SNAP: return "delete unmanaged snap";
+	}
+	return "???";
 }

From 23804d91f112df09b832cd091b71af4dc2831aa8 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 14 May 2010 13:06:30 -0700
Subject: [PATCH 55/59] ceph: specify max_bytes on readdir replies

Specify max bytes in request to bound size of reply.  Add associated
mount option with default value of 512 KB.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/ceph_fs.h | 1 +
 fs/ceph/dir.c     | 2 ++
 fs/ceph/super.c   | 8 ++++++++
 fs/ceph/super.h   | 4 +++-
 4 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
index 715e39ce517a..3b9eeed097b3 100644
--- a/fs/ceph/ceph_fs.h
+++ b/fs/ceph/ceph_fs.h
@@ -361,6 +361,7 @@ union ceph_mds_request_args {
 	struct {
 		__le32 frag;                 /* which dir fragment */
 		__le32 max_entries;          /* how many dentries to grab */
+		__le32 max_bytes;
 	} __attribute__ ((packed)) readdir;
 	struct {
 		__le32 mode;
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index d3bb8132a1aa..4fd30900eff7 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -233,6 +233,7 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	u32 ftype;
 	struct ceph_mds_reply_info_parsed *rinfo;
 	const int max_entries = client->mount_args->max_readdir;
+	const int max_bytes = client->mount_args->max_readdir_bytes;
 
 	dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);
 	if (fi->at_end)
@@ -316,6 +317,7 @@ more:
 		req->r_readdir_offset = fi->next_offset;
 		req->r_args.readdir.frag = cpu_to_le32(frag);
 		req->r_args.readdir.max_entries = cpu_to_le32(max_entries);
+		req->r_args.readdir.max_bytes = cpu_to_le32(max_bytes);
 		req->r_num_caps = max_entries + 1;
 		err = ceph_mdsc_do_request(mdsc, NULL, req);
 		if (err < 0) {
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 7f5b20dc4945..bac13898b943 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -190,6 +190,8 @@ static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
 			   args->cap_release_safety);
 	if (args->max_readdir != CEPH_MAX_READDIR_DEFAULT)
 		seq_printf(m, ",readdir_max_entries=%d", args->max_readdir);
+	if (args->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
+		seq_printf(m, ",readdir_max_bytes=%d", args->max_readdir_bytes);
 	if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
 		seq_printf(m, ",snapdirname=%s", args->snapdir_name);
 	if (args->name)
@@ -333,6 +335,7 @@ enum {
 	Opt_caps_wanted_delay_max,
 	Opt_cap_release_safety,
 	Opt_readdir_max_entries,
+	Opt_readdir_max_bytes,
 	Opt_congestion_kb,
 	Opt_last_int,
 	/* int args above */
@@ -365,6 +368,7 @@ static match_table_t arg_tokens = {
 	{Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
 	{Opt_cap_release_safety, "cap_release_safety=%d"},
 	{Opt_readdir_max_entries, "readdir_max_entries=%d"},
+	{Opt_readdir_max_bytes, "readdir_max_bytes=%d"},
 	{Opt_congestion_kb, "write_congestion_kb=%d"},
 	/* int args above */
 	{Opt_snapdirname, "snapdirname=%s"},
@@ -415,6 +419,7 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options,
 	args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
 	args->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
 	args->max_readdir = CEPH_MAX_READDIR_DEFAULT;
+	args->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
 	args->congestion_kb = default_congestion_kb();
 
 	/* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
@@ -522,6 +527,9 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options,
 		case Opt_readdir_max_entries:
 			args->max_readdir = intval;
 			break;
+		case Opt_readdir_max_bytes:
+			args->max_readdir_bytes = intval;
+			break;
 		case Opt_congestion_kb:
 			args->congestion_kb = intval;
 			break;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 395adc5fcebb..3725c9ee9d08 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -66,7 +66,8 @@ struct ceph_mount_args {
 	int congestion_kb;    /* max writeback in flight */
 	int caps_wanted_delay_min, caps_wanted_delay_max;
 	int cap_release_safety;
-	int max_readdir;      /* max readdir size */
+	int max_readdir;       /* max readdir result (entires) */
+	int max_readdir_bytes; /* max readdir result (bytes) */
 	char *snapdir_name;   /* default ".snap" */
 	char *name;
 	char *secret;
@@ -81,6 +82,7 @@ struct ceph_mount_args {
 #define CEPH_OSD_IDLE_TTL_DEFAULT    60
 #define CEPH_MOUNT_RSIZE_DEFAULT    (512*1024) /* readahead */
 #define CEPH_MAX_READDIR_DEFAULT    1024
+#define CEPH_MAX_READDIR_BYTES_DEFAULT    (512*1024)
 
 #define CEPH_MSG_MAX_FRONT_LEN	(16*1024*1024)
 #define CEPH_MSG_MAX_DATA_LEN	(16*1024*1024)

From 34d23762d988b7dcb08390ac72a353df3d60193c Mon Sep 17 00:00:00 2001
From: Yehuda Sadeh <yehuda@hq.newdream.net>
Date: Tue, 6 Apr 2010 14:33:58 -0700
Subject: [PATCH 56/59] ceph: all allocation functions should get gfp_mask

This is essential, as for the rados block device we'll need
to run in different contexts that would need flags that
are other than GFP_NOFS.

Signed-off-by: Yehuda Sadeh <yehuda@hq.newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/caps.c       |  2 +-
 fs/ceph/file.c       | 10 +++++-----
 fs/ceph/mds_client.c | 11 ++++++-----
 fs/ceph/messenger.c  | 10 +++++-----
 fs/ceph/messenger.h  |  2 +-
 fs/ceph/mon_client.c | 15 ++++++++-------
 fs/ceph/msgpool.c    |  4 ++--
 fs/ceph/osd_client.c |  8 ++++----
 8 files changed, 32 insertions(+), 30 deletions(-)

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 0e85d3c80790..0dd0b81e64f7 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -938,7 +938,7 @@ static int send_cap_msg(struct ceph_mds_session *session,
 	     seq, issue_seq, mseq, follows, size, max_size,
 	     xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0);
 
-	msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc));
+	msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), GFP_NOFS);
 	if (!msg)
 		return -ENOMEM;
 
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index b0426090e8c3..0611ec3698aa 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -317,16 +317,16 @@ void ceph_release_page_vector(struct page **pages, int num_pages)
 /*
  * allocate a vector new pages
  */
-static struct page **alloc_page_vector(int num_pages)
+struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags)
 {
 	struct page **pages;
 	int i;
 
-	pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
+	pages = kmalloc(sizeof(*pages) * num_pages, flags);
 	if (!pages)
 		return ERR_PTR(-ENOMEM);
 	for (i = 0; i < num_pages; i++) {
-		pages[i] = __page_cache_alloc(GFP_NOFS);
+		pages[i] = __page_cache_alloc(flags);
 		if (pages[i] == NULL) {
 			ceph_release_page_vector(pages, i);
 			return ERR_PTR(-ENOMEM);
@@ -540,7 +540,7 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data,
 		 * in sequence.
 		 */
 	} else {
-		pages = alloc_page_vector(num_pages);
+		pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
 	}
 	if (IS_ERR(pages))
 		return PTR_ERR(pages);
@@ -668,7 +668,7 @@ more:
 		truncate_inode_pages_range(inode->i_mapping, pos, 
 					   (pos+len) | (PAGE_CACHE_SIZE-1));
 	} else {
-		pages = alloc_page_vector(num_pages);
+		pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
 		if (IS_ERR(pages)) {
 			ret = PTR_ERR(pages);
 			goto out;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 5c17ab44d02d..1b786cf2c7af 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -665,7 +665,7 @@ static struct ceph_msg *create_session_msg(u32 op, u64 seq)
 	struct ceph_msg *msg;
 	struct ceph_mds_session_head *h;
 
-	msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h));
+	msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS);
 	if (!msg) {
 		pr_err("create_session_msg ENOMEM creating msg\n");
 		return NULL;
@@ -1089,7 +1089,8 @@ static int add_cap_releases(struct ceph_mds_client *mdsc,
 
 	while (session->s_num_cap_releases < session->s_nr_caps + extra) {
 		spin_unlock(&session->s_cap_lock);
-		msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE);
+		msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE,
+				   GFP_NOFS);
 		if (!msg)
 			goto out_unlocked;
 		dout("add_cap_releases %p msg %p now %d\n", session, msg,
@@ -1492,7 +1493,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
 	if (req->r_old_dentry_drop)
 		len += req->r_old_dentry->d_name.len;
 
-	msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len);
+	msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS);
 	if (!msg) {
 		msg = ERR_PTR(-ENOMEM);
 		goto out_free2;
@@ -2244,7 +2245,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
 		goto fail_nopagelist;
 	ceph_pagelist_init(pagelist);
 
-	reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0);
+	reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS);
 	if (!reply)
 		goto fail_nomsg;
 
@@ -2535,7 +2536,7 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
 	dnamelen = dentry->d_name.len;
 	len += dnamelen;
 
-	msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len);
+	msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS);
 	if (!msg)
 		return;
 	lease = msg->front.iov_base;
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index 50c5f24086fd..60b74839ebec 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -2070,11 +2070,11 @@ void ceph_con_keepalive(struct ceph_connection *con)
  * construct a new message with given type, size
  * the new msg has a ref count of 1.
  */
-struct ceph_msg *ceph_msg_new(int type, int front_len)
+struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags)
 {
 	struct ceph_msg *m;
 
-	m = kmalloc(sizeof(*m), GFP_NOFS);
+	m = kmalloc(sizeof(*m), flags);
 	if (m == NULL)
 		goto out;
 	kref_init(&m->kref);
@@ -2101,11 +2101,11 @@ struct ceph_msg *ceph_msg_new(int type, int front_len)
 	/* front */
 	if (front_len) {
 		if (front_len > PAGE_CACHE_SIZE) {
-			m->front.iov_base = __vmalloc(front_len, GFP_NOFS,
+			m->front.iov_base = __vmalloc(front_len, flags,
 						      PAGE_KERNEL);
 			m->front_is_vmalloc = true;
 		} else {
-			m->front.iov_base = kmalloc(front_len, GFP_NOFS);
+			m->front.iov_base = kmalloc(front_len, flags);
 		}
 		if (m->front.iov_base == NULL) {
 			pr_err("msg_new can't allocate %d bytes\n",
@@ -2180,7 +2180,7 @@ static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
 	}
 	if (!msg) {
 		*skip = 0;
-		msg = ceph_msg_new(type, front_len);
+		msg = ceph_msg_new(type, front_len, GFP_NOFS);
 		if (!msg) {
 			pr_err("unable to allocate msg type %d len %d\n",
 			       type, front_len);
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h
index 889f81f093c9..00a9430b1ffc 100644
--- a/fs/ceph/messenger.h
+++ b/fs/ceph/messenger.h
@@ -232,7 +232,7 @@ extern void ceph_con_keepalive(struct ceph_connection *con);
 extern struct ceph_connection *ceph_con_get(struct ceph_connection *con);
 extern void ceph_con_put(struct ceph_connection *con);
 
-extern struct ceph_msg *ceph_msg_new(int type, int front_len);
+extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags);
 extern void ceph_msg_kfree(struct ceph_msg *m);
 
 
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
index 7062b889ab2f..61cb11701f10 100644
--- a/fs/ceph/mon_client.c
+++ b/fs/ceph/mon_client.c
@@ -191,7 +191,7 @@ static void __send_subscribe(struct ceph_mon_client *monc)
 		struct ceph_mon_subscribe_item *i;
 		void *p, *end;
 
-		msg = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96);
+		msg = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS);
 		if (!msg)
 			return;
 
@@ -490,10 +490,10 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
 	init_completion(&req->completion);
 
 	err = -ENOMEM;
-	req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h));
+	req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS);
 	if (!req->request)
 		goto out;
-	req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024);
+	req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS);
 	if (!req->reply)
 		goto out;
 
@@ -632,15 +632,16 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
 	/* msg pools */
 	err = -ENOMEM;
 	monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK,
-				     sizeof(struct ceph_mon_subscribe_ack));
+				     sizeof(struct ceph_mon_subscribe_ack),
+				     GFP_NOFS);
 	if (!monc->m_subscribe_ack)
 		goto out_monmap;
 
-	monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096);
+	monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS);
 	if (!monc->m_auth_reply)
 		goto out_subscribe_ack;
 
-	monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096);
+	monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS);
 	monc->pending_auth = 0;
 	if (!monc->m_auth)
 		goto out_auth_reply;
@@ -815,7 +816,7 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
 	case CEPH_MSG_MON_MAP:
 	case CEPH_MSG_MDS_MAP:
 	case CEPH_MSG_OSD_MAP:
-		m = ceph_msg_new(type, front_len);
+		m = ceph_msg_new(type, front_len, GFP_NOFS);
 		break;
 	}
 
diff --git a/fs/ceph/msgpool.c b/fs/ceph/msgpool.c
index 50fe5cc5de76..dd65a6438131 100644
--- a/fs/ceph/msgpool.c
+++ b/fs/ceph/msgpool.c
@@ -12,7 +12,7 @@ static void *alloc_fn(gfp_t gfp_mask, void *arg)
 	struct ceph_msgpool *pool = arg;
 	void *p;
 
-	p = ceph_msg_new(0, pool->front_len);
+	p = ceph_msg_new(0, pool->front_len, gfp_mask);
 	if (!p)
 		pr_err("msgpool %s alloc failed\n", pool->name);
 	return p;
@@ -48,7 +48,7 @@ struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool,
 		WARN_ON(1);
 
 		/* try to alloc a fresh message */
-		return ceph_msg_new(0, front_len);
+		return ceph_msg_new(0, front_len, GFP_NOFS);
 	}
 
 	return mempool_alloc(pool->pool, GFP_NOFS);
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index c0aca732efd3..16141e6e8b73 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -164,7 +164,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 		msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
 	else
 		msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY,
-				   OSD_OPREPLY_FRONT_LEN);
+				   OSD_OPREPLY_FRONT_LEN, GFP_NOFS);
 	if (!msg) {
 		ceph_osdc_put_request(req);
 		return NULL;
@@ -178,7 +178,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 	if (use_mempool)
 		msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
 	else
-		msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size);
+		msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, GFP_NOFS);
 	if (!msg) {
 		ceph_osdc_put_request(req);
 		return NULL;
@@ -1395,7 +1395,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
 	if (front > req->r_reply->front.iov_len) {
 		pr_warning("get_reply front %d > preallocated %d\n",
 			   front, (int)req->r_reply->front.iov_len);
-		m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front);
+		m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, GFP_NOFS);
 		if (!m)
 			goto out;
 		ceph_msg_put(req->r_reply);
@@ -1438,7 +1438,7 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con,
 
 	switch (type) {
 	case CEPH_MSG_OSD_MAP:
-		return ceph_msg_new(type, front);
+		return ceph_msg_new(type, front, GFP_NOFS);
 	case CEPH_MSG_OSD_OPREPLY:
 		return get_reply(con, hdr, skip);
 	default:

From 9e32789f63fc5ad91c8b10f68ec23a86856d5af5 Mon Sep 17 00:00:00 2001
From: Tobias Klauser <tklauser@distanz.ch>
Date: Thu, 20 May 2010 10:40:19 +0200
Subject: [PATCH 57/59] ceph: Storage class should be before const qualifier

The C99 specification states in section 6.11.5:

The placement of a storage-class specifier other than at the beginning
of the declaration specifiers in a declaration is an obsolescent
feature.

Signed-off-by: Tobias Klauser <tklauser@distanz.ch>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/mds_client.c | 4 ++--
 fs/ceph/mon_client.c | 4 ++--
 fs/ceph/osd_client.c | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 1b786cf2c7af..885aa5710cfd 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -40,7 +40,7 @@
 static void __wake_requests(struct ceph_mds_client *mdsc,
 			    struct list_head *head);
 
-const static struct ceph_connection_operations mds_con_ops;
+static const struct ceph_connection_operations mds_con_ops;
 
 
 /*
@@ -3120,7 +3120,7 @@ static int invalidate_authorizer(struct ceph_connection *con)
 	return ceph_monc_validate_auth(&mdsc->client->monc);
 }
 
-const static struct ceph_connection_operations mds_con_ops = {
+static const struct ceph_connection_operations mds_con_ops = {
 	.get = con_get,
 	.put = con_put,
 	.dispatch = dispatch,
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
index 61cb11701f10..478729a6ffce 100644
--- a/fs/ceph/mon_client.c
+++ b/fs/ceph/mon_client.c
@@ -28,7 +28,7 @@
  * resend any outstanding requests.
  */
 
-const static struct ceph_connection_operations mon_con_ops;
+static const struct ceph_connection_operations mon_con_ops;
 
 static int __validate_auth(struct ceph_mon_client *monc);
 
@@ -861,7 +861,7 @@ out:
 	mutex_unlock(&monc->mutex);
 }
 
-const static struct ceph_connection_operations mon_con_ops = {
+static const struct ceph_connection_operations mon_con_ops = {
 	.get = ceph_con_get,
 	.put = ceph_con_put,
 	.dispatch = dispatch,
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index 16141e6e8b73..afa7bb3895c4 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -16,7 +16,7 @@
 #define OSD_OP_FRONT_LEN	4096
 #define OSD_OPREPLY_FRONT_LEN	512
 
-const static struct ceph_connection_operations osd_con_ops;
+static const struct ceph_connection_operations osd_con_ops;
 static int __kick_requests(struct ceph_osd_client *osdc,
 			  struct ceph_osd *kickosd);
 
@@ -1524,7 +1524,7 @@ static int invalidate_authorizer(struct ceph_connection *con)
 	return ceph_monc_validate_auth(&osdc->client->monc);
 }
 
-const static struct ceph_connection_operations osd_con_ops = {
+static const struct ceph_connection_operations osd_con_ops = {
 	.get = get_osd_con,
 	.put = put_osd_con,
 	.dispatch = dispatch,

From 970690012c572fc3b7be532080564b730f6a9c02 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 21 May 2010 12:31:49 -0700
Subject: [PATCH 58/59] ceph: avoid resending queued message to monitor

The auth_reply handler will (re)send any pending requests.  For the
initial mon authenticate phase, that's correct, but when a auth ticket
renewal races with an in-flight request, we may resend a request message
that is already in flight.  Avoid this by revoking the message before
sending it.

We should also avoid resending requests at all during ticket renewal; that
will come soon.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/mon_client.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
index 478729a6ffce..12d94f24ee97 100644
--- a/fs/ceph/mon_client.c
+++ b/fs/ceph/mon_client.c
@@ -104,6 +104,7 @@ static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
 	monc->pending_auth = 1;
 	monc->m_auth->front.iov_len = len;
 	monc->m_auth->hdr.front_len = cpu_to_le32(len);
+	ceph_con_revoke(monc->con, monc->m_auth);
 	ceph_msg_get(monc->m_auth);  /* keep our ref */
 	ceph_con_send(monc->con, monc->m_auth);
 }
@@ -539,6 +540,7 @@ static void __resend_generic_request(struct ceph_mon_client *monc)
 
 	for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) {
 		req = rb_entry(p, struct ceph_mon_generic_request, node);
+		ceph_con_revoke(monc->con, req->request);
 		ceph_con_send(monc->con, ceph_msg_get(req->request));
 	}
 }

From 240ed68eb567d80dd6bab739341999a5ab0ad55d Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 21 May 2010 14:57:25 -0700
Subject: [PATCH 59/59] ceph: reuse mon subscribe message instead of allocated
 anew

Use the same message, allocated during startup.  No need to reallocate a
new one each time around (and potentially ENOMEM).

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/mon_client.c | 22 +++++++++++++---------
 fs/ceph/mon_client.h |  2 +-
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
index 12d94f24ee97..f6510a476e7e 100644
--- a/fs/ceph/mon_client.c
+++ b/fs/ceph/mon_client.c
@@ -188,16 +188,12 @@ static void __send_subscribe(struct ceph_mon_client *monc)
 	     monc->want_next_osdmap);
 	if ((__sub_expired(monc) && !monc->sub_sent) ||
 	    monc->want_next_osdmap == 1) {
-		struct ceph_msg *msg;
+		struct ceph_msg *msg = monc->m_subscribe;
 		struct ceph_mon_subscribe_item *i;
 		void *p, *end;
 
-		msg = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS);
-		if (!msg)
-			return;
-
 		p = msg->front.iov_base;
-		end = p + msg->front.iov_len;
+		end = p + msg->front_max;
 
 		dout("__send_subscribe to 'mdsmap' %u+\n",
 		     (unsigned)monc->have_mdsmap);
@@ -227,7 +223,8 @@ static void __send_subscribe(struct ceph_mon_client *monc)
 
 		msg->front.iov_len = p - msg->front.iov_base;
 		msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
-		ceph_con_send(monc->con, msg);
+		ceph_con_revoke(monc->con, msg);
+		ceph_con_send(monc->con, ceph_msg_get(msg));
 
 		monc->sub_sent = jiffies | 1;  /* never 0 */
 	}
@@ -631,7 +628,7 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
 		CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON |
 		CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS;
 
-	/* msg pools */
+	/* msgs */
 	err = -ENOMEM;
 	monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK,
 				     sizeof(struct ceph_mon_subscribe_ack),
@@ -639,9 +636,13 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
 	if (!monc->m_subscribe_ack)
 		goto out_monmap;
 
+	monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS);
+	if (!monc->m_subscribe)
+		goto out_subscribe_ack;
+
 	monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS);
 	if (!monc->m_auth_reply)
-		goto out_subscribe_ack;
+		goto out_subscribe;
 
 	monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS);
 	monc->pending_auth = 0;
@@ -665,6 +666,8 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
 
 out_auth_reply:
 	ceph_msg_put(monc->m_auth_reply);
+out_subscribe:
+	ceph_msg_put(monc->m_subscribe);
 out_subscribe_ack:
 	ceph_msg_put(monc->m_subscribe_ack);
 out_monmap:
@@ -691,6 +694,7 @@ void ceph_monc_stop(struct ceph_mon_client *monc)
 
 	ceph_msg_put(monc->m_auth);
 	ceph_msg_put(monc->m_auth_reply);
+	ceph_msg_put(monc->m_subscribe);
 	ceph_msg_put(monc->m_subscribe_ack);
 
 	kfree(monc->monmap);
diff --git a/fs/ceph/mon_client.h b/fs/ceph/mon_client.h
index 76887785a4d7..174d794321d0 100644
--- a/fs/ceph/mon_client.h
+++ b/fs/ceph/mon_client.h
@@ -63,7 +63,7 @@ struct ceph_mon_client {
 	struct delayed_work delayed_work;
 
 	struct ceph_auth_client *auth;
-	struct ceph_msg *m_auth, *m_auth_reply, *m_subscribe_ack;
+	struct ceph_msg *m_auth, *m_auth_reply, *m_subscribe, *m_subscribe_ack;
 	int pending_auth;
 
 	bool hunting;