From 28c0254ede13ab575d2df5c6585ed3d4817c3e6b Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Mon, 28 May 2012 14:44:30 +0800
Subject: [PATCH 001/132] ceph: check PG_Private flag before accessing
 page->private

I got lots of NULL pointer dereference Oops when compiling kernel on ceph.
The bug is because the kernel page migration routine replaces some pages
in the page cache with new pages, these new pages' private can be non-zero.

Signed-off-by: Zheng Yan <zheng.z.yan@intel.com>
Signed-off-by: Sage Weil <sage@inktank.com>
---
 fs/ceph/addr.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 173b1d22e59b..8b67304e4b80 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -54,7 +54,12 @@
 	(CONGESTION_ON_THRESH(congestion_kb) -				\
 	 (CONGESTION_ON_THRESH(congestion_kb) >> 2))
 
-
+static inline struct ceph_snap_context *page_snap_context(struct page *page)
+{
+	if (PagePrivate(page))
+		return (void *)page->private;
+	return NULL;
+}
 
 /*
  * Dirty a page.  Optimistically adjust accounting, on the assumption
@@ -142,10 +147,9 @@ static void ceph_invalidatepage(struct page *page, unsigned long offset)
 {
 	struct inode *inode;
 	struct ceph_inode_info *ci;
-	struct ceph_snap_context *snapc = (void *)page->private;
+	struct ceph_snap_context *snapc = page_snap_context(page);
 
 	BUG_ON(!PageLocked(page));
-	BUG_ON(!page->private);
 	BUG_ON(!PagePrivate(page));
 	BUG_ON(!page->mapping);
 
@@ -182,7 +186,6 @@ static int ceph_releasepage(struct page *page, gfp_t g)
 	struct inode *inode = page->mapping ? page->mapping->host : NULL;
 	dout("%p releasepage %p idx %lu\n", inode, page, page->index);
 	WARN_ON(PageDirty(page));
-	WARN_ON(page->private);
 	WARN_ON(PagePrivate(page));
 	return 0;
 }
@@ -443,7 +446,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 	osdc = &fsc->client->osdc;
 
 	/* verify this is a writeable snap context */
-	snapc = (void *)page->private;
+	snapc = page_snap_context(page);
 	if (snapc == NULL) {
 		dout("writepage %p page %p not dirty?\n", inode, page);
 		goto out;
@@ -451,7 +454,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 	oldest = get_oldest_context(inode, &snap_size);
 	if (snapc->seq > oldest->seq) {
 		dout("writepage %p page %p snapc %p not writeable - noop\n",
-		     inode, page, (void *)page->private);
+		     inode, page, snapc);
 		/* we should only noop if called by kswapd */
 		WARN_ON((current->flags & PF_MEMALLOC) == 0);
 		ceph_put_snap_context(oldest);
@@ -591,7 +594,7 @@ static void writepages_finish(struct ceph_osd_request *req,
 			clear_bdi_congested(&fsc->backing_dev_info,
 					    BLK_RW_ASYNC);
 
-		ceph_put_snap_context((void *)page->private);
+		ceph_put_snap_context(page_snap_context(page));
 		page->private = 0;
 		ClearPagePrivate(page);
 		dout("unlocking %d %p\n", i, page);
@@ -795,7 +798,7 @@ get_more_pages:
 			}
 
 			/* only if matching snap context */
-			pgsnapc = (void *)page->private;
+			pgsnapc = page_snap_context(page);
 			if (pgsnapc->seq > snapc->seq) {
 				dout("page snapc %p %lld > oldest %p %lld\n",
 				     pgsnapc, pgsnapc->seq, snapc, snapc->seq);
@@ -984,7 +987,7 @@ retry_locked:
 	BUG_ON(!ci->i_snap_realm);
 	down_read(&mdsc->snap_rwsem);
 	BUG_ON(!ci->i_snap_realm->cached_context);
-	snapc = (void *)page->private;
+	snapc = page_snap_context(page);
 	if (snapc && snapc != ci->i_head_snapc) {
 		/*
 		 * this page is already dirty in another (older) snap

From e5e372da9a469dfe3ece40277090a7056c566838 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Tue, 22 May 2012 11:41:43 -0500
Subject: [PATCH 002/132] libceph: eliminate connection state "DEAD"

The ceph connection state "DEAD" is never set and is therefore not
needed.  Eliminate it.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Yehuda Sadeh <yehuda@inktank.com>
---
 include/linux/ceph/messenger.h | 1 -
 net/ceph/messenger.c           | 6 ------
 2 files changed, 7 deletions(-)

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 2521a95fa6d9..aa506cadea67 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -119,7 +119,6 @@ struct ceph_msg_pos {
 #define CLOSED		10 /* we've closed the connection */
 #define SOCK_CLOSED	11 /* socket state changed to closed */
 #define OPENING         13 /* open connection w/ (possibly new) peer */
-#define DEAD            14 /* dead, about to kfree */
 #define BACKOFF         15
 
 /*
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 1a80907282cc..42ca8aab6dcf 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -2087,12 +2087,6 @@ bad_tag:
  */
 static void queue_con(struct ceph_connection *con)
 {
-	if (test_bit(DEAD, &con->state)) {
-		dout("queue_con %p ignoring: DEAD\n",
-		     con);
-		return;
-	}
-
 	if (!con->ops->get(con)) {
 		dout("queue_con %p ref count 0\n", con);
 		return;

From 6384bb8b8e88a9c6bf2ae0d9517c2c0199177c34 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Tue, 29 May 2012 21:47:38 -0500
Subject: [PATCH 003/132] libceph: kill bad_proto ceph connection op

No code sets a bad_proto method in its ceph connection operations
vector, so just get rid of it.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Yehuda Sadeh <yehuda@inktank.com>
---
 include/linux/ceph/messenger.h | 3 ---
 net/ceph/messenger.c           | 5 -----
 2 files changed, 8 deletions(-)

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index aa506cadea67..74f6c9bd8074 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -31,9 +31,6 @@ struct ceph_connection_operations {
 	int (*verify_authorizer_reply) (struct ceph_connection *con, int len);
 	int (*invalidate_authorizer)(struct ceph_connection *con);
 
-	/* protocol version mismatch */
-	void (*bad_proto) (struct ceph_connection *con);
-
 	/* there was some error on the socket (disconnect, whatever) */
 	void (*fault) (struct ceph_connection *con);
 
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 42ca8aab6dcf..07af9948e3f7 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -1356,11 +1356,6 @@ static void fail_protocol(struct ceph_connection *con)
 {
 	reset_connection(con);
 	set_bit(CLOSED, &con->state);  /* in case there's queued work */
-
-	mutex_unlock(&con->mutex);
-	if (con->ops->bad_proto)
-		con->ops->bad_proto(con);
-	mutex_lock(&con->mutex);
 }
 
 static int process_connect(struct ceph_connection *con)

From 327800bdc2cb9b71f4b458ca07aa9d522668dde0 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Tue, 22 May 2012 11:41:43 -0500
Subject: [PATCH 004/132] libceph: rename socket callbacks

Change the names of the three socket callback functions to make it
more obvious they're specifically associated with a connection's
socket (not the ceph connection that uses it).

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Yehuda Sadeh <yehuda@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 net/ceph/messenger.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 07af9948e3f7..545255823de2 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -153,46 +153,46 @@ EXPORT_SYMBOL(ceph_msgr_flush);
  */
 
 /* data available on socket, or listen socket received a connect */
-static void ceph_data_ready(struct sock *sk, int count_unused)
+static void ceph_sock_data_ready(struct sock *sk, int count_unused)
 {
 	struct ceph_connection *con = sk->sk_user_data;
 
 	if (sk->sk_state != TCP_CLOSE_WAIT) {
-		dout("ceph_data_ready on %p state = %lu, queueing work\n",
+		dout("%s on %p state = %lu, queueing work\n", __func__,
 		     con, con->state);
 		queue_con(con);
 	}
 }
 
 /* socket has buffer space for writing */
-static void ceph_write_space(struct sock *sk)
+static void ceph_sock_write_space(struct sock *sk)
 {
 	struct ceph_connection *con = sk->sk_user_data;
 
 	/* only queue to workqueue if there is data we want to write,
 	 * and there is sufficient space in the socket buffer to accept
-	 * more data.  clear SOCK_NOSPACE so that ceph_write_space()
+	 * more data.  clear SOCK_NOSPACE so that ceph_sock_write_space()
 	 * doesn't get called again until try_write() fills the socket
 	 * buffer. See net/ipv4/tcp_input.c:tcp_check_space()
 	 * and net/core/stream.c:sk_stream_write_space().
 	 */
 	if (test_bit(WRITE_PENDING, &con->state)) {
 		if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
-			dout("ceph_write_space %p queueing write work\n", con);
+			dout("%s %p queueing write work\n", __func__, con);
 			clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 			queue_con(con);
 		}
 	} else {
-		dout("ceph_write_space %p nothing to write\n", con);
+		dout("%s %p nothing to write\n", __func__, con);
 	}
 }
 
 /* socket's state has changed */
-static void ceph_state_change(struct sock *sk)
+static void ceph_sock_state_change(struct sock *sk)
 {
 	struct ceph_connection *con = sk->sk_user_data;
 
-	dout("ceph_state_change %p state = %lu sk_state = %u\n",
+	dout("%s %p state = %lu sk_state = %u\n", __func__,
 	     con, con->state, sk->sk_state);
 
 	if (test_bit(CLOSED, &con->state))
@@ -200,9 +200,9 @@ static void ceph_state_change(struct sock *sk)
 
 	switch (sk->sk_state) {
 	case TCP_CLOSE:
-		dout("ceph_state_change TCP_CLOSE\n");
+		dout("%s TCP_CLOSE\n", __func__);
 	case TCP_CLOSE_WAIT:
-		dout("ceph_state_change TCP_CLOSE_WAIT\n");
+		dout("%s TCP_CLOSE_WAIT\n", __func__);
 		if (test_and_set_bit(SOCK_CLOSED, &con->state) == 0) {
 			if (test_bit(CONNECTING, &con->state))
 				con->error_msg = "connection failed";
@@ -212,7 +212,7 @@ static void ceph_state_change(struct sock *sk)
 		}
 		break;
 	case TCP_ESTABLISHED:
-		dout("ceph_state_change TCP_ESTABLISHED\n");
+		dout("%s TCP_ESTABLISHED\n", __func__);
 		queue_con(con);
 		break;
 	default:	/* Everything else is uninteresting */
@@ -228,9 +228,9 @@ static void set_sock_callbacks(struct socket *sock,
 {
 	struct sock *sk = sock->sk;
 	sk->sk_user_data = con;
-	sk->sk_data_ready = ceph_data_ready;
-	sk->sk_write_space = ceph_write_space;
-	sk->sk_state_change = ceph_state_change;
+	sk->sk_data_ready = ceph_sock_data_ready;
+	sk->sk_write_space = ceph_sock_write_space;
+	sk->sk_state_change = ceph_sock_state_change;
 }
 
 
From e22004235a900213625acd6583ac913d5a30c155 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 23 May 2012 14:35:23 -0500
Subject: [PATCH 005/132] libceph: rename kvec_reset and kvec_add functions

The functions ceph_con_out_kvec_reset() and ceph_con_out_kvec_add()
are entirely private functions, so drop the "ceph_" prefix in their
name to make them slightly more wieldy.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Yehuda Sadeh <yehuda@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 net/ceph/messenger.c | 48 ++++++++++++++++++++++----------------------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 545255823de2..2ca491fc50e2 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -486,14 +486,14 @@ static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
 	return ret;
 }
 
-static void ceph_con_out_kvec_reset(struct ceph_connection *con)
+static void con_out_kvec_reset(struct ceph_connection *con)
 {
 	con->out_kvec_left = 0;
 	con->out_kvec_bytes = 0;
 	con->out_kvec_cur = &con->out_kvec[0];
 }
 
-static void ceph_con_out_kvec_add(struct ceph_connection *con,
+static void con_out_kvec_add(struct ceph_connection *con,
 				size_t size, void *data)
 {
 	int index;
@@ -534,7 +534,7 @@ static void prepare_write_message(struct ceph_connection *con)
 	struct ceph_msg *m;
 	u32 crc;
 
-	ceph_con_out_kvec_reset(con);
+	con_out_kvec_reset(con);
 	con->out_kvec_is_msg = true;
 	con->out_msg_done = false;
 
@@ -542,9 +542,9 @@ static void prepare_write_message(struct ceph_connection *con)
 	 * TCP packet that's a good thing. */
 	if (con->in_seq > con->in_seq_acked) {
 		con->in_seq_acked = con->in_seq;
-		ceph_con_out_kvec_add(con, sizeof (tag_ack), &tag_ack);
+		con_out_kvec_add(con, sizeof (tag_ack), &tag_ack);
 		con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
-		ceph_con_out_kvec_add(con, sizeof (con->out_temp_ack),
+		con_out_kvec_add(con, sizeof (con->out_temp_ack),
 			&con->out_temp_ack);
 	}
 
@@ -572,12 +572,12 @@ static void prepare_write_message(struct ceph_connection *con)
 	BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len);
 
 	/* tag + hdr + front + middle */
-	ceph_con_out_kvec_add(con, sizeof (tag_msg), &tag_msg);
-	ceph_con_out_kvec_add(con, sizeof (m->hdr), &m->hdr);
-	ceph_con_out_kvec_add(con, m->front.iov_len, m->front.iov_base);
+	con_out_kvec_add(con, sizeof (tag_msg), &tag_msg);
+	con_out_kvec_add(con, sizeof (m->hdr), &m->hdr);
+	con_out_kvec_add(con, m->front.iov_len, m->front.iov_base);
 
 	if (m->middle)
-		ceph_con_out_kvec_add(con, m->middle->vec.iov_len,
+		con_out_kvec_add(con, m->middle->vec.iov_len,
 			m->middle->vec.iov_base);
 
 	/* fill in crc (except data pages), footer */
@@ -626,12 +626,12 @@ static void prepare_write_ack(struct ceph_connection *con)
 	     con->in_seq_acked, con->in_seq);
 	con->in_seq_acked = con->in_seq;
 
-	ceph_con_out_kvec_reset(con);
+	con_out_kvec_reset(con);
 
-	ceph_con_out_kvec_add(con, sizeof (tag_ack), &tag_ack);
+	con_out_kvec_add(con, sizeof (tag_ack), &tag_ack);
 
 	con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
-	ceph_con_out_kvec_add(con, sizeof (con->out_temp_ack),
+	con_out_kvec_add(con, sizeof (con->out_temp_ack),
 				&con->out_temp_ack);
 
 	con->out_more = 1;  /* more will follow.. eventually.. */
@@ -644,8 +644,8 @@ static void prepare_write_ack(struct ceph_connection *con)
 static void prepare_write_keepalive(struct ceph_connection *con)
 {
 	dout("prepare_write_keepalive %p\n", con);
-	ceph_con_out_kvec_reset(con);
-	ceph_con_out_kvec_add(con, sizeof (tag_keepalive), &tag_keepalive);
+	con_out_kvec_reset(con);
+	con_out_kvec_add(con, sizeof (tag_keepalive), &tag_keepalive);
 	set_bit(WRITE_PENDING, &con->state);
 }
 
@@ -690,8 +690,8 @@ static struct ceph_auth_handshake *get_connect_authorizer(struct ceph_connection
  */
 static void prepare_write_banner(struct ceph_connection *con)
 {
-	ceph_con_out_kvec_add(con, strlen(CEPH_BANNER), CEPH_BANNER);
-	ceph_con_out_kvec_add(con, sizeof (con->msgr->my_enc_addr),
+	con_out_kvec_add(con, strlen(CEPH_BANNER), CEPH_BANNER);
+	con_out_kvec_add(con, sizeof (con->msgr->my_enc_addr),
 					&con->msgr->my_enc_addr);
 
 	con->out_more = 0;
@@ -738,10 +738,10 @@ static int prepare_write_connect(struct ceph_connection *con)
 	con->out_connect.authorizer_len = auth ?
 		cpu_to_le32(auth->authorizer_buf_len) : 0;
 
-	ceph_con_out_kvec_add(con, sizeof (con->out_connect),
+	con_out_kvec_add(con, sizeof (con->out_connect),
 					&con->out_connect);
 	if (auth && auth->authorizer_buf_len)
-		ceph_con_out_kvec_add(con, auth->authorizer_buf_len,
+		con_out_kvec_add(con, auth->authorizer_buf_len,
 					auth->authorizer_buf);
 
 	con->out_more = 0;
@@ -935,7 +935,7 @@ static int write_partial_msg_pages(struct ceph_connection *con)
 	/* prepare and queue up footer, too */
 	if (!do_datacrc)
 		con->out_msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
-	ceph_con_out_kvec_reset(con);
+	con_out_kvec_reset(con);
 	prepare_write_message_footer(con);
 	ret = 1;
 out:
@@ -1398,7 +1398,7 @@ static int process_connect(struct ceph_connection *con)
 			return -1;
 		}
 		con->auth_retry = 1;
-		ceph_con_out_kvec_reset(con);
+		con_out_kvec_reset(con);
 		ret = prepare_write_connect(con);
 		if (ret < 0)
 			return ret;
@@ -1419,7 +1419,7 @@ static int process_connect(struct ceph_connection *con)
 		       ENTITY_NAME(con->peer_name),
 		       ceph_pr_addr(&con->peer_addr.in_addr));
 		reset_connection(con);
-		ceph_con_out_kvec_reset(con);
+		con_out_kvec_reset(con);
 		ret = prepare_write_connect(con);
 		if (ret < 0)
 			return ret;
@@ -1445,7 +1445,7 @@ static int process_connect(struct ceph_connection *con)
 		     le32_to_cpu(con->out_connect.connect_seq),
 		     le32_to_cpu(con->in_connect.connect_seq));
 		con->connect_seq = le32_to_cpu(con->in_connect.connect_seq);
-		ceph_con_out_kvec_reset(con);
+		con_out_kvec_reset(con);
 		ret = prepare_write_connect(con);
 		if (ret < 0)
 			return ret;
@@ -1462,7 +1462,7 @@ static int process_connect(struct ceph_connection *con)
 		     le32_to_cpu(con->in_connect.global_seq));
 		get_global_seq(con->msgr,
 			       le32_to_cpu(con->in_connect.global_seq));
-		ceph_con_out_kvec_reset(con);
+		con_out_kvec_reset(con);
 		ret = prepare_write_connect(con);
 		if (ret < 0)
 			return ret;
@@ -1869,7 +1869,7 @@ more:
 
 	/* open the socket first? */
 	if (con->sock == NULL) {
-		ceph_con_out_kvec_reset(con);
+		con_out_kvec_reset(con);
 		prepare_write_banner(con);
 		ret = prepare_write_connect(con);
 		if (ret < 0)

From 15d9882c336db2db73ccf9871ae2398e452f694c Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Sat, 26 May 2012 23:26:43 -0500
Subject: [PATCH 006/132] libceph: embed ceph messenger structure in
 ceph_client

A ceph client has a pointer to a ceph messenger structure in it.
There is always exactly one ceph messenger for a ceph client, so
there is no need to allocate it separate from the ceph client
structure.

Switch the ceph_client structure to embed its ceph_messenger
structure.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Yehuda Sadeh <yehuda@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 fs/ceph/mds_client.c           |  2 +-
 include/linux/ceph/libceph.h   |  2 +-
 include/linux/ceph/messenger.h |  9 +++++----
 net/ceph/ceph_common.c         | 18 +++++-------------
 net/ceph/messenger.c           | 30 +++++++++---------------------
 net/ceph/mon_client.c          |  6 +++---
 net/ceph/osd_client.c          |  4 ++--
 7 files changed, 26 insertions(+), 45 deletions(-)

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 200bc87eceb1..ad30261cd4c0 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -394,7 +394,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
 	s->s_seq = 0;
 	mutex_init(&s->s_mutex);
 
-	ceph_con_init(mdsc->fsc->client->msgr, &s->s_con);
+	ceph_con_init(&mdsc->fsc->client->msgr, &s->s_con);
 	s->s_con.private = s;
 	s->s_con.ops = &mds_con_ops;
 	s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS;
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 92eef7c3d3c5..927361c4b0a8 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -131,7 +131,7 @@ struct ceph_client {
 	u32 supported_features;
 	u32 required_features;
 
-	struct ceph_messenger *msgr;   /* messenger instance */
+	struct ceph_messenger msgr;   /* messenger instance */
 	struct ceph_mon_client monc;
 	struct ceph_osd_client osdc;
 
diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 74f6c9bd8074..3fbd4be804ed 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -211,10 +211,11 @@ extern int ceph_msgr_init(void);
 extern void ceph_msgr_exit(void);
 extern void ceph_msgr_flush(void);
 
-extern struct ceph_messenger *ceph_messenger_create(
-	struct ceph_entity_addr *myaddr,
-	u32 features, u32 required);
-extern void ceph_messenger_destroy(struct ceph_messenger *);
+extern void ceph_messenger_init(struct ceph_messenger *msgr,
+			struct ceph_entity_addr *myaddr,
+			u32 supported_features,
+			u32 required_features,
+			bool nocrc);
 
 extern void ceph_con_init(struct ceph_messenger *msgr,
 			  struct ceph_connection *con);
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index cc913193d992..2de3ea1bbd64 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -468,19 +468,15 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private,
 	/* msgr */
 	if (ceph_test_opt(client, MYIP))
 		myaddr = &client->options->my_addr;
-	client->msgr = ceph_messenger_create(myaddr,
-					     client->supported_features,
-					     client->required_features);
-	if (IS_ERR(client->msgr)) {
-		err = PTR_ERR(client->msgr);
-		goto fail;
-	}
-	client->msgr->nocrc = ceph_test_opt(client, NOCRC);
+	ceph_messenger_init(&client->msgr, myaddr,
+		client->supported_features,
+		client->required_features,
+		ceph_test_opt(client, NOCRC));
 
 	/* subsystems */
 	err = ceph_monc_init(&client->monc, client);
 	if (err < 0)
-		goto fail_msgr;
+		goto fail;
 	err = ceph_osdc_init(&client->osdc, client);
 	if (err < 0)
 		goto fail_monc;
@@ -489,8 +485,6 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private,
 
 fail_monc:
 	ceph_monc_stop(&client->monc);
-fail_msgr:
-	ceph_messenger_destroy(client->msgr);
 fail:
 	kfree(client);
 	return ERR_PTR(err);
@@ -515,8 +509,6 @@ void ceph_destroy_client(struct ceph_client *client)
 
 	ceph_debugfs_client_cleanup(client);
 
-	ceph_messenger_destroy(client->msgr);
-
 	ceph_destroy_options(client->options);
 
 	kfree(client);
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 2ca491fc50e2..d8423a3f6698 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -2245,18 +2245,14 @@ out:
 
 
 /*
- * create a new messenger instance
+ * initialize a new messenger instance
  */
-struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr,
-					     u32 supported_features,
-					     u32 required_features)
+void ceph_messenger_init(struct ceph_messenger *msgr,
+			struct ceph_entity_addr *myaddr,
+			u32 supported_features,
+			u32 required_features,
+			bool nocrc)
 {
-	struct ceph_messenger *msgr;
-
-	msgr = kzalloc(sizeof(*msgr), GFP_KERNEL);
-	if (msgr == NULL)
-		return ERR_PTR(-ENOMEM);
-
 	msgr->supported_features = supported_features;
 	msgr->required_features = required_features;
 
@@ -2269,19 +2265,11 @@ struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr,
 	msgr->inst.addr.type = 0;
 	get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce));
 	encode_my_addr(msgr);
+	msgr->nocrc = nocrc;
 
-	dout("messenger_create %p\n", msgr);
-	return msgr;
+	dout("%s %p\n", __func__, msgr);
 }
-EXPORT_SYMBOL(ceph_messenger_create);
-
-void ceph_messenger_destroy(struct ceph_messenger *msgr)
-{
-	dout("destroy %p\n", msgr);
-	kfree(msgr);
-	dout("destroyed messenger %p\n", msgr);
-}
-EXPORT_SYMBOL(ceph_messenger_destroy);
+EXPORT_SYMBOL(ceph_messenger_init);
 
 static void clear_standby(struct ceph_connection *con)
 {
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index 1845cde26227..704dc95dc620 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -763,7 +763,7 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
 	monc->con = kmalloc(sizeof(*monc->con), GFP_KERNEL);
 	if (!monc->con)
 		goto out_monmap;
-	ceph_con_init(monc->client->msgr, monc->con);
+	ceph_con_init(&monc->client->msgr, monc->con);
 	monc->con->private = monc;
 	monc->con->ops = &mon_con_ops;
 
@@ -880,8 +880,8 @@ static void handle_auth_reply(struct ceph_mon_client *monc,
 	} else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) {
 		dout("authenticated, starting session\n");
 
-		monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
-		monc->client->msgr->inst.name.num =
+		monc->client->msgr.inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
+		monc->client->msgr.inst.name.num =
 					cpu_to_le64(monc->auth->global_id);
 
 		__send_subscribe(monc);
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index b098e7b591f0..cca4c7f1c780 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -639,7 +639,7 @@ static struct ceph_osd *create_osd(struct ceph_osd_client *osdc)
 	INIT_LIST_HEAD(&osd->o_osd_lru);
 	osd->o_incarnation = 1;
 
-	ceph_con_init(osdc->client->msgr, &osd->o_con);
+	ceph_con_init(&osdc->client->msgr, &osd->o_con);
 	osd->o_con.private = osd;
 	osd->o_con.ops = &osd_con_ops;
 	osd->o_con.peer_name.type = CEPH_ENTITY_TYPE_OSD;
@@ -1391,7 +1391,7 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 			     epoch, maplen);
 			newmap = osdmap_apply_incremental(&p, next,
 							  osdc->osdmap,
-							  osdc->client->msgr);
+							  &osdc->client->msgr);
 			if (IS_ERR(newmap)) {
 				err = PTR_ERR(newmap);
 				goto bad;

From 928443cd9644e7cfd46f687dbeffda2d1a357ff9 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Tue, 22 May 2012 11:41:43 -0500
Subject: [PATCH 007/132] libceph: start separating connection flags from state

A ceph_connection holds a mixture of connection state (as in "state
machine" state) and connection flags in a single "state" field.  To
make the distinction more clear, define a new "flags" field and use
it rather than the "state" field to hold Boolean flag values.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil<sage@inktank.com>
---
 include/linux/ceph/messenger.h | 18 ++++++++----
 net/ceph/messenger.c           | 50 +++++++++++++++++-----------------
 2 files changed, 37 insertions(+), 31 deletions(-)

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 3fbd4be804ed..920235e114ad 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -103,20 +103,25 @@ struct ceph_msg_pos {
 #define MAX_DELAY_INTERVAL	(5 * 60 * HZ)
 
 /*
- * ceph_connection state bit flags
+ * ceph_connection flag bits
  */
+
 #define LOSSYTX         0  /* we can close channel or drop messages on errors */
-#define CONNECTING	1
-#define NEGOTIATING	2
 #define KEEPALIVE_PENDING      3
 #define WRITE_PENDING	4  /* we have data ready to send */
+#define SOCK_CLOSED	11 /* socket state changed to closed */
+#define BACKOFF         15
+
+/*
+ * ceph_connection states
+ */
+#define CONNECTING	1
+#define NEGOTIATING	2
 #define STANDBY		8  /* no outgoing messages, socket closed.  we keep
 			    * the ceph_connection around to maintain shared
 			    * state with the peer. */
 #define CLOSED		10 /* we've closed the connection */
-#define SOCK_CLOSED	11 /* socket state changed to closed */
 #define OPENING         13 /* open connection w/ (possibly new) peer */
-#define BACKOFF         15
 
 /*
  * A single connection with another host.
@@ -133,7 +138,8 @@ struct ceph_connection {
 
 	struct ceph_messenger *msgr;
 	struct socket *sock;
-	unsigned long state;	/* connection state (see flags above) */
+	unsigned long flags;
+	unsigned long state;
 	const char *error_msg;  /* error message, if any */
 
 	struct ceph_entity_addr peer_addr; /* peer address */
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index d8423a3f6698..e84e4fd86bb7 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -176,7 +176,7 @@ static void ceph_sock_write_space(struct sock *sk)
 	 * buffer. See net/ipv4/tcp_input.c:tcp_check_space()
 	 * and net/core/stream.c:sk_stream_write_space().
 	 */
-	if (test_bit(WRITE_PENDING, &con->state)) {
+	if (test_bit(WRITE_PENDING, &con->flags)) {
 		if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
 			dout("%s %p queueing write work\n", __func__, con);
 			clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
@@ -203,7 +203,7 @@ static void ceph_sock_state_change(struct sock *sk)
 		dout("%s TCP_CLOSE\n", __func__);
 	case TCP_CLOSE_WAIT:
 		dout("%s TCP_CLOSE_WAIT\n", __func__);
-		if (test_and_set_bit(SOCK_CLOSED, &con->state) == 0) {
+		if (test_and_set_bit(SOCK_CLOSED, &con->flags) == 0) {
 			if (test_bit(CONNECTING, &con->state))
 				con->error_msg = "connection failed";
 			else
@@ -395,9 +395,9 @@ void ceph_con_close(struct ceph_connection *con)
 	     ceph_pr_addr(&con->peer_addr.in_addr));
 	set_bit(CLOSED, &con->state);  /* in case there's queued work */
 	clear_bit(STANDBY, &con->state);  /* avoid connect_seq bump */
-	clear_bit(LOSSYTX, &con->state);  /* so we retry next connect */
-	clear_bit(KEEPALIVE_PENDING, &con->state);
-	clear_bit(WRITE_PENDING, &con->state);
+	clear_bit(LOSSYTX, &con->flags);  /* so we retry next connect */
+	clear_bit(KEEPALIVE_PENDING, &con->flags);
+	clear_bit(WRITE_PENDING, &con->flags);
 	mutex_lock(&con->mutex);
 	reset_connection(con);
 	con->peer_global_seq = 0;
@@ -614,7 +614,7 @@ static void prepare_write_message(struct ceph_connection *con)
 		prepare_write_message_footer(con);
 	}
 
-	set_bit(WRITE_PENDING, &con->state);
+	set_bit(WRITE_PENDING, &con->flags);
 }
 
 /*
@@ -635,7 +635,7 @@ static void prepare_write_ack(struct ceph_connection *con)
 				&con->out_temp_ack);
 
 	con->out_more = 1;  /* more will follow.. eventually.. */
-	set_bit(WRITE_PENDING, &con->state);
+	set_bit(WRITE_PENDING, &con->flags);
 }
 
 /*
@@ -646,7 +646,7 @@ static void prepare_write_keepalive(struct ceph_connection *con)
 	dout("prepare_write_keepalive %p\n", con);
 	con_out_kvec_reset(con);
 	con_out_kvec_add(con, sizeof (tag_keepalive), &tag_keepalive);
-	set_bit(WRITE_PENDING, &con->state);
+	set_bit(WRITE_PENDING, &con->flags);
 }
 
 /*
@@ -675,7 +675,7 @@ static struct ceph_auth_handshake *get_connect_authorizer(struct ceph_connection
 
 	if (IS_ERR(auth))
 		return auth;
-	if (test_bit(CLOSED, &con->state) || test_bit(OPENING, &con->state))
+	if (test_bit(CLOSED, &con->state) || test_bit(OPENING, &con->flags))
 		return ERR_PTR(-EAGAIN);
 
 	con->auth_reply_buf = auth->authorizer_reply_buf;
@@ -695,7 +695,7 @@ static void prepare_write_banner(struct ceph_connection *con)
 					&con->msgr->my_enc_addr);
 
 	con->out_more = 0;
-	set_bit(WRITE_PENDING, &con->state);
+	set_bit(WRITE_PENDING, &con->flags);
 }
 
 static int prepare_write_connect(struct ceph_connection *con)
@@ -745,7 +745,7 @@ static int prepare_write_connect(struct ceph_connection *con)
 					auth->authorizer_buf);
 
 	con->out_more = 0;
-	set_bit(WRITE_PENDING, &con->state);
+	set_bit(WRITE_PENDING, &con->flags);
 
 	return 0;
 }
@@ -1492,7 +1492,7 @@ static int process_connect(struct ceph_connection *con)
 			le32_to_cpu(con->in_reply.connect_seq));
 
 		if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
-			set_bit(LOSSYTX, &con->state);
+			set_bit(LOSSYTX, &con->flags);
 
 		prepare_read_tag(con);
 		break;
@@ -1933,14 +1933,14 @@ do_next:
 			prepare_write_ack(con);
 			goto more;
 		}
-		if (test_and_clear_bit(KEEPALIVE_PENDING, &con->state)) {
+		if (test_and_clear_bit(KEEPALIVE_PENDING, &con->flags)) {
 			prepare_write_keepalive(con);
 			goto more;
 		}
 	}
 
 	/* Nothing to do! */
-	clear_bit(WRITE_PENDING, &con->state);
+	clear_bit(WRITE_PENDING, &con->flags);
 	dout("try_write nothing else to write.\n");
 	ret = 0;
 out:
@@ -2106,7 +2106,7 @@ static void con_work(struct work_struct *work)
 
 	mutex_lock(&con->mutex);
 restart:
-	if (test_and_clear_bit(BACKOFF, &con->state)) {
+	if (test_and_clear_bit(BACKOFF, &con->flags)) {
 		dout("con_work %p backing off\n", con);
 		if (queue_delayed_work(ceph_msgr_wq, &con->work,
 				       round_jiffies_relative(con->delay))) {
@@ -2135,7 +2135,7 @@ restart:
 		con_close_socket(con);
 	}
 
-	if (test_and_clear_bit(SOCK_CLOSED, &con->state))
+	if (test_and_clear_bit(SOCK_CLOSED, &con->flags))
 		goto fault;
 
 	ret = try_read(con);
@@ -2174,7 +2174,7 @@ static void ceph_fault(struct ceph_connection *con)
 	dout("fault %p state %lu to peer %s\n",
 	     con, con->state, ceph_pr_addr(&con->peer_addr.in_addr));
 
-	if (test_bit(LOSSYTX, &con->state)) {
+	if (test_bit(LOSSYTX, &con->flags)) {
 		dout("fault on LOSSYTX channel\n");
 		goto out;
 	}
@@ -2196,9 +2196,9 @@ static void ceph_fault(struct ceph_connection *con)
 	/* If there are no messages queued or keepalive pending, place
 	 * the connection in a STANDBY state */
 	if (list_empty(&con->out_queue) &&
-	    !test_bit(KEEPALIVE_PENDING, &con->state)) {
+	    !test_bit(KEEPALIVE_PENDING, &con->flags)) {
 		dout("fault %p setting STANDBY clearing WRITE_PENDING\n", con);
-		clear_bit(WRITE_PENDING, &con->state);
+		clear_bit(WRITE_PENDING, &con->flags);
 		set_bit(STANDBY, &con->state);
 	} else {
 		/* retry after a delay. */
@@ -2222,7 +2222,7 @@ static void ceph_fault(struct ceph_connection *con)
 			 * that when con_work restarts we schedule the
 			 * delay then.
 			 */
-			set_bit(BACKOFF, &con->state);
+			set_bit(BACKOFF, &con->flags);
 		}
 	}
 
@@ -2278,8 +2278,8 @@ static void clear_standby(struct ceph_connection *con)
 		mutex_lock(&con->mutex);
 		dout("clear_standby %p and ++connect_seq\n", con);
 		con->connect_seq++;
-		WARN_ON(test_bit(WRITE_PENDING, &con->state));
-		WARN_ON(test_bit(KEEPALIVE_PENDING, &con->state));
+		WARN_ON(test_bit(WRITE_PENDING, &con->flags));
+		WARN_ON(test_bit(KEEPALIVE_PENDING, &con->flags));
 		mutex_unlock(&con->mutex);
 	}
 }
@@ -2317,7 +2317,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
 	/* if there wasn't anything waiting to send before, queue
 	 * new work */
 	clear_standby(con);
-	if (test_and_set_bit(WRITE_PENDING, &con->state) == 0)
+	if (test_and_set_bit(WRITE_PENDING, &con->flags) == 0)
 		queue_con(con);
 }
 EXPORT_SYMBOL(ceph_con_send);
@@ -2384,8 +2384,8 @@ void ceph_con_keepalive(struct ceph_connection *con)
 {
 	dout("con_keepalive %p\n", con);
 	clear_standby(con);
-	if (test_and_set_bit(KEEPALIVE_PENDING, &con->state) == 0 &&
-	    test_and_set_bit(WRITE_PENDING, &con->state) == 0)
+	if (test_and_set_bit(KEEPALIVE_PENDING, &con->flags) == 0 &&
+	    test_and_set_bit(WRITE_PENDING, &con->flags) == 0)
 		queue_con(con);
 }
 EXPORT_SYMBOL(ceph_con_keepalive);

From ce2c8903e76e690846a00a0284e4bd9ee954d680 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Tue, 22 May 2012 22:15:49 -0500
Subject: [PATCH 008/132] libceph: start tracking connection socket state

Start explicitly keeping track of the state of a ceph connection's
socket, separate from the state of the connection itself.  Create
placeholder functions to encapsulate the state transitions.

    --------
    | NEW* |  transient initial state
    --------
        | con_sock_state_init()
        v
    ----------
    | CLOSED |  initialized, but no socket (and no
    ----------  TCP connection)
     ^      \
     |       \ con_sock_state_connecting()
     |        ----------------------
     |                              \
     + con_sock_state_closed()       \
     |\                               \
     | \                               \
     |  -----------                     \
     |  | CLOSING |  socket event;       \
     |  -----------  await close          \
     |       ^                            |
     |       |                            |
     |       + con_sock_state_closing()   |
     |      / \                           |
     |     /   ---------------            |
     |    /                   \           v
     |   /                    --------------
     |  /    -----------------| CONNECTING |  socket created, TCP
     |  |   /                 --------------  connect initiated
     |  |   | con_sock_state_connected()
     |  |   v
    -------------
    | CONNECTED |  TCP connection established
    -------------

Make the socket state an atomic variable, reinforcing that it's a
distinct transtion with no possible "intermediate/both" states.
This is almost certainly overkill at this point, though the
transitions into CONNECTED and CLOSING state do get called via
socket callback (the rest of the transitions occur with the
connection mutex held).  We can back out the atomicity later.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil<sage@inktank.com>
---
 include/linux/ceph/messenger.h |  8 +++--
 net/ceph/messenger.c           | 64 ++++++++++++++++++++++++++++++++++
 2 files changed, 70 insertions(+), 2 deletions(-)

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 920235e114ad..5e852f444f68 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -137,14 +137,18 @@ struct ceph_connection {
 	const struct ceph_connection_operations *ops;
 
 	struct ceph_messenger *msgr;
+
+	atomic_t sock_state;
 	struct socket *sock;
+	struct ceph_entity_addr peer_addr; /* peer address */
+	struct ceph_entity_addr peer_addr_for_me;
+
 	unsigned long flags;
 	unsigned long state;
 	const char *error_msg;  /* error message, if any */
 
-	struct ceph_entity_addr peer_addr; /* peer address */
 	struct ceph_entity_name peer_name; /* peer name */
-	struct ceph_entity_addr peer_addr_for_me;
+
 	unsigned peer_features;
 	u32 connect_seq;      /* identify the most recent connection
 				 attempt for this connection, client */
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index e84e4fd86bb7..a4ac3deec161 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -29,6 +29,14 @@
  * the sender.
  */
 
+/* State values for ceph_connection->sock_state; NEW is assumed to be 0 */
+
+#define CON_SOCK_STATE_NEW		0	/* -> CLOSED */
+#define CON_SOCK_STATE_CLOSED		1	/* -> CONNECTING */
+#define CON_SOCK_STATE_CONNECTING	2	/* -> CONNECTED or -> CLOSING */
+#define CON_SOCK_STATE_CONNECTED	3	/* -> CLOSING or -> CLOSED */
+#define CON_SOCK_STATE_CLOSING		4	/* -> CLOSED */
+
 /* static tag bytes (protocol control messages) */
 static char tag_msg = CEPH_MSGR_TAG_MSG;
 static char tag_ack = CEPH_MSGR_TAG_ACK;
@@ -147,6 +155,55 @@ void ceph_msgr_flush(void)
 }
 EXPORT_SYMBOL(ceph_msgr_flush);
 
+/* Connection socket state transition functions */
+
+static void con_sock_state_init(struct ceph_connection *con)
+{
+	int old_state;
+
+	old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSED);
+	if (WARN_ON(old_state != CON_SOCK_STATE_NEW))
+		printk("%s: unexpected old state %d\n", __func__, old_state);
+}
+
+static void con_sock_state_connecting(struct ceph_connection *con)
+{
+	int old_state;
+
+	old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CONNECTING);
+	if (WARN_ON(old_state != CON_SOCK_STATE_CLOSED))
+		printk("%s: unexpected old state %d\n", __func__, old_state);
+}
+
+static void con_sock_state_connected(struct ceph_connection *con)
+{
+	int old_state;
+
+	old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CONNECTED);
+	if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTING))
+		printk("%s: unexpected old state %d\n", __func__, old_state);
+}
+
+static void con_sock_state_closing(struct ceph_connection *con)
+{
+	int old_state;
+
+	old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSING);
+	if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTING &&
+			old_state != CON_SOCK_STATE_CONNECTED &&
+			old_state != CON_SOCK_STATE_CLOSING))
+		printk("%s: unexpected old state %d\n", __func__, old_state);
+}
+
+static void con_sock_state_closed(struct ceph_connection *con)
+{
+	int old_state;
+
+	old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSED);
+	if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTED &&
+			old_state != CON_SOCK_STATE_CLOSING))
+		printk("%s: unexpected old state %d\n", __func__, old_state);
+}
 
 /*
  * socket callback functions
@@ -203,6 +260,7 @@ static void ceph_sock_state_change(struct sock *sk)
 		dout("%s TCP_CLOSE\n", __func__);
 	case TCP_CLOSE_WAIT:
 		dout("%s TCP_CLOSE_WAIT\n", __func__);
+		con_sock_state_closing(con);
 		if (test_and_set_bit(SOCK_CLOSED, &con->flags) == 0) {
 			if (test_bit(CONNECTING, &con->state))
 				con->error_msg = "connection failed";
@@ -213,6 +271,7 @@ static void ceph_sock_state_change(struct sock *sk)
 		break;
 	case TCP_ESTABLISHED:
 		dout("%s TCP_ESTABLISHED\n", __func__);
+		con_sock_state_connected(con);
 		queue_con(con);
 		break;
 	default:	/* Everything else is uninteresting */
@@ -277,6 +336,7 @@ static int ceph_tcp_connect(struct ceph_connection *con)
 		return ret;
 	}
 	con->sock = sock;
+	con_sock_state_connecting(con);
 
 	return 0;
 }
@@ -343,6 +403,7 @@ static int con_close_socket(struct ceph_connection *con)
 	sock_release(con->sock);
 	con->sock = NULL;
 	clear_bit(SOCK_CLOSED, &con->state);
+	con_sock_state_closed(con);
 	return rc;
 }
 
@@ -462,6 +523,9 @@ void ceph_con_init(struct ceph_messenger *msgr, struct ceph_connection *con)
 	memset(con, 0, sizeof(*con));
 	atomic_set(&con->nref, 1);
 	con->msgr = msgr;
+
+	con_sock_state_init(con);
+
 	mutex_init(&con->mutex);
 	INIT_LIST_HEAD(&con->out_queue);
 	INIT_LIST_HEAD(&con->out_sent);

From e10006f807ffc4d5b1d861305d18d9e8145891ca Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Sat, 26 May 2012 23:26:43 -0500
Subject: [PATCH 009/132] libceph: provide osd number when creating osd

Pass the osd number to the create_osd() routine, and move the
initialization of fields that depend on it therein.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 net/ceph/osd_client.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index cca4c7f1c780..e30efbcc6388 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -624,7 +624,7 @@ static void osd_reset(struct ceph_connection *con)
 /*
  * Track open sessions with osds.
  */
-static struct ceph_osd *create_osd(struct ceph_osd_client *osdc)
+static struct ceph_osd *create_osd(struct ceph_osd_client *osdc, int onum)
 {
 	struct ceph_osd *osd;
 
@@ -634,6 +634,7 @@ static struct ceph_osd *create_osd(struct ceph_osd_client *osdc)
 
 	atomic_set(&osd->o_ref, 1);
 	osd->o_osdc = osdc;
+	osd->o_osd = onum;
 	INIT_LIST_HEAD(&osd->o_requests);
 	INIT_LIST_HEAD(&osd->o_linger_requests);
 	INIT_LIST_HEAD(&osd->o_osd_lru);
@@ -643,6 +644,7 @@ static struct ceph_osd *create_osd(struct ceph_osd_client *osdc)
 	osd->o_con.private = osd;
 	osd->o_con.ops = &osd_con_ops;
 	osd->o_con.peer_name.type = CEPH_ENTITY_TYPE_OSD;
+	osd->o_con.peer_name.num = cpu_to_le64(onum);
 
 	INIT_LIST_HEAD(&osd->o_keepalive_item);
 	return osd;
@@ -998,15 +1000,13 @@ static int __map_request(struct ceph_osd_client *osdc,
 	req->r_osd = __lookup_osd(osdc, o);
 	if (!req->r_osd && o >= 0) {
 		err = -ENOMEM;
-		req->r_osd = create_osd(osdc);
+		req->r_osd = create_osd(osdc, o);
 		if (!req->r_osd) {
 			list_move(&req->r_req_lru_item, &osdc->req_notarget);
 			goto out;
 		}
 
 		dout("map_request osd %p is osd%d\n", req->r_osd, o);
-		req->r_osd->o_osd = o;
-		req->r_osd->o_con.peer_name.num = cpu_to_le64(o);
 		__insert_osd(osdc, req->r_osd);
 
 		ceph_con_open(&req->r_osd->o_con, &osdc->osdmap->osd_addr[o]);

From a5988c490ef66cb04ea2f610681949b25c773b3c Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Tue, 29 May 2012 11:04:58 -0500
Subject: [PATCH 010/132] libceph: set CLOSED state bit in con_init

Once a connection is fully initialized, it is really in a CLOSED
state, so make that explicit by setting the bit in its state field.

It is possible for a connection in NEGOTIATING state to get a
failure, leading to ceph_fault() and ultimately ceph_con_close().
Clear that bits if it is set in that case, to reflect that the
connection truly is closed and is no longer participating in a
connect sequence.

Issue a warning if ceph_con_open() is called on a connection that
is not in CLOSED state.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 net/ceph/messenger.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index a4ac3deec161..36b440a00cc2 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -454,11 +454,14 @@ void ceph_con_close(struct ceph_connection *con)
 {
 	dout("con_close %p peer %s\n", con,
 	     ceph_pr_addr(&con->peer_addr.in_addr));
-	set_bit(CLOSED, &con->state);  /* in case there's queued work */
+	clear_bit(NEGOTIATING, &con->state);
 	clear_bit(STANDBY, &con->state);  /* avoid connect_seq bump */
+	set_bit(CLOSED, &con->state);
+
 	clear_bit(LOSSYTX, &con->flags);  /* so we retry next connect */
 	clear_bit(KEEPALIVE_PENDING, &con->flags);
 	clear_bit(WRITE_PENDING, &con->flags);
+
 	mutex_lock(&con->mutex);
 	reset_connection(con);
 	con->peer_global_seq = 0;
@@ -475,7 +478,8 @@ void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr)
 {
 	dout("con_open %p %s\n", con, ceph_pr_addr(&addr->in_addr));
 	set_bit(OPENING, &con->state);
-	clear_bit(CLOSED, &con->state);
+	WARN_ON(!test_and_clear_bit(CLOSED, &con->state));
+
 	memcpy(&con->peer_addr, addr, sizeof(*addr));
 	con->delay = 0;      /* reset backoff memory */
 	queue_con(con);
@@ -530,6 +534,8 @@ void ceph_con_init(struct ceph_messenger *msgr, struct ceph_connection *con)
 	INIT_LIST_HEAD(&con->out_queue);
 	INIT_LIST_HEAD(&con->out_sent);
 	INIT_DELAYED_WORK(&con->work, con_work);
+
+	set_bit(CLOSED, &con->state);
 }
 EXPORT_SYMBOL(ceph_con_init);
 
@@ -1933,14 +1939,15 @@ more:
 
 	/* open the socket first? */
 	if (con->sock == NULL) {
+		clear_bit(NEGOTIATING, &con->state);
+		set_bit(CONNECTING, &con->state);
+
 		con_out_kvec_reset(con);
 		prepare_write_banner(con);
 		ret = prepare_write_connect(con);
 		if (ret < 0)
 			goto out;
 		prepare_read_banner(con);
-		set_bit(CONNECTING, &con->state);
-		clear_bit(NEGOTIATING, &con->state);
 
 		BUG_ON(con->in_msg);
 		con->in_tag = CEPH_MSGR_TAG_READY;

From f9f9a1904467816452fc70740165030e84c2c659 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Wed, 6 Jun 2012 09:15:33 -0500
Subject: [PATCH 011/132] rbd: Fix ceph_snap_context size calculation

ceph_snap_context->snaps is an u64 array

Signed-off-by: Zheng Yan <zheng.z.yan@intel.com>
Reviewed-by: Alex Elder <elder@inktank.com>
---
 drivers/block/rbd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 65665c9c42c6..8b9c1734d807 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -499,7 +499,7 @@ static int rbd_header_from_disk(struct rbd_image_header *header,
 			 / sizeof (*ondisk))
 		return -EINVAL;
 	header->snapc = kmalloc(sizeof(struct ceph_snap_context) +
-				snap_count * sizeof (*ondisk),
+				snap_count * sizeof(u64),
 				gfp_flags);
 	if (!header->snapc)
 		return -ENOMEM;

From 895cfcc810e53d7d36639969c71efb9087221167 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Wed, 6 Jun 2012 09:15:33 -0500
Subject: [PATCH 012/132] rbd: endian bug in rbd_req_cb()

Sparse complains about this because:
drivers/block/rbd.c:996:20: warning: cast to restricted __le32
drivers/block/rbd.c:996:20: warning: cast from restricted __le16

These are set in osd_req_encode_op() and they are le16.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Reviewed-by: Alex Elder <elder@inktank.com>
---
 drivers/block/rbd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 8b9c1734d807..8f428a8ab003 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -977,7 +977,7 @@ static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
 	op = (void *)(replyhead + 1);
 	rc = le32_to_cpu(replyhead->result);
 	bytes = le64_to_cpu(op->extent.length);
-	read_op = (le32_to_cpu(op->op) == CEPH_OSD_OP_READ);
+	read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
 
 	dout("rbd_req_cb bytes=%lld readop=%d rc=%d\n", bytes, read_op, rc);
 

From ab8cb34a4b2f60281a4b18b1f1ad23bc2313d91b Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Mon, 4 Jun 2012 14:43:32 -0500
Subject: [PATCH 013/132] libceph: osd_client: don't drop reply reference too
 early

In ceph_osdc_release_request(), a reference to the r_reply message
is dropped.  But just after that, that same message is revoked if it
was in use to receive an incoming reply.  Reorder these so we are
sure we hold a reference until we're actually done with the message.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 net/ceph/osd_client.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index e30efbcc6388..d8b6d319499a 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -139,8 +139,6 @@ void ceph_osdc_release_request(struct kref *kref)
 
 	if (req->r_request)
 		ceph_msg_put(req->r_request);
-	if (req->r_reply)
-		ceph_msg_put(req->r_reply);
 	if (req->r_con_filling_msg) {
 		dout("release_request revoking pages %p from con %p\n",
 		     req->r_pages, req->r_con_filling_msg);
@@ -148,6 +146,8 @@ void ceph_osdc_release_request(struct kref *kref)
 				      req->r_reply);
 		ceph_con_put(req->r_con_filling_msg);
 	}
+	if (req->r_reply)
+		ceph_msg_put(req->r_reply);
 	if (req->r_own_pages)
 		ceph_release_page_vector(req->r_pages,
 					 req->r_num_pages);

From 0d47766f14211a73eaf54cab234db134ece79f49 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Thu, 31 May 2012 20:22:18 -0700
Subject: [PATCH 014/132] libceph: use con get/put ops from osd_client

There were a few direct calls to ceph_con_{get,put}() instead of the con
ops from osd_client.c.  This is a bug since those ops aren't defined to
be ceph_con_get/put.

This breaks refcounting on the ceph_osd structs that contain the
ceph_connections, and could lead to all manner of strangeness.

The purpose of the ->get and ->put methods in a ceph connection are
to allow the connection to indicate it has a reference to something
external to the messaging system, *not* to indicate something
external has a reference to the connection.

[elder@inktank.com: added that last sentence]

Signed-off-by: Sage Weil <sage@newdream.net>
Reviewed-by: Alex Elder <elder@inktank.com>
---
 net/ceph/osd_client.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index d8b6d319499a..5b41a6929cd9 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -144,7 +144,7 @@ void ceph_osdc_release_request(struct kref *kref)
 		     req->r_pages, req->r_con_filling_msg);
 		ceph_con_revoke_message(req->r_con_filling_msg,
 				      req->r_reply);
-		ceph_con_put(req->r_con_filling_msg);
+		req->r_con_filling_msg->ops->put(req->r_con_filling_msg);
 	}
 	if (req->r_reply)
 		ceph_msg_put(req->r_reply);
@@ -1216,7 +1216,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
 	if (req->r_con_filling_msg == con && req->r_reply == msg) {
 		dout(" dropping con_filling_msg ref %p\n", con);
 		req->r_con_filling_msg = NULL;
-		ceph_con_put(con);
+		con->ops->put(con);
 	}
 
 	if (!req->r_got_reply) {
@@ -2028,7 +2028,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
 		dout("get_reply revoking msg %p from old con %p\n",
 		     req->r_reply, req->r_con_filling_msg);
 		ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply);
-		ceph_con_put(req->r_con_filling_msg);
+		req->r_con_filling_msg->ops->put(req->r_con_filling_msg);
 		req->r_con_filling_msg = NULL;
 	}
 
@@ -2063,7 +2063,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
 #endif
 	}
 	*skip = 0;
-	req->r_con_filling_msg = ceph_con_get(con);
+	req->r_con_filling_msg = con->ops->get(con);
 	dout("get_reply tid %lld %p\n", tid, m);
 
 out:

From 67130934fb579fdf0f2f6d745960264378b57dc8 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Sat, 26 May 2012 23:26:43 -0500
Subject: [PATCH 015/132] libceph: embed ceph connection structure in
 mon_client

A monitor client has a pointer to a ceph connection structure in it.
This is the only one of the three ceph client types that do it this
way; the OSD and MDS clients embed the connection into their main
structures.  There is always exactly one ceph connection for a
monitor client, so there is no need to allocate it separate from the
monitor client structure.

So switch the ceph_mon_client structure to embed its
ceph_connection structure.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 include/linux/ceph/mon_client.h |  2 +-
 net/ceph/mon_client.c           | 47 ++++++++++++++-------------------
 2 files changed, 21 insertions(+), 28 deletions(-)

diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h
index 545f85917780..2113e3850a4e 100644
--- a/include/linux/ceph/mon_client.h
+++ b/include/linux/ceph/mon_client.h
@@ -70,7 +70,7 @@ struct ceph_mon_client {
 	bool hunting;
 	int cur_mon;                       /* last monitor i contacted */
 	unsigned long sub_sent, sub_renew_after;
-	struct ceph_connection *con;
+	struct ceph_connection con;
 	bool have_fsid;
 
 	/* pending generic requests */
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index 704dc95dc620..ac4d6b100730 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -106,9 +106,9 @@ static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
 	monc->pending_auth = 1;
 	monc->m_auth->front.iov_len = len;
 	monc->m_auth->hdr.front_len = cpu_to_le32(len);
-	ceph_con_revoke(monc->con, monc->m_auth);
+	ceph_con_revoke(&monc->con, monc->m_auth);
 	ceph_msg_get(monc->m_auth);  /* keep our ref */
-	ceph_con_send(monc->con, monc->m_auth);
+	ceph_con_send(&monc->con, monc->m_auth);
 }
 
 /*
@@ -117,8 +117,8 @@ static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
 static void __close_session(struct ceph_mon_client *monc)
 {
 	dout("__close_session closing mon%d\n", monc->cur_mon);
-	ceph_con_revoke(monc->con, monc->m_auth);
-	ceph_con_close(monc->con);
+	ceph_con_revoke(&monc->con, monc->m_auth);
+	ceph_con_close(&monc->con);
 	monc->cur_mon = -1;
 	monc->pending_auth = 0;
 	ceph_auth_reset(monc->auth);
@@ -142,9 +142,9 @@ static int __open_session(struct ceph_mon_client *monc)
 		monc->want_next_osdmap = !!monc->want_next_osdmap;
 
 		dout("open_session mon%d opening\n", monc->cur_mon);
-		monc->con->peer_name.type = CEPH_ENTITY_TYPE_MON;
-		monc->con->peer_name.num = cpu_to_le64(monc->cur_mon);
-		ceph_con_open(monc->con,
+		monc->con.peer_name.type = CEPH_ENTITY_TYPE_MON;
+		monc->con.peer_name.num = cpu_to_le64(monc->cur_mon);
+		ceph_con_open(&monc->con,
 			      &monc->monmap->mon_inst[monc->cur_mon].addr);
 
 		/* initiatiate authentication handshake */
@@ -226,8 +226,8 @@ static void __send_subscribe(struct ceph_mon_client *monc)
 
 		msg->front.iov_len = p - msg->front.iov_base;
 		msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
-		ceph_con_revoke(monc->con, msg);
-		ceph_con_send(monc->con, ceph_msg_get(msg));
+		ceph_con_revoke(&monc->con, msg);
+		ceph_con_send(&monc->con, ceph_msg_get(msg));
 
 		monc->sub_sent = jiffies | 1;  /* never 0 */
 	}
@@ -247,7 +247,7 @@ static void handle_subscribe_ack(struct ceph_mon_client *monc,
 	if (monc->hunting) {
 		pr_info("mon%d %s session established\n",
 			monc->cur_mon,
-			ceph_pr_addr(&monc->con->peer_addr.in_addr));
+			ceph_pr_addr(&monc->con.peer_addr.in_addr));
 		monc->hunting = false;
 	}
 	dout("handle_subscribe_ack after %d seconds\n", seconds);
@@ -461,7 +461,7 @@ static int do_generic_request(struct ceph_mon_client *monc,
 	req->request->hdr.tid = cpu_to_le64(req->tid);
 	__insert_generic_request(monc, req);
 	monc->num_generic_requests++;
-	ceph_con_send(monc->con, ceph_msg_get(req->request));
+	ceph_con_send(&monc->con, ceph_msg_get(req->request));
 	mutex_unlock(&monc->mutex);
 
 	err = wait_for_completion_interruptible(&req->completion);
@@ -684,8 +684,8 @@ static void __resend_generic_request(struct ceph_mon_client *monc)
 
 	for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) {
 		req = rb_entry(p, struct ceph_mon_generic_request, node);
-		ceph_con_revoke(monc->con, req->request);
-		ceph_con_send(monc->con, ceph_msg_get(req->request));
+		ceph_con_revoke(&monc->con, req->request);
+		ceph_con_send(&monc->con, ceph_msg_get(req->request));
 	}
 }
 
@@ -705,7 +705,7 @@ static void delayed_work(struct work_struct *work)
 		__close_session(monc);
 		__open_session(monc);  /* continue hunting */
 	} else {
-		ceph_con_keepalive(monc->con);
+		ceph_con_keepalive(&monc->con);
 
 		__validate_auth(monc);
 
@@ -760,19 +760,16 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
 		goto out;
 
 	/* connection */
-	monc->con = kmalloc(sizeof(*monc->con), GFP_KERNEL);
-	if (!monc->con)
-		goto out_monmap;
-	ceph_con_init(&monc->client->msgr, monc->con);
-	monc->con->private = monc;
-	monc->con->ops = &mon_con_ops;
+	ceph_con_init(&monc->client->msgr, &monc->con);
+	monc->con.private = monc;
+	monc->con.ops = &mon_con_ops;
 
 	/* authentication */
 	monc->auth = ceph_auth_init(cl->options->name,
 				    cl->options->key);
 	if (IS_ERR(monc->auth)) {
 		err = PTR_ERR(monc->auth);
-		goto out_con;
+		goto out_monmap;
 	}
 	monc->auth->want_keys =
 		CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON |
@@ -824,8 +821,6 @@ out_subscribe_ack:
 	ceph_msg_put(monc->m_subscribe_ack);
 out_auth:
 	ceph_auth_destroy(monc->auth);
-out_con:
-	monc->con->ops->put(monc->con);
 out_monmap:
 	kfree(monc->monmap);
 out:
@@ -841,9 +836,7 @@ void ceph_monc_stop(struct ceph_mon_client *monc)
 	mutex_lock(&monc->mutex);
 	__close_session(monc);
 
-	monc->con->private = NULL;
-	monc->con->ops->put(monc->con);
-	monc->con = NULL;
+	monc->con.private = NULL;
 
 	mutex_unlock(&monc->mutex);
 
@@ -1021,7 +1014,7 @@ static void mon_fault(struct ceph_connection *con)
 	if (!monc->hunting)
 		pr_info("mon%d %s session lost, "
 			"hunting for new mon\n", monc->cur_mon,
-			ceph_pr_addr(&monc->con->peer_addr.in_addr));
+			ceph_pr_addr(&monc->con.peer_addr.in_addr));
 
 	__close_session(monc);
 	if (!monc->hunting) {

From ec87ef4309d33bd9c87a53bb5152a86ae7a65f25 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Thu, 31 May 2012 20:27:50 -0700
Subject: [PATCH 016/132] libceph: drop connection refcounting for mon_client

All references to the embedded ceph_connection come from the msgr
workqueue, which is drained prior to mon_client destruction.  That
means we can ignore con refcounting entirely.

Signed-off-by: Sage Weil <sage@newdream.net>
Reviewed-by: Alex Elder <elder@inktank.com>
---
 net/ceph/mon_client.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index ac4d6b100730..062b72478077 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -1029,9 +1029,23 @@ out:
 	mutex_unlock(&monc->mutex);
 }
 
+/*
+ * We can ignore refcounting on the connection struct, as all references
+ * will come from the messenger workqueue, which is drained prior to
+ * mon_client destruction.
+ */
+static struct ceph_connection *con_get(struct ceph_connection *con)
+{
+	return con;
+}
+
+static void con_put(struct ceph_connection *con)
+{
+}
+
 static const struct ceph_connection_operations mon_con_ops = {
-	.get = ceph_con_get,
-	.put = ceph_con_put,
+	.get = con_get,
+	.put = con_put,
 	.dispatch = dispatch,
 	.fault = mon_fault,
 	.alloc_msg = mon_alloc_msg,

From 20581c1faf7b15ae1f8b80c0ec757877b0b53151 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Sat, 26 May 2012 23:26:43 -0500
Subject: [PATCH 017/132] libceph: init monitor connection when opening

Hold off initializing a monitor client's connection until just
before it gets opened for use.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 net/ceph/mon_client.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index 062b72478077..6adbea78b168 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -119,6 +119,7 @@ static void __close_session(struct ceph_mon_client *monc)
 	dout("__close_session closing mon%d\n", monc->cur_mon);
 	ceph_con_revoke(&monc->con, monc->m_auth);
 	ceph_con_close(&monc->con);
+	monc->con.private = NULL;
 	monc->cur_mon = -1;
 	monc->pending_auth = 0;
 	ceph_auth_reset(monc->auth);
@@ -141,9 +142,13 @@ static int __open_session(struct ceph_mon_client *monc)
 		monc->sub_renew_after = jiffies;  /* i.e., expired */
 		monc->want_next_osdmap = !!monc->want_next_osdmap;
 
-		dout("open_session mon%d opening\n", monc->cur_mon);
+		ceph_con_init(&monc->client->msgr, &monc->con);
+		monc->con.private = monc;
+		monc->con.ops = &mon_con_ops;
 		monc->con.peer_name.type = CEPH_ENTITY_TYPE_MON;
 		monc->con.peer_name.num = cpu_to_le64(monc->cur_mon);
+
+		dout("open_session mon%d opening\n", monc->cur_mon);
 		ceph_con_open(&monc->con,
 			      &monc->monmap->mon_inst[monc->cur_mon].addr);
 
@@ -760,10 +765,6 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
 		goto out;
 
 	/* connection */
-	ceph_con_init(&monc->client->msgr, &monc->con);
-	monc->con.private = monc;
-	monc->con.ops = &mon_con_ops;
-
 	/* authentication */
 	monc->auth = ceph_auth_init(cl->options->name,
 				    cl->options->key);
@@ -836,8 +837,6 @@ void ceph_monc_stop(struct ceph_mon_client *monc)
 	mutex_lock(&monc->mutex);
 	__close_session(monc);
 
-	monc->con.private = NULL;
-
 	mutex_unlock(&monc->mutex);
 
 	ceph_auth_destroy(monc->auth);

From 1bfd89f4e6e1adc6a782d94aa5d4c53be1e404d7 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Sat, 26 May 2012 23:26:43 -0500
Subject: [PATCH 018/132] libceph: fully initialize connection in con_init()

Move the initialization of a ceph connection's private pointer,
operations vector pointer, and peer name information into
ceph_con_init().  Rearrange the arguments so the connection pointer
is first.  Hide the byte-swapping of the peer entity number inside
ceph_con_init()

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 fs/ceph/mds_client.c           | 7 ++-----
 include/linux/ceph/messenger.h | 6 ++++--
 net/ceph/messenger.c           | 9 ++++++++-
 net/ceph/mon_client.c          | 8 +++-----
 net/ceph/osd_client.c          | 7 ++-----
 5 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index ad30261cd4c0..ecd7f15741c1 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -394,11 +394,8 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
 	s->s_seq = 0;
 	mutex_init(&s->s_mutex);
 
-	ceph_con_init(&mdsc->fsc->client->msgr, &s->s_con);
-	s->s_con.private = s;
-	s->s_con.ops = &mds_con_ops;
-	s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS;
-	s->s_con.peer_name.num = cpu_to_le64(mds);
+	ceph_con_init(&s->s_con, s, &mds_con_ops, &mdsc->fsc->client->msgr,
+		CEPH_ENTITY_TYPE_MDS, mds);
 
 	spin_lock_init(&s->s_gen_ttl_lock);
 	s->s_cap_gen = 0;
diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 5e852f444f68..dd27837f79ac 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -227,8 +227,10 @@ extern void ceph_messenger_init(struct ceph_messenger *msgr,
 			u32 required_features,
 			bool nocrc);
 
-extern void ceph_con_init(struct ceph_messenger *msgr,
-			  struct ceph_connection *con);
+extern void ceph_con_init(struct ceph_connection *con, void *private,
+			const struct ceph_connection_operations *ops,
+			struct ceph_messenger *msgr, __u8 entity_type,
+			__u64 entity_num);
 extern void ceph_con_open(struct ceph_connection *con,
 			  struct ceph_entity_addr *addr);
 extern bool ceph_con_opened(struct ceph_connection *con);
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 36b440a00cc2..3b65f6e6911b 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -521,15 +521,22 @@ void ceph_con_put(struct ceph_connection *con)
 /*
  * initialize a new connection.
  */
-void ceph_con_init(struct ceph_messenger *msgr, struct ceph_connection *con)
+void ceph_con_init(struct ceph_connection *con, void *private,
+	const struct ceph_connection_operations *ops,
+	struct ceph_messenger *msgr, __u8 entity_type, __u64 entity_num)
 {
 	dout("con_init %p\n", con);
 	memset(con, 0, sizeof(*con));
+	con->private = private;
+	con->ops = ops;
 	atomic_set(&con->nref, 1);
 	con->msgr = msgr;
 
 	con_sock_state_init(con);
 
+	con->peer_name.type = (__u8) entity_type;
+	con->peer_name.num = cpu_to_le64(entity_num);
+
 	mutex_init(&con->mutex);
 	INIT_LIST_HEAD(&con->out_queue);
 	INIT_LIST_HEAD(&con->out_sent);
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index 6adbea78b168..ab6b24a5169e 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -142,11 +142,9 @@ static int __open_session(struct ceph_mon_client *monc)
 		monc->sub_renew_after = jiffies;  /* i.e., expired */
 		monc->want_next_osdmap = !!monc->want_next_osdmap;
 
-		ceph_con_init(&monc->client->msgr, &monc->con);
-		monc->con.private = monc;
-		monc->con.ops = &mon_con_ops;
-		monc->con.peer_name.type = CEPH_ENTITY_TYPE_MON;
-		monc->con.peer_name.num = cpu_to_le64(monc->cur_mon);
+		ceph_con_init(&monc->con, monc, &mon_con_ops,
+			&monc->client->msgr,
+			CEPH_ENTITY_TYPE_MON, monc->cur_mon);
 
 		dout("open_session mon%d opening\n", monc->cur_mon);
 		ceph_con_open(&monc->con,
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 5b41a6929cd9..448c9da8beff 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -640,11 +640,8 @@ static struct ceph_osd *create_osd(struct ceph_osd_client *osdc, int onum)
 	INIT_LIST_HEAD(&osd->o_osd_lru);
 	osd->o_incarnation = 1;
 
-	ceph_con_init(&osdc->client->msgr, &osd->o_con);
-	osd->o_con.private = osd;
-	osd->o_con.ops = &osd_con_ops;
-	osd->o_con.peer_name.type = CEPH_ENTITY_TYPE_OSD;
-	osd->o_con.peer_name.num = cpu_to_le64(onum);
+	ceph_con_init(&osd->o_con, osd, &osd_con_ops, &osdc->client->msgr,
+		CEPH_ENTITY_TYPE_OSD, onum);
 
 	INIT_LIST_HEAD(&osd->o_keepalive_item);
 	return osd;

From 1c20f2d26795803fc4f5155fe4fca5717a5944b6 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Mon, 4 Jun 2012 14:43:32 -0500
Subject: [PATCH 019/132] libceph: tweak ceph_alloc_msg()

The function ceph_alloc_msg() is only used to allocate a message
that will be assigned to a connection's in_msg pointer.  Rename the
function so this implied usage is more clear.

In addition, make that assignment inside the function (again, since
that's precisely what it's intended to be used for).  This allows us
to return what is now provided via the passed-in address of a "skip"
variable.  The return type is now Boolean to be explicit that there
are only two possible outcomes.

Make sure the result of an ->alloc_msg method call always sets the
value of *skip properly.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 net/ceph/messenger.c  | 61 ++++++++++++++++++++++++-------------------
 net/ceph/mon_client.c |  3 +++
 net/ceph/osd_client.c |  1 +
 3 files changed, 38 insertions(+), 27 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 3b65f6e6911b..98ca23726ea6 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -1655,9 +1655,8 @@ static int read_partial_message_section(struct ceph_connection *con,
 	return 1;
 }
 
-static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
-				struct ceph_msg_header *hdr,
-				int *skip);
+static bool ceph_con_in_msg_alloc(struct ceph_connection *con,
+				struct ceph_msg_header *hdr);
 
 
 static int read_partial_message_pages(struct ceph_connection *con,
@@ -1740,7 +1739,6 @@ static int read_partial_message(struct ceph_connection *con)
 	int ret;
 	unsigned front_len, middle_len, data_len;
 	bool do_datacrc = !con->msgr->nocrc;
-	int skip;
 	u64 seq;
 	u32 crc;
 
@@ -1793,9 +1791,7 @@ static int read_partial_message(struct ceph_connection *con)
 	if (!con->in_msg) {
 		dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
 		     con->in_hdr.front_len, con->in_hdr.data_len);
-		skip = 0;
-		con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip);
-		if (skip) {
+		if (ceph_con_in_msg_alloc(con, &con->in_hdr)) {
 			/* skip this message */
 			dout("alloc_msg said skip message\n");
 			BUG_ON(con->in_msg);
@@ -2577,46 +2573,57 @@ static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg)
 }
 
 /*
- * Generic message allocator, for incoming messages.
+ * Allocate a message for receiving an incoming message on a
+ * connection, and save the result in con->in_msg.  Uses the
+ * connection's private alloc_msg op if available.
+ *
+ * Returns true if the message should be skipped, false otherwise.
+ * If true is returned (skip message), con->in_msg will be NULL.
+ * If false is returned, con->in_msg will contain a pointer to the
+ * newly-allocated message, or NULL in case of memory exhaustion.
  */
-static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
-				struct ceph_msg_header *hdr,
-				int *skip)
+static bool ceph_con_in_msg_alloc(struct ceph_connection *con,
+				struct ceph_msg_header *hdr)
 {
 	int type = le16_to_cpu(hdr->type);
 	int front_len = le32_to_cpu(hdr->front_len);
 	int middle_len = le32_to_cpu(hdr->middle_len);
-	struct ceph_msg *msg = NULL;
 	int ret;
 
+	BUG_ON(con->in_msg != NULL);
+
 	if (con->ops->alloc_msg) {
+		int skip = 0;
+
 		mutex_unlock(&con->mutex);
-		msg = con->ops->alloc_msg(con, hdr, skip);
+		con->in_msg = con->ops->alloc_msg(con, hdr, &skip);
 		mutex_lock(&con->mutex);
-		if (!msg || *skip)
-			return NULL;
+		if (skip)
+			con->in_msg = NULL;
+
+		if (!con->in_msg)
+			return skip != 0;
 	}
-	if (!msg) {
-		*skip = 0;
-		msg = ceph_msg_new(type, front_len, GFP_NOFS, false);
-		if (!msg) {
+	if (!con->in_msg) {
+		con->in_msg = ceph_msg_new(type, front_len, GFP_NOFS, false);
+		if (!con->in_msg) {
 			pr_err("unable to allocate msg type %d len %d\n",
 			       type, front_len);
-			return NULL;
+			return false;
 		}
-		msg->page_alignment = le16_to_cpu(hdr->data_off);
+		con->in_msg->page_alignment = le16_to_cpu(hdr->data_off);
 	}
-	memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
+	memcpy(&con->in_msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
 
-	if (middle_len && !msg->middle) {
-		ret = ceph_alloc_middle(con, msg);
+	if (middle_len && !con->in_msg->middle) {
+		ret = ceph_alloc_middle(con, con->in_msg);
 		if (ret < 0) {
-			ceph_msg_put(msg);
-			return NULL;
+			ceph_msg_put(con->in_msg);
+			con->in_msg = NULL;
 		}
 	}
 
-	return msg;
+	return false;
 }
 
 
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index ab6b24a5169e..8462ccec6333 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -442,6 +442,7 @@ static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
 		m = NULL;
 	} else {
 		dout("get_generic_reply %lld got %p\n", tid, req->reply);
+		*skip = 0;
 		m = ceph_msg_get(req->reply);
 		/*
 		 * we don't need to track the connection reading into
@@ -982,6 +983,8 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
 	case CEPH_MSG_MDS_MAP:
 	case CEPH_MSG_OSD_MAP:
 		m = ceph_msg_new(type, front_len, GFP_NOFS, false);
+		if (!m)
+			return NULL;	/* ENOMEM--return skip == 0 */
 		break;
 	}
 
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 448c9da8beff..24b427b1eca4 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -2077,6 +2077,7 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con,
 	int type = le16_to_cpu(hdr->type);
 	int front = le32_to_cpu(hdr->front_len);
 
+	*skip = 0;
 	switch (type) {
 	case CEPH_MSG_OSD_MAP:
 	case CEPH_MSG_WATCH_NOTIFY:

From 38941f8031bf042dba3ced6394ba3a3b16c244ea Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 1 Jun 2012 14:56:43 -0500
Subject: [PATCH 020/132] libceph: have messages point to their connection

When a ceph message is queued for sending it is placed on a list of
pending messages (ceph_connection->out_queue).  When they are
actually sent over the wire, they are moved from that list to
another (ceph_connection->out_sent).  When acknowledgement for the
message is received, it is removed from the sent messages list.

During that entire time the message is "in the possession" of a
single ceph connection.  Keep track of that connection in the
message.  This will be used in the next patch (and is a helpful
bit of information for debugging anyway).

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 include/linux/ceph/messenger.h |  3 +++
 net/ceph/messenger.c           | 27 +++++++++++++++++++++++++--
 2 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index dd27837f79ac..6df837f72761 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -77,7 +77,10 @@ struct ceph_msg {
 	unsigned nr_pages;              /* size of page array */
 	unsigned page_alignment;        /* io offset in first page */
 	struct ceph_pagelist *pagelist; /* instead of pages */
+
+	struct ceph_connection *con;
 	struct list_head list_head;
+
 	struct kref kref;
 	struct bio  *bio;		/* instead of pages/pagelist */
 	struct bio  *bio_iter;		/* bio iterator */
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 98ca23726ea6..68b49b5b8e86 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -414,6 +414,9 @@ static int con_close_socket(struct ceph_connection *con)
 static void ceph_msg_remove(struct ceph_msg *msg)
 {
 	list_del_init(&msg->list_head);
+	BUG_ON(msg->con == NULL);
+	msg->con = NULL;
+
 	ceph_msg_put(msg);
 }
 static void ceph_msg_remove_list(struct list_head *head)
@@ -433,6 +436,8 @@ static void reset_connection(struct ceph_connection *con)
 	ceph_msg_remove_list(&con->out_sent);
 
 	if (con->in_msg) {
+		BUG_ON(con->in_msg->con != con);
+		con->in_msg->con = NULL;
 		ceph_msg_put(con->in_msg);
 		con->in_msg = NULL;
 	}
@@ -625,8 +630,10 @@ static void prepare_write_message(struct ceph_connection *con)
 			&con->out_temp_ack);
 	}
 
+	BUG_ON(list_empty(&con->out_queue));
 	m = list_first_entry(&con->out_queue, struct ceph_msg, list_head);
 	con->out_msg = m;
+	BUG_ON(m->con != con);
 
 	/* put message on sent list */
 	ceph_msg_get(m);
@@ -1806,6 +1813,8 @@ static int read_partial_message(struct ceph_connection *con)
 				"error allocating memory for incoming message";
 			return -ENOMEM;
 		}
+
+		BUG_ON(con->in_msg->con != con);
 		m = con->in_msg;
 		m->front.iov_len = 0;    /* haven't read it yet */
 		if (m->middle)
@@ -1901,6 +1910,8 @@ static void process_message(struct ceph_connection *con)
 {
 	struct ceph_msg *msg;
 
+	BUG_ON(con->in_msg->con != con);
+	con->in_msg->con = NULL;
 	msg = con->in_msg;
 	con->in_msg = NULL;
 
@@ -2260,6 +2271,8 @@ static void ceph_fault(struct ceph_connection *con)
 	con_close_socket(con);
 
 	if (con->in_msg) {
+		BUG_ON(con->in_msg->con != con);
+		con->in_msg->con = NULL;
 		ceph_msg_put(con->in_msg);
 		con->in_msg = NULL;
 	}
@@ -2378,6 +2391,8 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
 
 	/* queue */
 	mutex_lock(&con->mutex);
+	BUG_ON(msg->con != NULL);
+	msg->con = con;
 	BUG_ON(!list_empty(&msg->list_head));
 	list_add_tail(&msg->list_head, &con->out_queue);
 	dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg,
@@ -2403,13 +2418,16 @@ void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg)
 {
 	mutex_lock(&con->mutex);
 	if (!list_empty(&msg->list_head)) {
-		dout("con_revoke %p msg %p - was on queue\n", con, msg);
+		dout("%s %p msg %p - was on queue\n", __func__, con, msg);
 		list_del_init(&msg->list_head);
+		BUG_ON(msg->con == NULL);
+		msg->con = NULL;
+
 		ceph_msg_put(msg);
 		msg->hdr.seq = 0;
 	}
 	if (con->out_msg == msg) {
-		dout("con_revoke %p msg %p - was sending\n", con, msg);
+		dout("%s %p msg %p - was sending\n", __func__, con, msg);
 		con->out_msg = NULL;
 		if (con->out_kvec_is_msg) {
 			con->out_skip = con->out_kvec_bytes;
@@ -2478,6 +2496,8 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags,
 	if (m == NULL)
 		goto out;
 	kref_init(&m->kref);
+
+	m->con = NULL;
 	INIT_LIST_HEAD(&m->list_head);
 
 	m->hdr.tid = 0;
@@ -2598,6 +2618,8 @@ static bool ceph_con_in_msg_alloc(struct ceph_connection *con,
 		mutex_unlock(&con->mutex);
 		con->in_msg = con->ops->alloc_msg(con, hdr, &skip);
 		mutex_lock(&con->mutex);
+		if (con->in_msg)
+			con->in_msg->con = con;
 		if (skip)
 			con->in_msg = NULL;
 
@@ -2611,6 +2633,7 @@ static bool ceph_con_in_msg_alloc(struct ceph_connection *con,
 			       type, front_len);
 			return false;
 		}
+		con->in_msg->con = con;
 		con->in_msg->page_alignment = le16_to_cpu(hdr->data_off);
 	}
 	memcpy(&con->in_msg->hdr, &con->in_hdr, sizeof(con->in_hdr));

From 92ce034b5a740046cc643a21ea21eaad589e0043 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Mon, 4 Jun 2012 14:43:33 -0500
Subject: [PATCH 021/132] libceph: have messages take a connection reference

There are essentially two types of ceph messages: incoming and
outgoing.  Outgoing messages are always allocated via ceph_msg_new(),
and at the time of their allocation they are not associated with any
particular connection.  Incoming messages are always allocated via
ceph_con_in_msg_alloc(), and they are initially associated with the
connection from which incoming data will be placed into the message.

When an outgoing message gets sent, it becomes associated with a
connection and remains that way until the message is successfully
sent.  The association of an incoming message goes away at the point
it is sent to an upper layer via a con->ops->dispatch method.

This patch implements reference counting for all ceph messages, such
that every message holds a reference (and a pointer) to a connection
if and only if it is associated with that connection (as described
above).


For background, here is an explanation of the ceph message
lifecycle, emphasizing when an association exists between a message
and a connection.

Outgoing Messages
An outgoing message is "owned" by its allocator, from the time it is
allocated in ceph_msg_new() up to the point it gets queued for
sending in ceph_con_send().  Prior to that point the message's
msg->con pointer is null; at the point it is queued for sending its
message pointer is assigned to refer to the connection.  At that
time the message is inserted into a connection's out_queue list.

When a message on the out_queue list has been sent to the socket
layer to be put on the wire, it is transferred out of that list and
into the connection's out_sent list.  At that point it is still owned
by the connection, and will remain so until an acknowledgement is
received from the recipient that indicates the message was
successfully transferred.  When such an acknowledgement is received
(in process_ack()), the message is removed from its list (in
ceph_msg_remove()), at which point it is no longer associated with
the connection.

So basically, any time a message is on one of a connection's lists,
it is associated with that connection.  Reference counting outgoing
messages can thus be done at the points a message is added to the
out_queue (in ceph_con_send()) and the point it is removed from
either its two lists (in ceph_msg_remove())--at which point its
connection pointer becomes null.

Incoming Messages
When an incoming message on a connection is getting read (in
read_partial_message()) and there is no message in con->in_msg,
a new one is allocated using ceph_con_in_msg_alloc().  At that
point the message is associated with the connection.  Once that
message has been completely and successfully read, it is passed to
upper layer code using the connection's con->ops->dispatch method.
At that point the association between the message and the connection
no longer exists.

Reference counting of connections for incoming messages can be done
by taking a reference to the connection when the message gets
allocated, and releasing that reference when it gets handed off
using the dispatch method.

We should never fail to get a connection reference for a
message--the since the caller should already hold one.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 net/ceph/messenger.c | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 68b49b5b8e86..88ac083bb995 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -415,6 +415,7 @@ static void ceph_msg_remove(struct ceph_msg *msg)
 {
 	list_del_init(&msg->list_head);
 	BUG_ON(msg->con == NULL);
+	ceph_con_put(msg->con);
 	msg->con = NULL;
 
 	ceph_msg_put(msg);
@@ -440,6 +441,7 @@ static void reset_connection(struct ceph_connection *con)
 		con->in_msg->con = NULL;
 		ceph_msg_put(con->in_msg);
 		con->in_msg = NULL;
+		ceph_con_put(con->in_msg->con);
 	}
 
 	con->connect_seq = 0;
@@ -1914,6 +1916,7 @@ static void process_message(struct ceph_connection *con)
 	con->in_msg->con = NULL;
 	msg = con->in_msg;
 	con->in_msg = NULL;
+	ceph_con_put(con);
 
 	/* if first message, set peer_name */
 	if (con->peer_name.type == 0)
@@ -2275,6 +2278,7 @@ static void ceph_fault(struct ceph_connection *con)
 		con->in_msg->con = NULL;
 		ceph_msg_put(con->in_msg);
 		con->in_msg = NULL;
+		ceph_con_put(con);
 	}
 
 	/* Requeue anything that hasn't been acked */
@@ -2391,8 +2395,11 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
 
 	/* queue */
 	mutex_lock(&con->mutex);
+
 	BUG_ON(msg->con != NULL);
-	msg->con = con;
+	msg->con = ceph_con_get(con);
+	BUG_ON(msg->con == NULL);
+
 	BUG_ON(!list_empty(&msg->list_head));
 	list_add_tail(&msg->list_head, &con->out_queue);
 	dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg,
@@ -2421,10 +2428,11 @@ void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg)
 		dout("%s %p msg %p - was on queue\n", __func__, con, msg);
 		list_del_init(&msg->list_head);
 		BUG_ON(msg->con == NULL);
+		ceph_con_put(msg->con);
 		msg->con = NULL;
+		msg->hdr.seq = 0;
 
 		ceph_msg_put(msg);
-		msg->hdr.seq = 0;
 	}
 	if (con->out_msg == msg) {
 		dout("%s %p msg %p - was sending\n", __func__, con, msg);
@@ -2433,8 +2441,9 @@ void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg)
 			con->out_skip = con->out_kvec_bytes;
 			con->out_kvec_is_msg = false;
 		}
-		ceph_msg_put(msg);
 		msg->hdr.seq = 0;
+
+		ceph_msg_put(msg);
 	}
 	mutex_unlock(&con->mutex);
 }
@@ -2618,8 +2627,10 @@ static bool ceph_con_in_msg_alloc(struct ceph_connection *con,
 		mutex_unlock(&con->mutex);
 		con->in_msg = con->ops->alloc_msg(con, hdr, &skip);
 		mutex_lock(&con->mutex);
-		if (con->in_msg)
-			con->in_msg->con = con;
+		if (con->in_msg) {
+			con->in_msg->con = ceph_con_get(con);
+			BUG_ON(con->in_msg->con == NULL);
+		}
 		if (skip)
 			con->in_msg = NULL;
 
@@ -2633,7 +2644,8 @@ static bool ceph_con_in_msg_alloc(struct ceph_connection *con,
 			       type, front_len);
 			return false;
 		}
-		con->in_msg->con = con;
+		con->in_msg->con = ceph_con_get(con);
+		BUG_ON(con->in_msg->con == NULL);
 		con->in_msg->page_alignment = le16_to_cpu(hdr->data_off);
 	}
 	memcpy(&con->in_msg->hdr, &con->in_hdr, sizeof(con->in_hdr));

From 6740a845b2543cc46e1902ba21bac743fbadd0dc Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 1 Jun 2012 14:56:43 -0500
Subject: [PATCH 022/132] libceph: make ceph_con_revoke() a msg operation

ceph_con_revoke() is passed both a message and a ceph connection.
Now that any message associated with a connection holds a pointer
to that connection, there's no need to provide the connection when
revoking a message.

This has the added benefit of precluding the possibility of the
providing the wrong connection pointer.  If the message's connection
pointer is null, it is not being tracked by any connection, so
revoking it is a no-op.  This is supported as a convenience for
upper layers, so they can revoke a message that is not actually
"in flight."

Rename the function ceph_msg_revoke() to reflect that it is really
an operation on a message, not a connection.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 include/linux/ceph/messenger.h | 3 ++-
 net/ceph/messenger.c           | 7 ++++++-
 net/ceph/mon_client.c          | 8 ++++----
 net/ceph/osd_client.c          | 4 ++--
 4 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 6df837f72761..9008f81c20cd 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -239,7 +239,8 @@ extern void ceph_con_open(struct ceph_connection *con,
 extern bool ceph_con_opened(struct ceph_connection *con);
 extern void ceph_con_close(struct ceph_connection *con);
 extern void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg);
-extern void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg);
+
+extern void ceph_msg_revoke(struct ceph_msg *msg);
 extern void ceph_con_revoke_message(struct ceph_connection *con,
 				  struct ceph_msg *msg);
 extern void ceph_con_keepalive(struct ceph_connection *con);
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 88ac083bb995..d636903ad4b2 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -2421,8 +2421,13 @@ EXPORT_SYMBOL(ceph_con_send);
 /*
  * Revoke a message that was previously queued for send
  */
-void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg)
+void ceph_msg_revoke(struct ceph_msg *msg)
 {
+	struct ceph_connection *con = msg->con;
+
+	if (!con)
+		return;		/* Message not in our possession */
+
 	mutex_lock(&con->mutex);
 	if (!list_empty(&msg->list_head)) {
 		dout("%s %p msg %p - was on queue\n", __func__, con, msg);
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index 8462ccec6333..7a16750d62a6 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -106,7 +106,7 @@ static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
 	monc->pending_auth = 1;
 	monc->m_auth->front.iov_len = len;
 	monc->m_auth->hdr.front_len = cpu_to_le32(len);
-	ceph_con_revoke(&monc->con, monc->m_auth);
+	ceph_msg_revoke(monc->m_auth);
 	ceph_msg_get(monc->m_auth);  /* keep our ref */
 	ceph_con_send(&monc->con, monc->m_auth);
 }
@@ -117,7 +117,7 @@ static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
 static void __close_session(struct ceph_mon_client *monc)
 {
 	dout("__close_session closing mon%d\n", monc->cur_mon);
-	ceph_con_revoke(&monc->con, monc->m_auth);
+	ceph_msg_revoke(monc->m_auth);
 	ceph_con_close(&monc->con);
 	monc->con.private = NULL;
 	monc->cur_mon = -1;
@@ -229,7 +229,7 @@ static void __send_subscribe(struct ceph_mon_client *monc)
 
 		msg->front.iov_len = p - msg->front.iov_base;
 		msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
-		ceph_con_revoke(&monc->con, msg);
+		ceph_msg_revoke(msg);
 		ceph_con_send(&monc->con, ceph_msg_get(msg));
 
 		monc->sub_sent = jiffies | 1;  /* never 0 */
@@ -688,7 +688,7 @@ static void __resend_generic_request(struct ceph_mon_client *monc)
 
 	for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) {
 		req = rb_entry(p, struct ceph_mon_generic_request, node);
-		ceph_con_revoke(&monc->con, req->request);
+		ceph_msg_revoke(req->request);
 		ceph_con_send(&monc->con, ceph_msg_get(req->request));
 	}
 }
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 24b427b1eca4..ad78705a4aff 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -852,7 +852,7 @@ static void __unregister_request(struct ceph_osd_client *osdc,
 
 	if (req->r_osd) {
 		/* make sure the original request isn't in flight. */
-		ceph_con_revoke(&req->r_osd->o_con, req->r_request);
+		ceph_msg_revoke(req->r_request);
 
 		list_del_init(&req->r_osd_item);
 		if (list_empty(&req->r_osd->o_requests) &&
@@ -879,7 +879,7 @@ static void __unregister_request(struct ceph_osd_client *osdc,
 static void __cancel_request(struct ceph_osd_request *req)
 {
 	if (req->r_sent && req->r_osd) {
-		ceph_con_revoke(&req->r_osd->o_con, req->r_request);
+		ceph_msg_revoke(req->r_request);
 		req->r_sent = 0;
 	}
 }

From 8921d114f5574c6da2cdd00749d185633ecf88f3 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 1 Jun 2012 14:56:43 -0500
Subject: [PATCH 023/132] libceph: make ceph_con_revoke_message() a msg op

ceph_con_revoke_message() is passed both a message and a ceph
connection.  A ceph_msg allocated for incoming messages on a
connection always has a pointer to that connection, so there's no
need to provide the connection when revoking such a message.

Note that the existing logic does not preclude the message supplied
being a null/bogus message pointer.  The only user of this interface
is the OSD client, and the only value an osd client passes is a
request's r_reply field.  That is always non-null (except briefly in
an error path in ceph_osdc_alloc_request(), and that drops the
only reference so the request won't ever have a reply to revoke).
So we can safely assume the passed-in message is non-null, but add a
BUG_ON() to make it very obvious we are imposing this restriction.

Rename the function ceph_msg_revoke_incoming() to reflect that it is
really an operation on an incoming message.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 include/linux/ceph/messenger.h |  4 ++--
 net/ceph/messenger.c           | 22 ++++++++++++++++------
 net/ceph/osd_client.c          |  9 ++++-----
 3 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 9008f81c20cd..a334dbd1b324 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -241,8 +241,8 @@ extern void ceph_con_close(struct ceph_connection *con);
 extern void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg);
 
 extern void ceph_msg_revoke(struct ceph_msg *msg);
-extern void ceph_con_revoke_message(struct ceph_connection *con,
-				  struct ceph_msg *msg);
+extern void ceph_msg_revoke_incoming(struct ceph_msg *msg);
+
 extern void ceph_con_keepalive(struct ceph_connection *con);
 extern struct ceph_connection *ceph_con_get(struct ceph_connection *con);
 extern void ceph_con_put(struct ceph_connection *con);
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index d636903ad4b2..3857f815c035 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -2456,17 +2456,27 @@ void ceph_msg_revoke(struct ceph_msg *msg)
 /*
  * Revoke a message that we may be reading data into
  */
-void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg)
+void ceph_msg_revoke_incoming(struct ceph_msg *msg)
 {
+	struct ceph_connection *con;
+
+	BUG_ON(msg == NULL);
+	if (!msg->con) {
+		dout("%s msg %p null con\n", __func__, msg);
+
+		return;		/* Message not in our possession */
+	}
+
+	con = msg->con;
 	mutex_lock(&con->mutex);
-	if (con->in_msg && con->in_msg == msg) {
+	if (con->in_msg == msg) {
 		unsigned front_len = le32_to_cpu(con->in_hdr.front_len);
 		unsigned middle_len = le32_to_cpu(con->in_hdr.middle_len);
 		unsigned data_len = le32_to_cpu(con->in_hdr.data_len);
 
 		/* skip rest of message */
-		dout("con_revoke_pages %p msg %p revoked\n", con, msg);
-			con->in_base_pos = con->in_base_pos -
+		dout("%s %p msg %p revoked\n", __func__, con, msg);
+		con->in_base_pos = con->in_base_pos -
 				sizeof(struct ceph_msg_header) -
 				front_len -
 				middle_len -
@@ -2477,8 +2487,8 @@ void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg)
 		con->in_tag = CEPH_MSGR_TAG_READY;
 		con->in_seq++;
 	} else {
-		dout("con_revoke_pages %p msg %p pages %p no-op\n",
-		     con, con->in_msg, msg);
+		dout("%s %p in_msg %p msg %p no-op\n",
+		     __func__, con, con->in_msg, msg);
 	}
 	mutex_unlock(&con->mutex);
 }
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index ad78705a4aff..c178c770acb4 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -140,10 +140,9 @@ void ceph_osdc_release_request(struct kref *kref)
 	if (req->r_request)
 		ceph_msg_put(req->r_request);
 	if (req->r_con_filling_msg) {
-		dout("release_request revoking pages %p from con %p\n",
+		dout("%s revoking pages %p from con %p\n", __func__,
 		     req->r_pages, req->r_con_filling_msg);
-		ceph_con_revoke_message(req->r_con_filling_msg,
-				      req->r_reply);
+		ceph_msg_revoke_incoming(req->r_reply);
 		req->r_con_filling_msg->ops->put(req->r_con_filling_msg);
 	}
 	if (req->r_reply)
@@ -2022,9 +2021,9 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
 	}
 
 	if (req->r_con_filling_msg) {
-		dout("get_reply revoking msg %p from old con %p\n",
+		dout("%s revoking msg %p from old con %p\n", __func__,
 		     req->r_reply, req->r_con_filling_msg);
-		ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply);
+		ceph_msg_revoke_incoming(req->r_reply);
 		req->r_con_filling_msg->ops->put(req->r_con_filling_msg);
 		req->r_con_filling_msg = NULL;
 	}

From 43643528cce60ca184fe8197efa8e8da7c89a037 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Wed, 6 Jun 2012 19:35:55 -0500
Subject: [PATCH 024/132] rbd: Clear ceph_msg->bio_iter for retransmitted
 message

The bug can cause NULL pointer dereference in write_partial_msg_pages

Signed-off-by: Zheng Yan <zheng.z.yan@intel.com>
Reviewed-by: Alex Elder <elder@inktank.com>
---
 net/ceph/messenger.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 3857f815c035..769a2c9fe1af 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -649,6 +649,10 @@ static void prepare_write_message(struct ceph_connection *con)
 		m->hdr.seq = cpu_to_le64(++con->out_seq);
 		m->needs_out_seq = false;
 	}
+#ifdef CONFIG_BLOCK
+	else
+		m->bio_iter = NULL;
+#endif
 
 	dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n",
 	     m, con->out_seq, le16_to_cpu(m->hdr.type),

From ad3b904c07dfa88603689bf9a67bffbb9b99beb5 Mon Sep 17 00:00:00 2001
From: Xi Wang <xi.wang@gmail.com>
Date: Wed, 6 Jun 2012 19:35:55 -0500
Subject: [PATCH 025/132] libceph: fix overflow in __decode_pool_names()

`len' is read from network and thus needs validation.  Otherwise a
large `len' would cause out-of-bounds access via the memcpy() call.
In addition, len = 0xffffffff would overflow the kmalloc() size,
leading to out-of-bounds write.

This patch adds a check of `len' via ceph_decode_need().  Also use
kstrndup rather than kmalloc/memcpy.

[elder@inktank.com: added -ENOMEM return for null kstrndup() result]

Signed-off-by: Xi Wang <xi.wang@gmail.com>
Reviewed-by: Alex Elder <elder@inktank.com>
---
 net/ceph/osdmap.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 1892c523c43c..df47871b8389 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -488,15 +488,16 @@ static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
 		ceph_decode_32_safe(p, end, pool, bad);
 		ceph_decode_32_safe(p, end, len, bad);
 		dout("  pool %d len %d\n", pool, len);
+		ceph_decode_need(p, end, len, bad);
 		pi = __lookup_pg_pool(&map->pg_pools, pool);
 		if (pi) {
+			char *name = kstrndup(*p, len, GFP_NOFS);
+
+			if (!name)
+				return -ENOMEM;
 			kfree(pi->name);
-			pi->name = kmalloc(len + 1, GFP_NOFS);
-			if (pi->name) {
-				memcpy(pi->name, *p, len);
-				pi->name[len] = '\0';
-				dout("  name is %s\n", pi->name);
-			}
+			pi->name = name;
+			dout("  name is %s\n", pi->name);
 		}
 		*p += len;
 	}

From e91a9b639a691e0982088b5954eaafb5a25c8f1c Mon Sep 17 00:00:00 2001
From: Xi Wang <xi.wang@gmail.com>
Date: Wed, 6 Jun 2012 19:35:55 -0500
Subject: [PATCH 026/132] libceph: fix overflow in osdmap_decode()

On 32-bit systems, a large `n' would overflow `n * sizeof(u32)' and bypass
the check ceph_decode_need(p, end, n * sizeof(u32), bad).  It would also
overflow the subsequent kmalloc() size, leading to out-of-bounds write.

Signed-off-by: Xi Wang <xi.wang@gmail.com>
Reviewed-by: Alex Elder <elder@inktank.com>
---
 net/ceph/osdmap.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index df47871b8389..d70aaca4a45d 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -667,6 +667,9 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
 		ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad);
 		ceph_decode_copy(p, &pgid, sizeof(pgid));
 		n = ceph_decode_32(p);
+		err = -EINVAL;
+		if (n > (UINT_MAX - sizeof(*pg)) / sizeof(u32))
+			goto bad;
 		ceph_decode_need(p, end, n * sizeof(u32), bad);
 		err = -ENOMEM;
 		pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS);

From a5506049500b30dbc5edb4d07a3577477c1f3643 Mon Sep 17 00:00:00 2001
From: Xi Wang <xi.wang@gmail.com>
Date: Wed, 6 Jun 2012 19:35:55 -0500
Subject: [PATCH 027/132] libceph: fix overflow in osdmap_apply_incremental()

On 32-bit systems, a large `pglen' would overflow `pglen*sizeof(u32)'
and bypass the check ceph_decode_need(p, end, pglen*sizeof(u32), bad).
It would also overflow the subsequent kmalloc() size, leading to
out-of-bounds write.

Signed-off-by: Xi Wang <xi.wang@gmail.com>
Reviewed-by: Alex Elder <elder@inktank.com>
---
 net/ceph/osdmap.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index d70aaca4a45d..d3de09f519b2 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -893,6 +893,10 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 			(void) __remove_pg_mapping(&map->pg_temp, pgid);
 
 			/* insert */
+			if (pglen > (UINT_MAX - sizeof(*pg)) / sizeof(u32)) {
+				err = -EINVAL;
+				goto bad;
+			}
 			pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS);
 			if (!pg) {
 				err = -ENOMEM;

From 89a86be0ce20022f6ede8bccec078dbb3d63caaa Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Sat, 9 Jun 2012 14:19:21 -0700
Subject: [PATCH 028/132] libceph: transition socket state prior to actual
 connect

Once we call ->connect(), we are racing against the actual
connection, and a subsequent transition from CONNECTING ->
CONNECTED.  Set the state to CONNECTING before that, under the
protection of the mutex, to avoid the race.

This was introduced in 928443cd9644e7cfd46f687dbeffda2d1a357ff9,
with the original socket state code.

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Alex Elder <elder@inktank.com>
---
 net/ceph/messenger.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 769a2c9fe1af..bdbecac2d69d 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -321,6 +321,7 @@ static int ceph_tcp_connect(struct ceph_connection *con)
 
 	dout("connect %s\n", ceph_pr_addr(&con->peer_addr.in_addr));
 
+	con_sock_state_connecting(con);
 	ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr),
 				 O_NONBLOCK);
 	if (ret == -EINPROGRESS) {
@@ -336,8 +337,6 @@ static int ceph_tcp_connect(struct ceph_connection *con)
 		return ret;
 	}
 	con->sock = sock;
-	con_sock_state_connecting(con);
-
 	return 0;
 }
 

From f3dea7edd3d449fe7a6d402c1ce56a294b985261 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Sun, 10 Jun 2012 20:43:56 -0700
Subject: [PATCH 029/132] libceph: flush msgr queue during mon_client shutdown

We need to flush the msgr workqueue during mon_client shutdown to
ensure that any work affecting our embedded ceph_connection is
finished so that we can be safely destroyed.

Previously, we were flushing the work queue after osd_client
shutdown and before mon_client shutdown to ensure that any osd
connection refs to authorizers are flushed.  Remove the redundant
flush, and document in the comment that the mon_client flush is
needed to cover that case as well.

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Alex Elder <elder@inktank.com>
---
 net/ceph/ceph_common.c | 7 -------
 net/ceph/mon_client.c  | 8 ++++++++
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 2de3ea1bbd64..c815f31a1a3f 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -498,13 +498,6 @@ void ceph_destroy_client(struct ceph_client *client)
 	/* unmount */
 	ceph_osdc_stop(&client->osdc);
 
-	/*
-	 * make sure osd connections close out before destroying the
-	 * auth module, which is needed to free those connections'
-	 * ceph_authorizers.
-	 */
-	ceph_msgr_flush();
-
 	ceph_monc_stop(&client->monc);
 
 	ceph_debugfs_client_cleanup(client);
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index 7a16750d62a6..dc16595d6885 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -838,6 +838,14 @@ void ceph_monc_stop(struct ceph_mon_client *monc)
 
 	mutex_unlock(&monc->mutex);
 
+	/*
+	 * flush msgr queue before we destroy ourselves to ensure that:
+	 *  - any work that references our embedded con is finished.
+	 *  - any osd_client or other work that may reference an authorizer
+	 *    finishes before we shut down the auth subsystem.
+	 */
+	ceph_msgr_flush();
+
 	ceph_auth_destroy(monc->auth);
 
 	ceph_msg_put(monc->m_auth);

From 26ce171915f348abd1f41da1ed139d93750d987f Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Tue, 19 Jun 2012 08:52:33 -0500
Subject: [PATCH 030/132] libceph: fix NULL dereference in reset_connection()

We dereference "con->in_msg" on the line after it was set to NULL.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Reviewed-by: Alex Elder <elder@inktank.com>
---
 net/ceph/messenger.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 5e9f61d6d234..23073cff6481 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -440,7 +440,7 @@ static void reset_connection(struct ceph_connection *con)
 		con->in_msg->con = NULL;
 		ceph_msg_put(con->in_msg);
 		con->in_msg = NULL;
-		ceph_con_put(con->in_msg->con);
+		ceph_con_put(con);
 	}
 
 	con->connect_seq = 0;

From 36eb71aa57e6a33d61fd90a2fd87f00c6844bc86 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Thu, 21 Jun 2012 12:47:08 -0700
Subject: [PATCH 031/132] libceph: use con get/put methods

The ceph_con_get/put() helpers manipulate the embedded con ref
count, which isn't used now that ceph_connections are embedded in
other structures.

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Alex Elder <elder@inktank.com>
---
 net/ceph/messenger.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 23073cff6481..fc0cee7c9aa2 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -414,7 +414,7 @@ static void ceph_msg_remove(struct ceph_msg *msg)
 {
 	list_del_init(&msg->list_head);
 	BUG_ON(msg->con == NULL);
-	ceph_con_put(msg->con);
+	msg->con->ops->put(msg->con);
 	msg->con = NULL;
 
 	ceph_msg_put(msg);
@@ -440,7 +440,7 @@ static void reset_connection(struct ceph_connection *con)
 		con->in_msg->con = NULL;
 		ceph_msg_put(con->in_msg);
 		con->in_msg = NULL;
-		ceph_con_put(con);
+		con->ops->put(con);
 	}
 
 	con->connect_seq = 0;
@@ -1919,7 +1919,7 @@ static void process_message(struct ceph_connection *con)
 	con->in_msg->con = NULL;
 	msg = con->in_msg;
 	con->in_msg = NULL;
-	ceph_con_put(con);
+	con->ops->put(con);
 
 	/* if first message, set peer_name */
 	if (con->peer_name.type == 0)
@@ -2281,7 +2281,7 @@ static void ceph_fault(struct ceph_connection *con)
 		con->in_msg->con = NULL;
 		ceph_msg_put(con->in_msg);
 		con->in_msg = NULL;
-		ceph_con_put(con);
+		con->ops->put(con);
 	}
 
 	/* Requeue anything that hasn't been acked */
@@ -2400,7 +2400,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
 	mutex_lock(&con->mutex);
 
 	BUG_ON(msg->con != NULL);
-	msg->con = ceph_con_get(con);
+	msg->con = con->ops->get(con);
 	BUG_ON(msg->con == NULL);
 
 	BUG_ON(!list_empty(&msg->list_head));
@@ -2436,7 +2436,7 @@ void ceph_msg_revoke(struct ceph_msg *msg)
 		dout("%s %p msg %p - was on queue\n", __func__, con, msg);
 		list_del_init(&msg->list_head);
 		BUG_ON(msg->con == NULL);
-		ceph_con_put(msg->con);
+		msg->con->ops->put(msg->con);
 		msg->con = NULL;
 		msg->hdr.seq = 0;
 
@@ -2646,7 +2646,7 @@ static bool ceph_con_in_msg_alloc(struct ceph_connection *con,
 		con->in_msg = con->ops->alloc_msg(con, hdr, &skip);
 		mutex_lock(&con->mutex);
 		if (con->in_msg) {
-			con->in_msg->con = ceph_con_get(con);
+			con->in_msg->con = con->ops->get(con);
 			BUG_ON(con->in_msg->con == NULL);
 		}
 		if (skip)
@@ -2662,7 +2662,7 @@ static bool ceph_con_in_msg_alloc(struct ceph_connection *con,
 			       type, front_len);
 			return false;
 		}
-		con->in_msg->con = ceph_con_get(con);
+		con->in_msg->con = con->ops->get(con);
 		BUG_ON(con->in_msg->con == NULL);
 		con->in_msg->page_alignment = le16_to_cpu(hdr->data_off);
 	}

From d59315ca8c0de00df9b363f94a2641a30961ca1c Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Thu, 21 Jun 2012 12:49:23 -0700
Subject: [PATCH 032/132] libceph: drop ceph_con_get/put helpers and nref
 member

These are no longer used.  Every ceph_connection instance is embedded in
another structure, and refcounts manipulated via the get/put ops.

Signed-off-by: Sage Weil <sage@inktank.com>
---
 include/linux/ceph/messenger.h |  1 -
 net/ceph/messenger.c           | 28 +---------------------------
 2 files changed, 1 insertion(+), 28 deletions(-)

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index a334dbd1b324..cc6f9bdcf466 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -135,7 +135,6 @@ struct ceph_msg_pos {
  */
 struct ceph_connection {
 	void *private;
-	atomic_t nref;
 
 	const struct ceph_connection_operations *ops;
 
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index fc0cee7c9aa2..ab690e2e1206 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -500,30 +500,6 @@ bool ceph_con_opened(struct ceph_connection *con)
 	return con->connect_seq > 0;
 }
 
-/*
- * generic get/put
- */
-struct ceph_connection *ceph_con_get(struct ceph_connection *con)
-{
-	int nref = __atomic_add_unless(&con->nref, 1, 0);
-
-	dout("con_get %p nref = %d -> %d\n", con, nref, nref + 1);
-
-	return nref ? con : NULL;
-}
-
-void ceph_con_put(struct ceph_connection *con)
-{
-	int nref = atomic_dec_return(&con->nref);
-
-	BUG_ON(nref < 0);
-	if (nref == 0) {
-		BUG_ON(con->sock);
-		kfree(con);
-	}
-	dout("con_put %p nref = %d -> %d\n", con, nref + 1, nref);
-}
-
 /*
  * initialize a new connection.
  */
@@ -535,7 +511,6 @@ void ceph_con_init(struct ceph_connection *con, void *private,
 	memset(con, 0, sizeof(*con));
 	con->private = private;
 	con->ops = ops;
-	atomic_set(&con->nref, 1);
 	con->msgr = msgr;
 
 	con_sock_state_init(con);
@@ -1951,8 +1926,7 @@ static int try_write(struct ceph_connection *con)
 {
 	int ret = 1;
 
-	dout("try_write start %p state %lu nref %d\n", con, con->state,
-	     atomic_read(&con->nref));
+	dout("try_write start %p state %lu\n", con, con->state);
 
 more:
 	dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);

From 739c905baa018c99003564ebc367d93aa44d4861 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Mon, 11 Jun 2012 14:57:13 -0500
Subject: [PATCH 033/132] libceph: encapsulate out message data setup

Move the code that prepares to write the data portion of a message
into its own function.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 net/ceph/messenger.c | 37 +++++++++++++++++++++++--------------
 1 file changed, 23 insertions(+), 14 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index ab690e2e1206..56448660883d 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -565,6 +565,24 @@ static void con_out_kvec_add(struct ceph_connection *con,
 	con->out_kvec_bytes += size;
 }
 
+static void prepare_write_message_data(struct ceph_connection *con)
+{
+	struct ceph_msg *msg = con->out_msg;
+
+	BUG_ON(!msg);
+	BUG_ON(!msg->hdr.data_len);
+
+	/* initialize page iterator */
+	con->out_msg_pos.page = 0;
+	if (msg->pages)
+		con->out_msg_pos.page_pos = msg->page_alignment;
+	else
+		con->out_msg_pos.page_pos = 0;
+	con->out_msg_pos.data_pos = 0;
+	con->out_msg_pos.did_page_crc = false;
+	con->out_more = 1;  /* data + footer will follow */
+}
+
 /*
  * Prepare footer for currently outgoing message, and finish things
  * off.  Assumes out_kvec* are already valid.. we just add on to the end.
@@ -657,26 +675,17 @@ static void prepare_write_message(struct ceph_connection *con)
 		con->out_msg->footer.middle_crc = cpu_to_le32(crc);
 	} else
 		con->out_msg->footer.middle_crc = 0;
-	con->out_msg->footer.data_crc = 0;
-	dout("prepare_write_message front_crc %u data_crc %u\n",
+	dout("%s front_crc %u middle_crc %u\n", __func__,
 	     le32_to_cpu(con->out_msg->footer.front_crc),
 	     le32_to_cpu(con->out_msg->footer.middle_crc));
 
 	/* is there a data payload? */
-	if (le32_to_cpu(m->hdr.data_len) > 0) {
-		/* initialize page iterator */
-		con->out_msg_pos.page = 0;
-		if (m->pages)
-			con->out_msg_pos.page_pos = m->page_alignment;
-		else
-			con->out_msg_pos.page_pos = 0;
-		con->out_msg_pos.data_pos = 0;
-		con->out_msg_pos.did_page_crc = false;
-		con->out_more = 1;  /* data + footer will follow */
-	} else {
+	con->out_msg->footer.data_crc = 0;
+	if (m->hdr.data_len)
+		prepare_write_message_data(con);
+	else
 		/* no, queue up footer too and be done */
 		prepare_write_message_footer(con);
-	}
 
 	set_bit(WRITE_PENDING, &con->flags);
 }

From 84ca8fc87fcf4ab97bb8acdb59bf97bb4820cb14 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Mon, 11 Jun 2012 14:57:13 -0500
Subject: [PATCH 034/132] libceph: encapsulate advancing msg page

In write_partial_msg_pages(), once all the data from a page has been
sent we advance to the next one.  Put the code that takes care of
this into its own function.

While modifying write_partial_msg_pages(), make its local variable
"in_trail" be Boolean, and use the local variable "msg" (which is
just the connection's current out_msg pointer) consistently.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 net/ceph/messenger.c | 58 ++++++++++++++++++++++++++------------------
 1 file changed, 34 insertions(+), 24 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 56448660883d..1b92e3b16c0d 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -891,6 +891,33 @@ static void iter_bio_next(struct bio **bio_iter, int *seg)
 }
 #endif
 
+static void out_msg_pos_next(struct ceph_connection *con, struct page *page,
+			size_t len, size_t sent, bool in_trail)
+{
+	struct ceph_msg *msg = con->out_msg;
+
+	BUG_ON(!msg);
+	BUG_ON(!sent);
+
+	con->out_msg_pos.data_pos += sent;
+	con->out_msg_pos.page_pos += sent;
+	if (sent == len) {
+		con->out_msg_pos.page_pos = 0;
+		con->out_msg_pos.page++;
+		con->out_msg_pos.did_page_crc = false;
+		if (in_trail)
+			list_move_tail(&page->lru,
+				       &msg->trail->head);
+		else if (msg->pagelist)
+			list_move_tail(&page->lru,
+				       &msg->pagelist->head);
+#ifdef CONFIG_BLOCK
+		else if (msg->bio)
+			iter_bio_next(&msg->bio_iter, &msg->bio_seg);
+#endif
+	}
+}
+
 /*
  * Write as much message data payload as we can.  If we finish, queue
  * up the footer.
@@ -906,11 +933,11 @@ static int write_partial_msg_pages(struct ceph_connection *con)
 	bool do_datacrc = !con->msgr->nocrc;
 	int ret;
 	int total_max_write;
-	int in_trail = 0;
+	bool in_trail = false;
 	size_t trail_len = (msg->trail ? msg->trail->length : 0);
 
 	dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n",
-	     con, con->out_msg, con->out_msg_pos.page, con->out_msg->nr_pages,
+	     con, msg, con->out_msg_pos.page, msg->nr_pages,
 	     con->out_msg_pos.page_pos);
 
 #ifdef CONFIG_BLOCK
@@ -934,13 +961,12 @@ static int write_partial_msg_pages(struct ceph_connection *con)
 
 		/* have we reached the trail part of the data? */
 		if (con->out_msg_pos.data_pos >= data_len - trail_len) {
-			in_trail = 1;
+			in_trail = true;
 
 			total_max_write = data_len - con->out_msg_pos.data_pos;
 
 			page = list_first_entry(&msg->trail->head,
 						struct page, lru);
-			max_write = PAGE_SIZE;
 		} else if (msg->pages) {
 			page = msg->pages[con->out_msg_pos.page];
 		} else if (msg->pagelist) {
@@ -964,14 +990,14 @@ static int write_partial_msg_pages(struct ceph_connection *con)
 		if (do_datacrc && !con->out_msg_pos.did_page_crc) {
 			void *base;
 			u32 crc;
-			u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc);
+			u32 tmpcrc = le32_to_cpu(msg->footer.data_crc);
 			char *kaddr;
 
 			kaddr = kmap(page);
 			BUG_ON(kaddr == NULL);
 			base = kaddr + con->out_msg_pos.page_pos + bio_offset;
 			crc = crc32c(tmpcrc, base, len);
-			con->out_msg->footer.data_crc = cpu_to_le32(crc);
+			msg->footer.data_crc = cpu_to_le32(crc);
 			con->out_msg_pos.did_page_crc = true;
 		}
 		ret = ceph_tcp_sendpage(con->sock, page,
@@ -984,30 +1010,14 @@ static int write_partial_msg_pages(struct ceph_connection *con)
 		if (ret <= 0)
 			goto out;
 
-		con->out_msg_pos.data_pos += ret;
-		con->out_msg_pos.page_pos += ret;
-		if (ret == len) {
-			con->out_msg_pos.page_pos = 0;
-			con->out_msg_pos.page++;
-			con->out_msg_pos.did_page_crc = false;
-			if (in_trail)
-				list_move_tail(&page->lru,
-					       &msg->trail->head);
-			else if (msg->pagelist)
-				list_move_tail(&page->lru,
-					       &msg->pagelist->head);
-#ifdef CONFIG_BLOCK
-			else if (msg->bio)
-				iter_bio_next(&msg->bio_iter, &msg->bio_seg);
-#endif
-		}
+		out_msg_pos_next(con, page, len, (size_t) ret, in_trail);
 	}
 
 	dout("write_partial_msg_pages %p msg %p done\n", con, msg);
 
 	/* prepare and queue up footer, too */
 	if (!do_datacrc)
-		con->out_msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
+		msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
 	con_out_kvec_reset(con);
 	prepare_write_message_footer(con);
 	ret = 1;

From fd154f3c75465abd83b7a395033e3755908a1e6e Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Mon, 11 Jun 2012 14:57:13 -0500
Subject: [PATCH 035/132] libceph: don't mark footer complete before it is

This is a nit, but prepare_write_message() sets the FOOTER_COMPLETE
flag before the CRC for the data portion (recorded in the footer)
has been completely computed.  Hold off setting the complete flag
until we've decided it's ready to send.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 net/ceph/messenger.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 1b92e3b16c0d..5354d59ba8b9 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -592,6 +592,8 @@ static void prepare_write_message_footer(struct ceph_connection *con)
 	struct ceph_msg *m = con->out_msg;
 	int v = con->out_kvec_left;
 
+	m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE;
+
 	dout("prepare_write_message_footer %p\n", con);
 	con->out_kvec_is_msg = true;
 	con->out_kvec[v].iov_base = &m->footer;
@@ -665,7 +667,7 @@ static void prepare_write_message(struct ceph_connection *con)
 	/* fill in crc (except data pages), footer */
 	crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc));
 	con->out_msg->hdr.crc = cpu_to_le32(crc);
-	con->out_msg->footer.flags = CEPH_MSG_FOOTER_COMPLETE;
+	con->out_msg->footer.flags = 0;
 
 	crc = crc32c(0, m->front.iov_base, m->front.iov_len);
 	con->out_msg->footer.front_crc = cpu_to_le32(crc);

From df6ad1f97342ebc4270128222e896541405eecdb Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Mon, 11 Jun 2012 14:57:13 -0500
Subject: [PATCH 036/132] libceph: move init_bio_*() functions up

Move init_bio_iter() and iter_bio_next() up in their source file so
the'll be defined before they're needed.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 net/ceph/messenger.c | 50 ++++++++++++++++++++++----------------------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 5354d59ba8b9..7b5ff4545bf3 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -565,6 +565,31 @@ static void con_out_kvec_add(struct ceph_connection *con,
 	con->out_kvec_bytes += size;
 }
 
+#ifdef CONFIG_BLOCK
+static void init_bio_iter(struct bio *bio, struct bio **iter, int *seg)
+{
+	if (!bio) {
+		*iter = NULL;
+		*seg = 0;
+		return;
+	}
+	*iter = bio;
+	*seg = bio->bi_idx;
+}
+
+static void iter_bio_next(struct bio **bio_iter, int *seg)
+{
+	if (*bio_iter == NULL)
+		return;
+
+	BUG_ON(*seg >= (*bio_iter)->bi_vcnt);
+
+	(*seg)++;
+	if (*seg == (*bio_iter)->bi_vcnt)
+		init_bio_iter((*bio_iter)->bi_next, bio_iter, seg);
+}
+#endif
+
 static void prepare_write_message_data(struct ceph_connection *con)
 {
 	struct ceph_msg *msg = con->out_msg;
@@ -868,31 +893,6 @@ out:
 	return ret;  /* done! */
 }
 
-#ifdef CONFIG_BLOCK
-static void init_bio_iter(struct bio *bio, struct bio **iter, int *seg)
-{
-	if (!bio) {
-		*iter = NULL;
-		*seg = 0;
-		return;
-	}
-	*iter = bio;
-	*seg = bio->bi_idx;
-}
-
-static void iter_bio_next(struct bio **bio_iter, int *seg)
-{
-	if (*bio_iter == NULL)
-		return;
-
-	BUG_ON(*seg >= (*bio_iter)->bi_vcnt);
-
-	(*seg)++;
-	if (*seg == (*bio_iter)->bi_vcnt)
-		init_bio_iter((*bio_iter)->bi_next, bio_iter, seg);
-}
-#endif
-
 static void out_msg_pos_next(struct ceph_connection *con, struct page *page,
 			size_t len, size_t sent, bool in_trail)
 {

From 572c588edadaa3da3992bd8a0fed830bbcc861f8 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Mon, 11 Jun 2012 14:57:13 -0500
Subject: [PATCH 037/132] libceph: move init of bio_iter

If a message has a non-null bio pointer, its bio_iter field is
initialized in write_partial_msg_pages() if this has not been done
already.  This is really a one-time setup operation for sending a
message's (bio) data, so move that initialization code into
prepare_write_message_data() which serves that purpose.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 net/ceph/messenger.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 7b5ff4545bf3..fedad914b238 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -603,6 +603,10 @@ static void prepare_write_message_data(struct ceph_connection *con)
 		con->out_msg_pos.page_pos = msg->page_alignment;
 	else
 		con->out_msg_pos.page_pos = 0;
+#ifdef CONFIG_BLOCK
+	if (msg->bio && !msg->bio_iter)
+		init_bio_iter(msg->bio, &msg->bio_iter, &msg->bio_seg);
+#endif
 	con->out_msg_pos.data_pos = 0;
 	con->out_msg_pos.did_page_crc = false;
 	con->out_more = 1;  /* data + footer will follow */
@@ -942,11 +946,6 @@ static int write_partial_msg_pages(struct ceph_connection *con)
 	     con, msg, con->out_msg_pos.page, msg->nr_pages,
 	     con->out_msg_pos.page_pos);
 
-#ifdef CONFIG_BLOCK
-	if (msg->bio && !msg->bio_iter)
-		init_bio_iter(msg->bio, &msg->bio_iter, &msg->bio_seg);
-#endif
-
 	while (data_len > con->out_msg_pos.data_pos) {
 		struct page *page = NULL;
 		int max_write = PAGE_SIZE;

From abdaa6a849af1d63153682c11f5bbb22dacb1f6b Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Mon, 11 Jun 2012 14:57:13 -0500
Subject: [PATCH 038/132] libceph: don't use bio_iter as a flag

Recently a bug was fixed in which the bio_iter field in a ceph
message was not being properly re-initialized when a message got
re-transmitted:
    commit 43643528cce60ca184fe8197efa8e8da7c89a037
    Author: Yan, Zheng <zheng.z.yan@intel.com>
    rbd: Clear ceph_msg->bio_iter for retransmitted message

We are now only initializing the bio_iter field when we are about to
start to write message data (in prepare_write_message_data()),
rather than every time we are attempting to write any portion of the
message data (in write_partial_msg_pages()).  This means we no
longer need to use the msg->bio_iter field as a flag.

So just don't do that any more.  Trust prepare_write_message_data()
to ensure msg->bio_iter is properly initialized, every time we are
about to begin writing (or re-writing) a message's bio data.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 net/ceph/messenger.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index fedad914b238..3a4330371d88 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -604,7 +604,7 @@ static void prepare_write_message_data(struct ceph_connection *con)
 	else
 		con->out_msg_pos.page_pos = 0;
 #ifdef CONFIG_BLOCK
-	if (msg->bio && !msg->bio_iter)
+	if (msg->bio)
 		init_bio_iter(msg->bio, &msg->bio_iter, &msg->bio_seg);
 #endif
 	con->out_msg_pos.data_pos = 0;
@@ -672,10 +672,6 @@ static void prepare_write_message(struct ceph_connection *con)
 		m->hdr.seq = cpu_to_le64(++con->out_seq);
 		m->needs_out_seq = false;
 	}
-#ifdef CONFIG_BLOCK
-	else
-		m->bio_iter = NULL;
-#endif
 
 	dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n",
 	     m, con->out_seq, le16_to_cpu(m->hdr.type),

From a8d00e3cdef4c1c4f194414b72b24cd995439a05 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 20 Jun 2012 21:53:53 -0500
Subject: [PATCH 039/132] libceph: SOCK_CLOSED is a flag, not a state

The following commit changed it so SOCK_CLOSED bit was stored in
a connection's new "flags" field rather than its "state" field.

    libceph: start separating connection flags from state
    commit 928443cd

That bit is used in con_close_socket() to protect against setting an
error message more than once in the socket event handler function.

Unfortunately, the field being operated on in that function was not
updated to be "flags" as it should have been.  This fixes that
error.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 net/ceph/messenger.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 3a4330371d88..39653944f21b 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -397,11 +397,11 @@ static int con_close_socket(struct ceph_connection *con)
 	dout("con_close_socket on %p sock %p\n", con, con->sock);
 	if (!con->sock)
 		return 0;
-	set_bit(SOCK_CLOSED, &con->state);
+	set_bit(SOCK_CLOSED, &con->flags);
 	rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR);
 	sock_release(con->sock);
 	con->sock = NULL;
-	clear_bit(SOCK_CLOSED, &con->state);
+	clear_bit(SOCK_CLOSED, &con->flags);
 	con_sock_state_closed(con);
 	return rc;
 }

From 188048bce311ee41e5178bc3255415d0eae28423 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 20 Jun 2012 21:53:53 -0500
Subject: [PATCH 040/132] libceph: don't change socket state on sock event

Currently the socket state change event handler records an error
message on a connection to distinguish a close while connecting from
a close while a connection was already established.

Changing connection information during handling of a socket event is
not very clean, so instead move this assignment inside con_work(),
where it can be done during normal connection-level processing (and
under protection of the connection mutex as well).

Move the handling of a socket closed event up to the top of the
processing loop in con_work(); there's no point in handling backoff
etc. if we have a newly-closed socket to take care of.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 net/ceph/messenger.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 39653944f21b..56381b973d02 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -261,13 +261,8 @@ static void ceph_sock_state_change(struct sock *sk)
 	case TCP_CLOSE_WAIT:
 		dout("%s TCP_CLOSE_WAIT\n", __func__);
 		con_sock_state_closing(con);
-		if (test_and_set_bit(SOCK_CLOSED, &con->flags) == 0) {
-			if (test_bit(CONNECTING, &con->state))
-				con->error_msg = "connection failed";
-			else
-				con->error_msg = "socket closed";
+		if (!test_and_set_bit(SOCK_CLOSED, &con->flags))
 			queue_con(con);
-		}
 		break;
 	case TCP_ESTABLISHED:
 		dout("%s TCP_ESTABLISHED\n", __func__);
@@ -2187,6 +2182,14 @@ static void con_work(struct work_struct *work)
 
 	mutex_lock(&con->mutex);
 restart:
+	if (test_and_clear_bit(SOCK_CLOSED, &con->flags)) {
+		if (test_bit(CONNECTING, &con->state))
+			con->error_msg = "connection failed";
+		else
+			con->error_msg = "socket closed";
+		goto fault;
+	}
+
 	if (test_and_clear_bit(BACKOFF, &con->flags)) {
 		dout("con_work %p backing off\n", con);
 		if (queue_delayed_work(ceph_msgr_wq, &con->work,
@@ -2216,9 +2219,6 @@ restart:
 		con_close_socket(con);
 	}
 
-	if (test_and_clear_bit(SOCK_CLOSED, &con->flags))
-		goto fault;
-
 	ret = try_read(con);
 	if (ret == -EAGAIN)
 		goto restart;

From d65c9e0b9eb43d14ece9dd843506ccba06162ee7 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 20 Jun 2012 21:53:53 -0500
Subject: [PATCH 041/132] libceph: just set SOCK_CLOSED when state changes

When a TCP_CLOSE or TCP_CLOSE_WAIT event occurs, the SOCK_CLOSED
connection flag bit is set, and if it had not been previously set
queue_con() is called to ensure con_work() will get a chance to
handle the changed state.

con_work() atomically checks--and if set, clears--the SOCK_CLOSED
bit if it was set.  This means that even if the bit were set
repeatedly, the related processing in con_work() only gets called
once per transition of the bit from 0 to 1.

What's important then is that we ensure con_work() gets called *at
least* once when a socket close event occurs, not that it gets
called *exactly* once.

The work queue mechanism already takes care of queueing work
only if it is not already queued, so there's no need for us
to call queue_con() conditionally.

So this patch just makes it so the SOCK_CLOSED flag gets set
unconditionally in ceph_sock_state_change().

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 net/ceph/messenger.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 56381b973d02..cebef8560586 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -261,8 +261,8 @@ static void ceph_sock_state_change(struct sock *sk)
 	case TCP_CLOSE_WAIT:
 		dout("%s TCP_CLOSE_WAIT\n", __func__);
 		con_sock_state_closing(con);
-		if (!test_and_set_bit(SOCK_CLOSED, &con->flags))
-			queue_con(con);
+		set_bit(SOCK_CLOSED, &con->flags);
+		queue_con(con);
 		break;
 	case TCP_ESTABLISHED:
 		dout("%s TCP_ESTABLISHED\n", __func__);

From 456ea46865787283088b23a8a7f69244513b95f0 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 20 Jun 2012 21:53:53 -0500
Subject: [PATCH 042/132] libceph: don't touch con state in con_close_socket()

In con_close_socket(), a connection's SOCK_CLOSED flag gets set and
then cleared while its shutdown method is called and its reference
gets dropped.

Previously, that flag got set only if it had not already been set,
so setting it in con_close_socket() might have prevented additional
processing being done on a socket being shut down.  We no longer set
SOCK_CLOSED in the socket event routine conditionally, so setting
that bit here no longer provides whatever benefit it might have
provided before.

A race condition could still leave the SOCK_CLOSED bit set even
after we've issued the call to con_close_socket(), so we still clear
that bit after shutting the socket down.  Add a comment explaining
the reason for this.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 net/ceph/messenger.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index cebef8560586..cfcca1f5be67 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -392,10 +392,16 @@ static int con_close_socket(struct ceph_connection *con)
 	dout("con_close_socket on %p sock %p\n", con, con->sock);
 	if (!con->sock)
 		return 0;
-	set_bit(SOCK_CLOSED, &con->flags);
 	rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR);
 	sock_release(con->sock);
 	con->sock = NULL;
+
+	/*
+	 * Forcibly clear the SOCK_CLOSE flag.  It gets set
+	 * independent of the connection mutex, and we could have
+	 * received a socket close event before we had the chance to
+	 * shut the socket down.
+	 */
 	clear_bit(SOCK_CLOSED, &con->flags);
 	con_sock_state_closed(con);
 	return rc;

From bb9e6bba5d8b85b631390f8dbe8a24ae1ff5b48a Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 20 Jun 2012 21:53:53 -0500
Subject: [PATCH 043/132] libceph: clear CONNECTING in ceph_con_close()

A connection that is closed will no longer be connecting.  So
clear the CONNECTING state bit in ceph_con_close().  Similarly,
if the socket has been closed we no longer are in connecting
state (a new connect sequence will need to be initiated).

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 net/ceph/messenger.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index cfcca1f5be67..beee382d784e 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -462,6 +462,7 @@ void ceph_con_close(struct ceph_connection *con)
 	dout("con_close %p peer %s\n", con,
 	     ceph_pr_addr(&con->peer_addr.in_addr));
 	clear_bit(NEGOTIATING, &con->state);
+	clear_bit(CONNECTING, &con->state);
 	clear_bit(STANDBY, &con->state);  /* avoid connect_seq bump */
 	set_bit(CLOSED, &con->state);
 
@@ -2189,7 +2190,7 @@ static void con_work(struct work_struct *work)
 	mutex_lock(&con->mutex);
 restart:
 	if (test_and_clear_bit(SOCK_CLOSED, &con->flags)) {
-		if (test_bit(CONNECTING, &con->state))
+		if (test_and_clear_bit(CONNECTING, &con->state))
 			con->error_msg = "connection failed";
 		else
 			con->error_msg = "socket closed";

From 3ec50d1868a9e0493046400bb1fdd054c7f64ebd Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 23 May 2012 14:35:23 -0500
Subject: [PATCH 044/132] libceph: clear NEGOTIATING when done

A connection state's NEGOTIATING bit gets set while in CONNECTING
state after we have successfully exchanged a ceph banner and IP
addresses with the connection's peer (the server).  But that bit
is not cleared again--at least not until another connection attempt
is initiated.

Instead, clear it as soon as the connection is fully established.
Also, clear it when a socket connection gets prematurely closed
in the midst of establishing a ceph connection (in case we had
reached the point where it was set).

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 net/ceph/messenger.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index beee382d784e..500207bad5d6 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -1562,6 +1562,7 @@ static int process_connect(struct ceph_connection *con)
 			fail_protocol(con);
 			return -1;
 		}
+		clear_bit(NEGOTIATING, &con->state);
 		clear_bit(CONNECTING, &con->state);
 		con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
 		con->connect_seq++;
@@ -1951,7 +1952,6 @@ more:
 
 	/* open the socket first? */
 	if (con->sock == NULL) {
-		clear_bit(NEGOTIATING, &con->state);
 		set_bit(CONNECTING, &con->state);
 
 		con_out_kvec_reset(con);
@@ -2190,10 +2190,12 @@ static void con_work(struct work_struct *work)
 	mutex_lock(&con->mutex);
 restart:
 	if (test_and_clear_bit(SOCK_CLOSED, &con->flags)) {
-		if (test_and_clear_bit(CONNECTING, &con->state))
+		if (test_and_clear_bit(CONNECTING, &con->state)) {
+			clear_bit(NEGOTIATING, &con->state);
 			con->error_msg = "connection failed";
-		else
+		} else {
 			con->error_msg = "socket closed";
+		}
 		goto fault;
 	}
 

From e27947c767f5bed15048f4e4dad3e2eb69133697 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 23 May 2012 14:35:23 -0500
Subject: [PATCH 045/132] libceph: define and use an explicit CONNECTED state

There is no state explicitly defined when a ceph connection is fully
operational.  So define one.

It's set when the connection sequence completes successfully, and is
cleared when the connection gets closed.

Be a little more careful when examining the old state when a socket
disconnect event is reported.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 include/linux/ceph/messenger.h | 1 +
 net/ceph/messenger.c           | 9 +++++++--
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index cc6f9bdcf466..002d504df3b7 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -120,6 +120,7 @@ struct ceph_msg_pos {
  */
 #define CONNECTING	1
 #define NEGOTIATING	2
+#define CONNECTED	5
 #define STANDBY		8  /* no outgoing messages, socket closed.  we keep
 			    * the ceph_connection around to maintain shared
 			    * state with the peer. */
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 500207bad5d6..83bcf977e9b9 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -463,6 +463,7 @@ void ceph_con_close(struct ceph_connection *con)
 	     ceph_pr_addr(&con->peer_addr.in_addr));
 	clear_bit(NEGOTIATING, &con->state);
 	clear_bit(CONNECTING, &con->state);
+	clear_bit(CONNECTED, &con->state);
 	clear_bit(STANDBY, &con->state);  /* avoid connect_seq bump */
 	set_bit(CLOSED, &con->state);
 
@@ -1564,6 +1565,7 @@ static int process_connect(struct ceph_connection *con)
 		}
 		clear_bit(NEGOTIATING, &con->state);
 		clear_bit(CONNECTING, &con->state);
+		set_bit(CONNECTED, &con->state);
 		con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
 		con->connect_seq++;
 		con->peer_features = server_feat;
@@ -2114,6 +2116,7 @@ more:
 			prepare_read_ack(con);
 			break;
 		case CEPH_MSGR_TAG_CLOSE:
+			clear_bit(CONNECTED, &con->state);
 			set_bit(CLOSED, &con->state);   /* fixme */
 			goto out;
 		default:
@@ -2190,11 +2193,13 @@ static void con_work(struct work_struct *work)
 	mutex_lock(&con->mutex);
 restart:
 	if (test_and_clear_bit(SOCK_CLOSED, &con->flags)) {
-		if (test_and_clear_bit(CONNECTING, &con->state)) {
+		if (test_and_clear_bit(CONNECTED, &con->state))
+			con->error_msg = "socket closed";
+		else if (test_and_clear_bit(CONNECTING, &con->state)) {
 			clear_bit(NEGOTIATING, &con->state);
 			con->error_msg = "connection failed";
 		} else {
-			con->error_msg = "socket closed";
+			con->error_msg = "unrecognized con state";
 		}
 		goto fault;
 	}

From ab166d5aa3bc036fba7efaca6e4e43a7e9510acf Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 31 May 2012 11:37:29 -0500
Subject: [PATCH 046/132] libceph: separate banner and connect writes

There are two phases in the process of linking together the two ends
of a ceph connection.  The first involves exchanging a banner and
IP addresses, and if that is successful a second phase exchanges
some detail about each side's connection capabilities.

When initiating a connection, the client side now queues to send
its information for both phases of this process at the same time.
This is probably a bit more efficient, but it is slightly messier
from a layering perspective in the code.

So rearrange things so that the client doesn't send the connection
information until it has received and processed the response in the
initial banner phase (in process_banner()).

Move the code (in the (con->sock == NULL) case in try_write()) that
prepares for writing the connection information, delaying doing that
until the banner exchange has completed.  Move the code that begins
the transition to this second "NEGOTIATING" phase out of
process_banner() and into its caller, so preparing to write the
connection information and preparing to read the response are
adjacent to each other.

Finally, preparing to write the connection information now requires
the output kvec to be reset in all cases, so move that into the
prepare_write_connect() and delete it from all callers.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 net/ceph/messenger.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 83bcf977e9b9..5e67be3fa296 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -841,6 +841,7 @@ static int prepare_write_connect(struct ceph_connection *con)
 	con->out_connect.authorizer_len = auth ?
 		cpu_to_le32(auth->authorizer_buf_len) : 0;
 
+	con_out_kvec_reset(con);
 	con_out_kvec_add(con, sizeof (con->out_connect),
 					&con->out_connect);
 	if (auth && auth->authorizer_buf_len)
@@ -1430,8 +1431,6 @@ static int process_banner(struct ceph_connection *con)
 		     ceph_pr_addr(&con->msgr->inst.addr.in_addr));
 	}
 
-	set_bit(NEGOTIATING, &con->state);
-	prepare_read_connect(con);
 	return 0;
 }
 
@@ -1481,7 +1480,6 @@ static int process_connect(struct ceph_connection *con)
 			return -1;
 		}
 		con->auth_retry = 1;
-		con_out_kvec_reset(con);
 		ret = prepare_write_connect(con);
 		if (ret < 0)
 			return ret;
@@ -1502,7 +1500,6 @@ static int process_connect(struct ceph_connection *con)
 		       ENTITY_NAME(con->peer_name),
 		       ceph_pr_addr(&con->peer_addr.in_addr));
 		reset_connection(con);
-		con_out_kvec_reset(con);
 		ret = prepare_write_connect(con);
 		if (ret < 0)
 			return ret;
@@ -1528,7 +1525,6 @@ static int process_connect(struct ceph_connection *con)
 		     le32_to_cpu(con->out_connect.connect_seq),
 		     le32_to_cpu(con->in_connect.connect_seq));
 		con->connect_seq = le32_to_cpu(con->in_connect.connect_seq);
-		con_out_kvec_reset(con);
 		ret = prepare_write_connect(con);
 		if (ret < 0)
 			return ret;
@@ -1545,7 +1541,6 @@ static int process_connect(struct ceph_connection *con)
 		     le32_to_cpu(con->in_connect.global_seq));
 		get_global_seq(con->msgr,
 			       le32_to_cpu(con->in_connect.global_seq));
-		con_out_kvec_reset(con);
 		ret = prepare_write_connect(con);
 		if (ret < 0)
 			return ret;
@@ -1958,9 +1953,6 @@ more:
 
 		con_out_kvec_reset(con);
 		prepare_write_banner(con);
-		ret = prepare_write_connect(con);
-		if (ret < 0)
-			goto out;
 		prepare_read_banner(con);
 
 		BUG_ON(con->in_msg);
@@ -2073,6 +2065,16 @@ more:
 			ret = process_banner(con);
 			if (ret < 0)
 				goto out;
+
+			/* Banner is good, exchange connection info */
+			ret = prepare_write_connect(con);
+			if (ret < 0)
+				goto out;
+			prepare_read_connect(con);
+			set_bit(NEGOTIATING, &con->state);
+
+			/* Send connection info before awaiting response */
+			goto out;
 		}
 		ret = read_partial_connect(con);
 		if (ret <= 0)

From 7593af920baac37752190a0db703d2732bed4a3b Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 24 May 2012 11:55:03 -0500
Subject: [PATCH 047/132] libceph: distinguish two phases of connect sequence

Currently a ceph connection enters a "CONNECTING" state when it
begins the process of (re-)connecting with its peer.  Once the two
ends have successfully exchanged their banner and addresses, an
additional NEGOTIATING bit is set in the ceph connection's state to
indicate the connection information exhange has begun.  The
CONNECTING bit/state continues to be set during this phase.

Rather than have the CONNECTING state continue while the NEGOTIATING
bit is set, interpret these two phases as distinct states.  In other
words, when NEGOTIATING is set, clear CONNECTING.  That way only
one of them will be active at a time.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 net/ceph/messenger.c | 52 ++++++++++++++++++++++++--------------------
 1 file changed, 28 insertions(+), 24 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 5e67be3fa296..32a3a2a72580 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -1559,7 +1559,6 @@ static int process_connect(struct ceph_connection *con)
 			return -1;
 		}
 		clear_bit(NEGOTIATING, &con->state);
-		clear_bit(CONNECTING, &con->state);
 		set_bit(CONNECTED, &con->state);
 		con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
 		con->connect_seq++;
@@ -2000,7 +1999,8 @@ more_kvec:
 	}
 
 do_next:
-	if (!test_bit(CONNECTING, &con->state)) {
+	if (!test_bit(CONNECTING, &con->state) &&
+			!test_bit(NEGOTIATING, &con->state)) {
 		/* is anything else pending? */
 		if (!list_empty(&con->out_queue)) {
 			prepare_write_message(con);
@@ -2057,25 +2057,29 @@ more:
 	}
 
 	if (test_bit(CONNECTING, &con->state)) {
-		if (!test_bit(NEGOTIATING, &con->state)) {
-			dout("try_read connecting\n");
-			ret = read_partial_banner(con);
-			if (ret <= 0)
-				goto out;
-			ret = process_banner(con);
-			if (ret < 0)
-				goto out;
-
-			/* Banner is good, exchange connection info */
-			ret = prepare_write_connect(con);
-			if (ret < 0)
-				goto out;
-			prepare_read_connect(con);
-			set_bit(NEGOTIATING, &con->state);
-
-			/* Send connection info before awaiting response */
+		dout("try_read connecting\n");
+		ret = read_partial_banner(con);
+		if (ret <= 0)
 			goto out;
-		}
+		ret = process_banner(con);
+		if (ret < 0)
+			goto out;
+
+		clear_bit(CONNECTING, &con->state);
+		set_bit(NEGOTIATING, &con->state);
+
+		/* Banner is good, exchange connection info */
+		ret = prepare_write_connect(con);
+		if (ret < 0)
+			goto out;
+		prepare_read_connect(con);
+
+		/* Send connection info before awaiting response */
+		goto out;
+	}
+
+	if (test_bit(NEGOTIATING, &con->state)) {
+		dout("try_read negotiating\n");
 		ret = read_partial_connect(con);
 		if (ret <= 0)
 			goto out;
@@ -2197,12 +2201,12 @@ restart:
 	if (test_and_clear_bit(SOCK_CLOSED, &con->flags)) {
 		if (test_and_clear_bit(CONNECTED, &con->state))
 			con->error_msg = "socket closed";
-		else if (test_and_clear_bit(CONNECTING, &con->state)) {
-			clear_bit(NEGOTIATING, &con->state);
+		else if (test_and_clear_bit(NEGOTIATING, &con->state))
+			con->error_msg = "negotiation failed";
+		else if (test_and_clear_bit(CONNECTING, &con->state))
 			con->error_msg = "connection failed";
-		} else {
+		else
 			con->error_msg = "unrecognized con state";
-		}
 		goto fault;
 	}
 

From 5821bd8ccdf5d17ab2c391c773756538603838c3 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Mon, 11 Jun 2012 14:57:13 -0500
Subject: [PATCH 048/132] libceph: small changes to messenger.c

This patch gathers a few small changes in "net/ceph/messenger.c":
  out_msg_pos_next()
    - small logic change that mostly affects indentation
  write_partial_msg_pages().
    - use a local variable trail_off to represent the offset into
      a message of the trail portion of the data (if present)
    - once we are in the trail portion we will always be there, so we
      don't always need to check against our data position
    - avoid computing len twice after we've reached the trail
    - get rid of the variable tmpcrc, which is not needed
    - trail_off and trail_len never change so mark them const
    - update some comments
  read_partial_message_bio()
    - bio_iovec_idx() will never return an error, so don't bother
      checking for it

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 net/ceph/messenger.c | 63 ++++++++++++++++++++++----------------------
 1 file changed, 31 insertions(+), 32 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 32a3a2a72580..4578e99bbef1 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -907,21 +907,23 @@ static void out_msg_pos_next(struct ceph_connection *con, struct page *page,
 
 	con->out_msg_pos.data_pos += sent;
 	con->out_msg_pos.page_pos += sent;
-	if (sent == len) {
-		con->out_msg_pos.page_pos = 0;
-		con->out_msg_pos.page++;
-		con->out_msg_pos.did_page_crc = false;
-		if (in_trail)
-			list_move_tail(&page->lru,
-				       &msg->trail->head);
-		else if (msg->pagelist)
-			list_move_tail(&page->lru,
-				       &msg->pagelist->head);
+	if (sent < len)
+		return;
+
+	BUG_ON(sent != len);
+	con->out_msg_pos.page_pos = 0;
+	con->out_msg_pos.page++;
+	con->out_msg_pos.did_page_crc = false;
+	if (in_trail)
+		list_move_tail(&page->lru,
+			       &msg->trail->head);
+	else if (msg->pagelist)
+		list_move_tail(&page->lru,
+			       &msg->pagelist->head);
 #ifdef CONFIG_BLOCK
-		else if (msg->bio)
-			iter_bio_next(&msg->bio_iter, &msg->bio_seg);
+	else if (msg->bio)
+		iter_bio_next(&msg->bio_iter, &msg->bio_seg);
 #endif
-	}
 }
 
 /*
@@ -940,30 +942,31 @@ static int write_partial_msg_pages(struct ceph_connection *con)
 	int ret;
 	int total_max_write;
 	bool in_trail = false;
-	size_t trail_len = (msg->trail ? msg->trail->length : 0);
+	const size_t trail_len = (msg->trail ? msg->trail->length : 0);
+	const size_t trail_off = data_len - trail_len;
 
 	dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n",
 	     con, msg, con->out_msg_pos.page, msg->nr_pages,
 	     con->out_msg_pos.page_pos);
 
+	/*
+	 * Iterate through each page that contains data to be
+	 * written, and send as much as possible for each.
+	 *
+	 * If we are calculating the data crc (the default), we will
+	 * need to map the page.  If we have no pages, they have
+	 * been revoked, so use the zero page.
+	 */
 	while (data_len > con->out_msg_pos.data_pos) {
 		struct page *page = NULL;
 		int max_write = PAGE_SIZE;
 		int bio_offset = 0;
 
-		total_max_write = data_len - trail_len -
-			con->out_msg_pos.data_pos;
-
-		/*
-		 * if we are calculating the data crc (the default), we need
-		 * to map the page.  if our pages[] has been revoked, use the
-		 * zero page.
-		 */
-
-		/* have we reached the trail part of the data? */
-		if (con->out_msg_pos.data_pos >= data_len - trail_len) {
-			in_trail = true;
+		in_trail = in_trail || con->out_msg_pos.data_pos >= trail_off;
+		if (!in_trail)
+			total_max_write = trail_off - con->out_msg_pos.data_pos;
 
+		if (in_trail) {
 			total_max_write = data_len - con->out_msg_pos.data_pos;
 
 			page = list_first_entry(&msg->trail->head,
@@ -990,14 +993,13 @@ static int write_partial_msg_pages(struct ceph_connection *con)
 
 		if (do_datacrc && !con->out_msg_pos.did_page_crc) {
 			void *base;
-			u32 crc;
-			u32 tmpcrc = le32_to_cpu(msg->footer.data_crc);
+			u32 crc = le32_to_cpu(msg->footer.data_crc);
 			char *kaddr;
 
 			kaddr = kmap(page);
 			BUG_ON(kaddr == NULL);
 			base = kaddr + con->out_msg_pos.page_pos + bio_offset;
-			crc = crc32c(tmpcrc, base, len);
+			crc = crc32c(crc, base, len);
 			msg->footer.data_crc = cpu_to_le32(crc);
 			con->out_msg_pos.did_page_crc = true;
 		}
@@ -1702,9 +1704,6 @@ static int read_partial_message_bio(struct ceph_connection *con,
 	void *p;
 	int ret, left;
 
-	if (IS_ERR(bv))
-		return PTR_ERR(bv);
-
 	left = min((int)(data_len - con->in_msg_pos.data_pos),
 		   (int)(bv->bv_len - con->in_msg_pos.page_pos));
 

From bc18f4b1c850ab355e38373fbb60fd28568d84b5 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 20 Jun 2012 21:53:53 -0500
Subject: [PATCH 049/132] libceph: add some fine ASCII art

Sage liked the state diagram I put in my commit description so
I'm putting it in with the code.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 net/ceph/messenger.c | 42 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 41 insertions(+), 1 deletion(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 4578e99bbef1..dcc50e4cd5cd 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -29,7 +29,47 @@
  * the sender.
  */
 
-/* State values for ceph_connection->sock_state; NEW is assumed to be 0 */
+/*
+ * We track the state of the socket on a given connection using
+ * values defined below.  The transition to a new socket state is
+ * handled by a function which verifies we aren't coming from an
+ * unexpected state.
+ *
+ *      --------
+ *      | NEW* |  transient initial state
+ *      --------
+ *          | con_sock_state_init()
+ *          v
+ *      ----------
+ *      | CLOSED |  initialized, but no socket (and no
+ *      ----------  TCP connection)
+ *       ^      \
+ *       |       \ con_sock_state_connecting()
+ *       |        ----------------------
+ *       |                              \
+ *       + con_sock_state_closed()       \
+ *       |\                               \
+ *       | \                               \
+ *       |  -----------                     \
+ *       |  | CLOSING |  socket event;       \
+ *       |  -----------  await close          \
+ *       |       ^                            |
+ *       |       |                            |
+ *       |       + con_sock_state_closing()   |
+ *       |      / \                           |
+ *       |     /   ---------------            |
+ *       |    /                   \           v
+ *       |   /                    --------------
+ *       |  /    -----------------| CONNECTING |  socket created, TCP
+ *       |  |   /                 --------------  connect initiated
+ *       |  |   | con_sock_state_connected()
+ *       |  |   v
+ *      -------------
+ *      | CONNECTED |  TCP connection established
+ *      -------------
+ *
+ * State values for ceph_connection->sock_state; NEW is assumed to be 0.
+ */
 
 #define CON_SOCK_STATE_NEW		0	/* -> CLOSED */
 #define CON_SOCK_STATE_CLOSED		1	/* -> CONNECTING */

From 261030215d970c62f799e6e508e3c68fc7ec2aa9 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 21 Jun 2012 12:49:23 -0700
Subject: [PATCH 050/132] libceph: drop declaration of ceph_con_get()

For some reason the declaration of ceph_con_get() and
ceph_con_put() did not get deleted in this commit:
    d59315ca libceph: drop ceph_con_get/put helpers and nref member

Clean that up.

Signed-off-by: Alex Elder <elder@inktank.com>
---
 include/linux/ceph/messenger.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 002d504df3b7..dd4ef1f8ec93 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -244,8 +244,6 @@ extern void ceph_msg_revoke(struct ceph_msg *msg);
 extern void ceph_msg_revoke_incoming(struct ceph_msg *msg);
 
 extern void ceph_con_keepalive(struct ceph_connection *con);
-extern struct ceph_connection *ceph_con_get(struct ceph_connection *con);
-extern void ceph_con_put(struct ceph_connection *con);
 
 extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags,
 				     bool can_fail);

From b7a9e5dd40f17a48a72f249b8bbc989b63bae5fd Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Wed, 27 Jun 2012 12:24:08 -0700
Subject: [PATCH 051/132] libceph: set peer name on con_open, not init

The peer name may change on each open attempt, even when the connection is
reused.

Signed-off-by: Sage Weil <sage@inktank.com>
---
 fs/ceph/mds_client.c           |  7 ++++---
 include/linux/ceph/messenger.h |  4 ++--
 net/ceph/messenger.c           | 12 +++++++-----
 net/ceph/mon_client.c          |  4 ++--
 net/ceph/osd_client.c          | 10 ++++++----
 5 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index ecd7f15741c1..5ac6434185ae 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -394,8 +394,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
 	s->s_seq = 0;
 	mutex_init(&s->s_mutex);
 
-	ceph_con_init(&s->s_con, s, &mds_con_ops, &mdsc->fsc->client->msgr,
-		CEPH_ENTITY_TYPE_MDS, mds);
+	ceph_con_init(&s->s_con, s, &mds_con_ops, &mdsc->fsc->client->msgr);
 
 	spin_lock_init(&s->s_gen_ttl_lock);
 	s->s_cap_gen = 0;
@@ -437,7 +436,8 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
 	mdsc->sessions[mds] = s;
 	atomic_inc(&s->s_ref);  /* one ref to sessions[], one to caller */
 
-	ceph_con_open(&s->s_con, ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
+	ceph_con_open(&s->s_con, CEPH_ENTITY_TYPE_MDS, mds,
+		      ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
 
 	return s;
 
@@ -2529,6 +2529,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
 	session->s_seq = 0;
 
 	ceph_con_open(&session->s_con,
+		      CEPH_ENTITY_TYPE_MDS, mds,
 		      ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
 
 	/* replay unsafe requests */
diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index dd4ef1f8ec93..478f814f2100 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -232,9 +232,9 @@ extern void ceph_messenger_init(struct ceph_messenger *msgr,
 
 extern void ceph_con_init(struct ceph_connection *con, void *private,
 			const struct ceph_connection_operations *ops,
-			struct ceph_messenger *msgr, __u8 entity_type,
-			__u64 entity_num);
+			struct ceph_messenger *msgr);
 extern void ceph_con_open(struct ceph_connection *con,
+			  __u8 entity_type, __u64 entity_num,
 			  struct ceph_entity_addr *addr);
 extern bool ceph_con_opened(struct ceph_connection *con);
 extern void ceph_con_close(struct ceph_connection *con);
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index dcc50e4cd5cd..ae082d95fc72 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -523,12 +523,17 @@ EXPORT_SYMBOL(ceph_con_close);
 /*
  * Reopen a closed connection, with a new peer address.
  */
-void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr)
+void ceph_con_open(struct ceph_connection *con,
+		   __u8 entity_type, __u64 entity_num,
+		   struct ceph_entity_addr *addr)
 {
 	dout("con_open %p %s\n", con, ceph_pr_addr(&addr->in_addr));
 	set_bit(OPENING, &con->state);
 	WARN_ON(!test_and_clear_bit(CLOSED, &con->state));
 
+	con->peer_name.type = (__u8) entity_type;
+	con->peer_name.num = cpu_to_le64(entity_num);
+
 	memcpy(&con->peer_addr, addr, sizeof(*addr));
 	con->delay = 0;      /* reset backoff memory */
 	queue_con(con);
@@ -548,7 +553,7 @@ bool ceph_con_opened(struct ceph_connection *con)
  */
 void ceph_con_init(struct ceph_connection *con, void *private,
 	const struct ceph_connection_operations *ops,
-	struct ceph_messenger *msgr, __u8 entity_type, __u64 entity_num)
+	struct ceph_messenger *msgr)
 {
 	dout("con_init %p\n", con);
 	memset(con, 0, sizeof(*con));
@@ -558,9 +563,6 @@ void ceph_con_init(struct ceph_connection *con, void *private,
 
 	con_sock_state_init(con);
 
-	con->peer_name.type = (__u8) entity_type;
-	con->peer_name.num = cpu_to_le64(entity_num);
-
 	mutex_init(&con->mutex);
 	INIT_LIST_HEAD(&con->out_queue);
 	INIT_LIST_HEAD(&con->out_sent);
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index e9db3de20b2e..bcc80a0e2a98 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -143,11 +143,11 @@ static int __open_session(struct ceph_mon_client *monc)
 		monc->want_next_osdmap = !!monc->want_next_osdmap;
 
 		ceph_con_init(&monc->con, monc, &mon_con_ops,
-			&monc->client->msgr,
-			CEPH_ENTITY_TYPE_MON, monc->cur_mon);
+			&monc->client->msgr);
 
 		dout("open_session mon%d opening\n", monc->cur_mon);
 		ceph_con_open(&monc->con,
+			      CEPH_ENTITY_TYPE_MON, monc->cur_mon,
 			      &monc->monmap->mon_inst[monc->cur_mon].addr);
 
 		/* initiatiate authentication handshake */
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index db2da54f7336..c2527113d2ae 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -639,8 +639,7 @@ static struct ceph_osd *create_osd(struct ceph_osd_client *osdc, int onum)
 	INIT_LIST_HEAD(&osd->o_osd_lru);
 	osd->o_incarnation = 1;
 
-	ceph_con_init(&osd->o_con, osd, &osd_con_ops, &osdc->client->msgr,
-		CEPH_ENTITY_TYPE_OSD, onum);
+	ceph_con_init(&osd->o_con, osd, &osd_con_ops, &osdc->client->msgr);
 
 	INIT_LIST_HEAD(&osd->o_keepalive_item);
 	return osd;
@@ -750,7 +749,8 @@ static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
 		ret = -EAGAIN;
 	} else {
 		ceph_con_close(&osd->o_con);
-		ceph_con_open(&osd->o_con, &osdc->osdmap->osd_addr[osd->o_osd]);
+		ceph_con_open(&osd->o_con, CEPH_ENTITY_TYPE_OSD, osd->o_osd,
+			      &osdc->osdmap->osd_addr[osd->o_osd]);
 		osd->o_incarnation++;
 	}
 	return ret;
@@ -1005,7 +1005,9 @@ static int __map_request(struct ceph_osd_client *osdc,
 		dout("map_request osd %p is osd%d\n", req->r_osd, o);
 		__insert_osd(osdc, req->r_osd);
 
-		ceph_con_open(&req->r_osd->o_con, &osdc->osdmap->osd_addr[o]);
+		ceph_con_open(&req->r_osd->o_con,
+			      CEPH_ENTITY_TYPE_OSD, o,
+			      &osdc->osdmap->osd_addr[o]);
 	}
 
 	if (req->r_osd) {

From 735a72ef952d42a256f79ae3e6dc1c17a45c041b Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Wed, 27 Jun 2012 12:24:34 -0700
Subject: [PATCH 052/132] libceph: initialize mon_client con only once

Do not re-initialize the con on every connection attempt.  When we
ceph_con_close, there may still be work queued on the socket (e.g., to
close it), and re-initializing will clobber the work_struct state.

Signed-off-by: Sage Weil <sage@inktank.com>
---
 net/ceph/mon_client.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index bcc80a0e2a98..bfd21a891d69 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -119,7 +119,6 @@ static void __close_session(struct ceph_mon_client *monc)
 	dout("__close_session closing mon%d\n", monc->cur_mon);
 	ceph_msg_revoke(monc->m_auth);
 	ceph_con_close(&monc->con);
-	monc->con.private = NULL;
 	monc->cur_mon = -1;
 	monc->pending_auth = 0;
 	ceph_auth_reset(monc->auth);
@@ -142,9 +141,6 @@ static int __open_session(struct ceph_mon_client *monc)
 		monc->sub_renew_after = jiffies;  /* i.e., expired */
 		monc->want_next_osdmap = !!monc->want_next_osdmap;
 
-		ceph_con_init(&monc->con, monc, &mon_con_ops,
-			&monc->client->msgr);
-
 		dout("open_session mon%d opening\n", monc->cur_mon);
 		ceph_con_open(&monc->con,
 			      CEPH_ENTITY_TYPE_MON, monc->cur_mon,
@@ -798,6 +794,9 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
 	if (!monc->m_auth)
 		goto out_auth_reply;
 
+	ceph_con_init(&monc->con, monc, &mon_con_ops,
+		      &monc->client->msgr);
+
 	monc->cur_mon = -1;
 	monc->hunting = true;
 	monc->sub_renew_after = jiffies;

From fbb85a478f6d4cce6942f1c25c6a68ec5b1e7e7f Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Wed, 27 Jun 2012 12:31:02 -0700
Subject: [PATCH 053/132] libceph: allow sock transition from CONNECTING to
 CLOSED

It is possible to close a socket that is in the OPENING state.  For
example, it can happen if ceph_con_close() is called on the con before
the TCP connection is established.  con_work() will come around and shut
down the socket.

Signed-off-by: Sage Weil <sage@inktank.com>
---
 net/ceph/messenger.c | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index ae082d95fc72..09ada7924874 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -48,17 +48,17 @@
  *       |        ----------------------
  *       |                              \
  *       + con_sock_state_closed()       \
- *       |\                               \
- *       | \                               \
- *       |  -----------                     \
- *       |  | CLOSING |  socket event;       \
- *       |  -----------  await close          \
- *       |       ^                            |
- *       |       |                            |
- *       |       + con_sock_state_closing()   |
- *       |      / \                           |
- *       |     /   ---------------            |
- *       |    /                   \           v
+ *       |+---------------------------    \
+ *       | \                          \    \
+ *       |  -----------                \    \
+ *       |  | CLOSING |  socket event;  \    \
+ *       |  -----------  await close     \    \
+ *       |       ^                        \   |
+ *       |       |                         \  |
+ *       |       + con_sock_state_closing() \ |
+ *       |      / \                         | |
+ *       |     /   ---------------          | |
+ *       |    /                   \         v v
  *       |   /                    --------------
  *       |  /    -----------------| CONNECTING |  socket created, TCP
  *       |  |   /                 --------------  connect initiated
@@ -241,7 +241,8 @@ static void con_sock_state_closed(struct ceph_connection *con)
 
 	old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSED);
 	if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTED &&
-			old_state != CON_SOCK_STATE_CLOSING))
+		    old_state != CON_SOCK_STATE_CLOSING &&
+		    old_state != CON_SOCK_STATE_CONNECTING))
 		printk("%s: unexpected old state %d\n", __func__, old_state);
 }
 

From d50b409fb8698571d8209e5adfe122e287e31290 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Mon, 9 Jul 2012 14:22:34 -0700
Subject: [PATCH 054/132] libceph: initialize msgpool message types

Initialize the type field for messages in a msgpool.  The caller was doing
this for osd ops, but not for the reply messages.

Reported-by: Alex Elder <elder@inktank.com>
Signed-off-by: Sage Weil <sage@inktank.com>
---
 include/linux/ceph/msgpool.h | 3 ++-
 net/ceph/msgpool.c           | 7 ++++---
 net/ceph/osd_client.c        | 7 ++++---
 3 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/include/linux/ceph/msgpool.h b/include/linux/ceph/msgpool.h
index a362605f9368..09fa96b43436 100644
--- a/include/linux/ceph/msgpool.h
+++ b/include/linux/ceph/msgpool.h
@@ -11,10 +11,11 @@
 struct ceph_msgpool {
 	const char *name;
 	mempool_t *pool;
+	int type;               /* preallocated message type */
 	int front_len;          /* preallocated payload size */
 };
 
-extern int ceph_msgpool_init(struct ceph_msgpool *pool,
+extern int ceph_msgpool_init(struct ceph_msgpool *pool, int type,
 			     int front_len, int size, bool blocking,
 			     const char *name);
 extern void ceph_msgpool_destroy(struct ceph_msgpool *pool);
diff --git a/net/ceph/msgpool.c b/net/ceph/msgpool.c
index 11d5f4196a73..ddec1c10ac80 100644
--- a/net/ceph/msgpool.c
+++ b/net/ceph/msgpool.c
@@ -12,7 +12,7 @@ static void *msgpool_alloc(gfp_t gfp_mask, void *arg)
 	struct ceph_msgpool *pool = arg;
 	struct ceph_msg *msg;
 
-	msg = ceph_msg_new(0, pool->front_len, gfp_mask, true);
+	msg = ceph_msg_new(pool->type, pool->front_len, gfp_mask, true);
 	if (!msg) {
 		dout("msgpool_alloc %s failed\n", pool->name);
 	} else {
@@ -32,10 +32,11 @@ static void msgpool_free(void *element, void *arg)
 	ceph_msg_put(msg);
 }
 
-int ceph_msgpool_init(struct ceph_msgpool *pool,
+int ceph_msgpool_init(struct ceph_msgpool *pool, int type,
 		      int front_len, int size, bool blocking, const char *name)
 {
 	dout("msgpool %s init\n", name);
+	pool->type = type;
 	pool->front_len = front_len;
 	pool->pool = mempool_create(size, msgpool_alloc, msgpool_free, pool);
 	if (!pool->pool)
@@ -61,7 +62,7 @@ struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool,
 		WARN_ON(1);
 
 		/* try to alloc a fresh message */
-		return ceph_msg_new(0, front_len, GFP_NOFS, false);
+		return ceph_msg_new(pool->type, front_len, GFP_NOFS, false);
 	}
 
 	msg = mempool_alloc(pool->pool, GFP_NOFS);
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index c2527113d2ae..4475d17863ee 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -242,6 +242,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 		}
 		ceph_pagelist_init(req->r_trail);
 	}
+
 	/* create request message; allow space for oid */
 	msg_size += MAX_OBJ_NAME_SIZE;
 	if (snapc)
@@ -255,7 +256,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 		return NULL;
 	}
 
-	msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP);
 	memset(msg->front.iov_base, 0, msg->front.iov_len);
 
 	req->r_request = msg;
@@ -1837,11 +1837,12 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
 	if (!osdc->req_mempool)
 		goto out;
 
-	err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true,
+	err = ceph_msgpool_init(&osdc->msgpool_op, CEPH_MSG_OSD_OP,
+				OSD_OP_FRONT_LEN, 10, true,
 				"osd_op");
 	if (err < 0)
 		goto out_mempool;
-	err = ceph_msgpool_init(&osdc->msgpool_op_reply,
+	err = ceph_msgpool_init(&osdc->msgpool_op_reply, CEPH_MSG_OSD_OPREPLY,
 				OSD_OPREPLY_FRONT_LEN, 10, true,
 				"osd_op_reply");
 	if (err < 0)

From cd43045c2de60f40a0aea49bfb252a2eafe58f8c Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Mon, 9 Jul 2012 14:31:41 -0700
Subject: [PATCH 055/132] libceph: initialize rb, list nodes in
 ceph_osd_request

These don't strictly need to be initialized based on how they are used, but
it is good practice to do so.

Reported-by: Alex Elder <elder@inktank.com>
Signed-off-by: Sage Weil <sage@inktank.com>
---
 net/ceph/osd_client.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 4475d17863ee..07920cac31a6 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -213,10 +213,13 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 	kref_init(&req->r_kref);
 	init_completion(&req->r_completion);
 	init_completion(&req->r_safe_completion);
+	rb_init_node(&req->r_node);
 	INIT_LIST_HEAD(&req->r_unsafe_item);
 	INIT_LIST_HEAD(&req->r_linger_item);
 	INIT_LIST_HEAD(&req->r_linger_osd);
 	INIT_LIST_HEAD(&req->r_req_lru_item);
+	INIT_LIST_HEAD(&req->r_osd_item);
+
 	req->r_flags = flags;
 
 	WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0);

From a16cb1f70799c851410d9dca0a24122e258df06c Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Tue, 10 Jul 2012 11:53:34 -0700
Subject: [PATCH 056/132] libceph: fix messenger retry

In ancient times, the messenger could both initiate and accept connections.
An artifact if that was data structures to store/process an incoming
ceph_msg_connect request and send an outgoing ceph_msg_connect_reply.
Sadly, the negotiation code was referencing those structures and ignoring
important information (like the peer's connect_seq) from the correct ones.

Among other things, this fixes tight reconnect loops where the server sends
RETRY_SESSION and we (the client) retries with the same connect_seq as last
time.  This bug pretty easily triggered by injecting socket failures on the
MDS and running some fs workload like workunits/direct_io/test_sync_io.

Signed-off-by: Sage Weil <sage@inktank.com>
---
 include/linux/ceph/messenger.h | 12 ++----------
 net/ceph/messenger.c           | 12 ++++++------
 2 files changed, 8 insertions(+), 16 deletions(-)

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 478f814f2100..cfb1bbdac624 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -172,16 +172,8 @@ struct ceph_connection {
 
 	/* connection negotiation temps */
 	char in_banner[CEPH_BANNER_MAX_LEN];
-	union {
-		struct {  /* outgoing connection */
-			struct ceph_msg_connect out_connect;
-			struct ceph_msg_connect_reply in_reply;
-		};
-		struct {  /* incoming */
-			struct ceph_msg_connect in_connect;
-			struct ceph_msg_connect_reply out_reply;
-		};
-	};
+	struct ceph_msg_connect out_connect;
+	struct ceph_msg_connect_reply in_reply;
 	struct ceph_entity_addr actual_peer_addr;
 
 	/* message out temps */
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 09ada7924874..16814d1f4774 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -1540,7 +1540,7 @@ static int process_connect(struct ceph_connection *con)
 		 * dropped messages.
 		 */
 		dout("process_connect got RESET peer seq %u\n",
-		     le32_to_cpu(con->in_connect.connect_seq));
+		     le32_to_cpu(con->in_reply.connect_seq));
 		pr_err("%s%lld %s connection reset\n",
 		       ENTITY_NAME(con->peer_name),
 		       ceph_pr_addr(&con->peer_addr.in_addr));
@@ -1566,10 +1566,10 @@ static int process_connect(struct ceph_connection *con)
 		 * If we sent a smaller connect_seq than the peer has, try
 		 * again with a larger value.
 		 */
-		dout("process_connect got RETRY my seq = %u, peer_seq = %u\n",
+		dout("process_connect got RETRY_SESSION my seq %u, peer %u\n",
 		     le32_to_cpu(con->out_connect.connect_seq),
-		     le32_to_cpu(con->in_connect.connect_seq));
-		con->connect_seq = le32_to_cpu(con->in_connect.connect_seq);
+		     le32_to_cpu(con->in_reply.connect_seq));
+		con->connect_seq = le32_to_cpu(con->in_reply.connect_seq);
 		ret = prepare_write_connect(con);
 		if (ret < 0)
 			return ret;
@@ -1583,9 +1583,9 @@ static int process_connect(struct ceph_connection *con)
 		 */
 		dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
 		     con->peer_global_seq,
-		     le32_to_cpu(con->in_connect.global_seq));
+		     le32_to_cpu(con->in_reply.global_seq));
 		get_global_seq(con->msgr,
-			       le32_to_cpu(con->in_connect.global_seq));
+			       le32_to_cpu(con->in_reply.global_seq));
 		ret = prepare_write_connect(con);
 		if (ret < 0)
 			return ret;

From a2a3258417eb6a1799cf893350771428875a8287 Mon Sep 17 00:00:00 2001
From: Guanjun He <gjhe@suse.com>
Date: Sun, 8 Jul 2012 19:50:33 -0700
Subject: [PATCH 057/132] libceph: prevent the race of incoming work during
 teardown

Add an atomic variable 'stopping' as flag in struct ceph_messenger,
set this flag to 1 in function ceph_destroy_client(), and add the condition code
in function ceph_data_ready() to test the flag value, if true(1), just return.

Signed-off-by: Guanjun He <gjhe@suse.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 include/linux/ceph/messenger.h | 1 +
 net/ceph/ceph_common.c         | 2 ++
 net/ceph/messenger.c           | 5 +++++
 3 files changed, 8 insertions(+)

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index cfb1bbdac624..a310d7fe6e29 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -50,6 +50,7 @@ struct ceph_messenger {
 	struct ceph_entity_inst inst;    /* my name+address */
 	struct ceph_entity_addr my_enc_addr;
 
+	atomic_t stopping;
 	bool nocrc;
 
 	/*
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 58b09efb528d..3b45e01fa8d1 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -495,6 +495,8 @@ void ceph_destroy_client(struct ceph_client *client)
 {
 	dout("destroy_client %p\n", client);
 
+	atomic_set(&client->msgr.stopping, 1);
+
 	/* unmount */
 	ceph_osdc_stop(&client->osdc);
 
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 16814d1f4774..63e1252d3af5 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -254,6 +254,9 @@ static void con_sock_state_closed(struct ceph_connection *con)
 static void ceph_sock_data_ready(struct sock *sk, int count_unused)
 {
 	struct ceph_connection *con = sk->sk_user_data;
+	if (atomic_read(&con->msgr->stopping)) {
+		return;
+	}
 
 	if (sk->sk_state != TCP_CLOSE_WAIT) {
 		dout("%s on %p state = %lu, queueing work\n", __func__,
@@ -2413,6 +2416,8 @@ void ceph_messenger_init(struct ceph_messenger *msgr,
 	encode_my_addr(msgr);
 	msgr->nocrc = nocrc;
 
+	atomic_set(&msgr->stopping, 0);
+
 	dout("%s %p\n", __func__, msgr);
 }
 EXPORT_SYMBOL(ceph_messenger_init);

From 8842b3be96c376f174ae0d4f282d14728ad5febf Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Thu, 7 Jun 2012 13:43:35 -0700
Subject: [PATCH 058/132] ceph: clean up useless d_parent checks

d_parent is never NULL, and IS_ROOT() is the proper way to check for a
(non-self-referential) parent.

Reported-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Sage Weil <sage@inktank.com>
---
 fs/ceph/dir.c        |  7 +++----
 fs/ceph/mds_client.c | 11 -----------
 2 files changed, 3 insertions(+), 15 deletions(-)

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 3e8094be4604..6a66bd2d4da0 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -51,8 +51,7 @@ int ceph_init_dentry(struct dentry *dentry)
 		goto out_unlock;
 	}
 
-	if (dentry->d_parent == NULL ||   /* nfs fh_to_dentry */
-	    ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP)
+	if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP)
 		d_set_d_op(dentry, &ceph_dentry_ops);
 	else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR)
 		d_set_d_op(dentry, &ceph_snapdir_dentry_ops);
@@ -79,7 +78,7 @@ struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry)
 		return NULL;
 
 	spin_lock(&dentry->d_lock);
-	if (dentry->d_parent) {
+	if (!IS_ROOT(dentry)) {
 		inode = dentry->d_parent->d_inode;
 		ihold(inode);
 	}
@@ -1140,7 +1139,7 @@ static void ceph_d_prune(struct dentry *dentry)
 	dout("ceph_d_prune %p\n", dentry);
 
 	/* do we have a valid parent? */
-	if (!dentry->d_parent || IS_ROOT(dentry))
+	if (IS_ROOT(dentry))
 		return;
 
 	/* if we are not hashed, we don't affect D_COMPLETE */
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 5ac6434185ae..418f6a82c90d 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1469,11 +1469,6 @@ retry:
 		else
 			len += 1 + temp->d_name.len;
 		temp = temp->d_parent;
-		if (temp == NULL) {
-			rcu_read_unlock();
-			pr_err("build_path corrupt dentry %p\n", dentry);
-			return ERR_PTR(-EINVAL);
-		}
 	}
 	rcu_read_unlock();
 	if (len)
@@ -1510,12 +1505,6 @@ retry:
 		if (pos)
 			path[--pos] = '/';
 		temp = temp->d_parent;
-		if (temp == NULL) {
-			rcu_read_unlock();
-			pr_err("build_path corrupt dentry\n");
-			kfree(path);
-			return ERR_PTR(-EINVAL);
-		}
 	}
 	rcu_read_unlock();
 	if (pos != 0 || read_seqretry(&rename_lock, seq)) {

From c61a1abd215c1ccd6fa73104c79e79987ed3aa98 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Tue, 3 Jul 2012 16:01:18 -0500
Subject: [PATCH 059/132] libceph: fix off-by-one bug in ceph_encode_filepath()

There is a BUG_ON() call that doesn't account for the single byte
structure version at the start of an encoded filepath in
ceph_encode_filepath().  Fix that.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Yehuda Sadeh <yehuda@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 include/linux/ceph/decode.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h
index d8615dee5808..bcbd66c84890 100644
--- a/include/linux/ceph/decode.h
+++ b/include/linux/ceph/decode.h
@@ -151,7 +151,7 @@ static inline void ceph_encode_filepath(void **p, void *end,
 					u64 ino, const char *path)
 {
 	u32 len = path ? strlen(path) : 0;
-	BUG_ON(*p + sizeof(ino) + sizeof(len) + len > end);
+	BUG_ON(*p + 1 + sizeof(ino) + sizeof(len) + len > end);
 	ceph_encode_8(p, 1);
 	ceph_encode_64(p, ino);
 	ceph_encode_32(p, len);

From ad4f232f28e8059fa1de51f3127d8a6a2759ef16 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Tue, 3 Jul 2012 16:01:19 -0500
Subject: [PATCH 060/132] rbd: drop a useless local variable

In rbd_req_sync_notify_ack(), a local variable was needlessly being
used to hold a null pointer.  Just pass NULL instead.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Yehuda Sadeh <yehuda@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 8f428a8ab003..2ae3bb0c0a34 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1187,7 +1187,6 @@ static int rbd_req_sync_notify_ack(struct rbd_device *dev,
 				   const char *obj)
 {
 	struct ceph_osd_req_op *ops;
-	struct page **pages = NULL;
 	int ret;
 
 	ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY_ACK, 0);
@@ -1200,7 +1199,7 @@ static int rbd_req_sync_notify_ack(struct rbd_device *dev,
 
 	ret = rbd_do_request(NULL, dev, NULL, CEPH_NOSNAP,
 			  obj, 0, 0, NULL,
-			  pages, 0,
+			  NULL, 0,
 			  CEPH_OSD_FLAG_READ,
 			  ops,
 			  1,

From f8c36c58accd5c53a472b5c289910565b3df9f9d Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 11 Jul 2012 08:24:45 -0500
Subject: [PATCH 061/132] libceph: define ceph_extract_encoded_string()

This adds a new utility routine which will return a dynamically-
allocated buffer containing a string that has been decoded from ceph
over-the-wire format.  It also returns the length of the string
if the address of a size variable is supplied to receive it.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 include/linux/ceph/decode.h | 47 +++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h
index bcbd66c84890..4bbf2db45f46 100644
--- a/include/linux/ceph/decode.h
+++ b/include/linux/ceph/decode.h
@@ -1,6 +1,7 @@
 #ifndef __CEPH_DECODE_H
 #define __CEPH_DECODE_H
 
+#include <linux/err.h>
 #include <linux/bug.h>
 #include <linux/time.h>
 #include <asm/unaligned.h>
@@ -84,6 +85,52 @@ static inline int ceph_has_room(void **p, void *end, size_t n)
 		ceph_decode_copy(p, pv, n);			\
 	} while (0)
 
+/*
+ * Allocate a buffer big enough to hold the wire-encoded string, and
+ * decode the string into it.  The resulting string will always be
+ * terminated with '\0'.  If successful, *p will be advanced
+ * past the decoded data.  Also, if lenp is not a null pointer, the
+ * length (not including the terminating '\0') will be recorded in
+ * *lenp.  Note that a zero-length string is a valid return value.
+ *
+ * Returns a pointer to the newly-allocated string buffer, or a
+ * pointer-coded errno if an error occurs.  Neither *p nor *lenp
+ * will have been updated if an error is returned.
+ *
+ * There are two possible failures:
+ *   - converting the string would require accessing memory at or
+ *     beyond the "end" pointer provided (-E
+ *   - memory could not be allocated for the result
+ */
+static inline char *ceph_extract_encoded_string(void **p, void *end,
+						size_t *lenp, gfp_t gfp)
+{
+	u32 len;
+	void *sp = *p;
+	char *buf;
+
+	ceph_decode_32_safe(&sp, end, len, bad);
+	if (!ceph_has_room(&sp, end, len))
+		goto bad;
+
+	buf = kmalloc(len + 1, gfp);
+	if (!buf)
+		return ERR_PTR(-ENOMEM);
+
+	if (len)
+		memcpy(buf, sp, len);
+	buf[len] = '\0';
+
+	*p = (char *) *p + sizeof (u32) + len;
+	if (lenp)
+		*lenp = (size_t) len;
+
+	return buf;
+
+bad:
+	return ERR_PTR(-ERANGE);
+}
+
 /*
  * struct ceph_timespec <-> struct timespec
  */

From ea3352f4aa4fc32397d9a535780315e0f2bfee15 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Mon, 9 Jul 2012 21:04:23 -0500
Subject: [PATCH 062/132] rbd: define dup_token()

Define a new function dup_token(), to be used during argument
parsing for making dynamically-allocated copies of tokens being
parsed.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Yehuda Sadeh <yehuda@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 2ae3bb0c0a34..384694f40b44 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -2280,6 +2280,42 @@ static inline size_t copy_token(const char **buf,
         return len;
 }
 
+/*
+ * Finds the next token in *buf, dynamically allocates a buffer big
+ * enough to hold a copy of it, and copies the token into the new
+ * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
+ * that a duplicate buffer is created even for a zero-length token.
+ *
+ * Returns a pointer to the newly-allocated duplicate, or a null
+ * pointer if memory for the duplicate was not available.  If
+ * the lenp argument is a non-null pointer, the length of the token
+ * (not including the '\0') is returned in *lenp.
+ *
+ * If successful, the *buf pointer will be updated to point beyond
+ * the end of the found token.
+ *
+ * Note: uses GFP_KERNEL for allocation.
+ */
+static inline char *dup_token(const char **buf, size_t *lenp)
+{
+	char *dup;
+	size_t len;
+
+	len = next_token(buf);
+	dup = kmalloc(len + 1, GFP_KERNEL);
+	if (!dup)
+		return NULL;
+
+	memcpy(dup, *buf, len);
+	*(dup + len) = '\0';
+	*buf += len;
+
+	if (lenp)
+		*lenp = len;
+
+	return dup;
+}
+
 /*
  * This fills in the pool_name, obj, obj_len, snap_name, obj_len,
  * rbd_dev, rbd_md_name, and name fields of the given rbd_dev, based

From ca1e49a6afe87ea4e2d3e73e10d1d3c0fad2aa3f Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Tue, 10 Jul 2012 20:30:09 -0500
Subject: [PATCH 063/132] rbd: rename rbd_dev->block_name

Each rbd image has a name that forms the basis of all data objects
backing the device.  Old (format 1) images refer to this name as the
"block name," while new (format 2) images use the term "object
prefix" for this.

Change the field name in the in-core rbd image header structure to
reflect the more modern usage.  We intentionally keep the the name
"block_name" in the on-disk definition for format 1 image headers.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Yehuda Sadeh <yehuda@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 384694f40b44..523413bdca92 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -78,7 +78,7 @@
  */
 struct rbd_image_header {
 	u64 image_size;
-	char block_name[32];
+	char object_prefix[32];
 	__u8 obj_order;
 	__u8 crypt_type;
 	__u8 comp_type;
@@ -518,7 +518,7 @@ static int rbd_header_from_disk(struct rbd_image_header *header,
 		header->snap_names = NULL;
 		header->snap_sizes = NULL;
 	}
-	memcpy(header->block_name, ondisk->block_name,
+	memcpy(header->object_prefix, ondisk->block_name,
 	       sizeof(ondisk->block_name));
 
 	header->image_size = le64_to_cpu(ondisk->image_size);
@@ -620,7 +620,7 @@ static void rbd_header_free(struct rbd_image_header *header)
  * get the actual striped segment name, offset and length
  */
 static u64 rbd_get_segment(struct rbd_image_header *header,
-			   const char *block_name,
+			   const char *object_prefix,
 			   u64 ofs, u64 len,
 			   char *seg_name, u64 *segofs)
 {
@@ -628,7 +628,7 @@ static u64 rbd_get_segment(struct rbd_image_header *header,
 
 	if (seg_name)
 		snprintf(seg_name, RBD_MAX_SEG_NAME_LEN,
-			 "%s.%012llx", block_name, seg);
+			 "%s.%012llx", object_prefix, seg);
 
 	ofs = ofs & ((1 << header->obj_order) - 1);
 	len = min_t(u64, len, (1 << header->obj_order) - ofs);
@@ -1091,7 +1091,7 @@ static int rbd_do_op(struct request *rq,
 		return -ENOMEM;
 
 	seg_len = rbd_get_segment(&rbd_dev->header,
-				  rbd_dev->header.block_name,
+				  rbd_dev->header.object_prefix,
 				  ofs, len,
 				  seg_name, &seg_ofs);
 
@@ -1482,7 +1482,7 @@ static void rbd_rq_fn(struct request_queue *q)
 			/* a bio clone to be passed down to OSD req */
 			dout("rq->bio->bi_vcnt=%d\n", rq->bio->bi_vcnt);
 			op_size = rbd_get_segment(&rbd_dev->header,
-						  rbd_dev->header.block_name,
+						  rbd_dev->header.object_prefix,
 						  ofs, size,
 						  NULL, NULL);
 			kref_get(&coll->kref);

From 9bb2f334b9b5f2eb6030b7988b7f2302c3115bbb Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 12 Jul 2012 10:46:35 -0500
Subject: [PATCH 064/132] rbd: create pool_id device attribute

Add an entry under /sys/bus/rbd/devices/<N>/ named "pool_id" that
provides the id for the pool the rbd image is assocatied with.  This
is in addition to the pool name already provided.

Rename the "poolid" field in struct rbd_device  to be "pool_id".

Update the documentation to reflect the addition of this new entry.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 Documentation/ABI/testing/sysfs-bus-rbd | 10 ++++++++--
 drivers/block/rbd.c                     | 18 ++++++++++++++----
 2 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-bus-rbd b/Documentation/ABI/testing/sysfs-bus-rbd
index bcd88eb7ebcd..3c17b62899f6 100644
--- a/Documentation/ABI/testing/sysfs-bus-rbd
+++ b/Documentation/ABI/testing/sysfs-bus-rbd
@@ -35,8 +35,14 @@ name
 
 pool
 
-	The pool where this rbd image resides. The pool-name pair is unique
-	per rados system.
+	The name of the storage pool where this rbd image resides.
+	An rbd image name is unique within its pool.
+
+pool_id
+
+	The unique identifier for the rbd image's pool.  This is
+	a permanent attribute of the pool.  A pool's id will never
+	change.
 
 size
 
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 523413bdca92..80320fd1c621 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -167,7 +167,7 @@ struct rbd_device {
 	int			obj_len;
 	char			obj_md_name[RBD_MAX_MD_NAME_LEN]; /* hdr nm. */
 	char			pool_name[RBD_MAX_POOL_NAME_LEN];
-	int			poolid;
+	int			pool_id;
 
 	struct ceph_osd_event   *watch_event;
 	struct ceph_osd_request *watch_request;
@@ -920,7 +920,7 @@ static int rbd_do_request(struct request *rq,
 	layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
 	layout->fl_stripe_count = cpu_to_le32(1);
 	layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
-	layout->fl_pg_pool = cpu_to_le32(dev->poolid);
+	layout->fl_pg_pool = cpu_to_le32(dev->pool_id);
 	ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
 				req, ops);
 
@@ -1643,7 +1643,7 @@ static int rbd_header_add_snap(struct rbd_device *dev,
 		return -EINVAL;
 
 	monc = &dev->rbd_client->client->monc;
-	ret = ceph_monc_create_snapid(monc, dev->poolid, &new_snapid);
+	ret = ceph_monc_create_snapid(monc, dev->pool_id, &new_snapid);
 	dout("created snapid=%lld\n", new_snapid);
 	if (ret < 0)
 		return ret;
@@ -1847,6 +1847,14 @@ static ssize_t rbd_pool_show(struct device *dev,
 	return sprintf(buf, "%s\n", rbd_dev->pool_name);
 }
 
+static ssize_t rbd_pool_id_show(struct device *dev,
+			     struct device_attribute *attr, char *buf)
+{
+	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
+
+	return sprintf(buf, "%d\n", rbd_dev->pool_id);
+}
+
 static ssize_t rbd_name_show(struct device *dev,
 			     struct device_attribute *attr, char *buf)
 {
@@ -1887,6 +1895,7 @@ static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
 static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
 static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
 static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
+static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
 static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
 static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
 static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
@@ -1897,6 +1906,7 @@ static struct attribute *rbd_attrs[] = {
 	&dev_attr_major.attr,
 	&dev_attr_client_id.attr,
 	&dev_attr_pool.attr,
+	&dev_attr_pool_id.attr,
 	&dev_attr_name.attr,
 	&dev_attr_current_snap.attr,
 	&dev_attr_refresh.attr,
@@ -2430,7 +2440,7 @@ static ssize_t rbd_add(struct bus_type *bus,
 	rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
 	if (rc < 0)
 		goto err_out_client;
-	rbd_dev->poolid = rc;
+	rbd_dev->pool_id = rc;
 
 	/* register our block device */
 	rc = register_blkdev(0, rbd_dev->name);

From d22f76e703040c2cc4ad13dd7747845b9844d360 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 12 Jul 2012 10:46:35 -0500
Subject: [PATCH 065/132] rbd: dynamically allocate pool name

There is no need to impose a small limit the length of the pool name
recorded for an rbd image in a struct rbd_device.  Remove the
limitation by allocating space for the pool name ynamically.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 80320fd1c621..61ce29d268a6 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -56,7 +56,6 @@
 #define RBD_MINORS_PER_MAJOR	256		/* max minors per blkdev */
 
 #define RBD_MAX_MD_NAME_LEN	(RBD_MAX_OBJ_NAME_LEN + sizeof(RBD_SUFFIX))
-#define RBD_MAX_POOL_NAME_LEN	64
 #define RBD_MAX_SNAP_NAME_LEN	32
 #define RBD_MAX_OPT_LEN		1024
 
@@ -166,7 +165,7 @@ struct rbd_device {
 	char			obj[RBD_MAX_OBJ_NAME_LEN]; /* rbd image name */
 	int			obj_len;
 	char			obj_md_name[RBD_MAX_MD_NAME_LEN]; /* hdr nm. */
-	char			pool_name[RBD_MAX_POOL_NAME_LEN];
+	char			*pool_name;
 	int			pool_id;
 
 	struct ceph_osd_event   *watch_event;
@@ -2331,6 +2330,8 @@ static inline char *dup_token(const char **buf, size_t *lenp)
  * rbd_dev, rbd_md_name, and name fields of the given rbd_dev, based
  * on the list of monitor addresses and other options provided via
  * /sys/bus/rbd/add.
+ *
+ * Note: rbd_dev is assumed to have been initially zero-filled.
  */
 static int rbd_add_parse_args(struct rbd_device *rbd_dev,
 			      const char *buf,
@@ -2339,7 +2340,8 @@ static int rbd_add_parse_args(struct rbd_device *rbd_dev,
 			      char *options,
 			      size_t options_size)
 {
-	size_t	len;
+	size_t len;
+	int ret;
 
 	/* The first four tokens are required */
 
@@ -2355,13 +2357,14 @@ static int rbd_add_parse_args(struct rbd_device *rbd_dev,
 	if (!len || len >= options_size)
 		return -EINVAL;
 
-	len = copy_token(&buf, rbd_dev->pool_name, sizeof (rbd_dev->pool_name));
-	if (!len || len >= sizeof (rbd_dev->pool_name))
-		return -EINVAL;
+	rbd_dev->pool_name = dup_token(&buf, NULL);
+	if (!rbd_dev->pool_name)
+		return -ENOMEM;
 
+	ret = -EINVAL;
 	len = copy_token(&buf, rbd_dev->obj, sizeof (rbd_dev->obj));
 	if (!len || len >= sizeof (rbd_dev->obj))
-		return -EINVAL;
+		goto out_err;
 
 	/* We have the object length in hand, save it. */
 
@@ -2380,9 +2383,15 @@ static int rbd_add_parse_args(struct rbd_device *rbd_dev,
 		memcpy(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
 			sizeof (RBD_SNAP_HEAD_NAME));
 	else if (len >= sizeof (rbd_dev->snap_name))
-		return -EINVAL;
+		goto out_err;
 
 	return 0;
+
+out_err:
+	kfree(rbd_dev->pool_name);
+	rbd_dev->pool_name = NULL;
+
+	return ret;
 }
 
 static ssize_t rbd_add(struct bus_type *bus,
@@ -2480,6 +2489,7 @@ err_out_blkdev:
 err_out_client:
 	rbd_put_client(rbd_dev);
 err_put_id:
+	kfree(rbd_dev->pool_name);
 	rbd_id_put(rbd_dev);
 err_nomem:
 	kfree(options);
@@ -2528,6 +2538,7 @@ static void rbd_dev_release(struct device *dev)
 	unregister_blkdev(rbd_dev->major, rbd_dev->name);
 
 	/* done with the id, and with the rbd_dev */
+	kfree(rbd_dev->pool_name);
 	rbd_id_put(rbd_dev);
 	kfree(rbd_dev);
 

From 849b4260d482f7d4be5565b2044901a25f80e2c6 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Mon, 9 Jul 2012 21:04:24 -0500
Subject: [PATCH 066/132] rbd: dynamically allocate object prefix

There is no need to impose a small limit the length of the object
prefix recorded for an rbd image in a struct rbd_image_header.
Remove the limitation by allocating space for the object prefix
dynamically.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 34 ++++++++++++++++++++++++++--------
 1 file changed, 26 insertions(+), 8 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 61ce29d268a6..92867f3e945c 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -77,7 +77,7 @@
  */
 struct rbd_image_header {
 	u64 image_size;
-	char object_prefix[32];
+	char *object_prefix;
 	__u8 obj_order;
 	__u8 crypt_type;
 	__u8 comp_type;
@@ -517,8 +517,15 @@ static int rbd_header_from_disk(struct rbd_image_header *header,
 		header->snap_names = NULL;
 		header->snap_sizes = NULL;
 	}
+
+	header->object_prefix = kmalloc(sizeof (ondisk->block_name) + 1,
+					gfp_flags);
+	if (!header->object_prefix)
+		goto err_sizes;
+
 	memcpy(header->object_prefix, ondisk->block_name,
 	       sizeof(ondisk->block_name));
+	header->object_prefix[sizeof (ondisk->block_name)] = '\0';
 
 	header->image_size = le64_to_cpu(ondisk->image_size);
 	header->obj_order = ondisk->options.order;
@@ -545,6 +552,8 @@ static int rbd_header_from_disk(struct rbd_image_header *header,
 
 	return 0;
 
+err_sizes:
+	kfree(header->snap_sizes);
 err_names:
 	kfree(header->snap_names);
 err_snapc:
@@ -610,9 +619,10 @@ done:
 
 static void rbd_header_free(struct rbd_image_header *header)
 {
-	kfree(header->snapc);
-	kfree(header->snap_names);
+	kfree(header->object_prefix);
 	kfree(header->snap_sizes);
+	kfree(header->snap_names);
+	kfree(header->snapc);
 }
 
 /*
@@ -1710,15 +1720,20 @@ static int __rbd_refresh_header(struct rbd_device *rbd_dev)
 		   if head moves */
 		follow_seq = 1;
 
-	kfree(rbd_dev->header.snapc);
-	kfree(rbd_dev->header.snap_names);
+	/* rbd_dev->header.object_prefix shouldn't change */
 	kfree(rbd_dev->header.snap_sizes);
+	kfree(rbd_dev->header.snap_names);
+	kfree(rbd_dev->header.snapc);
 
 	rbd_dev->header.total_snaps = h.total_snaps;
 	rbd_dev->header.snapc = h.snapc;
 	rbd_dev->header.snap_names = h.snap_names;
 	rbd_dev->header.snap_names_len = h.snap_names_len;
 	rbd_dev->header.snap_sizes = h.snap_sizes;
+	/* Free the extra copy of the object prefix */
+	WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
+	kfree(h.object_prefix);
+
 	if (follow_seq)
 		rbd_dev->header.snapc->seq = rbd_dev->header.snapc->snaps[0];
 	else
@@ -2361,10 +2376,11 @@ static int rbd_add_parse_args(struct rbd_device *rbd_dev,
 	if (!rbd_dev->pool_name)
 		return -ENOMEM;
 
-	ret = -EINVAL;
 	len = copy_token(&buf, rbd_dev->obj, sizeof (rbd_dev->obj));
-	if (!len || len >= sizeof (rbd_dev->obj))
+	if (!len || len >= sizeof (rbd_dev->obj)) {
+		ret = -EINVAL;
 		goto out_err;
+	}
 
 	/* We have the object length in hand, save it. */
 
@@ -2382,8 +2398,10 @@ static int rbd_add_parse_args(struct rbd_device *rbd_dev,
 	if (!len)
 		memcpy(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
 			sizeof (RBD_SNAP_HEAD_NAME));
-	else if (len >= sizeof (rbd_dev->snap_name))
+	else if (len >= sizeof (rbd_dev->snap_name)) {
+		ret = -EINVAL;
 		goto out_err;
+	}
 
 	return 0;
 

From cb8627c76db699e3a085596aa80503fb0973c041 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Mon, 9 Jul 2012 21:04:23 -0500
Subject: [PATCH 067/132] rbd: dynamically allocate image header name

There is no need to impose a small limit the length of the header
name recorded for an rbd image in a struct rbd_dev.  Remove the
limitation by allocating space for the header name dynamically.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 31 ++++++++++++++++++++-----------
 1 file changed, 20 insertions(+), 11 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 92867f3e945c..9b676b3b9ba0 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -55,7 +55,6 @@
 
 #define RBD_MINORS_PER_MAJOR	256		/* max minors per blkdev */
 
-#define RBD_MAX_MD_NAME_LEN	(RBD_MAX_OBJ_NAME_LEN + sizeof(RBD_SUFFIX))
 #define RBD_MAX_SNAP_NAME_LEN	32
 #define RBD_MAX_OPT_LEN		1024
 
@@ -164,7 +163,7 @@ struct rbd_device {
 	struct rbd_image_header	header;
 	char			obj[RBD_MAX_OBJ_NAME_LEN]; /* rbd image name */
 	int			obj_len;
-	char			obj_md_name[RBD_MAX_MD_NAME_LEN]; /* hdr nm. */
+	char			*obj_md_name; /* hdr nm. */
 	char			*pool_name;
 	int			pool_id;
 
@@ -2386,8 +2385,13 @@ static int rbd_add_parse_args(struct rbd_device *rbd_dev,
 
 	rbd_dev->obj_len = len;
 
-	BUILD_BUG_ON(RBD_MAX_MD_NAME_LEN
-				< RBD_MAX_OBJ_NAME_LEN + sizeof (RBD_SUFFIX));
+	/* Create the name of the header object */
+
+	rbd_dev->obj_md_name = kmalloc(len + sizeof (RBD_SUFFIX), GFP_KERNEL);
+	if (!rbd_dev->obj_md_name) {
+		ret = -ENOMEM;
+		goto out_err;
+	}
 	sprintf(rbd_dev->obj_md_name, "%s%s", rbd_dev->obj, RBD_SUFFIX);
 
 	/*
@@ -2406,6 +2410,7 @@ static int rbd_add_parse_args(struct rbd_device *rbd_dev,
 	return 0;
 
 out_err:
+	kfree(rbd_dev->obj_md_name);
 	kfree(rbd_dev->pool_name);
 	rbd_dev->pool_name = NULL;
 
@@ -2416,22 +2421,22 @@ static ssize_t rbd_add(struct bus_type *bus,
 		       const char *buf,
 		       size_t count)
 {
-	struct rbd_device *rbd_dev;
+	char *options;
+	struct rbd_device *rbd_dev = NULL;
 	const char *mon_addrs = NULL;
 	size_t mon_addrs_size = 0;
-	char *options = NULL;
 	struct ceph_osd_client *osdc;
 	int rc = -ENOMEM;
 
 	if (!try_module_get(THIS_MODULE))
 		return -ENODEV;
 
-	rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
-	if (!rbd_dev)
-		goto err_nomem;
 	options = kmalloc(count, GFP_KERNEL);
 	if (!options)
 		goto err_nomem;
+	rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
+	if (!rbd_dev)
+		goto err_nomem;
 
 	/* static rbd_device initialization */
 	spin_lock_init(&rbd_dev->lock);
@@ -2507,11 +2512,14 @@ err_out_blkdev:
 err_out_client:
 	rbd_put_client(rbd_dev);
 err_put_id:
-	kfree(rbd_dev->pool_name);
+	if (rbd_dev->pool_name) {
+		kfree(rbd_dev->obj_md_name);
+		kfree(rbd_dev->pool_name);
+	}
 	rbd_id_put(rbd_dev);
 err_nomem:
-	kfree(options);
 	kfree(rbd_dev);
+	kfree(options);
 
 	dout("Error adding device %s\n", buf);
 	module_put(THIS_MODULE);
@@ -2556,6 +2564,7 @@ static void rbd_dev_release(struct device *dev)
 	unregister_blkdev(rbd_dev->major, rbd_dev->name);
 
 	/* done with the id, and with the rbd_dev */
+	kfree(rbd_dev->obj_md_name);
 	kfree(rbd_dev->pool_name);
 	rbd_id_put(rbd_dev);
 	kfree(rbd_dev);

From bf3e5ae1129ef18a702c14fbaac27aeb2fe25e62 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Mon, 9 Jul 2012 21:04:23 -0500
Subject: [PATCH 068/132] rbd: dynamically allocate image name

There is no need to impose a small limit the length of the rbd image
name recorded in a struct rbd_dev.  Remove the limitation by
allocating space for the image name dynamically.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c       | 28 +++++++++++++---------------
 drivers/block/rbd_types.h |  1 -
 2 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 9b676b3b9ba0..7d4735c9dba5 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -161,8 +161,8 @@ struct rbd_device {
 	spinlock_t		lock;		/* queue lock */
 
 	struct rbd_image_header	header;
-	char			obj[RBD_MAX_OBJ_NAME_LEN]; /* rbd image name */
-	int			obj_len;
+	char			*obj; /* rbd image name */
+	size_t			obj_len;
 	char			*obj_md_name; /* hdr nm. */
 	char			*pool_name;
 	int			pool_id;
@@ -2371,27 +2371,22 @@ static int rbd_add_parse_args(struct rbd_device *rbd_dev,
 	if (!len || len >= options_size)
 		return -EINVAL;
 
+	ret = -ENOMEM;
 	rbd_dev->pool_name = dup_token(&buf, NULL);
 	if (!rbd_dev->pool_name)
-		return -ENOMEM;
-
-	len = copy_token(&buf, rbd_dev->obj, sizeof (rbd_dev->obj));
-	if (!len || len >= sizeof (rbd_dev->obj)) {
-		ret = -EINVAL;
 		goto out_err;
-	}
 
-	/* We have the object length in hand, save it. */
-
-	rbd_dev->obj_len = len;
+	rbd_dev->obj = dup_token(&buf, &rbd_dev->obj_len);
+	if (!rbd_dev->obj)
+		goto out_err;
 
 	/* Create the name of the header object */
 
-	rbd_dev->obj_md_name = kmalloc(len + sizeof (RBD_SUFFIX), GFP_KERNEL);
-	if (!rbd_dev->obj_md_name) {
-		ret = -ENOMEM;
+	rbd_dev->obj_md_name = kmalloc(rbd_dev->obj_len
+						+ sizeof (RBD_SUFFIX),
+					GFP_KERNEL);
+	if (!rbd_dev->obj_md_name)
 		goto out_err;
-	}
 	sprintf(rbd_dev->obj_md_name, "%s%s", rbd_dev->obj, RBD_SUFFIX);
 
 	/*
@@ -2411,6 +2406,7 @@ static int rbd_add_parse_args(struct rbd_device *rbd_dev,
 
 out_err:
 	kfree(rbd_dev->obj_md_name);
+	kfree(rbd_dev->obj);
 	kfree(rbd_dev->pool_name);
 	rbd_dev->pool_name = NULL;
 
@@ -2514,6 +2510,7 @@ err_out_client:
 err_put_id:
 	if (rbd_dev->pool_name) {
 		kfree(rbd_dev->obj_md_name);
+		kfree(rbd_dev->obj);
 		kfree(rbd_dev->pool_name);
 	}
 	rbd_id_put(rbd_dev);
@@ -2566,6 +2563,7 @@ static void rbd_dev_release(struct device *dev)
 	/* done with the id, and with the rbd_dev */
 	kfree(rbd_dev->obj_md_name);
 	kfree(rbd_dev->pool_name);
+	kfree(rbd_dev->obj);
 	rbd_id_put(rbd_dev);
 	kfree(rbd_dev);
 
diff --git a/drivers/block/rbd_types.h b/drivers/block/rbd_types.h
index 950708688f17..0924e9e41a60 100644
--- a/drivers/block/rbd_types.h
+++ b/drivers/block/rbd_types.h
@@ -31,7 +31,6 @@
 #define RBD_MIN_OBJ_ORDER       16
 #define RBD_MAX_OBJ_ORDER       30
 
-#define RBD_MAX_OBJ_NAME_LEN	96
 #define RBD_MAX_SEG_NAME_LEN	128
 
 #define RBD_COMP_NONE		0

From 820a5f3e94b9f8ea8c0c6125ce34b237ed67b1dc Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Mon, 9 Jul 2012 21:04:24 -0500
Subject: [PATCH 069/132] rbd: dynamically allocate snapshot name

There is no need to impose a small limit the length of the snapshot
name recorded for an rbd image in a struct rbd_dev.  Remove the
limitation by allocating space for the snapshot name dynamically.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 7d4735c9dba5..5c0f0445982c 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -172,7 +172,7 @@ struct rbd_device {
 
 	/* protects updating the header */
 	struct rw_semaphore     header_rwsem;
-	char                    snap_name[RBD_MAX_SNAP_NAME_LEN];
+	char                    *snap_name;
 	u64                     snap_id;	/* current snapshot id */
 	int read_only;
 
@@ -588,8 +588,6 @@ static int rbd_header_set_snap(struct rbd_device *dev, u64 *size)
 	struct ceph_snap_context *snapc = header->snapc;
 	int ret = -ENOENT;
 
-	BUILD_BUG_ON(sizeof (dev->snap_name) < sizeof (RBD_SNAP_HEAD_NAME));
-
 	down_write(&dev->header_rwsem);
 
 	if (!memcmp(dev->snap_name, RBD_SNAP_HEAD_NAME,
@@ -2390,16 +2388,22 @@ static int rbd_add_parse_args(struct rbd_device *rbd_dev,
 	sprintf(rbd_dev->obj_md_name, "%s%s", rbd_dev->obj, RBD_SUFFIX);
 
 	/*
-	 * The snapshot name is optional, but it's an error if it's
-	 * too long.  If no snapshot is supplied, fill in the default.
+	 * The snapshot name is optional.  If none is is supplied,
+	 * we use the default value.
 	 */
-	len = copy_token(&buf, rbd_dev->snap_name, sizeof (rbd_dev->snap_name));
-	if (!len)
+	rbd_dev->snap_name = dup_token(&buf, &len);
+	if (!rbd_dev->snap_name)
+		goto out_err;
+	if (!len) {
+		/* Replace the empty name with the default */
+		kfree(rbd_dev->snap_name);
+		rbd_dev->snap_name
+			= kmalloc(sizeof (RBD_SNAP_HEAD_NAME), GFP_KERNEL);
+		if (!rbd_dev->snap_name)
+			goto out_err;
+
 		memcpy(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
 			sizeof (RBD_SNAP_HEAD_NAME));
-	else if (len >= sizeof (rbd_dev->snap_name)) {
-		ret = -EINVAL;
-		goto out_err;
 	}
 
 	return 0;
@@ -2509,6 +2513,7 @@ err_out_client:
 	rbd_put_client(rbd_dev);
 err_put_id:
 	if (rbd_dev->pool_name) {
+		kfree(rbd_dev->snap_name);
 		kfree(rbd_dev->obj_md_name);
 		kfree(rbd_dev->obj);
 		kfree(rbd_dev->pool_name);
@@ -2561,6 +2566,7 @@ static void rbd_dev_release(struct device *dev)
 	unregister_blkdev(rbd_dev->major, rbd_dev->name);
 
 	/* done with the id, and with the rbd_dev */
+	kfree(rbd_dev->snap_name);
 	kfree(rbd_dev->obj_md_name);
 	kfree(rbd_dev->pool_name);
 	kfree(rbd_dev->obj);

From 0ce1a7941341cc63c8352b7df50020e5485bc43a Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Tue, 3 Jul 2012 16:01:18 -0500
Subject: [PATCH 070/132] rbd: use rbd_dev consistently

Most variables that represent a struct rbd_device are named
"rbd_dev", but in some cases "dev" is used instead.  Change all the
"dev" references so they use "rbd_dev" consistently, to make it
clear from the name that we're working with an RBD device (as
opposed to, for example, a struct device).  Similarly, change the
name of the "dev" field in struct rbd_notify_info to be "rbd_dev".

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 125 +++++++++++++++++++++++---------------------
 1 file changed, 64 insertions(+), 61 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 5c0f0445982c..03439565c738 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -582,35 +582,36 @@ static int snap_by_name(struct rbd_image_header *header, const char *snap_name,
 	return -ENOENT;
 }
 
-static int rbd_header_set_snap(struct rbd_device *dev, u64 *size)
+static int rbd_header_set_snap(struct rbd_device *rbd_dev, u64 *size)
 {
-	struct rbd_image_header *header = &dev->header;
+	struct rbd_image_header *header = &rbd_dev->header;
 	struct ceph_snap_context *snapc = header->snapc;
 	int ret = -ENOENT;
 
-	down_write(&dev->header_rwsem);
+	down_write(&rbd_dev->header_rwsem);
 
-	if (!memcmp(dev->snap_name, RBD_SNAP_HEAD_NAME,
+	if (!memcmp(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
 		    sizeof (RBD_SNAP_HEAD_NAME))) {
 		if (header->total_snaps)
 			snapc->seq = header->snap_seq;
 		else
 			snapc->seq = 0;
-		dev->snap_id = CEPH_NOSNAP;
-		dev->read_only = 0;
+		rbd_dev->snap_id = CEPH_NOSNAP;
+		rbd_dev->read_only = 0;
 		if (size)
 			*size = header->image_size;
 	} else {
-		ret = snap_by_name(header, dev->snap_name, &snapc->seq, size);
+		ret = snap_by_name(header, rbd_dev->snap_name,
+					&snapc->seq, size);
 		if (ret < 0)
 			goto done;
-		dev->snap_id = snapc->seq;
-		dev->read_only = 1;
+		rbd_dev->snap_id = snapc->seq;
+		rbd_dev->read_only = 1;
 	}
 
 	ret = 0;
 done:
-	up_write(&dev->header_rwsem);
+	up_write(&rbd_dev->header_rwsem);
 	return ret;
 }
 
@@ -854,7 +855,7 @@ static void rbd_coll_end_req(struct rbd_request *req,
  * Send ceph osd request
  */
 static int rbd_do_request(struct request *rq,
-			  struct rbd_device *dev,
+			  struct rbd_device *rbd_dev,
 			  struct ceph_snap_context *snapc,
 			  u64 snapid,
 			  const char *obj, u64 ofs, u64 len,
@@ -895,13 +896,13 @@ static int rbd_do_request(struct request *rq,
 
 	dout("rbd_do_request obj=%s ofs=%lld len=%lld\n", obj, len, ofs);
 
-	down_read(&dev->header_rwsem);
+	down_read(&rbd_dev->header_rwsem);
 
-	osdc = &dev->rbd_client->client->osdc;
+	osdc = &rbd_dev->rbd_client->client->osdc;
 	req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
 					false, GFP_NOIO, pages, bio);
 	if (!req) {
-		up_read(&dev->header_rwsem);
+		up_read(&rbd_dev->header_rwsem);
 		ret = -ENOMEM;
 		goto done_pages;
 	}
@@ -926,7 +927,7 @@ static int rbd_do_request(struct request *rq,
 	layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
 	layout->fl_stripe_count = cpu_to_le32(1);
 	layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
-	layout->fl_pg_pool = cpu_to_le32(dev->pool_id);
+	layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id);
 	ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
 				req, ops);
 
@@ -935,7 +936,7 @@ static int rbd_do_request(struct request *rq,
 				snapc,
 				&mtime,
 				req->r_oid, req->r_oid_len);
-	up_read(&dev->header_rwsem);
+	up_read(&rbd_dev->header_rwsem);
 
 	if (linger_req) {
 		ceph_osdc_set_request_linger(osdc, req);
@@ -1012,7 +1013,7 @@ static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg
 /*
  * Do a synchronous ceph osd operation
  */
-static int rbd_req_sync_op(struct rbd_device *dev,
+static int rbd_req_sync_op(struct rbd_device *rbd_dev,
 			   struct ceph_snap_context *snapc,
 			   u64 snapid,
 			   int opcode,
@@ -1049,7 +1050,7 @@ static int rbd_req_sync_op(struct rbd_device *dev,
 		}
 	}
 
-	ret = rbd_do_request(NULL, dev, snapc, snapid,
+	ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
 			  obj, ofs, len, NULL,
 			  pages, num_pages,
 			  flags,
@@ -1076,7 +1077,7 @@ done:
  * Do an asynchronous ceph osd operation
  */
 static int rbd_do_op(struct request *rq,
-		     struct rbd_device *rbd_dev ,
+		     struct rbd_device *rbd_dev,
 		     struct ceph_snap_context *snapc,
 		     u64 snapid,
 		     int opcode, int flags, int num_reply,
@@ -1168,7 +1169,7 @@ static int rbd_req_read(struct request *rq,
 /*
  * Request sync osd read
  */
-static int rbd_req_sync_read(struct rbd_device *dev,
+static int rbd_req_sync_read(struct rbd_device *rbd_dev,
 			  struct ceph_snap_context *snapc,
 			  u64 snapid,
 			  const char *obj,
@@ -1176,7 +1177,7 @@ static int rbd_req_sync_read(struct rbd_device *dev,
 			  char *buf,
 			  u64 *ver)
 {
-	return rbd_req_sync_op(dev, NULL,
+	return rbd_req_sync_op(rbd_dev, NULL,
 			       snapid,
 			       CEPH_OSD_OP_READ,
 			       CEPH_OSD_FLAG_READ,
@@ -1187,7 +1188,7 @@ static int rbd_req_sync_read(struct rbd_device *dev,
 /*
  * Request sync osd watch
  */
-static int rbd_req_sync_notify_ack(struct rbd_device *dev,
+static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
 				   u64 ver,
 				   u64 notify_id,
 				   const char *obj)
@@ -1199,11 +1200,11 @@ static int rbd_req_sync_notify_ack(struct rbd_device *dev,
 	if (ret < 0)
 		return ret;
 
-	ops[0].watch.ver = cpu_to_le64(dev->header.obj_version);
+	ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
 	ops[0].watch.cookie = notify_id;
 	ops[0].watch.flag = 0;
 
-	ret = rbd_do_request(NULL, dev, NULL, CEPH_NOSNAP,
+	ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
 			  obj, 0, 0, NULL,
 			  NULL, 0,
 			  CEPH_OSD_FLAG_READ,
@@ -1218,54 +1219,54 @@ static int rbd_req_sync_notify_ack(struct rbd_device *dev,
 
 static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
 {
-	struct rbd_device *dev = (struct rbd_device *)data;
+	struct rbd_device *rbd_dev = (struct rbd_device *)data;
 	int rc;
 
-	if (!dev)
+	if (!rbd_dev)
 		return;
 
-	dout("rbd_watch_cb %s notify_id=%lld opcode=%d\n", dev->obj_md_name,
+	dout("rbd_watch_cb %s notify_id=%lld opcode=%d\n", rbd_dev->obj_md_name,
 		notify_id, (int)opcode);
 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
-	rc = __rbd_refresh_header(dev);
+	rc = __rbd_refresh_header(rbd_dev);
 	mutex_unlock(&ctl_mutex);
 	if (rc)
 		pr_warning(RBD_DRV_NAME "%d got notification but failed to "
-			   " update snaps: %d\n", dev->major, rc);
+			   " update snaps: %d\n", rbd_dev->major, rc);
 
-	rbd_req_sync_notify_ack(dev, ver, notify_id, dev->obj_md_name);
+	rbd_req_sync_notify_ack(rbd_dev, ver, notify_id, rbd_dev->obj_md_name);
 }
 
 /*
  * Request sync osd watch
  */
-static int rbd_req_sync_watch(struct rbd_device *dev,
+static int rbd_req_sync_watch(struct rbd_device *rbd_dev,
 			      const char *obj,
 			      u64 ver)
 {
 	struct ceph_osd_req_op *ops;
-	struct ceph_osd_client *osdc = &dev->rbd_client->client->osdc;
+	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
 
 	int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0);
 	if (ret < 0)
 		return ret;
 
 	ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
-				     (void *)dev, &dev->watch_event);
+				     (void *)rbd_dev, &rbd_dev->watch_event);
 	if (ret < 0)
 		goto fail;
 
 	ops[0].watch.ver = cpu_to_le64(ver);
-	ops[0].watch.cookie = cpu_to_le64(dev->watch_event->cookie);
+	ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
 	ops[0].watch.flag = 1;
 
-	ret = rbd_req_sync_op(dev, NULL,
+	ret = rbd_req_sync_op(rbd_dev, NULL,
 			      CEPH_NOSNAP,
 			      0,
 			      CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
 			      ops,
 			      1, obj, 0, 0, NULL,
-			      &dev->watch_request, NULL);
+			      &rbd_dev->watch_request, NULL);
 
 	if (ret < 0)
 		goto fail_event;
@@ -1274,8 +1275,8 @@ static int rbd_req_sync_watch(struct rbd_device *dev,
 	return 0;
 
 fail_event:
-	ceph_osdc_cancel_event(dev->watch_event);
-	dev->watch_event = NULL;
+	ceph_osdc_cancel_event(rbd_dev->watch_event);
+	rbd_dev->watch_event = NULL;
 fail:
 	rbd_destroy_ops(ops);
 	return ret;
@@ -1284,7 +1285,7 @@ fail:
 /*
  * Request sync osd unwatch
  */
-static int rbd_req_sync_unwatch(struct rbd_device *dev,
+static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev,
 				const char *obj)
 {
 	struct ceph_osd_req_op *ops;
@@ -1294,10 +1295,10 @@ static int rbd_req_sync_unwatch(struct rbd_device *dev,
 		return ret;
 
 	ops[0].watch.ver = 0;
-	ops[0].watch.cookie = cpu_to_le64(dev->watch_event->cookie);
+	ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
 	ops[0].watch.flag = 0;
 
-	ret = rbd_req_sync_op(dev, NULL,
+	ret = rbd_req_sync_op(rbd_dev, NULL,
 			      CEPH_NOSNAP,
 			      0,
 			      CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
@@ -1305,33 +1306,34 @@ static int rbd_req_sync_unwatch(struct rbd_device *dev,
 			      1, obj, 0, 0, NULL, NULL, NULL);
 
 	rbd_destroy_ops(ops);
-	ceph_osdc_cancel_event(dev->watch_event);
-	dev->watch_event = NULL;
+	ceph_osdc_cancel_event(rbd_dev->watch_event);
+	rbd_dev->watch_event = NULL;
 	return ret;
 }
 
 struct rbd_notify_info {
-	struct rbd_device *dev;
+	struct rbd_device *rbd_dev;
 };
 
 static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
 {
-	struct rbd_device *dev = (struct rbd_device *)data;
-	if (!dev)
+	struct rbd_device *rbd_dev = (struct rbd_device *)data;
+	if (!rbd_dev)
 		return;
 
-	dout("rbd_notify_cb %s notify_id=%lld opcode=%d\n", dev->obj_md_name,
+	dout("rbd_notify_cb %s notify_id=%lld opcode=%d\n",
+				rbd_dev->obj_md_name,
 		notify_id, (int)opcode);
 }
 
 /*
  * Request sync osd notify
  */
-static int rbd_req_sync_notify(struct rbd_device *dev,
+static int rbd_req_sync_notify(struct rbd_device *rbd_dev,
 		          const char *obj)
 {
 	struct ceph_osd_req_op *ops;
-	struct ceph_osd_client *osdc = &dev->rbd_client->client->osdc;
+	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
 	struct ceph_osd_event *event;
 	struct rbd_notify_info info;
 	int payload_len = sizeof(u32) + sizeof(u32);
@@ -1341,7 +1343,7 @@ static int rbd_req_sync_notify(struct rbd_device *dev,
 	if (ret < 0)
 		return ret;
 
-	info.dev = dev;
+	info.rbd_dev = rbd_dev;
 
 	ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1,
 				     (void *)&info, &event);
@@ -1354,7 +1356,7 @@ static int rbd_req_sync_notify(struct rbd_device *dev,
 	ops[0].watch.prot_ver = RADOS_NOTIFY_VER;
 	ops[0].watch.timeout = 12;
 
-	ret = rbd_req_sync_op(dev, NULL,
+	ret = rbd_req_sync_op(rbd_dev, NULL,
 			       CEPH_NOSNAP,
 			       0,
 			       CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
@@ -1378,7 +1380,7 @@ fail:
 /*
  * Request sync osd read
  */
-static int rbd_req_sync_exec(struct rbd_device *dev,
+static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
 			     const char *obj,
 			     const char *cls,
 			     const char *method,
@@ -1402,7 +1404,7 @@ static int rbd_req_sync_exec(struct rbd_device *dev,
 	ops[0].cls.indata = data;
 	ops[0].cls.indata_len = len;
 
-	ret = rbd_req_sync_op(dev, NULL,
+	ret = rbd_req_sync_op(rbd_dev, NULL,
 			       CEPH_NOSNAP,
 			       0,
 			       CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
@@ -1633,7 +1635,7 @@ out_dh:
 /*
  * create a snapshot
  */
-static int rbd_header_add_snap(struct rbd_device *dev,
+static int rbd_header_add_snap(struct rbd_device *rbd_dev,
 			       const char *snap_name,
 			       gfp_t gfp_flags)
 {
@@ -1645,11 +1647,11 @@ static int rbd_header_add_snap(struct rbd_device *dev,
 	struct ceph_mon_client *monc;
 
 	/* we should create a snapshot only if we're pointing at the head */
-	if (dev->snap_id != CEPH_NOSNAP)
+	if (rbd_dev->snap_id != CEPH_NOSNAP)
 		return -EINVAL;
 
-	monc = &dev->rbd_client->client->monc;
-	ret = ceph_monc_create_snapid(monc, dev->pool_id, &new_snapid);
+	monc = &rbd_dev->rbd_client->client->monc;
+	ret = ceph_monc_create_snapid(monc, rbd_dev->pool_id, &new_snapid);
 	dout("created snapid=%lld\n", new_snapid);
 	if (ret < 0)
 		return ret;
@@ -1664,7 +1666,8 @@ static int rbd_header_add_snap(struct rbd_device *dev,
 	ceph_encode_string_safe(&p, e, snap_name, name_len, bad);
 	ceph_encode_64_safe(&p, e, new_snapid, bad);
 
-	ret = rbd_req_sync_exec(dev, dev->obj_md_name, "rbd", "snap_add",
+	ret = rbd_req_sync_exec(rbd_dev, rbd_dev->obj_md_name,
+				"rbd", "snap_add",
 				data, p - data, &ver);
 
 	kfree(data);
@@ -1672,9 +1675,9 @@ static int rbd_header_add_snap(struct rbd_device *dev,
 	if (ret < 0)
 		return ret;
 
-	down_write(&dev->header_rwsem);
-	dev->header.snapc->seq = new_snapid;
-	up_write(&dev->header_rwsem);
+	down_write(&rbd_dev->header_rwsem);
+	rbd_dev->header.snapc->seq = new_snapid;
+	up_write(&rbd_dev->header_rwsem);
 
 	return 0;
 bad:

From 0bed54dc9af4ab75547739a27b64f0fe0aa98756 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Tue, 3 Jul 2012 16:01:18 -0500
Subject: [PATCH 071/132] rbd: rename some fields in struct rbd_dev

An rbd image is not a single object, but a logical construct made up
of an aggregation of objects.

Rename some fields in struct rbd_dev, in hopes of reinforcing this.
    obj         --> image_name
    obj_len     --> image_name_len
    obj_md_name --> header_name

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 55 +++++++++++++++++++++++----------------------
 1 file changed, 28 insertions(+), 27 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 03439565c738..7a87a8c3fa34 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -161,9 +161,9 @@ struct rbd_device {
 	spinlock_t		lock;		/* queue lock */
 
 	struct rbd_image_header	header;
-	char			*obj; /* rbd image name */
-	size_t			obj_len;
-	char			*obj_md_name; /* hdr nm. */
+	char			*image_name;
+	size_t			image_name_len;
+	char			*header_name;
 	char			*pool_name;
 	int			pool_id;
 
@@ -1225,8 +1225,8 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
 	if (!rbd_dev)
 		return;
 
-	dout("rbd_watch_cb %s notify_id=%lld opcode=%d\n", rbd_dev->obj_md_name,
-		notify_id, (int)opcode);
+	dout("rbd_watch_cb %s notify_id=%lld opcode=%d\n",
+		rbd_dev->header_name, notify_id, (int) opcode);
 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
 	rc = __rbd_refresh_header(rbd_dev);
 	mutex_unlock(&ctl_mutex);
@@ -1234,7 +1234,7 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
 		pr_warning(RBD_DRV_NAME "%d got notification but failed to "
 			   " update snaps: %d\n", rbd_dev->major, rc);
 
-	rbd_req_sync_notify_ack(rbd_dev, ver, notify_id, rbd_dev->obj_md_name);
+	rbd_req_sync_notify_ack(rbd_dev, ver, notify_id, rbd_dev->header_name);
 }
 
 /*
@@ -1322,7 +1322,7 @@ static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
 		return;
 
 	dout("rbd_notify_cb %s notify_id=%lld opcode=%d\n",
-				rbd_dev->obj_md_name,
+				rbd_dev->header_name,
 		notify_id, (int)opcode);
 }
 
@@ -1600,7 +1600,7 @@ static int rbd_read_header(struct rbd_device *rbd_dev,
 
 		rc = rbd_req_sync_read(rbd_dev,
 				       NULL, CEPH_NOSNAP,
-				       rbd_dev->obj_md_name,
+				       rbd_dev->header_name,
 				       0, len,
 				       (char *)dh, &ver);
 		if (rc < 0)
@@ -1610,7 +1610,8 @@ static int rbd_read_header(struct rbd_device *rbd_dev,
 		if (rc < 0) {
 			if (rc == -ENXIO)
 				pr_warning("unrecognized header format"
-					   " for image %s", rbd_dev->obj);
+					   " for image %s\n",
+					   rbd_dev->image_name);
 			goto out_dh;
 		}
 
@@ -1666,7 +1667,7 @@ static int rbd_header_add_snap(struct rbd_device *rbd_dev,
 	ceph_encode_string_safe(&p, e, snap_name, name_len, bad);
 	ceph_encode_64_safe(&p, e, new_snapid, bad);
 
-	ret = rbd_req_sync_exec(rbd_dev, rbd_dev->obj_md_name,
+	ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
 				"rbd", "snap_add",
 				data, p - data, &ver);
 
@@ -1874,7 +1875,7 @@ static ssize_t rbd_name_show(struct device *dev,
 {
 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
 
-	return sprintf(buf, "%s\n", rbd_dev->obj);
+	return sprintf(buf, "%s\n", rbd_dev->image_name);
 }
 
 static ssize_t rbd_snap_show(struct device *dev,
@@ -2178,7 +2179,7 @@ static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
 	int ret, rc;
 
 	do {
-		ret = rbd_req_sync_watch(rbd_dev, rbd_dev->obj_md_name,
+		ret = rbd_req_sync_watch(rbd_dev, rbd_dev->header_name,
 					 rbd_dev->header.obj_version);
 		if (ret == -ERANGE) {
 			mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
@@ -2341,7 +2342,7 @@ static inline char *dup_token(const char **buf, size_t *lenp)
 }
 
 /*
- * This fills in the pool_name, obj, obj_len, snap_name, obj_len,
+ * This fills in the pool_name, image_name, image_name_len, snap_name,
  * rbd_dev, rbd_md_name, and name fields of the given rbd_dev, based
  * on the list of monitor addresses and other options provided via
  * /sys/bus/rbd/add.
@@ -2353,7 +2354,7 @@ static int rbd_add_parse_args(struct rbd_device *rbd_dev,
 			      const char **mon_addrs,
 			      size_t *mon_addrs_size,
 			      char *options,
-			      size_t options_size)
+			     size_t options_size)
 {
 	size_t len;
 	int ret;
@@ -2377,18 +2378,18 @@ static int rbd_add_parse_args(struct rbd_device *rbd_dev,
 	if (!rbd_dev->pool_name)
 		goto out_err;
 
-	rbd_dev->obj = dup_token(&buf, &rbd_dev->obj_len);
-	if (!rbd_dev->obj)
+	rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len);
+	if (!rbd_dev->image_name)
 		goto out_err;
 
 	/* Create the name of the header object */
 
-	rbd_dev->obj_md_name = kmalloc(rbd_dev->obj_len
+	rbd_dev->header_name = kmalloc(rbd_dev->image_name_len
 						+ sizeof (RBD_SUFFIX),
 					GFP_KERNEL);
-	if (!rbd_dev->obj_md_name)
+	if (!rbd_dev->header_name)
 		goto out_err;
-	sprintf(rbd_dev->obj_md_name, "%s%s", rbd_dev->obj, RBD_SUFFIX);
+	sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX);
 
 	/*
 	 * The snapshot name is optional.  If none is is supplied,
@@ -2412,8 +2413,8 @@ static int rbd_add_parse_args(struct rbd_device *rbd_dev,
 	return 0;
 
 out_err:
-	kfree(rbd_dev->obj_md_name);
-	kfree(rbd_dev->obj);
+	kfree(rbd_dev->header_name);
+	kfree(rbd_dev->image_name);
 	kfree(rbd_dev->pool_name);
 	rbd_dev->pool_name = NULL;
 
@@ -2517,8 +2518,8 @@ err_out_client:
 err_put_id:
 	if (rbd_dev->pool_name) {
 		kfree(rbd_dev->snap_name);
-		kfree(rbd_dev->obj_md_name);
-		kfree(rbd_dev->obj);
+		kfree(rbd_dev->header_name);
+		kfree(rbd_dev->image_name);
 		kfree(rbd_dev->pool_name);
 	}
 	rbd_id_put(rbd_dev);
@@ -2560,7 +2561,7 @@ static void rbd_dev_release(struct device *dev)
 						    rbd_dev->watch_request);
 	}
 	if (rbd_dev->watch_event)
-		rbd_req_sync_unwatch(rbd_dev, rbd_dev->obj_md_name);
+		rbd_req_sync_unwatch(rbd_dev, rbd_dev->header_name);
 
 	rbd_put_client(rbd_dev);
 
@@ -2570,9 +2571,9 @@ static void rbd_dev_release(struct device *dev)
 
 	/* done with the id, and with the rbd_dev */
 	kfree(rbd_dev->snap_name);
-	kfree(rbd_dev->obj_md_name);
+	kfree(rbd_dev->header_name);
 	kfree(rbd_dev->pool_name);
-	kfree(rbd_dev->obj);
+	kfree(rbd_dev->image_name);
 	rbd_id_put(rbd_dev);
 	kfree(rbd_dev);
 
@@ -2643,7 +2644,7 @@ static ssize_t rbd_snap_add(struct device *dev,
 	mutex_unlock(&ctl_mutex);
 
 	/* make a best effort, don't error if failed */
-	rbd_req_sync_notify(rbd_dev, rbd_dev->obj_md_name);
+	rbd_req_sync_notify(rbd_dev, rbd_dev->header_name);
 
 	ret = count;
 	kfree(name);

From aded07ea9f7de26e352f679910ac057212ea35e3 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Tue, 3 Jul 2012 16:01:18 -0500
Subject: [PATCH 072/132] rbd: more symbol renames

Rename variables named "obj" which represent object names so they're
consistently named "object_name".

Rename the "cls" and "method" parameters in rbd_req_sync_exec()
to be "class_name" and "method_name", and make similar changes
to the names of local variables in that function representing
the lengths of those names.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 53 +++++++++++++++++++++++----------------------
 1 file changed, 27 insertions(+), 26 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 7a87a8c3fa34..2fe160014f58 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -858,7 +858,7 @@ static int rbd_do_request(struct request *rq,
 			  struct rbd_device *rbd_dev,
 			  struct ceph_snap_context *snapc,
 			  u64 snapid,
-			  const char *obj, u64 ofs, u64 len,
+			  const char *object_name, u64 ofs, u64 len,
 			  struct bio *bio,
 			  struct page **pages,
 			  int num_pages,
@@ -894,7 +894,8 @@ static int rbd_do_request(struct request *rq,
 		req_data->coll_index = coll_index;
 	}
 
-	dout("rbd_do_request obj=%s ofs=%lld len=%lld\n", obj, len, ofs);
+	dout("rbd_do_request object_name=%s ofs=%lld len=%lld\n",
+		object_name, len, ofs);
 
 	down_read(&rbd_dev->header_rwsem);
 
@@ -919,7 +920,7 @@ static int rbd_do_request(struct request *rq,
 	reqhead = req->r_request->front.iov_base;
 	reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
 
-	strncpy(req->r_oid, obj, sizeof(req->r_oid));
+	strncpy(req->r_oid, object_name, sizeof(req->r_oid));
 	req->r_oid_len = strlen(req->r_oid);
 
 	layout = &req->r_file_layout;
@@ -1020,7 +1021,7 @@ static int rbd_req_sync_op(struct rbd_device *rbd_dev,
 			   int flags,
 			   struct ceph_osd_req_op *orig_ops,
 			   int num_reply,
-			   const char *obj,
+			   const char *object_name,
 			   u64 ofs, u64 len,
 			   char *buf,
 			   struct ceph_osd_request **linger_req,
@@ -1051,7 +1052,7 @@ static int rbd_req_sync_op(struct rbd_device *rbd_dev,
 	}
 
 	ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
-			  obj, ofs, len, NULL,
+			  object_name, ofs, len, NULL,
 			  pages, num_pages,
 			  flags,
 			  ops,
@@ -1172,7 +1173,7 @@ static int rbd_req_read(struct request *rq,
 static int rbd_req_sync_read(struct rbd_device *rbd_dev,
 			  struct ceph_snap_context *snapc,
 			  u64 snapid,
-			  const char *obj,
+			  const char *object_name,
 			  u64 ofs, u64 len,
 			  char *buf,
 			  u64 *ver)
@@ -1182,7 +1183,7 @@ static int rbd_req_sync_read(struct rbd_device *rbd_dev,
 			       CEPH_OSD_OP_READ,
 			       CEPH_OSD_FLAG_READ,
 			       NULL,
-			       1, obj, ofs, len, buf, NULL, ver);
+			       1, object_name, ofs, len, buf, NULL, ver);
 }
 
 /*
@@ -1191,7 +1192,7 @@ static int rbd_req_sync_read(struct rbd_device *rbd_dev,
 static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
 				   u64 ver,
 				   u64 notify_id,
-				   const char *obj)
+				   const char *object_name)
 {
 	struct ceph_osd_req_op *ops;
 	int ret;
@@ -1205,7 +1206,7 @@ static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
 	ops[0].watch.flag = 0;
 
 	ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
-			  obj, 0, 0, NULL,
+			  object_name, 0, 0, NULL,
 			  NULL, 0,
 			  CEPH_OSD_FLAG_READ,
 			  ops,
@@ -1241,7 +1242,7 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
  * Request sync osd watch
  */
 static int rbd_req_sync_watch(struct rbd_device *rbd_dev,
-			      const char *obj,
+			      const char *object_name,
 			      u64 ver)
 {
 	struct ceph_osd_req_op *ops;
@@ -1265,7 +1266,7 @@ static int rbd_req_sync_watch(struct rbd_device *rbd_dev,
 			      0,
 			      CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
 			      ops,
-			      1, obj, 0, 0, NULL,
+			      1, object_name, 0, 0, NULL,
 			      &rbd_dev->watch_request, NULL);
 
 	if (ret < 0)
@@ -1286,7 +1287,7 @@ fail:
  * Request sync osd unwatch
  */
 static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev,
-				const char *obj)
+				const char *object_name)
 {
 	struct ceph_osd_req_op *ops;
 
@@ -1303,7 +1304,7 @@ static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev,
 			      0,
 			      CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
 			      ops,
-			      1, obj, 0, 0, NULL, NULL, NULL);
+			      1, object_name, 0, 0, NULL, NULL, NULL);
 
 	rbd_destroy_ops(ops);
 	ceph_osdc_cancel_event(rbd_dev->watch_event);
@@ -1330,7 +1331,7 @@ static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
  * Request sync osd notify
  */
 static int rbd_req_sync_notify(struct rbd_device *rbd_dev,
-		          const char *obj)
+		          const char *object_name)
 {
 	struct ceph_osd_req_op *ops;
 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
@@ -1361,7 +1362,7 @@ static int rbd_req_sync_notify(struct rbd_device *rbd_dev,
 			       0,
 			       CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
 			       ops,
-			       1, obj, 0, 0, NULL, NULL, NULL);
+			       1, object_name, 0, 0, NULL, NULL, NULL);
 	if (ret < 0)
 		goto fail_event;
 
@@ -1381,25 +1382,25 @@ fail:
  * Request sync osd read
  */
 static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
-			     const char *obj,
-			     const char *cls,
-			     const char *method,
+			     const char *object_name,
+			     const char *class_name,
+			     const char *method_name,
 			     const char *data,
 			     int len,
 			     u64 *ver)
 {
 	struct ceph_osd_req_op *ops;
-	int cls_len = strlen(cls);
-	int method_len = strlen(method);
+	int class_name_len = strlen(class_name);
+	int method_name_len = strlen(method_name);
 	int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_CALL,
-				    cls_len + method_len + len);
+				    class_name_len + method_name_len + len);
 	if (ret < 0)
 		return ret;
 
-	ops[0].cls.class_name = cls;
-	ops[0].cls.class_len = (__u8)cls_len;
-	ops[0].cls.method_name = method;
-	ops[0].cls.method_len = (__u8)method_len;
+	ops[0].cls.class_name = class_name;
+	ops[0].cls.class_len = (__u8) class_name_len;
+	ops[0].cls.method_name = method_name;
+	ops[0].cls.method_len = (__u8) method_name_len;
 	ops[0].cls.argc = 0;
 	ops[0].cls.indata = data;
 	ops[0].cls.indata_len = len;
@@ -1409,7 +1410,7 @@ static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
 			       0,
 			       CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
 			       ops,
-			       1, obj, 0, 0, NULL, NULL, ver);
+			       1, object_name, 0, 0, NULL, NULL, ver);
 
 	rbd_destroy_ops(ops);
 

From 43ae47011232c1e792d77e78db4a7d0ab05032be Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Tue, 3 Jul 2012 16:01:18 -0500
Subject: [PATCH 073/132] rbd: option symbol renames

Use the name "ceph_opts" consistently (rather than just "opt") for
pointers to a ceph_options structure.

Change the few spots that don't use "rbd_opts" for a rbd_options
pointer to match the rest.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 44 ++++++++++++++++++++++----------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 2fe160014f58..cf6f49101794 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -271,9 +271,9 @@ static const struct block_device_operations rbd_bd_ops = {
 
 /*
  * Initialize an rbd client instance.
- * We own *opt.
+ * We own *ceph_opts.
  */
-static struct rbd_client *rbd_client_create(struct ceph_options *opt,
+static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts,
 					    struct rbd_options *rbd_opts)
 {
 	struct rbd_client *rbdc;
@@ -289,10 +289,10 @@ static struct rbd_client *rbd_client_create(struct ceph_options *opt,
 
 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
 
-	rbdc->client = ceph_create_client(opt, rbdc, 0, 0);
+	rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
 	if (IS_ERR(rbdc->client))
 		goto out_mutex;
-	opt = NULL; /* Now rbdc->client is responsible for opt */
+	ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
 
 	ret = ceph_open_session(rbdc->client);
 	if (ret < 0)
@@ -315,23 +315,23 @@ out_mutex:
 	mutex_unlock(&ctl_mutex);
 	kfree(rbdc);
 out_opt:
-	if (opt)
-		ceph_destroy_options(opt);
+	if (ceph_opts)
+		ceph_destroy_options(ceph_opts);
 	return ERR_PTR(ret);
 }
 
 /*
  * Find a ceph client with specific addr and configuration.
  */
-static struct rbd_client *__rbd_client_find(struct ceph_options *opt)
+static struct rbd_client *__rbd_client_find(struct ceph_options *ceph_opts)
 {
 	struct rbd_client *client_node;
 
-	if (opt->flags & CEPH_OPT_NOSHARE)
+	if (ceph_opts->flags & CEPH_OPT_NOSHARE)
 		return NULL;
 
 	list_for_each_entry(client_node, &rbd_client_list, node)
-		if (ceph_compare_options(opt, client_node->client) == 0)
+		if (!ceph_compare_options(ceph_opts, client_node->client))
 			return client_node;
 	return NULL;
 }
@@ -347,7 +347,7 @@ enum {
 	/* string args above */
 };
 
-static match_table_t rbdopt_tokens = {
+static match_table_t rbd_opts_tokens = {
 	{Opt_notify_timeout, "notify_timeout=%d"},
 	/* int args above */
 	/* string args above */
@@ -356,11 +356,11 @@ static match_table_t rbdopt_tokens = {
 
 static int parse_rbd_opts_token(char *c, void *private)
 {
-	struct rbd_options *rbdopt = private;
+	struct rbd_options *rbd_opts = private;
 	substring_t argstr[MAX_OPT_ARGS];
 	int token, intval, ret;
 
-	token = match_token(c, rbdopt_tokens, argstr);
+	token = match_token(c, rbd_opts_tokens, argstr);
 	if (token < 0)
 		return -EINVAL;
 
@@ -381,7 +381,7 @@ static int parse_rbd_opts_token(char *c, void *private)
 
 	switch (token) {
 	case Opt_notify_timeout:
-		rbdopt->notify_timeout = intval;
+		rbd_opts->notify_timeout = intval;
 		break;
 	default:
 		BUG_ON(token);
@@ -398,7 +398,7 @@ static struct rbd_client *rbd_get_client(const char *mon_addr,
 					 char *options)
 {
 	struct rbd_client *rbdc;
-	struct ceph_options *opt;
+	struct ceph_options *ceph_opts;
 	struct rbd_options *rbd_opts;
 
 	rbd_opts = kzalloc(sizeof(*rbd_opts), GFP_KERNEL);
@@ -407,29 +407,29 @@ static struct rbd_client *rbd_get_client(const char *mon_addr,
 
 	rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT;
 
-	opt = ceph_parse_options(options, mon_addr,
-				mon_addr + mon_addr_len,
-				parse_rbd_opts_token, rbd_opts);
-	if (IS_ERR(opt)) {
+	ceph_opts = ceph_parse_options(options, mon_addr,
+					mon_addr + mon_addr_len,
+					parse_rbd_opts_token, rbd_opts);
+	if (IS_ERR(ceph_opts)) {
 		kfree(rbd_opts);
-		return ERR_CAST(opt);
+		return ERR_CAST(ceph_opts);
 	}
 
 	spin_lock(&rbd_client_list_lock);
-	rbdc = __rbd_client_find(opt);
+	rbdc = __rbd_client_find(ceph_opts);
 	if (rbdc) {
 		/* using an existing client */
 		kref_get(&rbdc->kref);
 		spin_unlock(&rbd_client_list_lock);
 
-		ceph_destroy_options(opt);
+		ceph_destroy_options(ceph_opts);
 		kfree(rbd_opts);
 
 		return rbdc;
 	}
 	spin_unlock(&rbd_client_list_lock);
 
-	rbdc = rbd_client_create(opt, rbd_opts);
+	rbdc = rbd_client_create(ceph_opts, rbd_opts);
 
 	if (IS_ERR(rbdc))
 		kfree(rbd_opts);

From d1f57ea66369b5c34bd42f104b8070db409447f9 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Tue, 26 Jun 2012 12:57:03 -0700
Subject: [PATCH 074/132] rbd: kill num_reply parameters

Several functions include a num_reply parameter, but it is never
used.  Just get rid of it everywhere--it seems to be something
that never got fully implemented.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index cf6f49101794..b124442dab3a 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -864,7 +864,6 @@ static int rbd_do_request(struct request *rq,
 			  int num_pages,
 			  int flags,
 			  struct ceph_osd_req_op *ops,
-			  int num_reply,
 			  struct rbd_req_coll *coll,
 			  int coll_index,
 			  void (*rbd_cb)(struct ceph_osd_request *req,
@@ -1020,7 +1019,6 @@ static int rbd_req_sync_op(struct rbd_device *rbd_dev,
 			   int opcode,
 			   int flags,
 			   struct ceph_osd_req_op *orig_ops,
-			   int num_reply,
 			   const char *object_name,
 			   u64 ofs, u64 len,
 			   char *buf,
@@ -1056,7 +1054,6 @@ static int rbd_req_sync_op(struct rbd_device *rbd_dev,
 			  pages, num_pages,
 			  flags,
 			  ops,
-			  2,
 			  NULL, 0,
 			  NULL,
 			  linger_req, ver);
@@ -1081,7 +1078,7 @@ static int rbd_do_op(struct request *rq,
 		     struct rbd_device *rbd_dev,
 		     struct ceph_snap_context *snapc,
 		     u64 snapid,
-		     int opcode, int flags, int num_reply,
+		     int opcode, int flags,
 		     u64 ofs, u64 len,
 		     struct bio *bio,
 		     struct rbd_req_coll *coll,
@@ -1120,7 +1117,6 @@ static int rbd_do_op(struct request *rq,
 			     NULL, 0,
 			     flags,
 			     ops,
-			     num_reply,
 			     coll, coll_index,
 			     rbd_req_cb, 0, NULL);
 
@@ -1144,7 +1140,6 @@ static int rbd_req_write(struct request *rq,
 	return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
 			 CEPH_OSD_OP_WRITE,
 			 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
-			 2,
 			 ofs, len, bio, coll, coll_index);
 }
 
@@ -1163,7 +1158,6 @@ static int rbd_req_read(struct request *rq,
 			 snapid,
 			 CEPH_OSD_OP_READ,
 			 CEPH_OSD_FLAG_READ,
-			 2,
 			 ofs, len, bio, coll, coll_index);
 }
 
@@ -1183,7 +1177,7 @@ static int rbd_req_sync_read(struct rbd_device *rbd_dev,
 			       CEPH_OSD_OP_READ,
 			       CEPH_OSD_FLAG_READ,
 			       NULL,
-			       1, object_name, ofs, len, buf, NULL, ver);
+			       object_name, ofs, len, buf, NULL, ver);
 }
 
 /*
@@ -1210,7 +1204,6 @@ static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
 			  NULL, 0,
 			  CEPH_OSD_FLAG_READ,
 			  ops,
-			  1,
 			  NULL, 0,
 			  rbd_simple_req_cb, 0, NULL);
 
@@ -1266,7 +1259,7 @@ static int rbd_req_sync_watch(struct rbd_device *rbd_dev,
 			      0,
 			      CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
 			      ops,
-			      1, object_name, 0, 0, NULL,
+			      object_name, 0, 0, NULL,
 			      &rbd_dev->watch_request, NULL);
 
 	if (ret < 0)
@@ -1304,7 +1297,7 @@ static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev,
 			      0,
 			      CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
 			      ops,
-			      1, object_name, 0, 0, NULL, NULL, NULL);
+			      object_name, 0, 0, NULL, NULL, NULL);
 
 	rbd_destroy_ops(ops);
 	ceph_osdc_cancel_event(rbd_dev->watch_event);
@@ -1362,7 +1355,7 @@ static int rbd_req_sync_notify(struct rbd_device *rbd_dev,
 			       0,
 			       CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
 			       ops,
-			       1, object_name, 0, 0, NULL, NULL, NULL);
+			       object_name, 0, 0, NULL, NULL, NULL);
 	if (ret < 0)
 		goto fail_event;
 
@@ -1410,7 +1403,7 @@ static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
 			       0,
 			       CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
 			       ops,
-			       1, object_name, 0, 0, NULL, NULL, ver);
+			       object_name, 0, 0, NULL, NULL, ver);
 
 	rbd_destroy_ops(ops);
 

From 1fe60e51a3744528f3939b1b1167ca909133d9ae Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Mon, 30 Jul 2012 16:23:22 -0700
Subject: [PATCH 075/132] libceph: move feature bits to separate header

This is simply cleanup that will keep things more closely synced with the
userland code.

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Alex Elder <elder@inktank.com>
Reviewed-by: Yehuda Sadeh <yehuda@inktank.com>
---
 fs/ceph/mds_client.c               |  1 +
 fs/ceph/super.c                    |  1 +
 include/linux/ceph/ceph_features.h | 24 ++++++++++++++++++++++++
 include/linux/ceph/ceph_fs.h       | 14 --------------
 include/linux/ceph/libceph.h       |  6 ------
 net/ceph/ceph_common.c             |  5 +++--
 6 files changed, 29 insertions(+), 22 deletions(-)
 create mode 100644 include/linux/ceph/ceph_features.h

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 418f6a82c90d..39b76d66bc5d 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -10,6 +10,7 @@
 #include "super.h"
 #include "mds_client.h"
 
+#include <linux/ceph/ceph_features.h>
 #include <linux/ceph/messenger.h>
 #include <linux/ceph/decode.h>
 #include <linux/ceph/pagelist.h>
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 1e67dd7305a4..2c47ecfe4373 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -18,6 +18,7 @@
 #include "super.h"
 #include "mds_client.h"
 
+#include <linux/ceph/ceph_features.h>
 #include <linux/ceph/decode.h>
 #include <linux/ceph/mon_client.h>
 #include <linux/ceph/auth.h>
diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h
new file mode 100644
index 000000000000..342f93dbe162
--- /dev/null
+++ b/include/linux/ceph/ceph_features.h
@@ -0,0 +1,24 @@
+#ifndef __CEPH_FEATURES
+#define __CEPH_FEATURES
+
+/*
+ * feature bits
+ */
+#define CEPH_FEATURE_UID            (1<<0)
+#define CEPH_FEATURE_NOSRCADDR      (1<<1)
+#define CEPH_FEATURE_MONCLOCKCHECK  (1<<2)
+#define CEPH_FEATURE_FLOCK          (1<<3)
+#define CEPH_FEATURE_SUBSCRIBE2     (1<<4)
+#define CEPH_FEATURE_MONNAMES       (1<<5)
+#define CEPH_FEATURE_RECONNECT_SEQ  (1<<6)
+#define CEPH_FEATURE_DIRLAYOUTHASH  (1<<7)
+
+/*
+ * Features supported.
+ */
+#define CEPH_FEATURES_SUPPORTED_DEFAULT  \
+	(CEPH_FEATURE_NOSRCADDR)
+
+#define CEPH_FEATURES_REQUIRED_DEFAULT   \
+	(CEPH_FEATURE_NOSRCADDR)
+#endif
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index e81ab30d4896..d021610efd65 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -35,20 +35,6 @@
 /* arbitrary limit on max # of monitors (cluster of 3 is typical) */
 #define CEPH_MAX_MON   31
 
-
-/*
- * feature bits
- */
-#define CEPH_FEATURE_UID            (1<<0)
-#define CEPH_FEATURE_NOSRCADDR      (1<<1)
-#define CEPH_FEATURE_MONCLOCKCHECK  (1<<2)
-#define CEPH_FEATURE_FLOCK          (1<<3)
-#define CEPH_FEATURE_SUBSCRIBE2     (1<<4)
-#define CEPH_FEATURE_MONNAMES       (1<<5)
-#define CEPH_FEATURE_RECONNECT_SEQ  (1<<6)
-#define CEPH_FEATURE_DIRLAYOUTHASH  (1<<7)
-
-
 /*
  * ceph_file_layout - describe data layout for a file/inode
  */
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 98ec36ae8a3b..ea072e1f9db9 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -22,12 +22,6 @@
 #include "osd_client.h"
 #include "ceph_fs.h"
 
-/*
- * Supported features
- */
-#define CEPH_FEATURE_SUPPORTED_DEFAULT CEPH_FEATURE_NOSRCADDR
-#define CEPH_FEATURE_REQUIRED_DEFAULT  CEPH_FEATURE_NOSRCADDR
-
 /*
  * mount options
  */
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 3b45e01fa8d1..69e38db28e5f 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -17,6 +17,7 @@
 #include <linux/string.h>
 
 
+#include <linux/ceph/ceph_features.h>
 #include <linux/ceph/libceph.h>
 #include <linux/ceph/debugfs.h>
 #include <linux/ceph/decode.h>
@@ -460,9 +461,9 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private,
 	client->auth_err = 0;
 
 	client->extra_mon_dispatch = NULL;
-	client->supported_features = CEPH_FEATURE_SUPPORTED_DEFAULT |
+	client->supported_features = CEPH_FEATURES_SUPPORTED_DEFAULT |
 		supported_features;
-	client->required_features = CEPH_FEATURE_REQUIRED_DEFAULT |
+	client->required_features = CEPH_FEATURES_REQUIRED_DEFAULT |
 		required_features;
 
 	/* msgr */

From 546f04ef716dd49521774653d8b032a7d64c05d9 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Mon, 30 Jul 2012 18:15:23 -0700
Subject: [PATCH 076/132] libceph: support crush tunables

The server side recently added support for tuning some magic
crush variables. Decode these variables if they are present, or use the
default values if they are not present.

Corresponds to ceph.git commit 89af369c25f274fe62ef730e5e8aad0c54f1e5a5.

Signed-off-by: caleb miles <caleb.miles@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
Reviewed-by: Alex Elder <elder@inktank.com>
Reviewed-by: Yehuda Sadeh <yehuda@inktank.com>
---
 include/linux/ceph/ceph_features.h |  5 +++-
 include/linux/crush/crush.h        |  8 ++++++
 net/ceph/crush/mapper.c            | 13 +++++-----
 net/ceph/osdmap.c                  | 39 ++++++++++++++++++++++++++++++
 4 files changed, 58 insertions(+), 7 deletions(-)

diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h
index 342f93dbe162..dad579b0c0e6 100644
--- a/include/linux/ceph/ceph_features.h
+++ b/include/linux/ceph/ceph_features.h
@@ -12,12 +12,15 @@
 #define CEPH_FEATURE_MONNAMES       (1<<5)
 #define CEPH_FEATURE_RECONNECT_SEQ  (1<<6)
 #define CEPH_FEATURE_DIRLAYOUTHASH  (1<<7)
+/* bits 8-17 defined by user-space; not supported yet here */
+#define CEPH_FEATURE_CRUSH_TUNABLES (1<<18)
 
 /*
  * Features supported.
  */
 #define CEPH_FEATURES_SUPPORTED_DEFAULT  \
-	(CEPH_FEATURE_NOSRCADDR)
+	(CEPH_FEATURE_NOSRCADDR |	 \
+	 CEPH_FEATURE_CRUSH_TUNABLES)
 
 #define CEPH_FEATURES_REQUIRED_DEFAULT   \
 	(CEPH_FEATURE_NOSRCADDR)
diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h
index 7c4750811b96..25baa287cff7 100644
--- a/include/linux/crush/crush.h
+++ b/include/linux/crush/crush.h
@@ -154,6 +154,14 @@ struct crush_map {
 	__s32 max_buckets;
 	__u32 max_rules;
 	__s32 max_devices;
+
+	/* choose local retries before re-descent */
+	__u32 choose_local_tries;
+	/* choose local attempts using a fallback permutation before
+	 * re-descent */
+	__u32 choose_local_fallback_tries;
+	/* choose attempts before giving up */ 
+	__u32 choose_total_tries;
 };
 
 
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
index d7edc24333b8..35fce755ce10 100644
--- a/net/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@@ -306,7 +306,6 @@ static int crush_choose(const struct crush_map *map,
 	int item = 0;
 	int itemtype;
 	int collide, reject;
-	const unsigned int orig_tries = 5; /* attempts before we fall back to search */
 
 	dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
 		bucket->id, x, outpos, numrep);
@@ -351,8 +350,9 @@ static int crush_choose(const struct crush_map *map,
 					reject = 1;
 					goto reject;
 				}
-				if (flocal >= (in->size>>1) &&
-				    flocal > orig_tries)
+				if (map->choose_local_fallback_tries > 0 &&
+				    flocal >= (in->size>>1) &&
+				    flocal > map->choose_local_fallback_tries)
 					item = bucket_perm_choose(in, x, r);
 				else
 					item = crush_bucket_choose(in, x, r);
@@ -422,13 +422,14 @@ reject:
 					ftotal++;
 					flocal++;
 
-					if (collide && flocal < 3)
+					if (collide && flocal <= map->choose_local_tries)
 						/* retry locally a few times */
 						retry_bucket = 1;
-					else if (flocal <= in->size + orig_tries)
+					else if (map->choose_local_fallback_tries > 0 &&
+						 flocal <= in->size + map->choose_local_fallback_tries)
 						/* exhaustive bucket search */
 						retry_bucket = 1;
-					else if (ftotal < 20)
+					else if (ftotal <= map->choose_total_tries)
 						/* then retry descent */
 						retry_descent = 1;
 					else
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 9600674c2c39..3124b71a8883 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -135,6 +135,21 @@ bad:
 	return -EINVAL;
 }
 
+static int skip_name_map(void **p, void *end)
+{
+        int len;
+        ceph_decode_32_safe(p, end, len ,bad);
+        while (len--) {
+                int strlen;
+                *p += sizeof(u32);
+                ceph_decode_32_safe(p, end, strlen, bad);
+                *p += strlen;
+}
+        return 0;
+bad:
+        return -EINVAL;
+}
+
 static struct crush_map *crush_decode(void *pbyval, void *end)
 {
 	struct crush_map *c;
@@ -143,6 +158,7 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
 	void **p = &pbyval;
 	void *start = pbyval;
 	u32 magic;
+	u32 num_name_maps;
 
 	dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p));
 
@@ -150,6 +166,11 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
 	if (c == NULL)
 		return ERR_PTR(-ENOMEM);
 
+        /* set tunables to default values */
+        c->choose_local_tries = 2;
+        c->choose_local_fallback_tries = 5;
+        c->choose_total_tries = 19;
+
 	ceph_decode_need(p, end, 4*sizeof(u32), bad);
 	magic = ceph_decode_32(p);
 	if (magic != CRUSH_MAGIC) {
@@ -297,7 +318,25 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
 	}
 
 	/* ignore trailing name maps. */
+        for (num_name_maps = 0; num_name_maps < 3; num_name_maps++) {
+                err = skip_name_map(p, end);
+                if (err < 0)
+                        goto done;
+        }
 
+        /* tunables */
+        ceph_decode_need(p, end, 3*sizeof(u32), done);
+        c->choose_local_tries = ceph_decode_32(p);
+        c->choose_local_fallback_tries =  ceph_decode_32(p);
+        c->choose_total_tries = ceph_decode_32(p);
+        dout("crush decode tunable choose_local_tries = %d",
+             c->choose_local_tries);
+        dout("crush decode tunable choose_local_fallback_tries = %d",
+             c->choose_local_fallback_tries);
+        dout("crush decode tunable choose_total_tries = %d",
+             c->choose_total_tries);
+
+done:
 	dout("crush_decode success\n");
 	return c;
 

From 3a140a0d5c4b9e35373b016e41dfc85f1e526bdb Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Mon, 30 Jul 2012 16:24:21 -0700
Subject: [PATCH 077/132] libceph: report socket read/write error message

We need to set error_msg to something useful before calling ceph_fault();
do so here for try_{read,write}().  This is more informative than

libceph: osd0 192.168.106.220:6801 (null)

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Alex Elder <elder@inktank.com>
Reviewed-by: Yehuda Sadeh <yehuda@inktank.com>
---
 net/ceph/messenger.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 63e1252d3af5..6e2f67816f61 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -2287,14 +2287,18 @@ restart:
 	ret = try_read(con);
 	if (ret == -EAGAIN)
 		goto restart;
-	if (ret < 0)
+	if (ret < 0) {
+		con->error_msg = "socket error on read";
 		goto fault;
+	}
 
 	ret = try_write(con);
 	if (ret == -EAGAIN)
 		goto restart;
-	if (ret < 0)
+	if (ret < 0) {
+		con->error_msg = "socket error on write";
 		goto fault;
+	}
 
 done:
 	mutex_unlock(&con->mutex);

From 8c50c817566dfa4581f82373aac39f3e608a7dc8 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Mon, 30 Jul 2012 16:24:37 -0700
Subject: [PATCH 078/132] libceph: fix mutex coverage for ceph_con_close

Hold the mutex while twiddling all of the state bits to avoid possible
races.  While we're here, make not of why we cannot close the socket
directly.

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Alex Elder <elder@inktank.com>
Reviewed-by: Yehuda Sadeh <yehuda@inktank.com>
---
 net/ceph/messenger.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 6e2f67816f61..e65b15d5d8b9 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -503,6 +503,7 @@ static void reset_connection(struct ceph_connection *con)
  */
 void ceph_con_close(struct ceph_connection *con)
 {
+	mutex_lock(&con->mutex);
 	dout("con_close %p peer %s\n", con,
 	     ceph_pr_addr(&con->peer_addr.in_addr));
 	clear_bit(NEGOTIATING, &con->state);
@@ -515,11 +516,16 @@ void ceph_con_close(struct ceph_connection *con)
 	clear_bit(KEEPALIVE_PENDING, &con->flags);
 	clear_bit(WRITE_PENDING, &con->flags);
 
-	mutex_lock(&con->mutex);
 	reset_connection(con);
 	con->peer_global_seq = 0;
 	cancel_delayed_work(&con->work);
 	mutex_unlock(&con->mutex);
+
+	/*
+	 * We cannot close the socket directly from here because the
+	 * work threads use it without holding the mutex.  Instead, let
+	 * con_work() do it.
+	 */
 	queue_con(con);
 }
 EXPORT_SYMBOL(ceph_con_close);

From 6194ea895e447fdf4adfd23f67873a32bf4f15ae Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Mon, 30 Jul 2012 16:19:28 -0700
Subject: [PATCH 079/132] libceph: resubmit linger ops when pg mapping changes

The linger op registration (i.e., watch) modifies the object state.  As
such, the OSD will reply with success if it has already applied without
doing the associated side-effects (setting up the watch session state).
If we lose the ACK and resubmit, we will see success but the watch will not
be correctly registered and we won't get notifies.

To fix this, always resubmit the linger op with a new tid.  We accomplish
this by re-registering as a linger (i.e., 'registered') if we are not yet
registered.  Then the second loop will treat this just like a normal
case of re-registering.

This mirrors a similar fix on the userland ceph.git, commit 5dd68b95, and
ceph bug #2796.

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Alex Elder <elder@inktank.com>
Reviewed-by: Yehuda Sadeh <yehuda@inktank.com>
---
 net/ceph/osd_client.c | 26 +++++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 07920cac31a6..c605705c2f57 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -891,7 +891,9 @@ static void __register_linger_request(struct ceph_osd_client *osdc,
 {
 	dout("__register_linger_request %p\n", req);
 	list_add_tail(&req->r_linger_item, &osdc->req_linger);
-	list_add_tail(&req->r_linger_osd, &req->r_osd->o_linger_requests);
+	if (req->r_osd)
+		list_add_tail(&req->r_linger_osd,
+			      &req->r_osd->o_linger_requests);
 }
 
 static void __unregister_linger_request(struct ceph_osd_client *osdc,
@@ -1305,8 +1307,9 @@ static void kick_requests(struct ceph_osd_client *osdc, int force_resend)
 
 	dout("kick_requests %s\n", force_resend ? " (force resend)" : "");
 	mutex_lock(&osdc->request_mutex);
-	for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
+	for (p = rb_first(&osdc->requests); p; ) {
 		req = rb_entry(p, struct ceph_osd_request, r_node);
+		p = rb_next(p);
 		err = __map_request(osdc, req, force_resend);
 		if (err < 0)
 			continue;  /* error */
@@ -1314,10 +1317,23 @@ static void kick_requests(struct ceph_osd_client *osdc, int force_resend)
 			dout("%p tid %llu maps to no osd\n", req, req->r_tid);
 			needmap++;  /* request a newer map */
 		} else if (err > 0) {
-			dout("%p tid %llu requeued on osd%d\n", req, req->r_tid,
-			     req->r_osd ? req->r_osd->o_osd : -1);
-			if (!req->r_linger)
+			if (!req->r_linger) {
+				dout("%p tid %llu requeued on osd%d\n", req,
+				     req->r_tid,
+				     req->r_osd ? req->r_osd->o_osd : -1);
 				req->r_flags |= CEPH_OSD_FLAG_RETRY;
+			}
+		}
+		if (req->r_linger && list_empty(&req->r_linger_item)) {
+			/*
+			 * register as a linger so that we will
+			 * re-submit below and get a new tid
+			 */
+			dout("%p tid %llu restart on osd%d\n",
+			     req, req->r_tid,
+			     req->r_osd ? req->r_osd->o_osd : -1);
+			__register_linger_request(osdc, req);
+			__unregister_request(osdc, req);
 		}
 	}
 

From a4107026976f06c9a6ce8cc84a763564ee39d901 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Mon, 30 Jul 2012 16:20:25 -0700
Subject: [PATCH 080/132] libceph: (re)initialize bio_iter on start of message
 receive

Previously, we were opportunistically initializing the bio_iter if it
appeared to be uninitialized in the middle of the read path.  The problem
is that a sequence like:

 - start reading message
 - initialize bio_iter
 - read half a message
 - messenger fault, reconnect
 - restart reading message
 - ** bio_iter now non-NULL, not reinitialized **
 - read past end of bio, crash

Instead, initialize the bio_iter unconditionally when we allocate/claim
the message for read.

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Alex Elder <elder@inktank.com>
Reviewed-by: Yehuda Sadeh <yehuda@inktank.com>
---
 net/ceph/messenger.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index e65b15d5d8b9..f1bd3bbb0c46 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -1872,6 +1872,11 @@ static int read_partial_message(struct ceph_connection *con)
 		else
 			con->in_msg_pos.page_pos = 0;
 		con->in_msg_pos.data_pos = 0;
+
+#ifdef CONFIG_BLOCK
+		if (m->bio)
+			init_bio_iter(m->bio, &m->bio_iter, &m->bio_seg);
+#endif
 	}
 
 	/* front */
@@ -1888,10 +1893,6 @@ static int read_partial_message(struct ceph_connection *con)
 		if (ret <= 0)
 			return ret;
 	}
-#ifdef CONFIG_BLOCK
-	if (m->bio && !m->bio_iter)
-		init_bio_iter(m->bio, &m->bio_iter, &m->bio_seg);
-#endif
 
 	/* (page) data */
 	while (con->in_msg_pos.data_pos < data_len) {
@@ -1902,7 +1903,7 @@ static int read_partial_message(struct ceph_connection *con)
 				return ret;
 #ifdef CONFIG_BLOCK
 		} else if (m->bio) {
-
+			BUG_ON(!m->bio_iter);
 			ret = read_partial_message_bio(con,
 						 &m->bio_iter, &m->bio_seg,
 						 data_len, do_datacrc);

From a53aab645c82f0146e35684b34692c69b5118121 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Mon, 30 Jul 2012 16:21:17 -0700
Subject: [PATCH 081/132] ceph: close old con before reopening on mds reconnect

When we detect a mds session reset, close the old ceph_connection before
reopening it.  This ensures we clean up the old socket properly and keep
the ceph_connection state correct.

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Alex Elder <elder@inktank.com>
Reviewed-by: Yehuda Sadeh <yehuda@inktank.com>
---
 fs/ceph/mds_client.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 39b76d66bc5d..a5a735422aa7 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2518,6 +2518,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
 	session->s_state = CEPH_MDS_SESSION_RECONNECTING;
 	session->s_seq = 0;
 
+	ceph_con_close(&session->s_con);
 	ceph_con_open(&session->s_con,
 		      CEPH_ENTITY_TYPE_MDS, mds,
 		      ceph_mdsmap_get_addr(mdsc->mdsmap, mds));

From 5469155f2bc83bb2c88b0a0370c3d54d87eed06e Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Mon, 30 Jul 2012 16:21:40 -0700
Subject: [PATCH 082/132] libceph: protect ceph_con_open() with mutex

Take the con mutex while we are initiating a ceph open.  This is necessary
because the may have previously been in use and then closed, which could
result in a racing workqueue running con_work().

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Yehuda Sadeh <yehuda@inktank.com>
Reviewed-by: Alex Elder <elder@inktank.com>
---
 net/ceph/messenger.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index f1bd3bbb0c46..a4779988c847 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -537,6 +537,7 @@ void ceph_con_open(struct ceph_connection *con,
 		   __u8 entity_type, __u64 entity_num,
 		   struct ceph_entity_addr *addr)
 {
+	mutex_lock(&con->mutex);
 	dout("con_open %p %s\n", con, ceph_pr_addr(&addr->in_addr));
 	set_bit(OPENING, &con->state);
 	WARN_ON(!test_and_clear_bit(CLOSED, &con->state));
@@ -546,6 +547,7 @@ void ceph_con_open(struct ceph_connection *con,
 
 	memcpy(&con->peer_addr, addr, sizeof(*addr));
 	con->delay = 0;      /* reset backoff memory */
+	mutex_unlock(&con->mutex);
 	queue_con(con);
 }
 EXPORT_SYMBOL(ceph_con_open);

From 85effe183dd45854d1ad1a370b88cddb403c4c91 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Mon, 30 Jul 2012 16:22:05 -0700
Subject: [PATCH 083/132] libceph: reset connection retry on successfully
 negotiation

We exponentially back off when we encounter connection errors.  If several
errors accumulate, we will eventually wait ages before even trying to
reconnect.

Fix this by resetting the backoff counter after a successful negotiation/
connection with the remote node.  Fixes ceph issue #2802.

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Yehuda Sadeh <yehuda@inktank.com>
Reviewed-by: Alex Elder <elder@inktank.com>
---
 net/ceph/messenger.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index a4779988c847..07204f19e856 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -1629,6 +1629,8 @@ static int process_connect(struct ceph_connection *con)
 		if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
 			set_bit(LOSSYTX, &con->flags);
 
+		con->delay = 0;      /* reset backoff memory */
+
 		prepare_read_tag(con);
 		break;
 

From 21ec6ffa46719a4ed45531b5b01014c26f0416c4 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Fri, 20 Jul 2012 08:18:36 -0500
Subject: [PATCH 084/132] ceph: fix potential double free

We re-run the loop but we don't re-set the attrs pointer back to NULL.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Reviewed-by: Alex Elder <elder@inktank.com>
---
 fs/ceph/xattr.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 785cb3057c95..2c2ae5be9902 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -457,6 +457,7 @@ start:
 			for (i = 0; i < numattr; i++)
 				kfree(xattrs[i]);
 			kfree(xattrs);
+			xattrs = NULL;
 			goto start;
 		}
 		err = -EIO;

From 048a9d2d069e3d63c9169de82649be00de65a8f6 Mon Sep 17 00:00:00 2001
From: Jiaju Zhang <jjzhang@suse.de>
Date: Fri, 20 Jul 2012 08:18:36 -0500
Subject: [PATCH 085/132] libceph: trivial fix for the incorrect debug output

This is a trivial fix for the debug output, as it is inconsistent
with the function name so may confuse people when debugging.

[elder@inktank.com: switched to use __func__]

Signed-off-by: Jiaju Zhang <jjzhang@suse.de>
Reviewed-by: Alex Elder <elder@inktank.com>
---
 net/ceph/osd_client.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index c605705c2f57..ad427e698890 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -688,7 +688,7 @@ static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
 
 static void remove_all_osds(struct ceph_osd_client *osdc)
 {
-	dout("__remove_old_osds %p\n", osdc);
+	dout("%s %p\n", __func__, osdc);
 	mutex_lock(&osdc->request_mutex);
 	while (!RB_EMPTY_ROOT(&osdc->osds)) {
 		struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds),

From e88a36ec961b8c1899c59c5e4ae35a318c0209d3 Mon Sep 17 00:00:00 2001
From: Josh Durgin <josh.durgin@inktank.com>
Date: Mon, 21 Nov 2011 18:14:25 -0800
Subject: [PATCH 086/132] rbd: return errors for mapped but deleted snapshot

When a snapshot is deleted, the OSD will return ENOENT when reading
from it. This is normally interpreted as a hole by rbd, which will
return zeroes. To minimize the time in which this can happen, stop
requests early when we are notified that our snapshot no longer
exists.

[elder@inktank.com: updated __rbd_init_snaps_header() logic]

Signed-off-by: Josh Durgin <josh.durgin@inktank.com>
Reviewed-by: Alex Elder <elder@inktank.com>
---
 drivers/block/rbd.c | 32 ++++++++++++++++++++++++++++++--
 1 file changed, 30 insertions(+), 2 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index b124442dab3a..730d0ce505e1 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -172,9 +172,13 @@ struct rbd_device {
 
 	/* protects updating the header */
 	struct rw_semaphore     header_rwsem;
+	/* name of the snapshot this device reads from */
 	char                    *snap_name;
+	/* id of the snapshot this device reads from */
 	u64                     snap_id;	/* current snapshot id */
-	int read_only;
+	/* whether the snap_id this device reads from still exists */
+	bool                    snap_exists;
+	int                     read_only;
 
 	struct list_head	node;
 
@@ -597,6 +601,7 @@ static int rbd_header_set_snap(struct rbd_device *rbd_dev, u64 *size)
 		else
 			snapc->seq = 0;
 		rbd_dev->snap_id = CEPH_NOSNAP;
+		rbd_dev->snap_exists = false;
 		rbd_dev->read_only = 0;
 		if (size)
 			*size = header->image_size;
@@ -606,6 +611,7 @@ static int rbd_header_set_snap(struct rbd_device *rbd_dev, u64 *size)
 		if (ret < 0)
 			goto done;
 		rbd_dev->snap_id = snapc->seq;
+		rbd_dev->snap_exists = true;
 		rbd_dev->read_only = 1;
 	}
 
@@ -1468,6 +1474,21 @@ static void rbd_rq_fn(struct request_queue *q)
 
 		spin_unlock_irq(q->queue_lock);
 
+		if (rbd_dev->snap_id != CEPH_NOSNAP) {
+			bool snap_exists;
+
+			down_read(&rbd_dev->header_rwsem);
+			snap_exists = rbd_dev->snap_exists;
+			up_read(&rbd_dev->header_rwsem);
+
+			if (!snap_exists) {
+				dout("request for non-existent snapshot");
+				spin_lock_irq(q->queue_lock);
+				__blk_end_request_all(rq, -ENXIO);
+				continue;
+			}
+		}
+
 		dout("%s 0x%x bytes at 0x%llx\n",
 		     do_write ? "write" : "read",
 		     size, blk_rq_pos(rq) * SECTOR_SIZE);
@@ -2088,7 +2109,14 @@ static int __rbd_init_snaps_header(struct rbd_device *rbd_dev)
 			cur_id = rbd_dev->header.snapc->snaps[i - 1];
 
 		if (!i || old_snap->id < cur_id) {
-			/* old_snap->id was skipped, thus was removed */
+			/*
+			 * old_snap->id was skipped, thus was
+			 * removed.  If this rbd_dev is mapped to
+			 * the removed snapshot, record that it no
+			 * longer exists, to prevent further I/O.
+			 */
+			if (rbd_dev->snap_id == old_snap->id)
+				rbd_dev->snap_exists = false;
 			__rbd_remove_snap_dev(rbd_dev, old_snap);
 			continue;
 		}

From 474ef7ce832d471148f63a9d07f67fc5564834f1 Mon Sep 17 00:00:00 2001
From: Josh Durgin <josh.durgin@inktank.com>
Date: Mon, 21 Nov 2011 17:13:54 -0800
Subject: [PATCH 087/132] rbd: only reset capacity when pointing to head

Snapshots cannot be resized, and the new capacity of head should not
be reflected by the snapshot.

Signed-off-by: Josh Durgin <josh.durgin@inktank.com>
Reviewed-by: Alex Elder <elder@inktank.com>
---
 drivers/block/rbd.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 730d0ce505e1..f171cebabda9 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1725,7 +1725,12 @@ static int __rbd_refresh_header(struct rbd_device *rbd_dev)
 		return ret;
 
 	/* resized? */
-	set_capacity(rbd_dev->disk, h.image_size / SECTOR_SIZE);
+	if (rbd_dev->snap_id == CEPH_NOSNAP) {
+		sector_t size = (sector_t) h.image_size / SECTOR_SIZE;
+
+		dout("setting size to %llu sectors", (unsigned long long) size);
+		set_capacity(rbd_dev->disk, size);
+	}
 
 	down_write(&rbd_dev->header_rwsem);
 

From a51aa0c042fa39946dd017d5f91a073300a71577 Mon Sep 17 00:00:00 2001
From: Josh Durgin <josh.durgin@dreamhost.com>
Date: Mon, 5 Dec 2011 10:35:04 -0800
Subject: [PATCH 088/132] rbd: expose the correct size of the device in sysfs

If an image was mapped to a snapshot, the size of the head version
would be shown. Protect capacity with header_rwsem, since it may
change.

Signed-off-by: Josh Durgin <josh.durgin@dreamhost.com>
Reviewed-by: Alex Elder <elder@inktank.com>
---
 drivers/block/rbd.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index f171cebabda9..9c3a1db49ac4 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1724,6 +1724,8 @@ static int __rbd_refresh_header(struct rbd_device *rbd_dev)
 	if (ret < 0)
 		return ret;
 
+	down_write(&rbd_dev->header_rwsem);
+
 	/* resized? */
 	if (rbd_dev->snap_id == CEPH_NOSNAP) {
 		sector_t size = (sector_t) h.image_size / SECTOR_SIZE;
@@ -1732,8 +1734,6 @@ static int __rbd_refresh_header(struct rbd_device *rbd_dev)
 		set_capacity(rbd_dev->disk, size);
 	}
 
-	down_write(&rbd_dev->header_rwsem);
-
 	snap_seq = rbd_dev->header.snapc->seq;
 	if (rbd_dev->header.total_snaps &&
 	    rbd_dev->header.snapc->snaps[0] == snap_seq)
@@ -1853,8 +1853,13 @@ static ssize_t rbd_size_show(struct device *dev,
 			     struct device_attribute *attr, char *buf)
 {
 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
+	sector_t size;
 
-	return sprintf(buf, "%llu\n", (unsigned long long)rbd_dev->header.image_size);
+	down_read(&rbd_dev->header_rwsem);
+	size = get_capacity(rbd_dev->disk);
+	up_read(&rbd_dev->header_rwsem);
+
+	return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
 }
 
 static ssize_t rbd_major_show(struct device *dev,

From 93a24e084d67ba2fcb9a4c289135825b623ec864 Mon Sep 17 00:00:00 2001
From: Josh Durgin <josh.durgin@dreamhost.com>
Date: Mon, 5 Dec 2011 10:41:28 -0800
Subject: [PATCH 089/132] rbd: set image size when header is updated

The image may have been resized.

Signed-off-by: Josh Durgin <josh.durgin@dreamhost.com>
Reviewed-by: Alex Elder <elder@inktank.com>
---
 drivers/block/rbd.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 9c3a1db49ac4..a6bbda2e5eb8 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1746,6 +1746,7 @@ static int __rbd_refresh_header(struct rbd_device *rbd_dev)
 	kfree(rbd_dev->header.snap_names);
 	kfree(rbd_dev->header.snapc);
 
+	rbd_dev->header.image_size = h.image_size;
 	rbd_dev->header.total_snaps = h.total_snaps;
 	rbd_dev->header.snapc = h.snapc;
 	rbd_dev->header.snap_names = h.snap_names;

From d1d25646543134d756a02ffe4e02073faa761f2c Mon Sep 17 00:00:00 2001
From: Josh Durgin <josh.durgin@dreamhost.com>
Date: Mon, 5 Dec 2011 14:03:05 -0800
Subject: [PATCH 090/132] rbd: use reference counting for the snap context

This prevents a race between requests with a given snap context and
header updates that free it. The osd client was already expecting the
snap context to be reference counted, since it get()s it in
ceph_osdc_build_request and put()s it when the request completes.

Also remove the second down_read()/up_read() on header_rwsem in
rbd_do_request, which wasn't actually preventing this race or
protecting any other data.

Signed-off-by: Josh Durgin <josh.durgin@dreamhost.com>
Reviewed-by: Alex Elder <elder@inktank.com>
---
 drivers/block/rbd.c | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index a6bbda2e5eb8..988f94458f95 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -626,7 +626,7 @@ static void rbd_header_free(struct rbd_image_header *header)
 	kfree(header->object_prefix);
 	kfree(header->snap_sizes);
 	kfree(header->snap_names);
-	kfree(header->snapc);
+	ceph_put_snap_context(header->snapc);
 }
 
 /*
@@ -902,13 +902,10 @@ static int rbd_do_request(struct request *rq,
 	dout("rbd_do_request object_name=%s ofs=%lld len=%lld\n",
 		object_name, len, ofs);
 
-	down_read(&rbd_dev->header_rwsem);
-
 	osdc = &rbd_dev->rbd_client->client->osdc;
 	req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
 					false, GFP_NOIO, pages, bio);
 	if (!req) {
-		up_read(&rbd_dev->header_rwsem);
 		ret = -ENOMEM;
 		goto done_pages;
 	}
@@ -942,7 +939,6 @@ static int rbd_do_request(struct request *rq,
 				snapc,
 				&mtime,
 				req->r_oid, req->r_oid_len);
-	up_read(&rbd_dev->header_rwsem);
 
 	if (linger_req) {
 		ceph_osdc_set_request_linger(osdc, req);
@@ -1448,6 +1444,7 @@ static void rbd_rq_fn(struct request_queue *q)
 		u64 ofs;
 		int num_segs, cur_seg = 0;
 		struct rbd_req_coll *coll;
+		struct ceph_snap_context *snapc;
 
 		/* peek at request from block layer */
 		if (!rq)
@@ -1474,21 +1471,20 @@ static void rbd_rq_fn(struct request_queue *q)
 
 		spin_unlock_irq(q->queue_lock);
 
-		if (rbd_dev->snap_id != CEPH_NOSNAP) {
-			bool snap_exists;
+		down_read(&rbd_dev->header_rwsem);
 
-			down_read(&rbd_dev->header_rwsem);
-			snap_exists = rbd_dev->snap_exists;
+		if (rbd_dev->snap_id != CEPH_NOSNAP && !rbd_dev->snap_exists) {
 			up_read(&rbd_dev->header_rwsem);
-
-			if (!snap_exists) {
-				dout("request for non-existent snapshot");
-				spin_lock_irq(q->queue_lock);
-				__blk_end_request_all(rq, -ENXIO);
-				continue;
-			}
+			dout("request for non-existent snapshot");
+			spin_lock_irq(q->queue_lock);
+			__blk_end_request_all(rq, -ENXIO);
+			continue;
 		}
 
+		snapc = ceph_get_snap_context(rbd_dev->header.snapc);
+
+		up_read(&rbd_dev->header_rwsem);
+
 		dout("%s 0x%x bytes at 0x%llx\n",
 		     do_write ? "write" : "read",
 		     size, blk_rq_pos(rq) * SECTOR_SIZE);
@@ -1498,6 +1494,7 @@ static void rbd_rq_fn(struct request_queue *q)
 		if (!coll) {
 			spin_lock_irq(q->queue_lock);
 			__blk_end_request_all(rq, -ENOMEM);
+			ceph_put_snap_context(snapc);
 			continue;
 		}
 
@@ -1521,7 +1518,7 @@ static void rbd_rq_fn(struct request_queue *q)
 			/* init OSD command: write or read */
 			if (do_write)
 				rbd_req_write(rq, rbd_dev,
-					      rbd_dev->header.snapc,
+					      snapc,
 					      ofs,
 					      op_size, bio,
 					      coll, cur_seg);
@@ -1544,6 +1541,8 @@ next_seg:
 		if (bp)
 			bio_pair_release(bp);
 		spin_lock_irq(q->queue_lock);
+
+		ceph_put_snap_context(snapc);
 	}
 }
 
@@ -1744,7 +1743,8 @@ static int __rbd_refresh_header(struct rbd_device *rbd_dev)
 	/* rbd_dev->header.object_prefix shouldn't change */
 	kfree(rbd_dev->header.snap_sizes);
 	kfree(rbd_dev->header.snap_names);
-	kfree(rbd_dev->header.snapc);
+	/* osd requests may still refer to snapc */
+	ceph_put_snap_context(rbd_dev->header.snapc);
 
 	rbd_dev->header.image_size = h.image_size;
 	rbd_dev->header.total_snaps = h.total_snaps;

From a71b891bc7d77a070e723c8c53d1dd73cf931555 Mon Sep 17 00:00:00 2001
From: Josh Durgin <josh.durgin@dreamhost.com>
Date: Mon, 5 Dec 2011 18:10:44 -0800
Subject: [PATCH 091/132] rbd: send header version when notifying

Previously the original header version was sent. Now, we update it
when the header changes.

Signed-off-by: Josh Durgin <josh.durgin@dreamhost.com>
Reviewed-by: Alex Elder <elder@inktank.com>
---
 drivers/block/rbd.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 988f94458f95..4d3a1e02130b 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1197,7 +1197,7 @@ static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
 	if (ret < 0)
 		return ret;
 
-	ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
+	ops[0].watch.ver = cpu_to_le64(ver);
 	ops[0].watch.cookie = notify_id;
 	ops[0].watch.flag = 0;
 
@@ -1216,6 +1216,7 @@ static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
 static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
 {
 	struct rbd_device *rbd_dev = (struct rbd_device *)data;
+	u64 hver;
 	int rc;
 
 	if (!rbd_dev)
@@ -1225,12 +1226,13 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
 		rbd_dev->header_name, notify_id, (int) opcode);
 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
 	rc = __rbd_refresh_header(rbd_dev);
+	hver = rbd_dev->header.obj_version;
 	mutex_unlock(&ctl_mutex);
 	if (rc)
 		pr_warning(RBD_DRV_NAME "%d got notification but failed to "
 			   " update snaps: %d\n", rbd_dev->major, rc);
 
-	rbd_req_sync_notify_ack(rbd_dev, ver, notify_id, rbd_dev->header_name);
+	rbd_req_sync_notify_ack(rbd_dev, hver, notify_id, rbd_dev->header_name);
 }
 
 /*
@@ -1746,6 +1748,7 @@ static int __rbd_refresh_header(struct rbd_device *rbd_dev)
 	/* osd requests may still refer to snapc */
 	ceph_put_snap_context(rbd_dev->header.snapc);
 
+	rbd_dev->header.obj_version = h.obj_version;
 	rbd_dev->header.image_size = h.image_size;
 	rbd_dev->header.total_snaps = h.total_snaps;
 	rbd_dev->header.snapc = h.snapc;

From 75fe9e19816d6ed3e90f1bd3b741f99bf030e848 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 19 Jul 2012 08:49:18 -0500
Subject: [PATCH 092/132] rbd: don't use snapc->seq that way

In what appears to be an artifact of a different way of encoding
whether an rbd image maps a snapshot, __rbd_refresh_header() has
code that arranges to update the seq value in an rbd image's
snapshot context to point to the first entry in its snapshot
array if that's where it was pointing initially.

We now use rbd_dev->snap_id to record the snapshot id--using the
special value CEPH_NOSNAP to indicate the rbd_dev is not mapping a
snapshot at all.

There is therefore no need to check for this case, nor to update the
seq value, in __rbd_refresh_header().  Just preserve the seq value
that rbd_read_header() provides (which, at the moment, is nothing).

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 4d3a1e02130b..8a4659997e05 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1718,8 +1718,6 @@ static int __rbd_refresh_header(struct rbd_device *rbd_dev)
 {
 	int ret;
 	struct rbd_image_header h;
-	u64 snap_seq;
-	int follow_seq = 0;
 
 	ret = rbd_read_header(rbd_dev, &h);
 	if (ret < 0)
@@ -1735,13 +1733,6 @@ static int __rbd_refresh_header(struct rbd_device *rbd_dev)
 		set_capacity(rbd_dev->disk, size);
 	}
 
-	snap_seq = rbd_dev->header.snapc->seq;
-	if (rbd_dev->header.total_snaps &&
-	    rbd_dev->header.snapc->snaps[0] == snap_seq)
-		/* pointing at the head, will need to follow that
-		   if head moves */
-		follow_seq = 1;
-
 	/* rbd_dev->header.object_prefix shouldn't change */
 	kfree(rbd_dev->header.snap_sizes);
 	kfree(rbd_dev->header.snap_names);
@@ -1759,11 +1750,6 @@ static int __rbd_refresh_header(struct rbd_device *rbd_dev)
 	WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
 	kfree(h.object_prefix);
 
-	if (follow_seq)
-		rbd_dev->header.snapc->seq = rbd_dev->header.snapc->snaps[0];
-	else
-		rbd_dev->header.snapc->seq = snap_seq;
-
 	ret = __rbd_init_snaps_header(rbd_dev);
 
 	up_write(&rbd_dev->header_rwsem);

From 78dc447d3ca3701206a1dd813c901556a3fad451 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 19 Jul 2012 08:49:18 -0500
Subject: [PATCH 093/132] rbd: preserve snapc->seq in rbd_header_set_snap()

In rbd_header_set_snap(), there is logic to make the snap context's
seq field get set to a particular snapshot id, or 0 if there is no
snapshot for the rbd image.

This seems to be an artifact of how the current snapshot id for an
rbd_dev was recorded before the rbd_dev->snap_id field began to be
used for that purpose.

There's no need to update the value of snapc->seq here any more, so
stop doing it.  Tidy up a few local variables in that function
while we're at it.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 8a4659997e05..ac8a83fc2ad9 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -588,29 +588,25 @@ static int snap_by_name(struct rbd_image_header *header, const char *snap_name,
 
 static int rbd_header_set_snap(struct rbd_device *rbd_dev, u64 *size)
 {
-	struct rbd_image_header *header = &rbd_dev->header;
-	struct ceph_snap_context *snapc = header->snapc;
-	int ret = -ENOENT;
+	int ret;
 
 	down_write(&rbd_dev->header_rwsem);
 
 	if (!memcmp(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
 		    sizeof (RBD_SNAP_HEAD_NAME))) {
-		if (header->total_snaps)
-			snapc->seq = header->snap_seq;
-		else
-			snapc->seq = 0;
 		rbd_dev->snap_id = CEPH_NOSNAP;
 		rbd_dev->snap_exists = false;
 		rbd_dev->read_only = 0;
 		if (size)
-			*size = header->image_size;
+			*size = rbd_dev->header.image_size;
 	} else {
-		ret = snap_by_name(header, rbd_dev->snap_name,
-					&snapc->seq, size);
+		u64 snap_id = 0;
+
+		ret = snap_by_name(&rbd_dev->header, rbd_dev->snap_name,
+					&snap_id, size);
 		if (ret < 0)
 			goto done;
-		rbd_dev->snap_id = snapc->seq;
+		rbd_dev->snap_id = snap_id;
 		rbd_dev->snap_exists = true;
 		rbd_dev->read_only = 1;
 	}

From 505cbb9bedc8c609c31d86ff4f8f656e5a0f9c49 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 19 Jul 2012 08:49:18 -0500
Subject: [PATCH 094/132] rbd: set snapc->seq only when refreshing header

In rbd_header_add_snap() there is code to set snapc->seq to the
just-added snapshot id.  This is the only remnant left of the
use of that field for recording which snapshot an rbd_dev was
associated with.  That functionality is no longer supported,
so get rid of that final bit of code.

Doing so means we never actually set snapc->seq any more.  On the
server, the snapshot context's sequence value represents the highest
snapshot id ever issued for a particular rbd image.  So we'll make
it have that meaning here as well.  To do so, set this value
whenever the rbd header is (re-)read.  That way it will always be
consistent with the rest of the snapshot context we maintain.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index ac8a83fc2ad9..c299a55e3ff1 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -537,6 +537,7 @@ static int rbd_header_from_disk(struct rbd_image_header *header,
 
 	atomic_set(&header->snapc->nref, 1);
 	header->snap_seq = le64_to_cpu(ondisk->snap_seq);
+	header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
 	header->snapc->num_snaps = snap_count;
 	header->total_snaps = snap_count;
 
@@ -1685,14 +1686,7 @@ static int rbd_header_add_snap(struct rbd_device *rbd_dev,
 
 	kfree(data);
 
-	if (ret < 0)
-		return ret;
-
-	down_write(&rbd_dev->header_rwsem);
-	rbd_dev->header.snapc->seq = new_snapid;
-	up_write(&rbd_dev->header_rwsem);
-
-	return 0;
+	return ret < 0 ? ret : 0;
 bad:
 	return -ERANGE;
 }

From 9e15dc735a7a0418be14e2deab44ddee369af857 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 19 Jul 2012 08:49:18 -0500
Subject: [PATCH 095/132] rbd: kill rbd_image_header->snap_seq

The snap_seq field in an rbd_image_header structure held the value
from the rbd image header when it was last refreshed.  We now
maintain this value in the snapc->seq field.  So get rid of the
other one.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index c299a55e3ff1..6df8c62c40f5 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -82,7 +82,6 @@ struct rbd_image_header {
 	__u8 comp_type;
 	struct ceph_snap_context *snapc;
 	size_t snap_names_len;
-	u64 snap_seq;
 	u32 total_snaps;
 
 	char *snap_names;
@@ -536,7 +535,6 @@ static int rbd_header_from_disk(struct rbd_image_header *header,
 	header->comp_type = ondisk->options.comp_type;
 
 	atomic_set(&header->snapc->nref, 1);
-	header->snap_seq = le64_to_cpu(ondisk->snap_seq);
 	header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
 	header->snapc->num_snaps = snap_count;
 	header->total_snaps = snap_count;

From a66f8c97a31fd7b2cfd7b86d4789858dbfbedffb Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 19 Jul 2012 09:09:27 -0500
Subject: [PATCH 096/132] rbd: drop extra header_rwsem init

In commit c666601a there was inadvertently added an extra
initialization of rbd_dev->header_rwsem.  This gets rid of the
duplicate.

Reported-by: Guangliang Zhao <gzhao@suse.com>
Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 6df8c62c40f5..b9895feda5ee 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -2458,8 +2458,6 @@ static ssize_t rbd_add(struct bus_type *bus,
 	INIT_LIST_HEAD(&rbd_dev->snaps);
 	init_rwsem(&rbd_dev->header_rwsem);
 
-	init_rwsem(&rbd_dev->header_rwsem);
-
 	/* generate unique id: find highest unique id, add one */
 	rbd_id_get(rbd_dev);
 

From a05932905695f8c6c06d353ecd52c8e5d607cc77 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 19 Jul 2012 09:09:27 -0500
Subject: [PATCH 097/132] rbd: simplify __rbd_remove_all_snaps()

This just replaces a while loop with list_for_each_entry_safe()
in __rbd_remove_all_snaps().

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index b9895feda5ee..74e6a3329706 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1692,11 +1692,10 @@ bad:
 static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
 {
 	struct rbd_snap *snap;
+	struct rbd_snap *next;
 
-	while (!list_empty(&rbd_dev->snaps)) {
-		snap = list_first_entry(&rbd_dev->snaps, struct rbd_snap, node);
+	list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
 		__rbd_remove_snap_dev(rbd_dev, snap);
-	}
 }
 
 /*

From bd919d45aa61c19d9ed82548d6deb06bcae31153 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 13 Jul 2012 20:35:11 -0500
Subject: [PATCH 098/132] rbd: clean up a few dout() calls

There was a dout() call in rbd_do_request() that was reporting
the reporting the offset as the length and vice versa.  While
fixing that I did a quick scan of other dout() calls and fixed
a couple of other minor things.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 41 ++++++++++++++++++++++-------------------
 1 file changed, 22 insertions(+), 19 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 74e6a3329706..bb7f436b1765 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -734,9 +734,8 @@ static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
 			 * split_bio will BUG_ON if this is not the case
 			 */
 			dout("bio_chain_clone split! total=%d remaining=%d"
-			     "bi_size=%d\n",
-			     (int)total, (int)len-total,
-			     (int)old_chain->bi_size);
+			     "bi_size=%u\n",
+			     total, len - total, old_chain->bi_size);
 
 			/* split the bio. We'll release it either in the next
 			   call, or it will have to be released outside */
@@ -816,8 +815,8 @@ static void rbd_coll_end_req_index(struct request *rq,
 	struct request_queue *q;
 	int min, max, i;
 
-	dout("rbd_coll_end_req_index %p index %d ret %d len %lld\n",
-	     coll, index, ret, len);
+	dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
+	     coll, index, ret, (unsigned long long) len);
 
 	if (!rq)
 		return;
@@ -894,8 +893,8 @@ static int rbd_do_request(struct request *rq,
 		req_data->coll_index = coll_index;
 	}
 
-	dout("rbd_do_request object_name=%s ofs=%lld len=%lld\n",
-		object_name, len, ofs);
+	dout("rbd_do_request object_name=%s ofs=%llu len=%llu\n", object_name,
+		(unsigned long long) ofs, (unsigned long long) len);
 
 	osdc = &rbd_dev->rbd_client->client->osdc;
 	req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
@@ -948,8 +947,9 @@ static int rbd_do_request(struct request *rq,
 		ret = ceph_osdc_wait_request(osdc, req);
 		if (ver)
 			*ver = le64_to_cpu(req->r_reassert_version.version);
-		dout("reassert_ver=%lld\n",
-		     le64_to_cpu(req->r_reassert_version.version));
+		dout("reassert_ver=%llu\n",
+			(unsigned long long)
+				le64_to_cpu(req->r_reassert_version.version));
 		ceph_osdc_put_request(req);
 	}
 	return ret;
@@ -983,7 +983,8 @@ static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
 	bytes = le64_to_cpu(op->extent.length);
 	read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
 
-	dout("rbd_req_cb bytes=%lld readop=%d rc=%d\n", bytes, read_op, rc);
+	dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
+		(unsigned long long) bytes, read_op, (int) rc);
 
 	if (rc == -ENOENT && read_op) {
 		zero_bio_chain(req_data->bio, 0);
@@ -1217,8 +1218,9 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
 	if (!rbd_dev)
 		return;
 
-	dout("rbd_watch_cb %s notify_id=%lld opcode=%d\n",
-		rbd_dev->header_name, notify_id, (int) opcode);
+	dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
+		rbd_dev->header_name, (unsigned long long) notify_id,
+		(unsigned int) opcode);
 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
 	rc = __rbd_refresh_header(rbd_dev);
 	hver = rbd_dev->header.obj_version;
@@ -1314,9 +1316,9 @@ static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
 	if (!rbd_dev)
 		return;
 
-	dout("rbd_notify_cb %s notify_id=%lld opcode=%d\n",
-				rbd_dev->header_name,
-		notify_id, (int)opcode);
+	dout("rbd_notify_cb %s notify_id=%llu opcode=%u\n",
+			rbd_dev->header_name, (unsigned long long) notify_id,
+			(unsigned int) opcode);
 }
 
 /*
@@ -1437,7 +1439,8 @@ static void rbd_rq_fn(struct request_queue *q)
 		struct bio *bio;
 		struct bio *rq_bio, *next_bio = NULL;
 		bool do_write;
-		int size, op_size = 0;
+		unsigned int size;
+		u64 op_size = 0;
 		u64 ofs;
 		int num_segs, cur_seg = 0;
 		struct rbd_req_coll *coll;
@@ -1484,7 +1487,7 @@ static void rbd_rq_fn(struct request_queue *q)
 
 		dout("%s 0x%x bytes at 0x%llx\n",
 		     do_write ? "write" : "read",
-		     size, blk_rq_pos(rq) * SECTOR_SIZE);
+		     size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
 
 		num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
 		coll = rbd_alloc_coll(num_segs);
@@ -1497,7 +1500,7 @@ static void rbd_rq_fn(struct request_queue *q)
 
 		do {
 			/* a bio clone to be passed down to OSD req */
-			dout("rq->bio->bi_vcnt=%d\n", rq->bio->bi_vcnt);
+			dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
 			op_size = rbd_get_segment(&rbd_dev->header,
 						  rbd_dev->header.object_prefix,
 						  ofs, size,
@@ -1664,7 +1667,7 @@ static int rbd_header_add_snap(struct rbd_device *rbd_dev,
 
 	monc = &rbd_dev->rbd_client->client->monc;
 	ret = ceph_monc_create_snapid(monc, rbd_dev->pool_id, &new_snapid);
-	dout("created snapid=%lld\n", new_snapid);
+	dout("created snapid=%llu\n", (unsigned long long) new_snapid);
 	if (ret < 0)
 		return ret;
 

From aa711ee3402ad10ffd5b70ce0417fadc9a95cccf Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 13 Jul 2012 20:35:11 -0500
Subject: [PATCH 099/132] ceph: define snap counts as u32 everywhere

There are two structures in which a count of snapshots are
maintained:

    struct ceph_snap_context {
	...
        u32 num_snaps;
	...
    }
and
    struct ceph_snap_realm {
	...
        u32 num_prior_parent_snaps;   /*  had prior to parent_since */
	...
        u32 num_snaps;
	...
    }

These fields never take on negative values (e.g., to hold special
meaning), and so are really inherently unsigned.  Furthermore they
take their value from over-the-wire or on-disk formatted 32-bit
values.

So change their definition to have type u32, and change some spots
elsewhere in the code to account for this change.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 fs/ceph/snap.c               | 18 ++++++++++--------
 fs/ceph/super.h              |  4 ++--
 include/linux/ceph/libceph.h |  2 +-
 3 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index e5206fc76562..cbb2f54a3019 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -296,8 +296,7 @@ static int build_snap_context(struct ceph_snap_realm *realm)
 	struct ceph_snap_realm *parent = realm->parent;
 	struct ceph_snap_context *snapc;
 	int err = 0;
-	int i;
-	int num = realm->num_prior_parent_snaps + realm->num_snaps;
+	u32 num = realm->num_prior_parent_snaps + realm->num_snaps;
 
 	/*
 	 * build parent context, if it hasn't been built.
@@ -321,11 +320,11 @@ static int build_snap_context(struct ceph_snap_realm *realm)
 	    realm->cached_context->seq == realm->seq &&
 	    (!parent ||
 	     realm->cached_context->seq >= parent->cached_context->seq)) {
-		dout("build_snap_context %llx %p: %p seq %lld (%d snaps)"
+		dout("build_snap_context %llx %p: %p seq %lld (%u snaps)"
 		     " (unchanged)\n",
 		     realm->ino, realm, realm->cached_context,
 		     realm->cached_context->seq,
-		     realm->cached_context->num_snaps);
+		     (unsigned int) realm->cached_context->num_snaps);
 		return 0;
 	}
 
@@ -342,6 +341,8 @@ static int build_snap_context(struct ceph_snap_realm *realm)
 	num = 0;
 	snapc->seq = realm->seq;
 	if (parent) {
+		u32 i;
+
 		/* include any of parent's snaps occurring _after_ my
 		   parent became my parent */
 		for (i = 0; i < parent->cached_context->num_snaps; i++)
@@ -361,8 +362,9 @@ static int build_snap_context(struct ceph_snap_realm *realm)
 
 	sort(snapc->snaps, num, sizeof(u64), cmpu64_rev, NULL);
 	snapc->num_snaps = num;
-	dout("build_snap_context %llx %p: %p seq %lld (%d snaps)\n",
-	     realm->ino, realm, snapc, snapc->seq, snapc->num_snaps);
+	dout("build_snap_context %llx %p: %p seq %lld (%u snaps)\n",
+	     realm->ino, realm, snapc, snapc->seq,
+	     (unsigned int) snapc->num_snaps);
 
 	if (realm->cached_context)
 		ceph_put_snap_context(realm->cached_context);
@@ -402,9 +404,9 @@ static void rebuild_snap_realms(struct ceph_snap_realm *realm)
  * helper to allocate and decode an array of snapids.  free prior
  * instance, if any.
  */
-static int dup_array(u64 **dst, __le64 *src, int num)
+static int dup_array(u64 **dst, __le64 *src, u32 num)
 {
-	int i;
+	u32 i;
 
 	kfree(*dst);
 	if (num) {
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index fc35036d258d..3ea48b7b98b3 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -612,9 +612,9 @@ struct ceph_snap_realm {
 	u64 parent_since;   /* snapid when our current parent became so */
 
 	u64 *prior_parent_snaps;      /* snaps inherited from any parents we */
-	int num_prior_parent_snaps;   /*  had prior to parent_since */
+	u32 num_prior_parent_snaps;   /*  had prior to parent_since */
 	u64 *snaps;                   /* snaps specific to this realm */
-	int num_snaps;
+	u32 num_snaps;
 
 	struct ceph_snap_realm *parent;
 	struct list_head children;       /* list of child realms */
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index ea072e1f9db9..42624789b06f 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -154,7 +154,7 @@ struct ceph_client {
 struct ceph_snap_context {
 	atomic_t nref;
 	u64 seq;
-	int num_snaps;
+	u32 num_snaps;
 	u64 snaps[];
 };
 

From 8e94af8e2b582e5915abc171a28130881d1c26e4 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 25 Jul 2012 09:32:40 -0500
Subject: [PATCH 100/132] rbd: encapsulate header validity test

If an rbd image header is read and it doesn't begin with the
expected magic information, a warning is displayed.  This is
a fairly simple test, but it could be extended at some point.
Fix the comparison so it actually looks at the "text" field
rather than the front of the structure.

In any case, encapsulate the validity test in its own function.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index bb7f436b1765..ae65fac2c42b 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -481,6 +481,12 @@ static void rbd_coll_release(struct kref *kref)
 	kfree(coll);
 }
 
+static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
+{
+	return !memcmp(&ondisk->text,
+			RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT));
+}
+
 /*
  * Create a new header structure, translate header format from the on-disk
  * header.
@@ -492,7 +498,7 @@ static int rbd_header_from_disk(struct rbd_image_header *header,
 {
 	u32 i, snap_count;
 
-	if (memcmp(ondisk, RBD_HEADER_TEXT, sizeof(RBD_HEADER_TEXT)))
+	if (!rbd_dev_ondisk_valid(ondisk))
 		return -ENXIO;
 
 	snap_count = le32_to_cpu(ondisk->snap_count);

From de71a2970d57463d3d965025e33ec3adcf391248 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Tue, 3 Jul 2012 16:01:19 -0500
Subject: [PATCH 101/132] rbd: rename rbd_device->id

The "id" field of an rbd device structure represents the unique
client-local device id mapped to the underlying rbd image.  Each rbd
image will have another id--the image id--and each snapshot has its
own id as well.  The simple name "id" no longer conveys the
information one might like to have.

Rename the device "id" field in struct rbd_dev to be "dev_id" to
make it a little more obvious what we're dealing with without having
to think more about context.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index ae65fac2c42b..fe7a9e15b6f2 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -147,7 +147,7 @@ struct rbd_snap {
  * a single device
  */
 struct rbd_device {
-	int			id;		/* blkdev unique id */
+	int			dev_id;		/* blkdev unique id */
 
 	int			major;		/* blkdev assigned major */
 	struct gendisk		*disk;		/* blkdev's gendisk and rq */
@@ -1782,7 +1782,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
 		goto out;
 
 	snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
-		 rbd_dev->id);
+		 rbd_dev->dev_id);
 	disk->major = rbd_dev->major;
 	disk->first_minor = 0;
 	disk->fops = &rbd_bd_ops;
@@ -2171,7 +2171,7 @@ static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
 	dev->type = &rbd_device_type;
 	dev->parent = &rbd_root_dev;
 	dev->release = rbd_dev_release;
-	dev_set_name(dev, "%d", rbd_dev->id);
+	dev_set_name(dev, "%d", rbd_dev->dev_id);
 	ret = device_register(dev);
 	if (ret < 0)
 		goto out;
@@ -2219,7 +2219,7 @@ static atomic64_t rbd_id_max = ATOMIC64_INIT(0);
  */
 static void rbd_id_get(struct rbd_device *rbd_dev)
 {
-	rbd_dev->id = atomic64_inc_return(&rbd_id_max);
+	rbd_dev->dev_id = atomic64_inc_return(&rbd_id_max);
 
 	spin_lock(&rbd_dev_list_lock);
 	list_add_tail(&rbd_dev->node, &rbd_dev_list);
@@ -2233,7 +2233,7 @@ static void rbd_id_get(struct rbd_device *rbd_dev)
 static void rbd_id_put(struct rbd_device *rbd_dev)
 {
 	struct list_head *tmp;
-	int rbd_id = rbd_dev->id;
+	int rbd_id = rbd_dev->dev_id;
 	int max_id;
 
 	BUG_ON(rbd_id < 1);
@@ -2472,7 +2472,7 @@ static ssize_t rbd_add(struct bus_type *bus,
 	/* Fill in the device name, now that we have its id. */
 	BUILD_BUG_ON(DEV_NAME_LEN
 			< sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
-	sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->id);
+	sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
 
 	/* parse add command */
 	rc = rbd_add_parse_args(rbd_dev, buf, &mon_addrs, &mon_addrs_size,
@@ -2549,7 +2549,7 @@ err_nomem:
 	return (ssize_t) rc;
 }
 
-static struct rbd_device *__rbd_get_dev(unsigned long id)
+static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
 {
 	struct list_head *tmp;
 	struct rbd_device *rbd_dev;
@@ -2557,7 +2557,7 @@ static struct rbd_device *__rbd_get_dev(unsigned long id)
 	spin_lock(&rbd_dev_list_lock);
 	list_for_each(tmp, &rbd_dev_list) {
 		rbd_dev = list_entry(tmp, struct rbd_device, node);
-		if (rbd_dev->id == id) {
+		if (rbd_dev->dev_id == dev_id) {
 			spin_unlock(&rbd_dev_list_lock);
 			return rbd_dev;
 		}

From 9a5d690b08478fc2358d885703014853e44a357e Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 19 Jul 2012 09:09:27 -0500
Subject: [PATCH 102/132] rbd: snapc is unused in rbd_req_sync_read()

The "snapc" parameter to in rbd_req_sync_read() is not used, so
get rid of it.

Reported-by: Josh Durgin <josh.durgin@inktank.com>
Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index fe7a9e15b6f2..eed58e99e18d 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1169,7 +1169,6 @@ static int rbd_req_read(struct request *rq,
  * Request sync osd read
  */
 static int rbd_req_sync_read(struct rbd_device *rbd_dev,
-			  struct ceph_snap_context *snapc,
 			  u64 snapid,
 			  const char *object_name,
 			  u64 ofs, u64 len,
@@ -1619,7 +1618,7 @@ static int rbd_read_header(struct rbd_device *rbd_dev,
 			return -ENOMEM;
 
 		rc = rbd_req_sync_read(rbd_dev,
-				       NULL, CEPH_NOSNAP,
+				       CEPH_NOSNAP,
 				       rbd_dev->header_name,
 				       0, len,
 				       (char *)dh, &ver);

From ed63f4fd9a88218ee709e8f57c36c0c5f219a7ad Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 19 Jul 2012 09:09:27 -0500
Subject: [PATCH 103/132] rbd: drop rbd_header_from_disk() gfp_flags parameter

The function rbd_header_from_disk() is only called in one spot, and
it passes GFP_KERNEL as its value for the gfp_flags parameter.

Just drop that parameter and substitute GFP_KERNEL everywhere within
that function it had been used.  (If we find we need the parameter
again in the future it's easy enough to add back again.)

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index eed58e99e18d..34ca5c686b46 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -493,8 +493,7 @@ static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
  */
 static int rbd_header_from_disk(struct rbd_image_header *header,
 				 struct rbd_image_header_ondisk *ondisk,
-				 u32 allocated_snaps,
-				 gfp_t gfp_flags)
+				 u32 allocated_snaps)
 {
 	u32 i, snap_count;
 
@@ -507,18 +506,18 @@ static int rbd_header_from_disk(struct rbd_image_header *header,
 		return -EINVAL;
 	header->snapc = kmalloc(sizeof(struct ceph_snap_context) +
 				snap_count * sizeof(u64),
-				gfp_flags);
+				GFP_KERNEL);
 	if (!header->snapc)
 		return -ENOMEM;
 
 	header->snap_names_len = le64_to_cpu(ondisk->snap_names_len);
 	if (snap_count) {
 		header->snap_names = kmalloc(header->snap_names_len,
-					     gfp_flags);
+					     GFP_KERNEL);
 		if (!header->snap_names)
 			goto err_snapc;
 		header->snap_sizes = kmalloc(snap_count * sizeof(u64),
-					     gfp_flags);
+					     GFP_KERNEL);
 		if (!header->snap_sizes)
 			goto err_names;
 	} else {
@@ -527,7 +526,7 @@ static int rbd_header_from_disk(struct rbd_image_header *header,
 	}
 
 	header->object_prefix = kmalloc(sizeof (ondisk->block_name) + 1,
-					gfp_flags);
+					GFP_KERNEL);
 	if (!header->object_prefix)
 		goto err_sizes;
 
@@ -1625,7 +1624,7 @@ static int rbd_read_header(struct rbd_device *rbd_dev,
 		if (rc < 0)
 			goto out_dh;
 
-		rc = rbd_header_from_disk(header, dh, snap_count, GFP_KERNEL);
+		rc = rbd_header_from_disk(header, dh, snap_count);
 		if (rc < 0) {
 			if (rc == -ENXIO)
 				pr_warning("unrecognized header format"

From 14e7085d8460bf45e7145524a13802f1f4f9d81f Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 19 Jul 2012 09:09:27 -0500
Subject: [PATCH 104/132] rbd: drop rbd_dev parameter in snap functions

Both rbd_register_snap_dev() and __rbd_remove_snap_dev() have
rbd_dev parameters that are unused.  Remove them.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 34ca5c686b46..9e960c3ce1c4 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -202,8 +202,7 @@ static ssize_t rbd_snap_add(struct device *dev,
 			    struct device_attribute *attr,
 			    const char *buf,
 			    size_t count);
-static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev,
-				  struct rbd_snap *snap);
+static void __rbd_remove_snap_dev(struct rbd_snap *snap);
 
 static ssize_t rbd_add(struct bus_type *bus, const char *buf,
 		       size_t count);
@@ -1702,7 +1701,7 @@ static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
 	struct rbd_snap *next;
 
 	list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
-		__rbd_remove_snap_dev(rbd_dev, snap);
+		__rbd_remove_snap_dev(snap);
 }
 
 /*
@@ -2010,15 +2009,13 @@ static struct device_type rbd_snap_device_type = {
 	.release	= rbd_snap_dev_release,
 };
 
-static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev,
-				  struct rbd_snap *snap)
+static void __rbd_remove_snap_dev(struct rbd_snap *snap)
 {
 	list_del(&snap->node);
 	device_unregister(&snap->dev);
 }
 
-static int rbd_register_snap_dev(struct rbd_device *rbd_dev,
-				  struct rbd_snap *snap,
+static int rbd_register_snap_dev(struct rbd_snap *snap,
 				  struct device *parent)
 {
 	struct device *dev = &snap->dev;
@@ -2045,8 +2042,7 @@ static int __rbd_add_snap_dev(struct rbd_device *rbd_dev,
 	snap->size = rbd_dev->header.snap_sizes[i];
 	snap->id = rbd_dev->header.snapc->snaps[i];
 	if (device_is_registered(&rbd_dev->dev)) {
-		ret = rbd_register_snap_dev(rbd_dev, snap,
-					     &rbd_dev->dev);
+		ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
 		if (ret < 0)
 			goto err;
 	}
@@ -2111,7 +2107,7 @@ static int __rbd_init_snaps_header(struct rbd_device *rbd_dev)
 			 */
 			if (rbd_dev->snap_id == old_snap->id)
 				rbd_dev->snap_exists = false;
-			__rbd_remove_snap_dev(rbd_dev, old_snap);
+			__rbd_remove_snap_dev(old_snap);
 			continue;
 		}
 		if (old_snap->id == cur_id) {
@@ -2175,8 +2171,7 @@ static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
 		goto out;
 
 	list_for_each_entry(snap, &rbd_dev->snaps, node) {
-		ret = rbd_register_snap_dev(rbd_dev, snap,
-					     &rbd_dev->dev);
+		ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
 		if (ret < 0)
 			break;
 	}

From 0e6f322d550a104b2065288c9f6426d5c0414b76 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 25 Jul 2012 09:32:40 -0500
Subject: [PATCH 105/132] rbd: drop "object_name" from rbd_req_sync_watch()

rbd_req_sync_watch() is only called in one place, and in that place
it passes rbd_dev->header_name as the value of the "object_name"
parameter.  This value is available within the function already.

Having the extra parameter leaves the impression the object name
could take on different values, but it does not.

So get rid of the parameter.  We can always add it back again if
we find we want to watch some other object in the future.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 9e960c3ce1c4..b8e557fbf73b 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1238,9 +1238,7 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
 /*
  * Request sync osd watch
  */
-static int rbd_req_sync_watch(struct rbd_device *rbd_dev,
-			      const char *object_name,
-			      u64 ver)
+static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
 {
 	struct ceph_osd_req_op *ops;
 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
@@ -1254,7 +1252,7 @@ static int rbd_req_sync_watch(struct rbd_device *rbd_dev,
 	if (ret < 0)
 		goto fail;
 
-	ops[0].watch.ver = cpu_to_le64(ver);
+	ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
 	ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
 	ops[0].watch.flag = 1;
 
@@ -1263,7 +1261,8 @@ static int rbd_req_sync_watch(struct rbd_device *rbd_dev,
 			      0,
 			      CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
 			      ops,
-			      object_name, 0, 0, NULL,
+			      rbd_dev->header_name,
+			      0, 0, NULL,
 			      &rbd_dev->watch_request, NULL);
 
 	if (ret < 0)
@@ -2190,8 +2189,7 @@ static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
 	int ret, rc;
 
 	do {
-		ret = rbd_req_sync_watch(rbd_dev, rbd_dev->header_name,
-					 rbd_dev->header.obj_version);
+		ret = rbd_req_sync_watch(rbd_dev);
 		if (ret == -ERANGE) {
 			mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
 			rc = __rbd_refresh_header(rbd_dev);

From 4cb162508afade6d24d58e30be2bbaed80cf84d5 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 25 Jul 2012 09:32:40 -0500
Subject: [PATCH 106/132] rbd: drop "object_name" from rbd_req_sync_notify()

rbd_req_sync_notify() only ever uses rbd_dev->header_name as the
value of its "object_name" parameter, and that value is available
within the function already.  So get rid of the parameter.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index b8e557fbf73b..d187d087a505 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1326,8 +1326,7 @@ static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
 /*
  * Request sync osd notify
  */
-static int rbd_req_sync_notify(struct rbd_device *rbd_dev,
-		          const char *object_name)
+static int rbd_req_sync_notify(struct rbd_device *rbd_dev)
 {
 	struct ceph_osd_req_op *ops;
 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
@@ -1358,7 +1357,8 @@ static int rbd_req_sync_notify(struct rbd_device *rbd_dev,
 			       0,
 			       CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
 			       ops,
-			       object_name, 0, 0, NULL, NULL, NULL);
+			       rbd_dev->header_name,
+			       0, 0, NULL, NULL, NULL);
 	if (ret < 0)
 		goto fail_event;
 
@@ -2651,7 +2651,7 @@ static ssize_t rbd_snap_add(struct device *dev,
 	mutex_unlock(&ctl_mutex);
 
 	/* make a best effort, don't error if failed */
-	rbd_req_sync_notify(rbd_dev, rbd_dev->header_name);
+	rbd_req_sync_notify(rbd_dev);
 
 	ret = count;
 	kfree(name);

From 7f0a24d8552db422640e810414c43579bb3d9fb9 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 25 Jul 2012 09:32:40 -0500
Subject: [PATCH 107/132] rbd: drop "object_name" from
 rbd_req_sync_notify_ack()

rbd_req_sync_notify_ack() only ever uses rbd_dev->header_name as the
value of its "object_name" parameter, and that value is available
within the function already.  So get rid of the parameter.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index d187d087a505..7884cb0f7ce1 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1186,8 +1186,7 @@ static int rbd_req_sync_read(struct rbd_device *rbd_dev,
  */
 static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
 				   u64 ver,
-				   u64 notify_id,
-				   const char *object_name)
+				   u64 notify_id)
 {
 	struct ceph_osd_req_op *ops;
 	int ret;
@@ -1201,7 +1200,7 @@ static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
 	ops[0].watch.flag = 0;
 
 	ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
-			  object_name, 0, 0, NULL,
+			  rbd_dev->header_name, 0, 0, NULL,
 			  NULL, 0,
 			  CEPH_OSD_FLAG_READ,
 			  ops,
@@ -1232,7 +1231,7 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
 		pr_warning(RBD_DRV_NAME "%d got notification but failed to "
 			   " update snaps: %d\n", rbd_dev->major, rc);
 
-	rbd_req_sync_notify_ack(rbd_dev, hver, notify_id, rbd_dev->header_name);
+	rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
 }
 
 /*

From 070c633f60c23a89c226eb696f4a17b08a164b10 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 25 Jul 2012 09:32:41 -0500
Subject: [PATCH 108/132] rbd: drop "object_name" from rbd_req_sync_unwatch()

rbd_req_sync_unwatch() only ever uses rbd_dev->header_name as the
value of its "object_name" parameter, and that value is available
within the function already.  So get rid of the parameter.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 7884cb0f7ce1..30eb01e300a4 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1281,8 +1281,7 @@ fail:
 /*
  * Request sync osd unwatch
  */
-static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev,
-				const char *object_name)
+static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
 {
 	struct ceph_osd_req_op *ops;
 
@@ -1299,7 +1298,9 @@ static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev,
 			      0,
 			      CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
 			      ops,
-			      object_name, 0, 0, NULL, NULL, NULL);
+			      rbd_dev->header_name,
+			      0, 0, NULL, NULL, NULL);
+
 
 	rbd_destroy_ops(ops);
 	ceph_osdc_cancel_event(rbd_dev->watch_event);
@@ -2567,7 +2568,7 @@ static void rbd_dev_release(struct device *dev)
 						    rbd_dev->watch_request);
 	}
 	if (rbd_dev->watch_event)
-		rbd_req_sync_unwatch(rbd_dev, rbd_dev->header_name);
+		rbd_req_sync_unwatch(rbd_dev);
 
 	rbd_put_client(rbd_dev);
 

From 3b5ede07b55b52c3be27749d183d87257d032065 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Fri, 20 Jul 2012 15:22:53 -0700
Subject: [PATCH 109/132] libceph: fix fault locking; close socket on lossy
 fault

If we fault on a lossy connection, we should still close the socket
immediately, and do so under the con mutex.

We should also take the con mutex before printing out the state bits in
the debug output.

Signed-off-by: Sage Weil <sage@inktank.com>
---
 net/ceph/messenger.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 07204f19e856..9aaf539942ac 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -2330,22 +2330,23 @@ fault:
  */
 static void ceph_fault(struct ceph_connection *con)
 {
+	mutex_lock(&con->mutex);
+
 	pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
 	       ceph_pr_addr(&con->peer_addr.in_addr), con->error_msg);
 	dout("fault %p state %lu to peer %s\n",
 	     con, con->state, ceph_pr_addr(&con->peer_addr.in_addr));
 
-	if (test_bit(LOSSYTX, &con->flags)) {
-		dout("fault on LOSSYTX channel\n");
-		goto out;
-	}
-
-	mutex_lock(&con->mutex);
 	if (test_bit(CLOSED, &con->state))
 		goto out_unlock;
 
 	con_close_socket(con);
 
+	if (test_bit(LOSSYTX, &con->flags)) {
+		dout("fault on LOSSYTX channel\n");
+		goto out_unlock;
+	}
+
 	if (con->in_msg) {
 		BUG_ON(con->in_msg->con != con);
 		con->in_msg->con = NULL;
@@ -2392,7 +2393,6 @@ static void ceph_fault(struct ceph_connection *con)
 
 out_unlock:
 	mutex_unlock(&con->mutex);
-out:
 	/*
 	 * in case we faulted due to authentication, invalidate our
 	 * current tickets so that we can get new ones.

From 00650931e52e97fe64096bec167f5a6780dfd94a Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Fri, 20 Jul 2012 15:33:04 -0700
Subject: [PATCH 110/132] libceph: move msgr clear_standby under con mutex
 protection

Avoid dropping and retaking con->mutex in the ceph_con_send() case by
leaving locking up to the caller.

Signed-off-by: Sage Weil <sage@inktank.com>
---
 net/ceph/messenger.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 9aaf539942ac..1a3cb4a4f180 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -2441,12 +2441,10 @@ static void clear_standby(struct ceph_connection *con)
 {
 	/* come back from STANDBY? */
 	if (test_and_clear_bit(STANDBY, &con->state)) {
-		mutex_lock(&con->mutex);
 		dout("clear_standby %p and ++connect_seq\n", con);
 		con->connect_seq++;
 		WARN_ON(test_bit(WRITE_PENDING, &con->flags));
 		WARN_ON(test_bit(KEEPALIVE_PENDING, &con->flags));
-		mutex_unlock(&con->mutex);
 	}
 }
 
@@ -2483,11 +2481,12 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
 	     le32_to_cpu(msg->hdr.front_len),
 	     le32_to_cpu(msg->hdr.middle_len),
 	     le32_to_cpu(msg->hdr.data_len));
+
+	clear_standby(con);
 	mutex_unlock(&con->mutex);
 
 	/* if there wasn't anything waiting to send before, queue
 	 * new work */
-	clear_standby(con);
 	if (test_and_set_bit(WRITE_PENDING, &con->flags) == 0)
 		queue_con(con);
 }
@@ -2574,7 +2573,9 @@ void ceph_msg_revoke_incoming(struct ceph_msg *msg)
 void ceph_con_keepalive(struct ceph_connection *con)
 {
 	dout("con_keepalive %p\n", con);
+	mutex_lock(&con->mutex);
 	clear_standby(con);
+	mutex_unlock(&con->mutex);
 	if (test_and_set_bit(KEEPALIVE_PENDING, &con->flags) == 0 &&
 	    test_and_set_bit(WRITE_PENDING, &con->flags) == 0)
 		queue_con(con);

From a59b55a602b6c741052d79c1e3643f8440cddd27 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Fri, 20 Jul 2012 15:34:04 -0700
Subject: [PATCH 111/132] libceph: move ceph_con_send() closed check under the
 con mutex

Take the con mutex before checking whether the connection is closed to
avoid racing with someone else closing it.

Signed-off-by: Sage Weil <sage@inktank.com>
---
 net/ceph/messenger.c | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 1a3cb4a4f180..20e60a80bc29 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -2453,22 +2453,20 @@ static void clear_standby(struct ceph_connection *con)
  */
 void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
 {
+	/* set src+dst */
+	msg->hdr.src = con->msgr->inst.name;
+	BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len));
+	msg->needs_out_seq = true;
+
+	mutex_lock(&con->mutex);
+
 	if (test_bit(CLOSED, &con->state)) {
 		dout("con_send %p closed, dropping %p\n", con, msg);
 		ceph_msg_put(msg);
+		mutex_unlock(&con->mutex);
 		return;
 	}
 
-	/* set src+dst */
-	msg->hdr.src = con->msgr->inst.name;
-
-	BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len));
-
-	msg->needs_out_seq = true;
-
-	/* queue */
-	mutex_lock(&con->mutex);
-
 	BUG_ON(msg->con != NULL);
 	msg->con = con->ops->get(con);
 	BUG_ON(msg->con == NULL);

From 2e8cb10063820af7ed7638e3fd9013eee21266e7 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Fri, 20 Jul 2012 15:40:04 -0700
Subject: [PATCH 112/132] libceph: drop gratuitous socket close calls in
 con_work

If the state is CLOSED or OPENING, we shouldn't have a socket.

Signed-off-by: Sage Weil <sage@inktank.com>
---
 net/ceph/messenger.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 20e60a80bc29..32ab7cd089a3 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -2284,15 +2284,15 @@ restart:
 		dout("con_work %p STANDBY\n", con);
 		goto done;
 	}
-	if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */
-		dout("con_work CLOSED\n");
-		con_close_socket(con);
+	if (test_bit(CLOSED, &con->state)) {
+		dout("con_work %p CLOSED\n", con);
+		BUG_ON(con->sock);
 		goto done;
 	}
 	if (test_and_clear_bit(OPENING, &con->state)) {
 		/* reopen w/ new peer */
 		dout("con_work OPENING\n");
-		con_close_socket(con);
+		BUG_ON(con->sock);
 	}
 
 	ret = try_read(con);

From ee76e0736db8455e3b11827d6899bd2a4e1d0584 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Fri, 20 Jul 2012 16:45:49 -0700
Subject: [PATCH 113/132] libceph: close socket directly from ceph_con_close()

It is simpler to do this immediately, since we already hold the con mutex.
It also avoids the need to deal with a not-quite-CLOSED socket in con_work.

Signed-off-by: Sage Weil <sage@inktank.com>
---
 net/ceph/messenger.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 32ab7cd089a3..46ce113732e6 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -519,14 +519,8 @@ void ceph_con_close(struct ceph_connection *con)
 	reset_connection(con);
 	con->peer_global_seq = 0;
 	cancel_delayed_work(&con->work);
+	con_close_socket(con);
 	mutex_unlock(&con->mutex);
-
-	/*
-	 * We cannot close the socket directly from here because the
-	 * work threads use it without holding the mutex.  Instead, let
-	 * con_work() do it.
-	 */
-	queue_con(con);
 }
 EXPORT_SYMBOL(ceph_con_close);
 

From d7353dd5aaf22ed611fbcd0d4a4a12fb30659290 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Fri, 20 Jul 2012 17:19:43 -0700
Subject: [PATCH 114/132] libceph: drop unnecessary CLOSED check in socket
 state change callback

If we are CLOSED, the socket is closed and we won't get these.

Signed-off-by: Sage Weil <sage@inktank.com>
---
 net/ceph/messenger.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 46ce113732e6..e7320cd5fdbc 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -296,9 +296,6 @@ static void ceph_sock_state_change(struct sock *sk)
 	dout("%s %p state = %lu sk_state = %u\n", __func__,
 	     con, con->state, sk->sk_state);
 
-	if (test_bit(CLOSED, &con->state))
-		return;
-
 	switch (sk->sk_state) {
 	case TCP_CLOSE:
 		dout("%s TCP_CLOSE\n", __func__);

From 8dacc7da69a491c515851e68de6036f21b5663ce Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Fri, 20 Jul 2012 17:24:40 -0700
Subject: [PATCH 115/132] libceph: replace connection state bits with states

Use a simple set of 6 enumerated values for the socket states (CON_STATE_*)
and use those instead of the state bits.  All of the con->state checks are
now under the protection of the con mutex, so this is safe.  It also
simplifies many of the state checks because we can check for anything other
than the expected state instead of various bits for races we can think of.

This appears to hold up well to stress testing both with and without socket
failure injection on the server side.

Signed-off-by: Sage Weil <sage@inktank.com>
---
 include/linux/ceph/messenger.h |  12 ---
 net/ceph/messenger.c           | 134 +++++++++++++++++----------------
 2 files changed, 70 insertions(+), 76 deletions(-)

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index a310d7fe6e29..d9c2b8f5abde 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -116,18 +116,6 @@ struct ceph_msg_pos {
 #define SOCK_CLOSED	11 /* socket state changed to closed */
 #define BACKOFF         15
 
-/*
- * ceph_connection states
- */
-#define CONNECTING	1
-#define NEGOTIATING	2
-#define CONNECTED	5
-#define STANDBY		8  /* no outgoing messages, socket closed.  we keep
-			    * the ceph_connection around to maintain shared
-			    * state with the peer. */
-#define CLOSED		10 /* we've closed the connection */
-#define OPENING         13 /* open connection w/ (possibly new) peer */
-
 /*
  * A single connection with another host.
  *
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index e7320cd5fdbc..563e46aa4d6d 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -77,6 +77,17 @@
 #define CON_SOCK_STATE_CONNECTED	3	/* -> CLOSING or -> CLOSED */
 #define CON_SOCK_STATE_CLOSING		4	/* -> CLOSED */
 
+/*
+ * connection states
+ */
+#define CON_STATE_CLOSED        1  /* -> PREOPEN */
+#define CON_STATE_PREOPEN       2  /* -> CONNECTING, CLOSED */
+#define CON_STATE_CONNECTING    3  /* -> NEGOTIATING, CLOSED */
+#define CON_STATE_NEGOTIATING   4  /* -> OPEN, CLOSED */
+#define CON_STATE_OPEN          5  /* -> STANDBY, CLOSED */
+#define CON_STATE_STANDBY       6  /* -> PREOPEN, CLOSED */
+
+
 /* static tag bytes (protocol control messages) */
 static char tag_msg = CEPH_MSGR_TAG_MSG;
 static char tag_ack = CEPH_MSGR_TAG_ACK;
@@ -503,11 +514,7 @@ void ceph_con_close(struct ceph_connection *con)
 	mutex_lock(&con->mutex);
 	dout("con_close %p peer %s\n", con,
 	     ceph_pr_addr(&con->peer_addr.in_addr));
-	clear_bit(NEGOTIATING, &con->state);
-	clear_bit(CONNECTING, &con->state);
-	clear_bit(CONNECTED, &con->state);
-	clear_bit(STANDBY, &con->state);  /* avoid connect_seq bump */
-	set_bit(CLOSED, &con->state);
+	con->state = CON_STATE_CLOSED;
 
 	clear_bit(LOSSYTX, &con->flags);  /* so we retry next connect */
 	clear_bit(KEEPALIVE_PENDING, &con->flags);
@@ -530,8 +537,9 @@ void ceph_con_open(struct ceph_connection *con,
 {
 	mutex_lock(&con->mutex);
 	dout("con_open %p %s\n", con, ceph_pr_addr(&addr->in_addr));
-	set_bit(OPENING, &con->state);
-	WARN_ON(!test_and_clear_bit(CLOSED, &con->state));
+
+	BUG_ON(con->state != CON_STATE_CLOSED);
+	con->state = CON_STATE_PREOPEN;
 
 	con->peer_name.type = (__u8) entity_type;
 	con->peer_name.num = cpu_to_le64(entity_num);
@@ -571,7 +579,7 @@ void ceph_con_init(struct ceph_connection *con, void *private,
 	INIT_LIST_HEAD(&con->out_sent);
 	INIT_DELAYED_WORK(&con->work, con_work);
 
-	set_bit(CLOSED, &con->state);
+	con->state = CON_STATE_CLOSED;
 }
 EXPORT_SYMBOL(ceph_con_init);
 
@@ -809,27 +817,21 @@ static struct ceph_auth_handshake *get_connect_authorizer(struct ceph_connection
 	if (!con->ops->get_authorizer) {
 		con->out_connect.authorizer_protocol = CEPH_AUTH_UNKNOWN;
 		con->out_connect.authorizer_len = 0;
-
 		return NULL;
 	}
 
 	/* Can't hold the mutex while getting authorizer */
-
 	mutex_unlock(&con->mutex);
-
 	auth = con->ops->get_authorizer(con, auth_proto, con->auth_retry);
-
 	mutex_lock(&con->mutex);
 
 	if (IS_ERR(auth))
 		return auth;
-	if (test_bit(CLOSED, &con->state) || test_bit(OPENING, &con->flags))
+	if (con->state != CON_STATE_NEGOTIATING)
 		return ERR_PTR(-EAGAIN);
 
 	con->auth_reply_buf = auth->authorizer_reply_buf;
 	con->auth_reply_buf_len = auth->authorizer_reply_buf_len;
-
-
 	return auth;
 }
 
@@ -1484,7 +1486,8 @@ static int process_banner(struct ceph_connection *con)
 static void fail_protocol(struct ceph_connection *con)
 {
 	reset_connection(con);
-	set_bit(CLOSED, &con->state);  /* in case there's queued work */
+	BUG_ON(con->state != CON_STATE_NEGOTIATING);
+	con->state = CON_STATE_CLOSED;
 }
 
 static int process_connect(struct ceph_connection *con)
@@ -1558,8 +1561,7 @@ static int process_connect(struct ceph_connection *con)
 		if (con->ops->peer_reset)
 			con->ops->peer_reset(con);
 		mutex_lock(&con->mutex);
-		if (test_bit(CLOSED, &con->state) ||
-		    test_bit(OPENING, &con->state))
+		if (con->state != CON_STATE_NEGOTIATING)
 			return -EAGAIN;
 		break;
 
@@ -1605,8 +1607,10 @@ static int process_connect(struct ceph_connection *con)
 			fail_protocol(con);
 			return -1;
 		}
-		clear_bit(NEGOTIATING, &con->state);
-		set_bit(CONNECTED, &con->state);
+
+		BUG_ON(con->state != CON_STATE_NEGOTIATING);
+		con->state = CON_STATE_OPEN;
+
 		con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
 		con->connect_seq++;
 		con->peer_features = server_feat;
@@ -1994,8 +1998,9 @@ more:
 	dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
 
 	/* open the socket first? */
-	if (con->sock == NULL) {
-		set_bit(CONNECTING, &con->state);
+	if (con->state == CON_STATE_PREOPEN) {
+		BUG_ON(con->sock);
+		con->state = CON_STATE_CONNECTING;
 
 		con_out_kvec_reset(con);
 		prepare_write_banner(con);
@@ -2046,8 +2051,7 @@ more_kvec:
 	}
 
 do_next:
-	if (!test_bit(CONNECTING, &con->state) &&
-			!test_bit(NEGOTIATING, &con->state)) {
+	if (con->state == CON_STATE_OPEN) {
 		/* is anything else pending? */
 		if (!list_empty(&con->out_queue)) {
 			prepare_write_message(con);
@@ -2081,29 +2085,19 @@ static int try_read(struct ceph_connection *con)
 {
 	int ret = -1;
 
-	if (!con->sock)
-		return 0;
-
-	if (test_bit(STANDBY, &con->state))
-		return 0;
-
-	dout("try_read start on %p\n", con);
-
 more:
+	dout("try_read start on %p state %lu\n", con, con->state);
+	if (con->state != CON_STATE_CONNECTING &&
+	    con->state != CON_STATE_NEGOTIATING &&
+	    con->state != CON_STATE_OPEN)
+		return 0;
+
+	BUG_ON(!con->sock);
+
 	dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
 	     con->in_base_pos);
 
-	/*
-	 * process_connect and process_message drop and re-take
-	 * con->mutex.  make sure we handle a racing close or reopen.
-	 */
-	if (test_bit(CLOSED, &con->state) ||
-	    test_bit(OPENING, &con->state)) {
-		ret = -EAGAIN;
-		goto out;
-	}
-
-	if (test_bit(CONNECTING, &con->state)) {
+	if (con->state == CON_STATE_CONNECTING) {
 		dout("try_read connecting\n");
 		ret = read_partial_banner(con);
 		if (ret <= 0)
@@ -2112,8 +2106,8 @@ more:
 		if (ret < 0)
 			goto out;
 
-		clear_bit(CONNECTING, &con->state);
-		set_bit(NEGOTIATING, &con->state);
+		BUG_ON(con->state != CON_STATE_CONNECTING);
+		con->state = CON_STATE_NEGOTIATING;
 
 		/* Banner is good, exchange connection info */
 		ret = prepare_write_connect(con);
@@ -2125,7 +2119,7 @@ more:
 		goto out;
 	}
 
-	if (test_bit(NEGOTIATING, &con->state)) {
+	if (con->state == CON_STATE_NEGOTIATING) {
 		dout("try_read negotiating\n");
 		ret = read_partial_connect(con);
 		if (ret <= 0)
@@ -2136,6 +2130,8 @@ more:
 		goto more;
 	}
 
+	BUG_ON(con->state != CON_STATE_OPEN);
+
 	if (con->in_base_pos < 0) {
 		/*
 		 * skipping + discarding content.
@@ -2169,8 +2165,8 @@ more:
 			prepare_read_ack(con);
 			break;
 		case CEPH_MSGR_TAG_CLOSE:
-			clear_bit(CONNECTED, &con->state);
-			set_bit(CLOSED, &con->state);   /* fixme */
+			con_close_socket(con);
+			con->state = CON_STATE_CLOSED;
 			goto out;
 		default:
 			goto bad_tag;
@@ -2246,14 +2242,21 @@ static void con_work(struct work_struct *work)
 	mutex_lock(&con->mutex);
 restart:
 	if (test_and_clear_bit(SOCK_CLOSED, &con->flags)) {
-		if (test_and_clear_bit(CONNECTED, &con->state))
-			con->error_msg = "socket closed";
-		else if (test_and_clear_bit(NEGOTIATING, &con->state))
-			con->error_msg = "negotiation failed";
-		else if (test_and_clear_bit(CONNECTING, &con->state))
+		switch (con->state) {
+		case CON_STATE_CONNECTING:
 			con->error_msg = "connection failed";
-		else
+			break;
+		case CON_STATE_NEGOTIATING:
+			con->error_msg = "negotiation failed";
+			break;
+		case CON_STATE_OPEN:
+			con->error_msg = "socket closed";
+			break;
+		default:
+			dout("unrecognized con state %d\n", (int)con->state);
 			con->error_msg = "unrecognized con state";
+			BUG();
+		}
 		goto fault;
 	}
 
@@ -2271,17 +2274,16 @@ restart:
 		}
 	}
 
-	if (test_bit(STANDBY, &con->state)) {
+	if (con->state == CON_STATE_STANDBY) {
 		dout("con_work %p STANDBY\n", con);
 		goto done;
 	}
-	if (test_bit(CLOSED, &con->state)) {
+	if (con->state == CON_STATE_CLOSED) {
 		dout("con_work %p CLOSED\n", con);
 		BUG_ON(con->sock);
 		goto done;
 	}
-	if (test_and_clear_bit(OPENING, &con->state)) {
-		/* reopen w/ new peer */
+	if (con->state == CON_STATE_PREOPEN) {
 		dout("con_work OPENING\n");
 		BUG_ON(con->sock);
 	}
@@ -2328,13 +2330,15 @@ static void ceph_fault(struct ceph_connection *con)
 	dout("fault %p state %lu to peer %s\n",
 	     con, con->state, ceph_pr_addr(&con->peer_addr.in_addr));
 
-	if (test_bit(CLOSED, &con->state))
-		goto out_unlock;
+	BUG_ON(con->state != CON_STATE_CONNECTING &&
+	       con->state != CON_STATE_NEGOTIATING &&
+	       con->state != CON_STATE_OPEN);
 
 	con_close_socket(con);
 
 	if (test_bit(LOSSYTX, &con->flags)) {
-		dout("fault on LOSSYTX channel\n");
+		dout("fault on LOSSYTX channel, marking CLOSED\n");
+		con->state = CON_STATE_CLOSED;
 		goto out_unlock;
 	}
 
@@ -2355,9 +2359,10 @@ static void ceph_fault(struct ceph_connection *con)
 	    !test_bit(KEEPALIVE_PENDING, &con->flags)) {
 		dout("fault %p setting STANDBY clearing WRITE_PENDING\n", con);
 		clear_bit(WRITE_PENDING, &con->flags);
-		set_bit(STANDBY, &con->state);
+		con->state = CON_STATE_STANDBY;
 	} else {
 		/* retry after a delay. */
+		con->state = CON_STATE_PREOPEN;
 		if (con->delay == 0)
 			con->delay = BASE_DELAY_INTERVAL;
 		else if (con->delay < MAX_DELAY_INTERVAL)
@@ -2431,8 +2436,9 @@ EXPORT_SYMBOL(ceph_messenger_init);
 static void clear_standby(struct ceph_connection *con)
 {
 	/* come back from STANDBY? */
-	if (test_and_clear_bit(STANDBY, &con->state)) {
+	if (con->state == CON_STATE_STANDBY) {
 		dout("clear_standby %p and ++connect_seq\n", con);
+		con->state = CON_STATE_PREOPEN;
 		con->connect_seq++;
 		WARN_ON(test_bit(WRITE_PENDING, &con->flags));
 		WARN_ON(test_bit(KEEPALIVE_PENDING, &con->flags));
@@ -2451,7 +2457,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
 
 	mutex_lock(&con->mutex);
 
-	if (test_bit(CLOSED, &con->state)) {
+	if (con->state == CON_STATE_CLOSED) {
 		dout("con_send %p closed, dropping %p\n", con, msg);
 		ceph_msg_put(msg);
 		mutex_unlock(&con->mutex);

From 4a8616920860920abaa51193146fe36b38ef09aa Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Fri, 20 Jul 2012 17:29:55 -0700
Subject: [PATCH 116/132] libceph: clean up con flags

Rename flags with CON_FLAG prefix, move the definitions into the c file,
and (better) document their meaning.

Signed-off-by: Sage Weil <sage@inktank.com>
---
 include/linux/ceph/messenger.h | 10 ------
 net/ceph/messenger.c           | 62 ++++++++++++++++++++--------------
 2 files changed, 36 insertions(+), 36 deletions(-)

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index d9c2b8f5abde..189ae0637634 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -106,16 +106,6 @@ struct ceph_msg_pos {
 #define BASE_DELAY_INTERVAL	(HZ/2)
 #define MAX_DELAY_INTERVAL	(5 * 60 * HZ)
 
-/*
- * ceph_connection flag bits
- */
-
-#define LOSSYTX         0  /* we can close channel or drop messages on errors */
-#define KEEPALIVE_PENDING      3
-#define WRITE_PENDING	4  /* we have data ready to send */
-#define SOCK_CLOSED	11 /* socket state changed to closed */
-#define BACKOFF         15
-
 /*
  * A single connection with another host.
  *
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 563e46aa4d6d..b872db5c4989 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -87,6 +87,15 @@
 #define CON_STATE_OPEN          5  /* -> STANDBY, CLOSED */
 #define CON_STATE_STANDBY       6  /* -> PREOPEN, CLOSED */
 
+/*
+ * ceph_connection flag bits
+ */
+#define CON_FLAG_LOSSYTX           0  /* we can close channel or drop
+				       * messages on errors */
+#define CON_FLAG_KEEPALIVE_PENDING 1  /* we need to send a keepalive */
+#define CON_FLAG_WRITE_PENDING	   2  /* we have data ready to send */
+#define CON_FLAG_SOCK_CLOSED	   3  /* socket state changed to closed */
+#define CON_FLAG_BACKOFF           4  /* need to retry queuing delayed work */
 
 /* static tag bytes (protocol control messages) */
 static char tag_msg = CEPH_MSGR_TAG_MSG;
@@ -288,7 +297,7 @@ static void ceph_sock_write_space(struct sock *sk)
 	 * buffer. See net/ipv4/tcp_input.c:tcp_check_space()
 	 * and net/core/stream.c:sk_stream_write_space().
 	 */
-	if (test_bit(WRITE_PENDING, &con->flags)) {
+	if (test_bit(CON_FLAG_WRITE_PENDING, &con->flags)) {
 		if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
 			dout("%s %p queueing write work\n", __func__, con);
 			clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
@@ -313,7 +322,7 @@ static void ceph_sock_state_change(struct sock *sk)
 	case TCP_CLOSE_WAIT:
 		dout("%s TCP_CLOSE_WAIT\n", __func__);
 		con_sock_state_closing(con);
-		set_bit(SOCK_CLOSED, &con->flags);
+		set_bit(CON_FLAG_SOCK_CLOSED, &con->flags);
 		queue_con(con);
 		break;
 	case TCP_ESTABLISHED:
@@ -449,12 +458,12 @@ static int con_close_socket(struct ceph_connection *con)
 	con->sock = NULL;
 
 	/*
-	 * Forcibly clear the SOCK_CLOSE flag.  It gets set
+	 * Forcibly clear the SOCK_CLOSED flag.  It gets set
 	 * independent of the connection mutex, and we could have
 	 * received a socket close event before we had the chance to
 	 * shut the socket down.
 	 */
-	clear_bit(SOCK_CLOSED, &con->flags);
+	clear_bit(CON_FLAG_SOCK_CLOSED, &con->flags);
 	con_sock_state_closed(con);
 	return rc;
 }
@@ -516,9 +525,9 @@ void ceph_con_close(struct ceph_connection *con)
 	     ceph_pr_addr(&con->peer_addr.in_addr));
 	con->state = CON_STATE_CLOSED;
 
-	clear_bit(LOSSYTX, &con->flags);  /* so we retry next connect */
-	clear_bit(KEEPALIVE_PENDING, &con->flags);
-	clear_bit(WRITE_PENDING, &con->flags);
+	clear_bit(CON_FLAG_LOSSYTX, &con->flags); /* so we retry next connect */
+	clear_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags);
+	clear_bit(CON_FLAG_WRITE_PENDING, &con->flags);
 
 	reset_connection(con);
 	con->peer_global_seq = 0;
@@ -770,7 +779,7 @@ static void prepare_write_message(struct ceph_connection *con)
 		/* no, queue up footer too and be done */
 		prepare_write_message_footer(con);
 
-	set_bit(WRITE_PENDING, &con->flags);
+	set_bit(CON_FLAG_WRITE_PENDING, &con->flags);
 }
 
 /*
@@ -791,7 +800,7 @@ static void prepare_write_ack(struct ceph_connection *con)
 				&con->out_temp_ack);
 
 	con->out_more = 1;  /* more will follow.. eventually.. */
-	set_bit(WRITE_PENDING, &con->flags);
+	set_bit(CON_FLAG_WRITE_PENDING, &con->flags);
 }
 
 /*
@@ -802,7 +811,7 @@ static void prepare_write_keepalive(struct ceph_connection *con)
 	dout("prepare_write_keepalive %p\n", con);
 	con_out_kvec_reset(con);
 	con_out_kvec_add(con, sizeof (tag_keepalive), &tag_keepalive);
-	set_bit(WRITE_PENDING, &con->flags);
+	set_bit(CON_FLAG_WRITE_PENDING, &con->flags);
 }
 
 /*
@@ -845,7 +854,7 @@ static void prepare_write_banner(struct ceph_connection *con)
 					&con->msgr->my_enc_addr);
 
 	con->out_more = 0;
-	set_bit(WRITE_PENDING, &con->flags);
+	set_bit(CON_FLAG_WRITE_PENDING, &con->flags);
 }
 
 static int prepare_write_connect(struct ceph_connection *con)
@@ -896,7 +905,7 @@ static int prepare_write_connect(struct ceph_connection *con)
 					auth->authorizer_buf);
 
 	con->out_more = 0;
-	set_bit(WRITE_PENDING, &con->flags);
+	set_bit(CON_FLAG_WRITE_PENDING, &con->flags);
 
 	return 0;
 }
@@ -1622,7 +1631,7 @@ static int process_connect(struct ceph_connection *con)
 			le32_to_cpu(con->in_reply.connect_seq));
 
 		if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
-			set_bit(LOSSYTX, &con->flags);
+			set_bit(CON_FLAG_LOSSYTX, &con->flags);
 
 		con->delay = 0;      /* reset backoff memory */
 
@@ -2061,14 +2070,15 @@ do_next:
 			prepare_write_ack(con);
 			goto more;
 		}
-		if (test_and_clear_bit(KEEPALIVE_PENDING, &con->flags)) {
+		if (test_and_clear_bit(CON_FLAG_KEEPALIVE_PENDING,
+				       &con->flags)) {
 			prepare_write_keepalive(con);
 			goto more;
 		}
 	}
 
 	/* Nothing to do! */
-	clear_bit(WRITE_PENDING, &con->flags);
+	clear_bit(CON_FLAG_WRITE_PENDING, &con->flags);
 	dout("try_write nothing else to write.\n");
 	ret = 0;
 out:
@@ -2241,7 +2251,7 @@ static void con_work(struct work_struct *work)
 
 	mutex_lock(&con->mutex);
 restart:
-	if (test_and_clear_bit(SOCK_CLOSED, &con->flags)) {
+	if (test_and_clear_bit(CON_FLAG_SOCK_CLOSED, &con->flags)) {
 		switch (con->state) {
 		case CON_STATE_CONNECTING:
 			con->error_msg = "connection failed";
@@ -2260,7 +2270,7 @@ restart:
 		goto fault;
 	}
 
-	if (test_and_clear_bit(BACKOFF, &con->flags)) {
+	if (test_and_clear_bit(CON_FLAG_BACKOFF, &con->flags)) {
 		dout("con_work %p backing off\n", con);
 		if (queue_delayed_work(ceph_msgr_wq, &con->work,
 				       round_jiffies_relative(con->delay))) {
@@ -2336,7 +2346,7 @@ static void ceph_fault(struct ceph_connection *con)
 
 	con_close_socket(con);
 
-	if (test_bit(LOSSYTX, &con->flags)) {
+	if (test_bit(CON_FLAG_LOSSYTX, &con->flags)) {
 		dout("fault on LOSSYTX channel, marking CLOSED\n");
 		con->state = CON_STATE_CLOSED;
 		goto out_unlock;
@@ -2356,9 +2366,9 @@ static void ceph_fault(struct ceph_connection *con)
 	/* If there are no messages queued or keepalive pending, place
 	 * the connection in a STANDBY state */
 	if (list_empty(&con->out_queue) &&
-	    !test_bit(KEEPALIVE_PENDING, &con->flags)) {
+	    !test_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags)) {
 		dout("fault %p setting STANDBY clearing WRITE_PENDING\n", con);
-		clear_bit(WRITE_PENDING, &con->flags);
+		clear_bit(CON_FLAG_WRITE_PENDING, &con->flags);
 		con->state = CON_STATE_STANDBY;
 	} else {
 		/* retry after a delay. */
@@ -2383,7 +2393,7 @@ static void ceph_fault(struct ceph_connection *con)
 			 * that when con_work restarts we schedule the
 			 * delay then.
 			 */
-			set_bit(BACKOFF, &con->flags);
+			set_bit(CON_FLAG_BACKOFF, &con->flags);
 		}
 	}
 
@@ -2440,8 +2450,8 @@ static void clear_standby(struct ceph_connection *con)
 		dout("clear_standby %p and ++connect_seq\n", con);
 		con->state = CON_STATE_PREOPEN;
 		con->connect_seq++;
-		WARN_ON(test_bit(WRITE_PENDING, &con->flags));
-		WARN_ON(test_bit(KEEPALIVE_PENDING, &con->flags));
+		WARN_ON(test_bit(CON_FLAG_WRITE_PENDING, &con->flags));
+		WARN_ON(test_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags));
 	}
 }
 
@@ -2482,7 +2492,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
 
 	/* if there wasn't anything waiting to send before, queue
 	 * new work */
-	if (test_and_set_bit(WRITE_PENDING, &con->flags) == 0)
+	if (test_and_set_bit(CON_FLAG_WRITE_PENDING, &con->flags) == 0)
 		queue_con(con);
 }
 EXPORT_SYMBOL(ceph_con_send);
@@ -2571,8 +2581,8 @@ void ceph_con_keepalive(struct ceph_connection *con)
 	mutex_lock(&con->mutex);
 	clear_standby(con);
 	mutex_unlock(&con->mutex);
-	if (test_and_set_bit(KEEPALIVE_PENDING, &con->flags) == 0 &&
-	    test_and_set_bit(WRITE_PENDING, &con->flags) == 0)
+	if (test_and_set_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags) == 0 &&
+	    test_and_set_bit(CON_FLAG_WRITE_PENDING, &con->flags) == 0)
 		queue_con(con);
 }
 EXPORT_SYMBOL(ceph_con_keepalive);

From 43c7427d100769451601b8a36988ac0528ce0124 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Fri, 20 Jul 2012 17:30:40 -0700
Subject: [PATCH 117/132] libceph: clear all flags on con_close

Signed-off-by: Sage Weil <sage@inktank.com>
---
 net/ceph/messenger.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index b872db5c4989..fa16f2cc35fe 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -528,6 +528,8 @@ void ceph_con_close(struct ceph_connection *con)
 	clear_bit(CON_FLAG_LOSSYTX, &con->flags); /* so we retry next connect */
 	clear_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags);
 	clear_bit(CON_FLAG_WRITE_PENDING, &con->flags);
+	clear_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags);
+	clear_bit(CON_FLAG_BACKOFF, &con->flags);
 
 	reset_connection(con);
 	con->peer_global_seq = 0;

From 756a16a5d53c072cd1f937b261641c1a3b53fdd7 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Mon, 30 Jul 2012 16:26:13 -0700
Subject: [PATCH 118/132] libceph: be less chatty about stray replies

There are many (normal) conditions that can lead to us getting
unexpected replies, include cluster topology changes, osd failures,
and timeouts.  There's no need to spam the console about it.

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Alex Elder <elder@inktank.com>
---
 net/ceph/osd_client.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index ad427e698890..42119c05e82c 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -2037,8 +2037,8 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
 	if (!req) {
 		*skip = 1;
 		m = NULL;
-		pr_info("get_reply unknown tid %llu from osd%d\n", tid,
-			osd->o_osd);
+		dout("get_reply unknown tid %llu from osd%d\n", tid,
+		     osd->o_osd);
 		goto out;
 	}
 

From 09d9032751bfdabcba8752ab4b01a30fedea3d67 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Mon, 30 Jul 2012 16:27:48 -0700
Subject: [PATCH 119/132] ceph: update MAINTAINERS file

 * shiny new inktank.com email addresses
 * add include/linux/crush directory (previous oversight)

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Alex Elder <elder@inktank.com>
---
 MAINTAINERS | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 55f0fda602ec..3afd002ef95a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1761,15 +1761,16 @@ F:	arch/powerpc/oprofile/*cell*
 F:	arch/powerpc/platforms/cell/
 
 CEPH DISTRIBUTED FILE SYSTEM CLIENT
-M:	Sage Weil <sage@newdream.net>
+M:	Sage Weil <sage@inktank.com>
 L:	ceph-devel@vger.kernel.org
-W:	http://ceph.newdream.net/
+W:	http://ceph.com/
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git
 S:	Supported
 F:	Documentation/filesystems/ceph.txt
 F:	fs/ceph
 F:	net/ceph
 F:	include/linux/ceph
+F:	include/linux/crush
 
 CERTIFIED WIRELESS USB (WUSB) SUBSYSTEM:
 L:	linux-usb@vger.kernel.org
@@ -5573,10 +5574,12 @@ S:	Supported
 F:	arch/hexagon/
 
 RADOS BLOCK DEVICE (RBD)
-F:	include/linux/qnxtypes.h
-M:	Yehuda Sadeh <yehuda@hq.newdream.net>
-M:	Sage Weil <sage@newdream.net>
+M:	Yehuda Sadeh <yehuda@inktank.com>
+M:	Sage Weil <sage@inktank.com>
+M:	Alex Elder <elder@inktank.com>
 M:	ceph-devel@vger.kernel.org
+W:	http://ceph.com/
+T:	git git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git
 S:	Supported
 F:	drivers/block/rbd.c
 F:	drivers/block/rbd_types.h

From 8007b8d626b49c34fb146ec16dc639d8b10c862f Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Mon, 30 Jul 2012 18:16:16 -0700
Subject: [PATCH 120/132] libceph: fix handling of immediate socket connect
 failure

If the connect() call immediately fails such that sock == NULL, we
still need con_close_socket() to reset our socket state to CLOSED.

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Alex Elder <elder@inktank.com>
---
 net/ceph/messenger.c | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index fa16f2cc35fe..a6a0c7a0a979 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -224,6 +224,8 @@ static void con_sock_state_init(struct ceph_connection *con)
 	old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSED);
 	if (WARN_ON(old_state != CON_SOCK_STATE_NEW))
 		printk("%s: unexpected old state %d\n", __func__, old_state);
+	dout("%s con %p sock %d -> %d\n", __func__, con, old_state,
+	     CON_SOCK_STATE_CLOSED);
 }
 
 static void con_sock_state_connecting(struct ceph_connection *con)
@@ -233,6 +235,8 @@ static void con_sock_state_connecting(struct ceph_connection *con)
 	old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CONNECTING);
 	if (WARN_ON(old_state != CON_SOCK_STATE_CLOSED))
 		printk("%s: unexpected old state %d\n", __func__, old_state);
+	dout("%s con %p sock %d -> %d\n", __func__, con, old_state,
+	     CON_SOCK_STATE_CONNECTING);
 }
 
 static void con_sock_state_connected(struct ceph_connection *con)
@@ -242,6 +246,8 @@ static void con_sock_state_connected(struct ceph_connection *con)
 	old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CONNECTED);
 	if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTING))
 		printk("%s: unexpected old state %d\n", __func__, old_state);
+	dout("%s con %p sock %d -> %d\n", __func__, con, old_state,
+	     CON_SOCK_STATE_CONNECTED);
 }
 
 static void con_sock_state_closing(struct ceph_connection *con)
@@ -253,6 +259,8 @@ static void con_sock_state_closing(struct ceph_connection *con)
 			old_state != CON_SOCK_STATE_CONNECTED &&
 			old_state != CON_SOCK_STATE_CLOSING))
 		printk("%s: unexpected old state %d\n", __func__, old_state);
+	dout("%s con %p sock %d -> %d\n", __func__, con, old_state,
+	     CON_SOCK_STATE_CLOSING);
 }
 
 static void con_sock_state_closed(struct ceph_connection *con)
@@ -262,8 +270,11 @@ static void con_sock_state_closed(struct ceph_connection *con)
 	old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSED);
 	if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTED &&
 		    old_state != CON_SOCK_STATE_CLOSING &&
-		    old_state != CON_SOCK_STATE_CONNECTING))
+		    old_state != CON_SOCK_STATE_CONNECTING &&
+		    old_state != CON_SOCK_STATE_CLOSED))
 		printk("%s: unexpected old state %d\n", __func__, old_state);
+	dout("%s con %p sock %d -> %d\n", __func__, con, old_state,
+	     CON_SOCK_STATE_CLOSED);
 }
 
 /*
@@ -448,14 +459,14 @@ static int ceph_tcp_sendpage(struct socket *sock, struct page *page,
  */
 static int con_close_socket(struct ceph_connection *con)
 {
-	int rc;
+	int rc = 0;
 
 	dout("con_close_socket on %p sock %p\n", con, con->sock);
-	if (!con->sock)
-		return 0;
-	rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR);
-	sock_release(con->sock);
-	con->sock = NULL;
+	if (con->sock) {
+		rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR);
+		sock_release(con->sock);
+		con->sock = NULL;
+	}
 
 	/*
 	 * Forcibly clear the SOCK_CLOSED flag.  It gets set
@@ -464,6 +475,7 @@ static int con_close_socket(struct ceph_connection *con)
 	 * shut the socket down.
 	 */
 	clear_bit(CON_FLAG_SOCK_CLOSED, &con->flags);
+
 	con_sock_state_closed(con);
 	return rc;
 }

From 4f471e4a9c7db0256834e1b376ea50c82e345c3c Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Mon, 30 Jul 2012 18:16:40 -0700
Subject: [PATCH 121/132] libceph: revoke mon_client messages on session
 restart

Revoke all mon_client messages when we shut down the old connection.
This is mostly moot since we are re-using the same ceph_connection,
but it is cleaner.

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Alex Elder <elder@inktank.com>
---
 net/ceph/mon_client.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index bfd21a891d69..105d533b55f3 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -118,6 +118,9 @@ static void __close_session(struct ceph_mon_client *monc)
 {
 	dout("__close_session closing mon%d\n", monc->cur_mon);
 	ceph_msg_revoke(monc->m_auth);
+	ceph_msg_revoke_incoming(monc->m_auth_reply);
+	ceph_msg_revoke(monc->m_subscribe);
+	ceph_msg_revoke_incoming(monc->m_subscribe_ack);
 	ceph_con_close(&monc->con);
 	monc->cur_mon = -1;
 	monc->pending_auth = 0;
@@ -685,6 +688,7 @@ static void __resend_generic_request(struct ceph_mon_client *monc)
 	for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) {
 		req = rb_entry(p, struct ceph_mon_generic_request, node);
 		ceph_msg_revoke(req->request);
+		ceph_msg_revoke_incoming(req->reply);
 		ceph_con_send(&monc->con, ceph_msg_get(req->request));
 	}
 }

From 7b862e07b1a4d5c963d19027f10ea78085f27f9b Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Mon, 30 Jul 2012 18:16:56 -0700
Subject: [PATCH 122/132] libceph: verify state after retaking con lock after
 dispatch

We drop the con mutex when delivering a message.  When we retake the
lock, we need to verify we are still in the OPEN state before
preparing to read the next tag, or else we risk stepping on a
connection that has been closed.

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Alex Elder <elder@inktank.com>
---
 net/ceph/messenger.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index a6a0c7a0a979..feb5a2ac724c 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -2003,7 +2003,6 @@ static void process_message(struct ceph_connection *con)
 	con->ops->dispatch(con, msg);
 
 	mutex_lock(&con->mutex);
-	prepare_read_tag(con);
 }
 
 
@@ -2213,6 +2212,8 @@ more:
 		if (con->in_tag == CEPH_MSGR_TAG_READY)
 			goto more;
 		process_message(con);
+		if (con->state == CON_STATE_OPEN)
+			prepare_read_tag(con);
 		goto more;
 	}
 	if (con->in_tag == CEPH_MSGR_TAG_ACK) {

From 8636ea672f0c5ab7478c42c5b6705ebd1db7eb6a Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Mon, 30 Jul 2012 18:17:13 -0700
Subject: [PATCH 123/132] libceph: avoid dropping con mutex before fault

The ceph_fault() function takes the con mutex, so we should avoid
dropping it before calling it.  This fixes a potential race with
another thread calling ceph_con_close(), or _open(), or similar (we
don't reverify con->state after retaking the lock).

Add annotation so that lockdep realizes we will drop the mutex before
returning.

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Alex Elder <elder@inktank.com>
---
 net/ceph/messenger.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index feb5a2ac724c..c3b628c76194 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -2336,7 +2336,6 @@ done_unlocked:
 	return;
 
 fault:
-	mutex_unlock(&con->mutex);
 	ceph_fault(con);     /* error/fault path */
 	goto done_unlocked;
 }
@@ -2347,9 +2346,8 @@ fault:
  * exponential backoff
  */
 static void ceph_fault(struct ceph_connection *con)
+	__releases(con->mutex)
 {
-	mutex_lock(&con->mutex);
-
 	pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
 	       ceph_pr_addr(&con->peer_addr.in_addr), con->error_msg);
 	dout("fault %p state %lu to peer %s\n",

From 4740a623d20c51d167da7f752b63e2b8714b2543 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Mon, 30 Jul 2012 18:19:30 -0700
Subject: [PATCH 124/132] libceph: change ceph_con_in_msg_alloc convention to
 be less weird

This function's calling convention is very limiting.  In particular,
we can't return any error other than ENOMEM (and only implicitly),
which is a problem (see next patch).

Instead, return an normal 0 or error code, and make the skip a pointer
output parameter.  Drop the useless in_hdr argument (we have the con
pointer).

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Alex Elder <elder@inktank.com>
---
 net/ceph/messenger.c | 56 ++++++++++++++++++++++++--------------------
 1 file changed, 31 insertions(+), 25 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index c3b628c76194..13b549b6b1bf 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -1733,9 +1733,7 @@ static int read_partial_message_section(struct ceph_connection *con,
 	return 1;
 }
 
-static bool ceph_con_in_msg_alloc(struct ceph_connection *con,
-				struct ceph_msg_header *hdr);
-
+static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip);
 
 static int read_partial_message_pages(struct ceph_connection *con,
 				      struct page **pages,
@@ -1864,9 +1862,14 @@ static int read_partial_message(struct ceph_connection *con)
 
 	/* allocate message? */
 	if (!con->in_msg) {
+		int skip = 0;
+
 		dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
 		     con->in_hdr.front_len, con->in_hdr.data_len);
-		if (ceph_con_in_msg_alloc(con, &con->in_hdr)) {
+		ret = ceph_con_in_msg_alloc(con, &skip);
+		if (ret < 0)
+			return ret;
+		if (skip) {
 			/* skip this message */
 			dout("alloc_msg said skip message\n");
 			BUG_ON(con->in_msg);
@@ -1876,12 +1879,8 @@ static int read_partial_message(struct ceph_connection *con)
 			con->in_seq++;
 			return 0;
 		}
-		if (!con->in_msg) {
-			con->error_msg =
-				"error allocating memory for incoming message";
-			return -ENOMEM;
-		}
 
+		BUG_ON(!con->in_msg);
 		BUG_ON(con->in_msg->con != con);
 		m = con->in_msg;
 		m->front.iov_len = 0;    /* haven't read it yet */
@@ -2715,43 +2714,50 @@ static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg)
  * connection, and save the result in con->in_msg.  Uses the
  * connection's private alloc_msg op if available.
  *
- * Returns true if the message should be skipped, false otherwise.
- * If true is returned (skip message), con->in_msg will be NULL.
- * If false is returned, con->in_msg will contain a pointer to the
- * newly-allocated message, or NULL in case of memory exhaustion.
+ * Returns 0 on success, or a negative error code.
+ *
+ * On success, if we set *skip = 1:
+ *  - the next message should be skipped and ignored.
+ *  - con->in_msg == NULL
+ * or if we set *skip = 0:
+ *  - con->in_msg is non-null.
+ * On error (ENOMEM, EAGAIN, ...),
+ *  - con->in_msg == NULL
  */
-static bool ceph_con_in_msg_alloc(struct ceph_connection *con,
-				struct ceph_msg_header *hdr)
+static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip)
 {
+	struct ceph_msg_header *hdr = &con->in_hdr;
 	int type = le16_to_cpu(hdr->type);
 	int front_len = le32_to_cpu(hdr->front_len);
 	int middle_len = le32_to_cpu(hdr->middle_len);
-	int ret;
+	int ret = 0;
 
 	BUG_ON(con->in_msg != NULL);
 
 	if (con->ops->alloc_msg) {
-		int skip = 0;
-
 		mutex_unlock(&con->mutex);
-		con->in_msg = con->ops->alloc_msg(con, hdr, &skip);
+		con->in_msg = con->ops->alloc_msg(con, hdr, skip);
 		mutex_lock(&con->mutex);
 		if (con->in_msg) {
 			con->in_msg->con = con->ops->get(con);
 			BUG_ON(con->in_msg->con == NULL);
 		}
-		if (skip)
+		if (*skip) {
 			con->in_msg = NULL;
-
-		if (!con->in_msg)
-			return skip != 0;
+			return 0;
+		}
+		if (!con->in_msg) {
+			con->error_msg =
+				"error allocating memory for incoming message";
+			return -ENOMEM;
+		}
 	}
 	if (!con->in_msg) {
 		con->in_msg = ceph_msg_new(type, front_len, GFP_NOFS, false);
 		if (!con->in_msg) {
 			pr_err("unable to allocate msg type %d len %d\n",
 			       type, front_len);
-			return false;
+			return -ENOMEM;
 		}
 		con->in_msg->con = con->ops->get(con);
 		BUG_ON(con->in_msg->con == NULL);
@@ -2767,7 +2773,7 @@ static bool ceph_con_in_msg_alloc(struct ceph_connection *con,
 		}
 	}
 
-	return false;
+	return ret;
 }
 
 
From 6139919133377652992a5fe134e22abce3e9c25e Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Mon, 30 Jul 2012 18:19:45 -0700
Subject: [PATCH 125/132] libceph: recheck con state after allocating incoming
 message

We drop the lock when calling the ->alloc_msg() con op, which means
we need to (a) not clobber con->in_msg without the mutex held, and (b)
we need to verify that we are still in the OPEN state when we retake
it to avoid causing any mayhem.  If the state does change, -EAGAIN
will get us back to con_work() and loop.

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Alex Elder <elder@inktank.com>
---
 net/ceph/messenger.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 13b549b6b1bf..b6655b131558 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -2735,9 +2735,16 @@ static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip)
 	BUG_ON(con->in_msg != NULL);
 
 	if (con->ops->alloc_msg) {
+		struct ceph_msg *msg;
+
 		mutex_unlock(&con->mutex);
-		con->in_msg = con->ops->alloc_msg(con, hdr, skip);
+		msg = con->ops->alloc_msg(con, hdr, skip);
 		mutex_lock(&con->mutex);
+		if (con->state != CON_STATE_OPEN) {
+			ceph_msg_put(msg);
+			return -EAGAIN;
+		}
+		con->in_msg = msg;
 		if (con->in_msg) {
 			con->in_msg->con = con->ops->get(con);
 			BUG_ON(con->in_msg->con == NULL);

From 4e891e0af0f0011c90067373c46d7228568ec079 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Tue, 10 Jul 2012 20:30:10 -0500
Subject: [PATCH 126/132] rbd: have __rbd_add_snap_dev() return a pointer

It's not obvious whether the snapshot pointer whose address is
provided to __rbd_add_snap_dev() will be assigned by that function.
Change it to return the snapshot, or a pointer-coded errno in the
event of a failure.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 37 ++++++++++++++++++++++---------------
 1 file changed, 22 insertions(+), 15 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 30eb01e300a4..aba0d71a0345 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -2029,15 +2029,21 @@ static int rbd_register_snap_dev(struct rbd_snap *snap,
 	return ret;
 }
 
-static int __rbd_add_snap_dev(struct rbd_device *rbd_dev,
-			      int i, const char *name,
-			      struct rbd_snap **snapp)
+static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
+					      int i, const char *name)
 {
+	struct rbd_snap *snap;
 	int ret;
-	struct rbd_snap *snap = kzalloc(sizeof(*snap), GFP_KERNEL);
+
+	snap = kzalloc(sizeof (*snap), GFP_KERNEL);
 	if (!snap)
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);
+
+	ret = -ENOMEM;
 	snap->name = kstrdup(name, GFP_KERNEL);
+	if (!snap->name)
+		goto err;
+
 	snap->size = rbd_dev->header.snap_sizes[i];
 	snap->id = rbd_dev->header.snapc->snaps[i];
 	if (device_is_registered(&rbd_dev->dev)) {
@@ -2045,12 +2051,14 @@ static int __rbd_add_snap_dev(struct rbd_device *rbd_dev,
 		if (ret < 0)
 			goto err;
 	}
-	*snapp = snap;
-	return 0;
+
+	return snap;
+
 err:
 	kfree(snap->name);
 	kfree(snap);
-	return ret;
+
+	return ERR_PTR(ret);
 }
 
 /*
@@ -2083,7 +2091,6 @@ static int __rbd_init_snaps_header(struct rbd_device *rbd_dev)
 	const char *name, *first_name;
 	int i = rbd_dev->header.total_snaps;
 	struct rbd_snap *snap, *old_snap = NULL;
-	int ret;
 	struct list_head *p, *n;
 
 	first_name = rbd_dev->header.snap_names;
@@ -2126,9 +2133,9 @@ static int __rbd_init_snaps_header(struct rbd_device *rbd_dev)
 			if (cur_id >= old_snap->id)
 				break;
 			/* a new snapshot */
-			ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap);
-			if (ret < 0)
-				return ret;
+			snap = __rbd_add_snap_dev(rbd_dev, i - 1, name);
+			if (IS_ERR(snap))
+				return PTR_ERR(snap);
 
 			/* note that we add it backward so using n and not p */
 			list_add(&snap->node, n);
@@ -2142,9 +2149,9 @@ static int __rbd_init_snaps_header(struct rbd_device *rbd_dev)
 			WARN_ON(1);
 			return -EINVAL;
 		}
-		ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap);
-		if (ret < 0)
-			return ret;
+		snap = __rbd_add_snap_dev(rbd_dev, i - 1, name);
+		if (IS_ERR(snap))
+			return PTR_ERR(snap);
 		list_add(&snap->node, &rbd_dev->snaps);
 	}
 

From 57cfc1060f35ac2345cb37ea474f9644ac5cfd75 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Tue, 26 Jun 2012 12:57:03 -0700
Subject: [PATCH 127/132] rbd: make rbd_create_rw_ops() return a pointer

Either rbd_create_rw_ops() will succeed, or it will fail because a
memory allocation failed.  Have it just return a valid pointer or
null rather than stuffing a pointer into a provided address and
returning an errno.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 68 +++++++++++++++++++++++++--------------------
 1 file changed, 38 insertions(+), 30 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index aba0d71a0345..28670c0c68d5 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -788,22 +788,24 @@ err_out:
 /*
  * helpers for osd request op vectors.
  */
-static int rbd_create_rw_ops(struct ceph_osd_req_op **ops,
-			    int num_ops,
-			    int opcode,
-			    u32 payload_len)
+static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
+					int opcode, u32 payload_len)
 {
-	*ops = kzalloc(sizeof(struct ceph_osd_req_op) * (num_ops + 1),
-		       GFP_NOIO);
-	if (!*ops)
-		return -ENOMEM;
-	(*ops)[0].op = opcode;
+	struct ceph_osd_req_op *ops;
+
+	ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
+	if (!ops)
+		return NULL;
+
+	ops[0].op = opcode;
+
 	/*
 	 * op extent offset and length will be set later on
 	 * in calc_raw_layout()
 	 */
-	(*ops)[0].payload_len = payload_len;
-	return 0;
+	ops[0].payload_len = payload_len;
+
+	return ops;
 }
 
 static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
@@ -1040,8 +1042,9 @@ static int rbd_req_sync_op(struct rbd_device *rbd_dev,
 
 	if (!orig_ops) {
 		payload_len = (flags & CEPH_OSD_FLAG_WRITE ? len : 0);
-		ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
-		if (ret < 0)
+		ret = -ENOMEM;
+		ops = rbd_create_rw_ops(1, opcode, payload_len);
+		if (!ops)
 			goto done;
 
 		if ((flags & CEPH_OSD_FLAG_WRITE) && buf) {
@@ -1104,8 +1107,9 @@ static int rbd_do_op(struct request *rq,
 
 	payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
 
-	ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
-	if (ret < 0)
+	ret = -ENOMEM;
+	ops = rbd_create_rw_ops(1, opcode, payload_len);
+	if (!ops)
 		goto done;
 
 	/* we've taken care of segment sizes earlier when we
@@ -1191,9 +1195,9 @@ static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
 	struct ceph_osd_req_op *ops;
 	int ret;
 
-	ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY_ACK, 0);
-	if (ret < 0)
-		return ret;
+	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
+	if (!ops)
+		return -ENOMEM;
 
 	ops[0].watch.ver = cpu_to_le64(ver);
 	ops[0].watch.cookie = notify_id;
@@ -1241,10 +1245,11 @@ static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
 {
 	struct ceph_osd_req_op *ops;
 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+	int ret;
 
-	int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0);
-	if (ret < 0)
-		return ret;
+	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
+	if (!ops)
+		return -ENOMEM;
 
 	ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
 				     (void *)rbd_dev, &rbd_dev->watch_event);
@@ -1284,10 +1289,11 @@ fail:
 static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
 {
 	struct ceph_osd_req_op *ops;
+	int ret;
 
-	int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0);
-	if (ret < 0)
-		return ret;
+	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
+	if (!ops)
+		return -ENOMEM;
 
 	ops[0].watch.ver = 0;
 	ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
@@ -1335,9 +1341,9 @@ static int rbd_req_sync_notify(struct rbd_device *rbd_dev)
 	int payload_len = sizeof(u32) + sizeof(u32);
 	int ret;
 
-	ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY, payload_len);
-	if (ret < 0)
-		return ret;
+	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY, payload_len);
+	if (!ops)
+		return -ENOMEM;
 
 	info.rbd_dev = rbd_dev;
 
@@ -1388,10 +1394,12 @@ static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
 	struct ceph_osd_req_op *ops;
 	int class_name_len = strlen(class_name);
 	int method_name_len = strlen(method_name);
-	int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_CALL,
+	int ret;
+
+	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL,
 				    class_name_len + method_name_len + len);
-	if (ret < 0)
-		return ret;
+	if (!ops)
+		return -ENOMEM;
 
 	ops[0].cls.class_name = class_name;
 	ops[0].cls.class_len = (__u8) class_name_len;

From d67d4be56a3ec8d07b1f29aab6095b363085b028 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 13 Jul 2012 20:35:11 -0500
Subject: [PATCH 128/132] rbd: pass null version pointer in add_snap()

rbd_header_add_snap() passes the address of a version variable to
rbd_req_sync_exec(), but it ignores the result.  Just pass a null
pointer instead.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 28670c0c68d5..e1fa12b2ae2e 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1668,7 +1668,6 @@ static int rbd_header_add_snap(struct rbd_device *rbd_dev,
 	u64 new_snapid;
 	int ret;
 	void *data, *p, *e;
-	u64 ver;
 	struct ceph_mon_client *monc;
 
 	/* we should create a snapshot only if we're pointing at the head */
@@ -1693,7 +1692,7 @@ static int rbd_header_add_snap(struct rbd_device *rbd_dev,
 
 	ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
 				"rbd", "snap_add",
-				data, p - data, &ver);
+				data, p - data, NULL);
 
 	kfree(data);
 

From 913d2fdcf60576cbbd82969fcb2bc78a4d59ba33 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Tue, 26 Jun 2012 12:57:03 -0700
Subject: [PATCH 129/132] rbd: always pass ops array to rbd_req_sync_op()

All of the callers of rbd_req_sync_op() except one pass a non-null
"ops" pointer.  The only one that does not is rbd_req_sync_read(),
which passes CEPH_OSD_OP_READ as its "opcode" and, CEPH_OSD_FLAG_READ
for "flags".

By allocating the ops array in rbd_req_sync_read() and moving the
special case code for the null ops pointer into it, it becomes
clear that much of that code is not even necessary.

In addition, the "opcode" argument to rbd_req_sync_op() is never
actually used, so get rid of that.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 46 ++++++++++++++++-----------------------------
 1 file changed, 16 insertions(+), 30 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index e1fa12b2ae2e..1222d583ac2c 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1020,9 +1020,8 @@ static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg
 static int rbd_req_sync_op(struct rbd_device *rbd_dev,
 			   struct ceph_snap_context *snapc,
 			   u64 snapid,
-			   int opcode,
 			   int flags,
-			   struct ceph_osd_req_op *orig_ops,
+			   struct ceph_osd_req_op *ops,
 			   const char *object_name,
 			   u64 ofs, u64 len,
 			   char *buf,
@@ -1032,28 +1031,14 @@ static int rbd_req_sync_op(struct rbd_device *rbd_dev,
 	int ret;
 	struct page **pages;
 	int num_pages;
-	struct ceph_osd_req_op *ops = orig_ops;
-	u32 payload_len;
+
+	BUG_ON(ops == NULL);
 
 	num_pages = calc_pages_for(ofs , len);
 	pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
 	if (IS_ERR(pages))
 		return PTR_ERR(pages);
 
-	if (!orig_ops) {
-		payload_len = (flags & CEPH_OSD_FLAG_WRITE ? len : 0);
-		ret = -ENOMEM;
-		ops = rbd_create_rw_ops(1, opcode, payload_len);
-		if (!ops)
-			goto done;
-
-		if ((flags & CEPH_OSD_FLAG_WRITE) && buf) {
-			ret = ceph_copy_to_page_vector(pages, buf, ofs, len);
-			if (ret < 0)
-				goto done_ops;
-		}
-	}
-
 	ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
 			  object_name, ofs, len, NULL,
 			  pages, num_pages,
@@ -1063,14 +1048,11 @@ static int rbd_req_sync_op(struct rbd_device *rbd_dev,
 			  NULL,
 			  linger_req, ver);
 	if (ret < 0)
-		goto done_ops;
+		goto done;
 
 	if ((flags & CEPH_OSD_FLAG_READ) && buf)
 		ret = ceph_copy_from_page_vector(pages, buf, ofs, ret);
 
-done_ops:
-	if (!orig_ops)
-		rbd_destroy_ops(ops);
 done:
 	ceph_release_page_vector(pages, num_pages);
 	return ret;
@@ -1177,12 +1159,20 @@ static int rbd_req_sync_read(struct rbd_device *rbd_dev,
 			  char *buf,
 			  u64 *ver)
 {
-	return rbd_req_sync_op(rbd_dev, NULL,
+	struct ceph_osd_req_op *ops;
+	int ret;
+
+	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
+	if (!ops)
+		return -ENOMEM;
+
+	ret = rbd_req_sync_op(rbd_dev, NULL,
 			       snapid,
-			       CEPH_OSD_OP_READ,
 			       CEPH_OSD_FLAG_READ,
-			       NULL,
-			       object_name, ofs, len, buf, NULL, ver);
+			       ops, object_name, ofs, len, buf, NULL, ver);
+	rbd_destroy_ops(ops);
+
+	return ret;
 }
 
 /*
@@ -1262,7 +1252,6 @@ static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
 
 	ret = rbd_req_sync_op(rbd_dev, NULL,
 			      CEPH_NOSNAP,
-			      0,
 			      CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
 			      ops,
 			      rbd_dev->header_name,
@@ -1301,7 +1290,6 @@ static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
 
 	ret = rbd_req_sync_op(rbd_dev, NULL,
 			      CEPH_NOSNAP,
-			      0,
 			      CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
 			      ops,
 			      rbd_dev->header_name,
@@ -1360,7 +1348,6 @@ static int rbd_req_sync_notify(struct rbd_device *rbd_dev)
 
 	ret = rbd_req_sync_op(rbd_dev, NULL,
 			       CEPH_NOSNAP,
-			       0,
 			       CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
 			       ops,
 			       rbd_dev->header_name,
@@ -1411,7 +1398,6 @@ static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
 
 	ret = rbd_req_sync_op(rbd_dev, NULL,
 			       CEPH_NOSNAP,
-			       0,
 			       CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
 			       ops,
 			       object_name, 0, 0, NULL, NULL, ver);

From ccece235d3737221e7a1118fdbd8474112adac84 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Tue, 10 Jul 2012 20:30:10 -0500
Subject: [PATCH 130/132] rbd: fixes in rbd_header_from_disk()

This fixes a few issues in rbd_header_from_disk():
    - There is a check intended to catch overflow, but it's wrong in
      two ways.
	- First, the type we don't want to overflow is size_t, not
	  unsigned int, and there is now a SIZE_MAX we can use for
	  use with that type.
	- Second, we're allocating the snapshot ids and snapshot
	  image sizes separately (each has type u64; on disk they
          grouped together as a rbd_image_header_ondisk structure).
	  So we can use the size of u64 in this overflow check.
    - If there are no snapshots, then there should be no snapshot
      names.  Enforce this, and issue a warning if we encounter a
      header with no snapshots but a non-zero snap_names_len.
    - When saving the snapshot names into the header, be more direct
      in defining the offset in the on-disk structure from which
      they're being copied by using "snap_count" rather than "i"
      in the array index.
    - If an error occurs, the "snapc" and "snap_names" fields are
      freed at the end of the function.  Make those fields be null
      pointers after they're freed, to be explicit that they are
      no longer valid.
    - Finally, move the definition of the local variable "i" to the
      innermost scope in which it's needed.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 1222d583ac2c..34676937d2d2 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -494,14 +494,14 @@ static int rbd_header_from_disk(struct rbd_image_header *header,
 				 struct rbd_image_header_ondisk *ondisk,
 				 u32 allocated_snaps)
 {
-	u32 i, snap_count;
+	u32 snap_count;
 
 	if (!rbd_dev_ondisk_valid(ondisk))
 		return -ENXIO;
 
 	snap_count = le32_to_cpu(ondisk->snap_count);
-	if (snap_count > (UINT_MAX - sizeof(struct ceph_snap_context))
-			 / sizeof (*ondisk))
+	if (snap_count > (SIZE_MAX - sizeof(struct ceph_snap_context))
+				 / sizeof (u64))
 		return -EINVAL;
 	header->snapc = kmalloc(sizeof(struct ceph_snap_context) +
 				snap_count * sizeof(u64),
@@ -509,8 +509,8 @@ static int rbd_header_from_disk(struct rbd_image_header *header,
 	if (!header->snapc)
 		return -ENOMEM;
 
-	header->snap_names_len = le64_to_cpu(ondisk->snap_names_len);
 	if (snap_count) {
+		header->snap_names_len = le64_to_cpu(ondisk->snap_names_len);
 		header->snap_names = kmalloc(header->snap_names_len,
 					     GFP_KERNEL);
 		if (!header->snap_names)
@@ -520,6 +520,8 @@ static int rbd_header_from_disk(struct rbd_image_header *header,
 		if (!header->snap_sizes)
 			goto err_names;
 	} else {
+		WARN_ON(ondisk->snap_names_len);
+		header->snap_names_len = 0;
 		header->snap_names = NULL;
 		header->snap_sizes = NULL;
 	}
@@ -544,6 +546,8 @@ static int rbd_header_from_disk(struct rbd_image_header *header,
 	header->total_snaps = snap_count;
 
 	if (snap_count && allocated_snaps == snap_count) {
+		int i;
+
 		for (i = 0; i < snap_count; i++) {
 			header->snapc->snaps[i] =
 				le64_to_cpu(ondisk->snaps[i].id);
@@ -552,7 +556,7 @@ static int rbd_header_from_disk(struct rbd_image_header *header,
 		}
 
 		/* copy snapshot names */
-		memcpy(header->snap_names, &ondisk->snaps[i],
+		memcpy(header->snap_names, &ondisk->snaps[snap_count],
 			header->snap_names_len);
 	}
 
@@ -560,10 +564,14 @@ static int rbd_header_from_disk(struct rbd_image_header *header,
 
 err_sizes:
 	kfree(header->snap_sizes);
+	header->snap_sizes = NULL;
 err_names:
 	kfree(header->snap_names);
+	header->snap_names = NULL;
 err_snapc:
 	kfree(header->snapc);
+	header->snapc = NULL;
+
 	return -ENOMEM;
 }
 

From b813623ab95d0b4bbeb22e160bd5461965d0c571 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 25 Jul 2012 09:32:41 -0500
Subject: [PATCH 131/132] rbd: return obj version in __rbd_refresh_header()

Add a new parameter to __rbd_refresh_header() through which the
version of the header object is passed back to the caller.  In most
cases this isn't needed.  The main motivation is to normalize
(almost) all calls to __rbd_refresh_header() so they are all
wrapped immediately by mutex_lock()/mutex_unlock().

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 34676937d2d2..de981ac72362 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -240,7 +240,7 @@ static void rbd_put_dev(struct rbd_device *rbd_dev)
 	put_device(&rbd_dev->dev);
 }
 
-static int __rbd_refresh_header(struct rbd_device *rbd_dev);
+static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver);
 
 static int rbd_open(struct block_device *bdev, fmode_t mode)
 {
@@ -1226,8 +1226,7 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
 		rbd_dev->header_name, (unsigned long long) notify_id,
 		(unsigned int) opcode);
 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
-	rc = __rbd_refresh_header(rbd_dev);
-	hver = rbd_dev->header.obj_version;
+	rc = __rbd_refresh_header(rbd_dev, &hver);
 	mutex_unlock(&ctl_mutex);
 	if (rc)
 		pr_warning(RBD_DRV_NAME "%d got notification but failed to "
@@ -1707,7 +1706,7 @@ static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
 /*
  * only read the first part of the ondisk header, without the snaps info
  */
-static int __rbd_refresh_header(struct rbd_device *rbd_dev)
+static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
 {
 	int ret;
 	struct rbd_image_header h;
@@ -1732,6 +1731,8 @@ static int __rbd_refresh_header(struct rbd_device *rbd_dev)
 	/* osd requests may still refer to snapc */
 	ceph_put_snap_context(rbd_dev->header.snapc);
 
+	if (hver)
+		*hver = h.obj_version;
 	rbd_dev->header.obj_version = h.obj_version;
 	rbd_dev->header.image_size = h.image_size;
 	rbd_dev->header.total_snaps = h.total_snaps;
@@ -1901,17 +1902,13 @@ static ssize_t rbd_image_refresh(struct device *dev,
 				 size_t size)
 {
 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
-	int rc;
-	int ret = size;
+	int ret;
 
 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
-
-	rc = __rbd_refresh_header(rbd_dev);
-	if (rc < 0)
-		ret = rc;
-
+	ret = __rbd_refresh_header(rbd_dev, NULL);
 	mutex_unlock(&ctl_mutex);
-	return ret;
+
+	return ret < 0 ? ret : size;
 }
 
 static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
@@ -2200,7 +2197,7 @@ static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
 		ret = rbd_req_sync_watch(rbd_dev);
 		if (ret == -ERANGE) {
 			mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
-			rc = __rbd_refresh_header(rbd_dev);
+			rc = __rbd_refresh_header(rbd_dev, NULL);
 			mutex_unlock(&ctl_mutex);
 			if (rc < 0)
 				return rc;
@@ -2650,7 +2647,7 @@ static ssize_t rbd_snap_add(struct device *dev,
 	if (ret < 0)
 		goto err_unlock;
 
-	ret = __rbd_refresh_header(rbd_dev);
+	ret = __rbd_refresh_header(rbd_dev, NULL);
 	if (ret < 0)
 		goto err_unlock;
 

From 1fe5e9932156f6122c3b1ff6ba7541c27c86718c Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 25 Jul 2012 09:32:41 -0500
Subject: [PATCH 132/132] rbd: create rbd_refresh_helper()

Create a simple helper that handles the common case of calling
__rbd_refresh_header() while holding the ctl_mutex.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index de981ac72362..9917943a3572 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -240,7 +240,7 @@ static void rbd_put_dev(struct rbd_device *rbd_dev)
 	put_device(&rbd_dev->dev);
 }
 
-static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver);
+static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver);
 
 static int rbd_open(struct block_device *bdev, fmode_t mode)
 {
@@ -1225,9 +1225,7 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
 	dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
 		rbd_dev->header_name, (unsigned long long) notify_id,
 		(unsigned int) opcode);
-	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
-	rc = __rbd_refresh_header(rbd_dev, &hver);
-	mutex_unlock(&ctl_mutex);
+	rc = rbd_refresh_header(rbd_dev, &hver);
 	if (rc)
 		pr_warning(RBD_DRV_NAME "%d got notification but failed to "
 			   " update snaps: %d\n", rbd_dev->major, rc);
@@ -1751,6 +1749,17 @@ static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
 	return ret;
 }
 
+static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
+{
+	int ret;
+
+	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+	ret = __rbd_refresh_header(rbd_dev, hver);
+	mutex_unlock(&ctl_mutex);
+
+	return ret;
+}
+
 static int rbd_init_disk(struct rbd_device *rbd_dev)
 {
 	struct gendisk *disk;
@@ -1904,9 +1913,7 @@ static ssize_t rbd_image_refresh(struct device *dev,
 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
 	int ret;
 
-	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
-	ret = __rbd_refresh_header(rbd_dev, NULL);
-	mutex_unlock(&ctl_mutex);
+	ret = rbd_refresh_header(rbd_dev, NULL);
 
 	return ret < 0 ? ret : size;
 }
@@ -2196,9 +2203,7 @@ static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
 	do {
 		ret = rbd_req_sync_watch(rbd_dev);
 		if (ret == -ERANGE) {
-			mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
-			rc = __rbd_refresh_header(rbd_dev, NULL);
-			mutex_unlock(&ctl_mutex);
+			rc = rbd_refresh_header(rbd_dev, NULL);
 			if (rc < 0)
 				return rc;
 		}