Merge branch 'upstream-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mfasheh/ocfs2

* 'upstream-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mfasheh/ocfs2: (27 commits) ocfs2: Cache extent records ocfs2: Remember rw lock level during direct io ocfs2: Fix up i_blocks calculation to know about holes ocfs2: Fix extent lookup to return true size of holes ocfs2: Read from an unwritten extent returns zeros ocfs2: make room for unwritten extents flag ocfs2: Use own splice write actor ocfs2: Use do_sync_mapping_range() in ocfs2_zero_tail_for_truncate() [PATCH] Turn do_sync_file_range() into do_sync_mapping_range() ocfs2: zero tail of sparse files on truncate ocfs2: Teach ocfs2_get_block() about holes ocfs2: remove ocfs2_prepare_write() and ocfs2_commit_write() ocfs2: teach ocfs2_file_aio_write() about sparse files ocfs2: Turn off shared writeable mmap for local files systems with holes. ocfs2: abstract out allocation locking ocfs2: teach extend/truncate about sparse files ocfs2: temporarily remove extent map caching ocfs2: sparse b-tree support ocfs2: small cleanup of ocfs2_request_delete() ocfs2: remove unused code ...
2007-04-27 10:29:56 -07:00 · 2007-04-27 10:29:56 -07:00 · ea6db58f3e
parent c58b8e4a25 8341897882
commit ea6db58f3e
31 changed files with 4786 additions and 2326 deletions
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@ -31,7 +31,8 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
 			handle_t *handle,
 			struct inode *inode,
 			struct buffer_head *fe_bh,
-			u64 blkno,
+			u32 cpos,
+			u64 start_blk,
 			u32 new_clusters,
 			struct ocfs2_alloc_context *meta_ac);
 int ocfs2_num_free_extents(struct ocfs2_super *osb,
@ -70,6 +71,8 @@ struct ocfs2_truncate_context {
 	struct buffer_head *tc_last_eb_bh;
 };

+int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
+				 u64 new_i_size);
 int ocfs2_prepare_truncate(struct ocfs2_super *osb,
 			   struct inode *inode,
 			   struct buffer_head *fe_bh,
@ -79,4 +82,26 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
 			  struct buffer_head *fe_bh,
 			  struct ocfs2_truncate_context *tc);

+int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
+		    u32 cpos, struct buffer_head **leaf_bh);
+
+/*
+ * Helper function to look at the # of clusters in an extent record.
+ */
+static inline unsigned int ocfs2_rec_clusters(struct ocfs2_extent_list *el,
+					      struct ocfs2_extent_rec *rec)
+{
+	/*
+	 * Cluster count in extent records is slightly different
+	 * between interior nodes and leaf nodes. This is to support
+	 * unwritten extents which need a flags field in leaf node
+	 * records, thus shrinking the available space for a clusters
+	 * field.
+	 */
+	if (el->l_tree_depth)
+		return le32_to_cpu(rec->e_int_clusters);
+	else
+		return le16_to_cpu(rec->e_leaf_clusters);
+}
+
 #endif /* OCFS2_ALLOC_H */
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@ -30,12 +30,83 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
 							 unsigned from,
 							 unsigned to);

+int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
+			  struct inode *inode, unsigned int from,
+			  unsigned int to, int new);
+
+int walk_page_buffers(	handle_t *handle,
+			struct buffer_head *head,
+			unsigned from,
+			unsigned to,
+			int *partial,
+			int (*fn)(	handle_t *handle,
+					struct buffer_head *bh));
+
+struct ocfs2_write_ctxt;
+typedef int (ocfs2_page_writer)(struct inode *, struct ocfs2_write_ctxt *,
+				u64 *, unsigned int *, unsigned int *);
+
+ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos,
+				     size_t count, ocfs2_page_writer *actor,
+				     void *priv);
+
+struct ocfs2_write_ctxt {
+	size_t				w_count;
+	loff_t				w_pos;
+	u32				w_cpos;
+	unsigned int			w_finished_copy;
+
+	/* This is true if page_size > cluster_size */
+	unsigned int			w_large_pages;
+
+	/* Filler callback and private data */
+	ocfs2_page_writer		*w_write_data_page;
+	void				*w_private;
+
+	/* Only valid for the filler callback */
+	struct page			*w_this_page;
+	unsigned int			w_this_page_new;
+};
+
+struct ocfs2_buffered_write_priv {
+	char				*b_src_buf;
+	const struct iovec		*b_cur_iov; /* Current iovec */
+	size_t				b_cur_off; /* Offset in the
+						    * current iovec */
+};
+int ocfs2_map_and_write_user_data(struct inode *inode,
+				  struct ocfs2_write_ctxt *wc,
+				  u64 *p_blkno,
+				  unsigned int *ret_from,
+				  unsigned int *ret_to);
+
+struct ocfs2_splice_write_priv {
+	struct splice_desc		*s_sd;
+	struct pipe_buffer		*s_buf;
+	struct pipe_inode_info		*s_pipe;
+	/* Neither offset value is ever larger than one page */
+	unsigned int			s_offset;
+	unsigned int			s_buf_offset;
+};
+int ocfs2_map_and_write_splice_data(struct inode *inode,
+				    struct ocfs2_write_ctxt *wc,
+				    u64 *p_blkno,
+				    unsigned int *ret_from,
+				    unsigned int *ret_to);
+
 /* all ocfs2_dio_end_io()'s fault */
 #define ocfs2_iocb_is_rw_locked(iocb) \
 	test_bit(0, (unsigned long *)&iocb->private)
-#define ocfs2_iocb_set_rw_locked(iocb) \
-	set_bit(0, (unsigned long *)&iocb->private)
+static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level)
+{
+	set_bit(0, (unsigned long *)&iocb->private);
+	if (level)
+		set_bit(1, (unsigned long *)&iocb->private);
+	else
+		clear_bit(1, (unsigned long *)&iocb->private);
+}
 #define ocfs2_iocb_clear_rw_locked(iocb) \
 	clear_bit(0, (unsigned long *)&iocb->private)
-
+#define ocfs2_iocb_rw_locked_level(iocb) \
+	test_bit(1, (unsigned long *)&iocb->private)
 #endif /* OCFS2_FILE_H */
--- a/fs/ocfs2/cluster/quorum.c
+++ b/fs/ocfs2/cluster/quorum.c
@ -46,6 +46,7 @@
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/workqueue.h>
+#include <linux/reboot.h>

 #include "heartbeat.h"
 #include "nodemanager.h"
@ -72,7 +73,9 @@ static void o2quo_fence_self(void)
 	/* panic spins with interrupts enabled.  with preempt
 	 * threads can still schedule, etc, etc */
 	o2hb_stop_all_regions();
-	panic("ocfs2 is very sorry to be fencing this system by panicing\n");
+
+	printk("ocfs2 is very sorry to be fencing this system by restarting\n");
+	emergency_restart();
 }

 /* Indicate that a timeout occured on a hearbeat region write. The
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@ -38,6 +38,9 @@
 * locking semantics of the file system using the protocol.  It should 
 * be somewhere else, I'm sure, but right now it isn't.
 *
+ * New in version 8:
+ * 	- Replace delete inode votes with a cluster lock
+ *
 * New in version 7:
 * 	- DLM join domain includes the live nodemap
 *
@ -57,7 +60,7 @@
 * 	- full 64 bit i_size in the metadata lock lvbs
 * 	- introduction of "rw" lock and pushing meta/data locking down
 */
-#define O2NET_PROTOCOL_VERSION 7ULL
+#define O2NET_PROTOCOL_VERSION 8ULL
 struct o2net_handshake {
 	__be64	protocol_version;
 	__be64	connector_id;
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@ -358,15 +358,17 @@ int ocfs2_do_extend_dir(struct super_block *sb,
 {
 	int status;
 	int extend;
-	u64 p_blkno;
+	u64 p_blkno, v_blkno;

 	spin_lock(&OCFS2_I(dir)->ip_lock);
 	extend = (i_size_read(dir) == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters));
 	spin_unlock(&OCFS2_I(dir)->ip_lock);

 	if (extend) {
-		status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, 1,
-						    parent_fe_bh, handle,
+		u32 offset = OCFS2_I(dir)->ip_clusters;
+
+		status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, &offset,
+						    1, parent_fe_bh, handle,
 						    data_ac, meta_ac, NULL);
 		BUG_ON(status == -EAGAIN);
 		if (status < 0) {
@ -375,9 +377,8 @@ int ocfs2_do_extend_dir(struct super_block *sb,
 		}
 	}

-	status = ocfs2_extent_map_get_blocks(dir, (dir->i_blocks >>
-						   (sb->s_blocksize_bits - 9)),
-					     1, &p_blkno, NULL);
+	v_blkno = ocfs2_blocks_for_bytes(sb, i_size_read(dir));
+	status = ocfs2_extent_map_get_blocks(dir, v_blkno, &p_blkno, NULL, NULL);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@ -486,7 +487,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,

 	dir_i_size += dir->i_sb->s_blocksize;
 	i_size_write(dir, dir_i_size);
-	dir->i_blocks = ocfs2_align_bytes_to_sectors(dir_i_size);
+	dir->i_blocks = ocfs2_inode_sector_count(dir);
 	status = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
 	if (status < 0) {
 		mlog_errno(status);
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@ -430,11 +430,10 @@ redo_bucket:

 			dlm_lockres_put(res);

-			cond_resched_lock(&dlm->spinlock);
-
 			if (dropped)
 				goto redo_bucket;
 		}
+		cond_resched_lock(&dlm->spinlock);
 		num += n;
 		mlog(0, "%s: touched %d lockreses in bucket %d "
 		     "(tot=%d)\n", dlm->name, n, i, num);
@ -1035,7 +1034,7 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm)
 {
 	int status = 0, tmpstat, node;
 	struct domain_join_ctxt *ctxt;
-	enum dlm_query_join_response response;
+	enum dlm_query_join_response response = JOIN_DISALLOW;

 	mlog_entry("%p", dlm);

--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@ -611,6 +611,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
 			}
 		} while (status != 0);

+		spin_lock(&dlm_reco_state_lock);
 		switch (ndata->state) {
 			case DLM_RECO_NODE_DATA_INIT:
 			case DLM_RECO_NODE_DATA_FINALIZE_SENT:
@ -641,6 +642,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
 				     ndata->node_num, dead_node);
 				break;
 		}
+		spin_unlock(&dlm_reco_state_lock);
 	}

 	mlog(0, "done requesting all lock info\n");
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@ -225,11 +225,17 @@ static struct ocfs2_lock_res_ops ocfs2_dentry_lops = {
 	.flags		= 0,
 };

+static struct ocfs2_lock_res_ops ocfs2_inode_open_lops = {
+	.get_osb	= ocfs2_get_inode_osb,
+	.flags		= 0,
+};
+
 static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
 {
 	return lockres->l_type == OCFS2_LOCK_TYPE_META ||
 		lockres->l_type == OCFS2_LOCK_TYPE_DATA ||
-		lockres->l_type == OCFS2_LOCK_TYPE_RW;
+		lockres->l_type == OCFS2_LOCK_TYPE_RW ||
+		lockres->l_type == OCFS2_LOCK_TYPE_OPEN;
 }

 static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres)
@ -373,6 +379,9 @@ void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
 		case OCFS2_LOCK_TYPE_DATA:
 			ops = &ocfs2_inode_data_lops;
 			break;
+		case OCFS2_LOCK_TYPE_OPEN:
+			ops = &ocfs2_inode_open_lops;
+			break;
 		default:
 			mlog_bug_on_msg(1, "type: %d\n", type);
 			ops = NULL; /* thanks, gcc */
@ -1129,6 +1138,12 @@ int ocfs2_create_new_inode_locks(struct inode *inode)
 		goto bail;
 	}

+	ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_open_lockres, 0, 0);
+	if (ret) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
 bail:
 	mlog_exit(ret);
 	return ret;
@ -1182,6 +1197,99 @@ void ocfs2_rw_unlock(struct inode *inode, int write)
 	mlog_exit_void();
 }

+/*
+ * ocfs2_open_lock always get PR mode lock.
+ */
+int ocfs2_open_lock(struct inode *inode)
+{
+	int status = 0;
+	struct ocfs2_lock_res *lockres;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	BUG_ON(!inode);
+
+	mlog_entry_void();
+
+	mlog(0, "inode %llu take PRMODE open lock\n",
+	     (unsigned long long)OCFS2_I(inode)->ip_blkno);
+
+	if (ocfs2_mount_local(osb))
+		goto out;
+
+	lockres = &OCFS2_I(inode)->ip_open_lockres;
+
+	status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
+				    LKM_PRMODE, 0, 0);
+	if (status < 0)
+		mlog_errno(status);
+
+out:
+	mlog_exit(status);
+	return status;
+}
+
+int ocfs2_try_open_lock(struct inode *inode, int write)
+{
+	int status = 0, level;
+	struct ocfs2_lock_res *lockres;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	BUG_ON(!inode);
+
+	mlog_entry_void();
+
+	mlog(0, "inode %llu try to take %s open lock\n",
+	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
+	     write ? "EXMODE" : "PRMODE");
+
+	if (ocfs2_mount_local(osb))
+		goto out;
+
+	lockres = &OCFS2_I(inode)->ip_open_lockres;
+
+	level = write ? LKM_EXMODE : LKM_PRMODE;
+
+	/*
+	 * The file system may already holding a PRMODE/EXMODE open lock.
+	 * Since we pass LKM_NOQUEUE, the request won't block waiting on
+	 * other nodes and the -EAGAIN will indicate to the caller that
+	 * this inode is still in use.
+	 */
+	status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
+				    level, LKM_NOQUEUE, 0);
+
+out:
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * ocfs2_open_unlock unlock PR and EX mode open locks.
+ */
+void ocfs2_open_unlock(struct inode *inode)
+{
+	struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_open_lockres;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	mlog_entry_void();
+
+	mlog(0, "inode %llu drop open lock\n",
+	     (unsigned long long)OCFS2_I(inode)->ip_blkno);
+
+	if (ocfs2_mount_local(osb))
+		goto out;
+
+	if(lockres->l_ro_holders)
+		ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres,
+				     LKM_PRMODE);
+	if(lockres->l_ex_holders)
+		ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres,
+				     LKM_EXMODE);
+
+out:
+	mlog_exit_void();
+}
+
 int ocfs2_data_lock_full(struct inode *inode,
 			 int write,
 			 int arg_flags)
@ -1387,8 +1495,7 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
 	if (S_ISLNK(inode->i_mode) && !oi->ip_clusters)
 		inode->i_blocks = 0;
 	else
-		inode->i_blocks =
-			ocfs2_align_bytes_to_sectors(i_size_read(inode));
+		inode->i_blocks = ocfs2_inode_sector_count(inode);

 	inode->i_uid     = be32_to_cpu(lvb->lvb_iuid);
 	inode->i_gid     = be32_to_cpu(lvb->lvb_igid);
@ -1479,12 +1586,15 @@ static int ocfs2_meta_lock_update(struct inode *inode,
 {
 	int status = 0;
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
-	struct ocfs2_lock_res *lockres = NULL;
+	struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres;
 	struct ocfs2_dinode *fe;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);

 	mlog_entry_void();

+	if (ocfs2_mount_local(osb))
+		goto bail;
+
 	spin_lock(&oi->ip_lock);
 	if (oi->ip_flags & OCFS2_INODE_DELETED) {
 		mlog(0, "Orphaned inode %llu was deleted while we "
@ -1496,22 +1606,16 @@ static int ocfs2_meta_lock_update(struct inode *inode,
 	}
 	spin_unlock(&oi->ip_lock);

-	if (!ocfs2_mount_local(osb)) {
-		lockres = &oi->ip_meta_lockres;
-
-		if (!ocfs2_should_refresh_lock_res(lockres))
-			goto bail;
-	}
+	if (!ocfs2_should_refresh_lock_res(lockres))
+		goto bail;

 	/* This will discard any caching information we might have had
 	 * for the inode metadata. */
 	ocfs2_metadata_cache_purge(inode);

-	/* will do nothing for inode types that don't use the extent
-	 * map (directories, bitmap files, etc) */
 	ocfs2_extent_map_trunc(inode, 0);

-	if (lockres && ocfs2_meta_lvb_is_trustable(inode, lockres)) {
+	if (ocfs2_meta_lvb_is_trustable(inode, lockres)) {
 		mlog(0, "Trusting LVB on inode %llu\n",
 		     (unsigned long long)oi->ip_blkno);
 		ocfs2_refresh_inode_from_lvb(inode);
@ -1558,8 +1662,7 @@ static int ocfs2_meta_lock_update(struct inode *inode,

 	status = 0;
 bail_refresh:
-	if (lockres)
-		ocfs2_complete_lock_res_refresh(lockres, status);
+	ocfs2_complete_lock_res_refresh(lockres, status);
 bail:
 	mlog_exit(status);
 	return status;
@ -1630,7 +1733,6 @@ int ocfs2_meta_lock_full(struct inode *inode,
 		wait_event(osb->recovery_event,
 			   ocfs2_node_map_is_empty(osb, &osb->recovery_map));

-	acquired = 0;
 	lockres = &OCFS2_I(inode)->ip_meta_lockres;
 	level = ex ? LKM_EXMODE : LKM_PRMODE;
 	dlm_flags = 0;
@ -2458,12 +2560,19 @@ int ocfs2_drop_inode_locks(struct inode *inode)
 	 * ocfs2_clear_inode has done it for us. */

 	err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
-			      &OCFS2_I(inode)->ip_data_lockres);
+			      &OCFS2_I(inode)->ip_open_lockres);
 	if (err < 0)
 		mlog_errno(err);

 	status = err;

+	err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
+			      &OCFS2_I(inode)->ip_data_lockres);
+	if (err < 0)
+		mlog_errno(err);
+	if (err < 0 && !status)
+		status = err;
+
 	err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
 			      &OCFS2_I(inode)->ip_meta_lockres);
 	if (err < 0)
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@ -80,6 +80,9 @@ void ocfs2_data_unlock(struct inode *inode,
 		       int write);
 int ocfs2_rw_lock(struct inode *inode, int write);
 void ocfs2_rw_unlock(struct inode *inode, int write);
+int ocfs2_open_lock(struct inode *inode);
+int ocfs2_try_open_lock(struct inode *inode, int write);
+void ocfs2_open_unlock(struct inode *inode);
 int ocfs2_meta_lock_atime(struct inode *inode,
 			  struct vfsmount *vfsmnt,
 			  int *level);
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h
@ -25,22 +25,29 @@
 #ifndef _EXTENT_MAP_H
 #define _EXTENT_MAP_H

-int init_ocfs2_extent_maps(void);
-void exit_ocfs2_extent_maps(void);
+struct ocfs2_extent_map_item {
+	unsigned int			ei_cpos;
+	unsigned int			ei_phys;
+	unsigned int			ei_clusters;
+	unsigned int			ei_flags;

-/*
- * EVERY CALL here except _init, _trunc, and _drop expects alloc_sem
- * to be held.  The allocation cannot change at all while the map is
- * in the process of being updated.
- */
-int ocfs2_extent_map_init(struct inode *inode);
-int ocfs2_extent_map_append(struct inode *inode,
-			    struct ocfs2_extent_rec *rec,
-			    u32 new_clusters);
-int ocfs2_extent_map_get_blocks(struct inode *inode,
-				u64 v_blkno, int count,
-				u64 *p_blkno, int *ret_count);
-int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters);
-int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters);
+	struct list_head		ei_list;
+};
+
+#define OCFS2_MAX_EXTENT_MAP_ITEMS			3
+struct ocfs2_extent_map {
+	unsigned int			em_num_items;
+	struct list_head		em_list;
+};
+
+void ocfs2_extent_map_init(struct inode *inode);
+void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cluster);
+void ocfs2_extent_map_insert_rec(struct inode *inode,
+				 struct ocfs2_extent_rec *rec);
+
+int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, u32 *p_cluster,
+		       u32 *num_clusters, unsigned int *extent_flags);
+int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
+				u64 *ret_count, unsigned int *extent_flags);

 #endif  /* _EXTENT_MAP_H */
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@ -33,6 +33,7 @@
 #include <linux/sched.h>
 #include <linux/pipe_fs_i.h>
 #include <linux/mount.h>
+#include <linux/writeback.h>

 #define MLOG_MASK_PREFIX ML_INODE
 #include <cluster/masklog.h>
@ -215,7 +216,7 @@ int ocfs2_set_inode_size(handle_t *handle,

 	mlog_entry_void();
 	i_size_write(inode, new_i_size);
-	inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size);
+	inode->i_blocks = ocfs2_inode_sector_count(inode);
 	inode->i_ctime = inode->i_mtime = CURRENT_TIME;

 	status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
@ -261,6 +262,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
 {
 	int status;
 	handle_t *handle;
+	struct ocfs2_dinode *di;

 	mlog_entry_void();

@ -274,12 +276,39 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
 		goto out;
 	}

-	status = ocfs2_set_inode_size(handle, inode, fe_bh, new_i_size);
+	status = ocfs2_journal_access(handle, inode, fe_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_commit;
+	}
+
+	/*
+	 * Do this before setting i_size.
+	 */
+	status = ocfs2_zero_tail_for_truncate(inode, handle, new_i_size);
+	if (status) {
+		mlog_errno(status);
+		goto out_commit;
+	}
+
+	i_size_write(inode, new_i_size);
+	inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size);
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+
+	di = (struct ocfs2_dinode *) fe_bh->b_data;
+	di->i_size = cpu_to_le64(new_i_size);
+	di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
+	di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+
+	status = ocfs2_journal_dirty(handle, fe_bh);
 	if (status < 0)
 		mlog_errno(status);

+out_commit:
 	ocfs2_commit_trans(osb, handle);
 out:
+
 	mlog_exit(status);
 	return status;
 }
@ -342,19 +371,6 @@ static int ocfs2_truncate_file(struct inode *inode,
 		mlog_errno(status);
 		goto bail;
 	}
-	ocfs2_data_unlock(inode, 1);
-
-	if (le32_to_cpu(fe->i_clusters) ==
-	    ocfs2_clusters_for_bytes(osb->sb, new_i_size)) {
-		mlog(0, "fe->i_clusters = %u, so we do a simple truncate\n",
-		     fe->i_clusters);
-		/* No allocation change is required, so lets fast path
-		 * this truncate. */
-		status = ocfs2_simple_size_update(inode, di_bh, new_i_size);
-		if (status < 0)
-			mlog_errno(status);
-		goto bail;
-	}

 	/* alright, we're going to need to do a full blown alloc size
 	 * change. Orphan the inode so that recovery can complete the
@ -363,22 +379,25 @@ static int ocfs2_truncate_file(struct inode *inode,
 	status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
 	if (status < 0) {
 		mlog_errno(status);
-		goto bail;
+		goto bail_unlock_data;
 	}

 	status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
 	if (status < 0) {
 		mlog_errno(status);
-		goto bail;
+		goto bail_unlock_data;
 	}

 	status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
 	if (status < 0) {
 		mlog_errno(status);
-		goto bail;
+		goto bail_unlock_data;
 	}

 	/* TODO: orphan dir cleanup here. */
+bail_unlock_data:
+	ocfs2_data_unlock(inode, 1);
+
 bail:

 	mlog_exit(status);
@ -397,6 +416,7 @@ bail:
 */
 int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
 			       struct inode *inode,
+			       u32 *logical_offset,
 			       u32 clusters_to_add,
 			       struct buffer_head *fe_bh,
 			       handle_t *handle,
@ -460,18 +480,14 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
 	block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
 	mlog(0, "Allocating %u clusters at block %u for inode %llu\n",
 	     num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
-	status = ocfs2_insert_extent(osb, handle, inode, fe_bh, block,
-				     num_bits, meta_ac);
+	status = ocfs2_insert_extent(osb, handle, inode, fe_bh,
+				     *logical_offset, block, num_bits,
+				     meta_ac);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
 	}

-	le32_add_cpu(&fe->i_clusters, num_bits);
-	spin_lock(&OCFS2_I(inode)->ip_lock);
-	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
-	spin_unlock(&OCFS2_I(inode)->ip_lock);
-
 	status = ocfs2_journal_dirty(handle, fe_bh);
 	if (status < 0) {
 		mlog_errno(status);
@ -479,6 +495,7 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
 	}

 	clusters_to_add -= num_bits;
+	*logical_offset += num_bits;

 	if (clusters_to_add) {
 		mlog(0, "need to alloc once more, clusters = %u, wanted = "
@ -494,14 +511,87 @@ leave:
 	return status;
 }

+/*
+ * For a given allocation, determine which allocators will need to be
+ * accessed, and lock them, reserving the appropriate number of bits.
+ *
+ * Called from ocfs2_extend_allocation() for file systems which don't
+ * support holes, and from ocfs2_write() for file systems which
+ * understand sparse inodes.
+ */
+int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
+			  u32 clusters_to_add,
+			  struct ocfs2_alloc_context **data_ac,
+			  struct ocfs2_alloc_context **meta_ac)
+{
+	int ret, num_free_extents;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	*meta_ac = NULL;
+	*data_ac = NULL;
+
+	mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
+	     "clusters_to_add = %u\n",
+	     (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode),
+	     le32_to_cpu(di->i_clusters), clusters_to_add);
+
+	num_free_extents = ocfs2_num_free_extents(osb, inode, di);
+	if (num_free_extents < 0) {
+		ret = num_free_extents;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * Sparse allocation file systems need to be more conservative
+	 * with reserving room for expansion - the actual allocation
+	 * happens while we've got a journal handle open so re-taking
+	 * a cluster lock (because we ran out of room for another
+	 * extent) will violate ordering rules.
+	 *
+	 * Most of the time we'll only be seeing this 1 cluster at a time
+	 * anyway.
+	 */
+	if (!num_free_extents ||
+	    (ocfs2_sparse_alloc(osb) && num_free_extents < clusters_to_add)) {
+		ret = ocfs2_reserve_new_metadata(osb, di, meta_ac);
+		if (ret < 0) {
+			if (ret != -ENOSPC)
+				mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
+	if (ret < 0) {
+		if (ret != -ENOSPC)
+			mlog_errno(ret);
+		goto out;
+	}
+
+out:
+	if (ret) {
+		if (*meta_ac) {
+			ocfs2_free_alloc_context(*meta_ac);
+			*meta_ac = NULL;
+		}
+
+		/*
+		 * We cannot have an error and a non null *data_ac.
+		 */
+	}
+
+	return ret;
+}
+
 static int ocfs2_extend_allocation(struct inode *inode,
 				   u32 clusters_to_add)
 {
 	int status = 0;
 	int restart_func = 0;
 	int drop_alloc_sem = 0;
-	int credits, num_free_extents;
-	u32 prev_clusters;
+	int credits;
+	u32 prev_clusters, logical_start;
 	struct buffer_head *bh = NULL;
 	struct ocfs2_dinode *fe = NULL;
 	handle_t *handle = NULL;
@ -512,6 +602,12 @@ static int ocfs2_extend_allocation(struct inode *inode,

 	mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);

+	/*
+	 * This function only exists for file systems which don't
+	 * support holes.
+	 */
+	BUG_ON(ocfs2_sparse_alloc(osb));
+
 	status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
 				  OCFS2_BH_CACHED, inode);
 	if (status < 0) {
@ -526,39 +622,11 @@ static int ocfs2_extend_allocation(struct inode *inode,
 		goto leave;
 	}

+	logical_start = OCFS2_I(inode)->ip_clusters;
+
 restart_all:
 	BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);

-	mlog(0, "extend inode %llu, i_size = %lld, fe->i_clusters = %u, "
-	     "clusters_to_add = %u\n",
-	     (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode),
-	     fe->i_clusters, clusters_to_add);
-
-	num_free_extents = ocfs2_num_free_extents(osb,
-						  inode,
-						  fe);
-	if (num_free_extents < 0) {
-		status = num_free_extents;
-		mlog_errno(status);
-		goto leave;
-	}
-
-	if (!num_free_extents) {
-		status = ocfs2_reserve_new_metadata(osb, fe, &meta_ac);
-		if (status < 0) {
-			if (status != -ENOSPC)
-				mlog_errno(status);
-			goto leave;
-		}
-	}
-
-	status = ocfs2_reserve_clusters(osb, clusters_to_add, &data_ac);
-	if (status < 0) {
-		if (status != -ENOSPC)
-			mlog_errno(status);
-		goto leave;
-	}
-
 	/* blocks peope in read/write from reading our allocation
 	 * until we're done changing it. We depend on i_mutex to block
 	 * other extend/truncate calls while we're here. Ordering wrt
@ -566,6 +634,13 @@ restart_all:
 	down_write(&OCFS2_I(inode)->ip_alloc_sem);
 	drop_alloc_sem = 1;

+	status = ocfs2_lock_allocators(inode, fe, clusters_to_add, &data_ac,
+				       &meta_ac);
+	if (status) {
+		mlog_errno(status);
+		goto leave;
+	}
+
 	credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add);
 	handle = ocfs2_start_trans(osb, credits);
 	if (IS_ERR(handle)) {
@ -590,6 +665,7 @@ restarted_transaction:

 	status = ocfs2_do_extend_allocation(osb,
 					    inode,
+					    &logical_start,
 					    clusters_to_add,
 					    bh,
 					    handle,
@ -778,7 +854,7 @@ static int ocfs2_extend_file(struct inode *inode,
 			     size_t tail_to_skip)
 {
 	int ret = 0;
-	u32 clusters_to_add;
+	u32 clusters_to_add = 0;

 	BUG_ON(!tail_to_skip && !di_bh);

@ -790,6 +866,11 @@ static int ocfs2_extend_file(struct inode *inode,
  		goto out;
 	BUG_ON(new_i_size < i_size_read(inode));

+	if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
+		BUG_ON(tail_to_skip != 0);
+		goto out_update_size;
+	}
+
 	clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) - 
 		OCFS2_I(inode)->ip_clusters;

@ -825,6 +906,7 @@ static int ocfs2_extend_file(struct inode *inode,
 		goto out_unlock;
 	}

+out_update_size:
 	if (!tail_to_skip) {
 		/* We're being called from ocfs2_setattr() which wants
 		 * us to update i_size */
@ -834,7 +916,8 @@ static int ocfs2_extend_file(struct inode *inode,
 	}

 out_unlock:
-	ocfs2_data_unlock(inode, 1);
+	if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+		ocfs2_data_unlock(inode, 1);

 out:
 	return ret;
@ -972,7 +1055,8 @@ int ocfs2_permission(struct inode *inode, int mask, struct nameidata *nd)

 	ret = ocfs2_meta_lock(inode, NULL, 0);
 	if (ret) {
-		mlog_errno(ret);
+		if (ret != -ENOENT)
+			mlog_errno(ret);
 		goto out;
 	}

@ -1035,10 +1119,49 @@ out:
 	return ret;
 }

+/*
+ * Will look for holes and unwritten extents in the range starting at
+ * pos for count bytes (inclusive).
+ */
+static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos,
+				       size_t count)
+{
+	int ret = 0;
+	unsigned int extent_flags;
+	u32 cpos, clusters, extent_len, phys_cpos;
+	struct super_block *sb = inode->i_sb;
+
+	cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
+	clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
+
+	while (clusters) {
+		ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
+					 &extent_flags);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) {
+			ret = 1;
+			break;
+		}
+
+		if (extent_len > clusters)
+			extent_len = clusters;
+
+		clusters -= extent_len;
+		cpos += extent_len;
+	}
+out:
+	return ret;
+}
+
 static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
 					 loff_t *ppos,
 					 size_t count,
-					 int appending)
+					 int appending,
+					 int *direct_io)
 {
 	int ret = 0, meta_level = appending;
 	struct inode *inode = dentry->d_inode;
@ -1089,6 +1212,49 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
 		} else {
 			saved_pos = *ppos;
 		}
+
+		if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
+			loff_t end = saved_pos + count;
+
+			/*
+			 * Skip the O_DIRECT checks if we don't need
+			 * them.
+			 */
+			if (!direct_io || !(*direct_io))
+				break;
+
+			/*
+			 * Allowing concurrent direct writes means
+			 * i_size changes wouldn't be synchronized, so
+			 * one node could wind up truncating another
+			 * nodes writes.
+			 */
+			if (end > i_size_read(inode)) {
+				*direct_io = 0;
+				break;
+			}
+
+			/*
+			 * We don't fill holes during direct io, so
+			 * check for them here. If any are found, the
+			 * caller will have to retake some cluster
+			 * locks and initiate the io as buffered.
+			 */
+			ret = ocfs2_check_range_for_holes(inode, saved_pos,
+							  count);
+			if (ret == 1) {
+				*direct_io = 0;
+				ret = 0;
+			} else if (ret < 0)
+				mlog_errno(ret);
+			break;
+		}
+
+		/*
+		 * The rest of this loop is concerned with legacy file
+		 * systems which don't support sparse files.
+		 */
+
 		newsize = count + saved_pos;

 		mlog(0, "pos=%lld newsize=%lld cursize=%lld\n",
@ -1141,55 +1307,264 @@ out:
 	return ret;
 }

+static inline void
+ocfs2_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
+{
+	const struct iovec *iov = *iovp;
+	size_t base = *basep;
+
+	do {
+		int copy = min(bytes, iov->iov_len - base);
+
+		bytes -= copy;
+		base += copy;
+		if (iov->iov_len == base) {
+			iov++;
+			base = 0;
+		}
+	} while (bytes);
+	*iovp = iov;
+	*basep = base;
+}
+
+static struct page * ocfs2_get_write_source(struct ocfs2_buffered_write_priv *bp,
+					    const struct iovec *cur_iov,
+					    size_t iov_offset)
+{
+	int ret;
+	char *buf;
+	struct page *src_page = NULL;
+
+	buf = cur_iov->iov_base + iov_offset;
+
+	if (!segment_eq(get_fs(), KERNEL_DS)) {
+		/*
+		 * Pull in the user page. We want to do this outside
+		 * of the meta data locks in order to preserve locking
+		 * order in case of page fault.
+		 */
+		ret = get_user_pages(current, current->mm,
+				     (unsigned long)buf & PAGE_CACHE_MASK, 1,
+				     0, 0, &src_page, NULL);
+		if (ret == 1)
+			bp->b_src_buf = kmap(src_page);
+		else
+			src_page = ERR_PTR(-EFAULT);
+	} else {
+		bp->b_src_buf = buf;
+	}
+
+	return src_page;
+}
+
+static void ocfs2_put_write_source(struct ocfs2_buffered_write_priv *bp,
+				   struct page *page)
+{
+	if (page) {
+		kunmap(page);
+		page_cache_release(page);
+	}
+}
+
+static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos,
+					 const struct iovec *iov,
+					 unsigned long nr_segs,
+					 size_t count,
+					 ssize_t o_direct_written)
+{
+	int ret = 0;
+	ssize_t copied, total = 0;
+	size_t iov_offset = 0;
+	const struct iovec *cur_iov = iov;
+	struct ocfs2_buffered_write_priv bp;
+	struct page *page;
+
+	/*
+	 * handle partial DIO write.  Adjust cur_iov if needed.
+	 */
+	ocfs2_set_next_iovec(&cur_iov, &iov_offset, o_direct_written);
+
+	do {
+		bp.b_cur_off = iov_offset;
+		bp.b_cur_iov = cur_iov;
+
+		page = ocfs2_get_write_source(&bp, cur_iov, iov_offset);
+		if (IS_ERR(page)) {
+			ret = PTR_ERR(page);
+			goto out;
+		}
+
+		copied = ocfs2_buffered_write_cluster(file, *ppos, count,
+						      ocfs2_map_and_write_user_data,
+						      &bp);
+
+		ocfs2_put_write_source(&bp, page);
+
+		if (copied < 0) {
+			mlog_errno(copied);
+			ret = copied;
+			goto out;
+		}
+
+		total += copied;
+		*ppos = *ppos + copied;
+		count -= copied;
+
+		ocfs2_set_next_iovec(&cur_iov, &iov_offset, copied);
+	} while(count);
+
+out:
+	return total ? total : ret;
+}
+
+static int ocfs2_check_iovec(const struct iovec *iov, size_t *counted,
+			     unsigned long *nr_segs)
+{
+	size_t ocount;		/* original count */
+	unsigned long seg;
+
+	ocount = 0;
+	for (seg = 0; seg < *nr_segs; seg++) {
+		const struct iovec *iv = &iov[seg];
+
+		/*
+		 * If any segment has a negative length, or the cumulative
+		 * length ever wraps negative then return -EINVAL.
+		 */
+		ocount += iv->iov_len;
+		if (unlikely((ssize_t)(ocount|iv->iov_len) < 0))
+			return -EINVAL;
+		if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
+			continue;
+		if (seg == 0)
+			return -EFAULT;
+		*nr_segs = seg;
+		ocount -= iv->iov_len;	/* This segment is no good */
+		break;
+	}
+
+	*counted = ocount;
+	return 0;
+}
+
 static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
 				    const struct iovec *iov,
 				    unsigned long nr_segs,
 				    loff_t pos)
 {
-	int ret, rw_level, have_alloc_sem = 0;
-	struct file *filp = iocb->ki_filp;
-	struct inode *inode = filp->f_path.dentry->d_inode;
-	int appending = filp->f_flags & O_APPEND ? 1 : 0;
+	int ret, direct_io, appending, rw_level, have_alloc_sem  = 0;
+	int can_do_direct, sync = 0;
+	ssize_t written = 0;
+	size_t ocount;		/* original count */
+	size_t count;		/* after file limit checks */
+	loff_t *ppos = &iocb->ki_pos;
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_path.dentry->d_inode;

-	mlog_entry("(0x%p, %u, '%.*s')\n", filp,
+	mlog_entry("(0x%p, %u, '%.*s')\n", file,
 		   (unsigned int)nr_segs,
-		   filp->f_path.dentry->d_name.len,
-		   filp->f_path.dentry->d_name.name);
+		   file->f_path.dentry->d_name.len,
+		   file->f_path.dentry->d_name.name);

-	/* happy write of zero bytes */
 	if (iocb->ki_left == 0)
 		return 0;

+	ret = ocfs2_check_iovec(iov, &ocount, &nr_segs);
+	if (ret)
+		return ret;
+
+	count = ocount;
+
+	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+
+	appending = file->f_flags & O_APPEND ? 1 : 0;
+	direct_io = file->f_flags & O_DIRECT ? 1 : 0;
+
 	mutex_lock(&inode->i_mutex);
+
+relock:
 	/* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */
-	if (filp->f_flags & O_DIRECT) {
-		have_alloc_sem = 1;
+	if (direct_io) {
 		down_read(&inode->i_alloc_sem);
+		have_alloc_sem = 1;
 	}

 	/* concurrent O_DIRECT writes are allowed */
-	rw_level = (filp->f_flags & O_DIRECT) ? 0 : 1;
+	rw_level = !direct_io;
 	ret = ocfs2_rw_lock(inode, rw_level);
 	if (ret < 0) {
-		rw_level = -1;
 		mlog_errno(ret);
-		goto out;
+		goto out_sems;
 	}

-	ret = ocfs2_prepare_inode_for_write(filp->f_path.dentry, &iocb->ki_pos,
-					    iocb->ki_left, appending);
+	can_do_direct = direct_io;
+	ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos,
+					    iocb->ki_left, appending,
+					    &can_do_direct);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out;
 	}

+	/*
+	 * We can't complete the direct I/O as requested, fall back to
+	 * buffered I/O.
+	 */
+	if (direct_io && !can_do_direct) {
+		ocfs2_rw_unlock(inode, rw_level);
+		up_read(&inode->i_alloc_sem);
+
+		have_alloc_sem = 0;
+		rw_level = -1;
+
+		direct_io = 0;
+		sync = 1;
+		goto relock;
+	}
+
+	if (!sync && ((file->f_flags & O_SYNC) || IS_SYNC(inode)))
+		sync = 1;
+
+	/*
+	 * XXX: Is it ok to execute these checks a second time?
+	 */
+	ret = generic_write_checks(file, ppos, &count, S_ISBLK(inode->i_mode));
+	if (ret)
+		goto out;
+
+	/*
+	 * Set pos so that sync_page_range_nolock() below understands
+	 * where to start from. We might've moved it around via the
+	 * calls above. The range we want to actually sync starts from
+	 * *ppos here.
+	 *
+	 */
+	pos = *ppos;
+
 	/* communicate with ocfs2_dio_end_io */
-	ocfs2_iocb_set_rw_locked(iocb);
+	ocfs2_iocb_set_rw_locked(iocb, rw_level);

-	ret = generic_file_aio_write_nolock(iocb, iov, nr_segs, iocb->ki_pos);
+	if (direct_io) {
+		written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
+						    ppos, count, ocount);
+		if (written < 0) {
+			ret = written;
+			goto out_dio;
+		}
+	} else {
+		written = ocfs2_file_buffered_write(file, ppos, iov, nr_segs,
+						    count, written);
+		if (written < 0) {
+			ret = written;
+			if (ret != -EFAULT || ret != -ENOSPC)
+				mlog_errno(ret);
+			goto out;
+		}
+	}

+out_dio:
 	/* buffered aio wouldn't have proper lock coverage today */
-	BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
+	BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));

 	/* 
 	 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
@ -1207,13 +1582,102 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
 	}

 out:
+	if (rw_level != -1)
+		ocfs2_rw_unlock(inode, rw_level);
+
+out_sems:
 	if (have_alloc_sem)
 		up_read(&inode->i_alloc_sem);
-	if (rw_level != -1) 
-		ocfs2_rw_unlock(inode, rw_level);
+
+	if (written > 0 && sync) {
+		ssize_t err;
+
+		err = sync_page_range_nolock(inode, file->f_mapping, pos, count);
+		if (err < 0)
+			written = err;
+	}
+
 	mutex_unlock(&inode->i_mutex);

 	mlog_exit(ret);
+	return written ? written : ret;
+}
+
+static int ocfs2_splice_write_actor(struct pipe_inode_info *pipe,
+				    struct pipe_buffer *buf,
+				    struct splice_desc *sd)
+{
+	int ret, count, total = 0;
+	ssize_t copied = 0;
+	struct ocfs2_splice_write_priv sp;
+
+	ret = buf->ops->pin(pipe, buf);
+	if (ret)
+		goto out;
+
+	sp.s_sd = sd;
+	sp.s_buf = buf;
+	sp.s_pipe = pipe;
+	sp.s_offset = sd->pos & ~PAGE_CACHE_MASK;
+	sp.s_buf_offset = buf->offset;
+
+	count = sd->len;
+	if (count + sp.s_offset > PAGE_CACHE_SIZE)
+		count = PAGE_CACHE_SIZE - sp.s_offset;
+
+	do {
+		/*
+		 * splice wants us to copy up to one page at a
+		 * time. For pagesize > cluster size, this means we
+		 * might enter ocfs2_buffered_write_cluster() more
+		 * than once, so keep track of our progress here.
+		 */
+		copied = ocfs2_buffered_write_cluster(sd->file,
+						      (loff_t)sd->pos + total,
+						      count,
+						      ocfs2_map_and_write_splice_data,
+						      &sp);
+		if (copied < 0) {
+			mlog_errno(copied);
+			ret = copied;
+			goto out;
+		}
+
+		count -= copied;
+		sp.s_offset += copied;
+		sp.s_buf_offset += copied;
+		total += copied;
+	} while (count);
+
+	ret = 0;
+out:
+
+	return total ? total : ret;
+}
+
+static ssize_t __ocfs2_file_splice_write(struct pipe_inode_info *pipe,
+					 struct file *out,
+					 loff_t *ppos,
+					 size_t len,
+					 unsigned int flags)
+{
+	int ret, err;
+	struct address_space *mapping = out->f_mapping;
+	struct inode *inode = mapping->host;
+
+	ret = __splice_from_pipe(pipe, out, ppos, len, flags,
+				 ocfs2_splice_write_actor);
+	if (ret > 0) {
+		*ppos += ret;
+
+		if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
+			err = generic_osync_inode(inode, mapping,
+						  OSYNC_METADATA|OSYNC_DATA);
+			if (err)
+				ret = err;
+		}
+	}
+
 	return ret;
 }

@ -1239,14 +1703,15 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
 		goto out;
 	}

-	ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0);
+	ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0,
+					    NULL);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out_unlock;
 	}

 	/* ok, we're done with i_size and alloc work */
-	ret = generic_file_splice_write_nolock(pipe, out, ppos, len, flags);
+	ret = __ocfs2_file_splice_write(pipe, out, ppos, len, flags);

 out_unlock:
 	ocfs2_rw_unlock(inode, 1);
@ -1323,7 +1788,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
 		}
 		rw_level = 0;
 		/* communicate with ocfs2_dio_end_io */
-		ocfs2_iocb_set_rw_locked(iocb);
+		ocfs2_iocb_set_rw_locked(iocb, rw_level);
 	}

 	/*
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@ -39,12 +39,17 @@ enum ocfs2_alloc_restarted {
 };
 int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
 			       struct inode *inode,
+			       u32 *cluster_start,
 			       u32 clusters_to_add,
 			       struct buffer_head *fe_bh,
 			       handle_t *handle,
 			       struct ocfs2_alloc_context *data_ac,
 			       struct ocfs2_alloc_context *meta_ac,
 			       enum ocfs2_alloc_restarted *reason);
+int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
+			  u32 clusters_to_add,
+			  struct ocfs2_alloc_context **data_ac,
+			  struct ocfs2_alloc_context **meta_ac);
 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
 int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
 		  struct kstat *stat);
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@ -89,24 +89,6 @@ void ocfs2_set_inode_flags(struct inode *inode)
 		inode->i_flags |= S_DIRSYNC;
 }

-struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb,
-				     u64 blkno,
-				     int delete_vote)
-{
-	struct ocfs2_find_inode_args args;
-
-	/* ocfs2_ilookup_for_vote should *only* be called from the
-	 * vote thread */
-	BUG_ON(current != osb->vote_task);
-
-	args.fi_blkno = blkno;
-	args.fi_flags = OCFS2_FI_FLAG_NOWAIT;
-	if (delete_vote)
-		args.fi_flags |= OCFS2_FI_FLAG_DELETE;
-	args.fi_ino = ino_from_blkno(osb->sb, blkno);
-	return ilookup5(osb->sb, args.fi_ino, ocfs2_find_actor, &args);
-}
-
 struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, int flags)
 {
 	struct inode *inode = NULL;
@ -182,28 +164,6 @@ static int ocfs2_find_actor(struct inode *inode, void *opaque)
 	if (oi->ip_blkno != args->fi_blkno)
 		goto bail;

-	/* OCFS2_FI_FLAG_NOWAIT is *only* set from
-	 * ocfs2_ilookup_for_vote which won't create an inode for one
-	 * that isn't found. The vote thread which doesn't want to get
-	 * an inode which is in the process of going away - otherwise
-	 * the call to __wait_on_freeing_inode in find_inode_fast will
-	 * cause it to deadlock on an inode which may be waiting on a
-	 * vote (or lock release) in delete_inode */
-	if ((args->fi_flags & OCFS2_FI_FLAG_NOWAIT) &&
-	    (inode->i_state & (I_FREEING|I_CLEAR))) {
-		/* As stated above, we're not going to return an
-		 * inode.  In the case of a delete vote, the voting
-		 * code is going to signal the other node to go
-		 * ahead. Mark that state here, so this freeing inode
-		 * has the state when it gets to delete_inode. */
-		if (args->fi_flags & OCFS2_FI_FLAG_DELETE) {
-			spin_lock(&oi->ip_lock);
-			ocfs2_mark_inode_remotely_deleted(inode);
-			spin_unlock(&oi->ip_lock);
-		}
-		goto bail;
-	}
-
 	ret = 1;
 bail:
 	mlog_exit(ret);
@ -261,6 +221,9 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
 		goto bail;
 	}

+	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
+	OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr);
+
 	inode->i_version = 1;
 	inode->i_generation = le32_to_cpu(fe->i_generation);
 	inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev));
@ -272,8 +235,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
 	if (S_ISLNK(inode->i_mode) && !fe->i_clusters)
 		inode->i_blocks = 0;
 	else
-		inode->i_blocks =
-			ocfs2_align_bytes_to_sectors(le64_to_cpu(fe->i_size));
+		inode->i_blocks = ocfs2_inode_sector_count(inode);
 	inode->i_mapping->a_ops = &ocfs2_aops;
 	inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime);
 	inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec);
@ -288,10 +250,6 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
 		     (unsigned long long)OCFS2_I(inode)->ip_blkno,
 		     (unsigned long long)fe->i_blkno);

-	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
-	OCFS2_I(inode)->ip_orphaned_slot = OCFS2_INVALID_SLOT;
-	OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr);
-
 	inode->i_nlink = le16_to_cpu(fe->i_links_count);

 	if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL))
@ -347,6 +305,9 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,

 		ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres,
 					  OCFS2_LOCK_TYPE_META, 0, inode);
+
+		ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres,
+					  OCFS2_LOCK_TYPE_OPEN, 0, inode);
 	}

 	ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_rw_lockres,
@ -421,7 +382,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
 	 * cluster lock before trusting anything anyway.
 	 */
 	can_lock = !(args->fi_flags & OCFS2_FI_FLAG_SYSFILE)
-		&& !(args->fi_flags & OCFS2_FI_FLAG_NOLOCK)
+		&& !(args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY)
 		&& !ocfs2_mount_local(osb);

 	/*
@ -438,7 +399,17 @@ static int ocfs2_read_locked_inode(struct inode *inode,
 				  OCFS2_LOCK_TYPE_META,
 				  generation, inode);

+	ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres,
+				  OCFS2_LOCK_TYPE_OPEN,
+				  0, inode);
+
 	if (can_lock) {
+		status = ocfs2_open_lock(inode);
+		if (status) {
+			make_bad_inode(inode);
+			mlog_errno(status);
+			return status;
+		}
 		status = ocfs2_meta_lock(inode, NULL, 0);
 		if (status) {
 			make_bad_inode(inode);
@ -447,6 +418,14 @@ static int ocfs2_read_locked_inode(struct inode *inode,
 		}
 	}

+	if (args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY) {
+		status = ocfs2_try_open_lock(inode, 0);
+		if (status) {
+			make_bad_inode(inode);	
+			return status;
+		}
+	}
+
 	status = ocfs2_read_block(osb, args->fi_blkno, &bh, 0,
 				  can_lock ? inode : NULL);
 	if (status < 0) {
@ -507,50 +486,56 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
 				     struct buffer_head *fe_bh)
 {
 	int status = 0;
-	handle_t *handle = NULL;
 	struct ocfs2_truncate_context *tc = NULL;
 	struct ocfs2_dinode *fe;
+	handle_t *handle = NULL;

 	mlog_entry_void();

 	fe = (struct ocfs2_dinode *) fe_bh->b_data;

-	/* zero allocation, zero truncate :) */
-	if (!fe->i_clusters)
-		goto bail;
+	if (fe->i_clusters) {
+		handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+		if (IS_ERR(handle)) {
+			status = PTR_ERR(handle);
+			mlog_errno(status);
+			goto out;
+		}

-	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
-	if (IS_ERR(handle)) {
-		status = PTR_ERR(handle);
+		status = ocfs2_journal_access(handle, inode, fe_bh,
+					      OCFS2_JOURNAL_ACCESS_WRITE);
+		if (status < 0) {
+			mlog_errno(status);
+			goto out;
+		}
+
+		i_size_write(inode, 0);
+
+		status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto out;
+		}
+
+		ocfs2_commit_trans(osb, handle);
 		handle = NULL;
-		mlog_errno(status);
-		goto bail;
+
+		status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc);
+		if (status < 0) {
+			mlog_errno(status);
+			goto out;
+		}
+
+		status = ocfs2_commit_truncate(osb, inode, fe_bh, tc);
+		if (status < 0) {
+			mlog_errno(status);
+			goto out;
+		}
 	}

-	status = ocfs2_set_inode_size(handle, inode, fe_bh, 0ULL);
-	if (status < 0) {
-		mlog_errno(status);
-		goto bail;
-	}
-
-	ocfs2_commit_trans(osb, handle);
-	handle = NULL;
-
-	status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc);
-	if (status < 0) {
-		mlog_errno(status);
-		goto bail;
-	}
-
-	status = ocfs2_commit_truncate(osb, inode, fe_bh, tc);
-	if (status < 0) {
-		mlog_errno(status);
-		goto bail;
-	}
-bail:
+out:
 	if (handle)
 		ocfs2_commit_trans(osb, handle);
-
 	mlog_exit(status);
 	return status;
 }
@ -678,10 +663,10 @@ static int ocfs2_wipe_inode(struct inode *inode,
 	struct inode *orphan_dir_inode = NULL;
 	struct buffer_head *orphan_dir_bh = NULL;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_dinode *di;

-	/* We've already voted on this so it should be readonly - no
-	 * spinlock needed. */
-	orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
+	di = (struct ocfs2_dinode *) di_bh->b_data;
+	orphaned_slot = le16_to_cpu(di->i_orphaned_slot);

 	status = ocfs2_check_orphan_recovery_state(osb, orphaned_slot);
 	if (status)
@ -839,11 +824,20 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
 		goto bail;
 	}

-	status = ocfs2_request_delete_vote(inode);
-	/* -EBUSY means that other nodes are still using the
-	 * inode. We're done here though, so avoid doing anything on
-	 * disk and let them worry about deleting it. */
-	if (status == -EBUSY) {
+	/*
+	 * This is how ocfs2 determines whether an inode is still live
+	 * within the cluster. Every node takes a shared read lock on
+	 * the inode open lock in ocfs2_read_locked_inode(). When we
+	 * get to ->delete_inode(), each node tries to convert it's
+	 * lock to an exclusive. Trylocks are serialized by the inode
+	 * meta data lock. If the upconvert suceeds, we know the inode
+	 * is no longer live and can be deleted.
+	 *
+	 * Though we call this with the meta data lock held, the
+	 * trylock keeps us from ABBA deadlock.
+	 */
+	status = ocfs2_try_open_lock(inode, 1);
+	if (status == -EAGAIN) {
 		status = 0;
 		mlog(0, "Skipping delete of %llu because it is in use on"
 		     "other nodes\n", (unsigned long long)oi->ip_blkno);
@ -854,21 +848,10 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
 		goto bail;
 	}

-	spin_lock(&oi->ip_lock);
-	if (oi->ip_orphaned_slot == OCFS2_INVALID_SLOT) {
-		/* Nobody knew which slot this inode was orphaned
-		 * into. This may happen during node death and
-		 * recovery knows how to clean it up so we can safely
-		 * ignore this inode for now on. */
-		mlog(0, "Nobody knew where inode %llu was orphaned!\n",
-		     (unsigned long long)oi->ip_blkno);
-	} else {
-		*wipe = 1;
-
-		mlog(0, "Inode %llu is ok to wipe from orphan dir %d\n",
-		     (unsigned long long)oi->ip_blkno, oi->ip_orphaned_slot);
-	}
-	spin_unlock(&oi->ip_lock);
+	*wipe = 1;
+	mlog(0, "Inode %llu is ok to wipe from orphan dir %u\n",
+	     (unsigned long long)oi->ip_blkno,
+	     le16_to_cpu(di->i_orphaned_slot));

 bail:
 	return status;
@ -1001,11 +984,16 @@ void ocfs2_clear_inode(struct inode *inode)
 	mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL,
 			"Inode=%lu\n", inode->i_ino);

+	/* For remove delete_inode vote, we hold open lock before,
+	 * now it is time to unlock PR and EX open locks. */
+	ocfs2_open_unlock(inode);
+
 	/* Do these before all the other work so that we don't bounce
 	 * the vote thread while waiting to destroy the locks. */
 	ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres);
 	ocfs2_mark_lockres_freeing(&oi->ip_meta_lockres);
 	ocfs2_mark_lockres_freeing(&oi->ip_data_lockres);
+	ocfs2_mark_lockres_freeing(&oi->ip_open_lockres);

 	/* We very well may get a clear_inode before all an inodes
 	 * metadata has hit disk. Of course, we can't drop any cluster
@ -1020,8 +1008,7 @@ void ocfs2_clear_inode(struct inode *inode)
 			"Clear inode of %llu, inode has io markers\n",
 			(unsigned long long)oi->ip_blkno);

-	ocfs2_extent_map_drop(inode, 0);
-	ocfs2_extent_map_init(inode);
+	ocfs2_extent_map_trunc(inode, 0);

 	status = ocfs2_drop_inode_locks(inode);
 	if (status < 0)
@ -1030,6 +1017,7 @@ void ocfs2_clear_inode(struct inode *inode)
 	ocfs2_lock_res_free(&oi->ip_rw_lockres);
 	ocfs2_lock_res_free(&oi->ip_meta_lockres);
 	ocfs2_lock_res_free(&oi->ip_data_lockres);
+	ocfs2_lock_res_free(&oi->ip_open_lockres);

 	ocfs2_metadata_cache_purge(inode);

@ -1086,9 +1074,6 @@ void ocfs2_drop_inode(struct inode *inode)
 	mlog(0, "Drop inode %llu, nlink = %u, ip_flags = 0x%x\n",
 	     (unsigned long long)oi->ip_blkno, inode->i_nlink, oi->ip_flags);

-	/* Testing ip_orphaned_slot here wouldn't work because we may
-	 * not have gotten a delete_inode vote from any other nodes
-	 * yet. */
 	if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)
 		generic_delete_inode(inode);
 	else
@ -1121,8 +1106,8 @@ struct buffer_head *ocfs2_bread(struct inode *inode,
 		return NULL;
 	}

-	tmperr = ocfs2_extent_map_get_blocks(inode, block, 1,
-					     &p_blkno, NULL);
+	tmperr = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL,
+					     NULL);
 	if (tmperr < 0) {
 		mlog_errno(tmperr);
 		goto fail;
@ -1259,7 +1244,7 @@ void ocfs2_refresh_inode(struct inode *inode,
 	if (S_ISLNK(inode->i_mode) && le32_to_cpu(fe->i_clusters) == 0)
 		inode->i_blocks = 0;
 	else
-		inode->i_blocks = ocfs2_align_bytes_to_sectors(i_size_read(inode));
+		inode->i_blocks = ocfs2_inode_sector_count(inode);
 	inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime);
 	inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec);
 	inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime);
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@ -26,6 +26,8 @@
 #ifndef OCFS2_INODE_H
 #define OCFS2_INODE_H

+#include "extent_map.h"
+
 /* OCFS2 Inode Private Data */
 struct ocfs2_inode_info
 {
@ -34,6 +36,7 @@ struct ocfs2_inode_info
 	struct ocfs2_lock_res		ip_rw_lockres;
 	struct ocfs2_lock_res		ip_meta_lockres;
 	struct ocfs2_lock_res		ip_data_lockres;
+	struct ocfs2_lock_res		ip_open_lockres;

 	/* protects allocation changes on this inode. */
 	struct rw_semaphore		ip_alloc_sem;
@ -42,9 +45,7 @@ struct ocfs2_inode_info
 	spinlock_t			ip_lock;
 	u32				ip_open_count;
 	u32				ip_clusters;
-	struct ocfs2_extent_map		ip_map;
 	struct list_head		ip_io_markers;
-	int				ip_orphaned_slot;

 	struct mutex			ip_io_mutex;

@ -64,6 +65,8 @@ struct ocfs2_inode_info

 	struct ocfs2_caching_info	ip_metadata_cache;

+	struct ocfs2_extent_map		ip_extent_map;
+
 	struct inode			vfs_inode;
 };

@ -117,14 +120,9 @@ void ocfs2_delete_inode(struct inode *inode);
 void ocfs2_drop_inode(struct inode *inode);

 /* Flags for ocfs2_iget() */
-#define OCFS2_FI_FLAG_NOWAIT	0x1
-#define OCFS2_FI_FLAG_DELETE	0x2
-#define OCFS2_FI_FLAG_SYSFILE	0x4
-#define OCFS2_FI_FLAG_NOLOCK	0x8
+#define OCFS2_FI_FLAG_SYSFILE		0x4
+#define OCFS2_FI_FLAG_ORPHAN_RECOVERY	0x8
 struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, int flags);
-struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb,
-				     u64 blkno,
-				     int delete_vote);
 int ocfs2_inode_init_private(struct inode *inode);
 int ocfs2_inode_revalidate(struct dentry *dentry);
 int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
@ -144,4 +142,11 @@ int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb);

 void ocfs2_set_inode_flags(struct inode *inode);

+static inline blkcnt_t ocfs2_inode_sector_count(struct inode *inode)
+{
+	int c_to_s_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits - 9;
+
+	return (blkcnt_t)(OCFS2_I(inode)->ip_clusters << c_to_s_bits);
+}
+
 #endif /* OCFS2_INODE_H */
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@ -649,29 +649,20 @@ bail:
 static int ocfs2_force_read_journal(struct inode *inode)
 {
 	int status = 0;
-	int i, p_blocks;
-	u64 v_blkno, p_blkno;
-#define CONCURRENT_JOURNAL_FILL 32
+	int i;
+	u64 v_blkno, p_blkno, p_blocks, num_blocks;
+#define CONCURRENT_JOURNAL_FILL 32ULL
 	struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL];

 	mlog_entry_void();

-	BUG_ON(inode->i_blocks !=
-		     ocfs2_align_bytes_to_sectors(i_size_read(inode)));
-
 	memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL);

-	mlog(0, "Force reading %llu blocks\n",
-		(unsigned long long)(inode->i_blocks >>
-			(inode->i_sb->s_blocksize_bits - 9)));
-
+	num_blocks = ocfs2_blocks_for_bytes(inode->i_sb, inode->i_size);
 	v_blkno = 0;
-	while (v_blkno <
-	       (inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9))) {
-
+	while (v_blkno < num_blocks) {
 		status = ocfs2_extent_map_get_blocks(inode, v_blkno,
-						     1, &p_blkno,
-						     &p_blocks);
+						     &p_blkno, &p_blocks, NULL);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
@ -1306,7 +1297,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
 				continue;

 			iter = ocfs2_iget(osb, le64_to_cpu(de->inode),
-					  OCFS2_FI_FLAG_NOLOCK);
+					  OCFS2_FI_FLAG_ORPHAN_RECOVERY);
 			if (IS_ERR(iter))
 				continue;

@ -1418,7 +1409,6 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
 		/* Set the proper information to get us going into
 		 * ocfs2_delete_inode. */
 		oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
-		oi->ip_orphaned_slot = slot;
 		spin_unlock(&oi->ip_lock);

 		iput(inode);
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@ -390,7 +390,7 @@ static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
 	/* We may be deleting metadata blocks, so metadata alloc dinode +
 	   one desc. block for each possible delete. */
 	if (tree_depth && next_free == 1 &&
-	    le32_to_cpu(last_el->l_recs[i].e_clusters) == clusters_to_del)
+	    ocfs2_rec_clusters(last_el, &last_el->l_recs[i]) == clusters_to_del)
 		credits += 1 + tree_depth;

 	/* update to the truncate log. */
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@ -85,8 +85,11 @@ int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
 	int ret = 0, lock_level = 0;
 	struct ocfs2_super *osb = OCFS2_SB(file->f_dentry->d_inode->i_sb);

-	/* We don't want to support shared writable mappings yet. */
-	if (!ocfs2_mount_local(osb) &&
+	/*
+	 * Only support shared writeable mmap for local mounts which
+	 * don't know about holes.
+	 */
+	if ((!ocfs2_mount_local(osb) || ocfs2_sparse_alloc(osb)) &&
 	    ((vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_MAYSHARE)) &&
 	    ((vma->vm_flags & VM_WRITE) || (vma->vm_flags & VM_MAYWRITE))) {
 		mlog(0, "disallow shared writable mmaps %lx\n", vma->vm_flags);
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@ -175,8 +175,6 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,

 	inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0);
 	if (IS_ERR(inode)) {
-		mlog(ML_ERROR, "Unable to create inode %llu\n",
-		     (unsigned long long)blkno);
 		ret = ERR_PTR(-EACCES);
 		goto bail_unlock;
 	}
@ -189,7 +187,6 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
 	 * unlink. */
 	spin_lock(&oi->ip_lock);
 	oi->ip_flags &= ~OCFS2_INODE_MAYBE_ORPHANED;
-	oi->ip_orphaned_slot = OCFS2_INVALID_SLOT;
 	spin_unlock(&oi->ip_lock);

 bail_add:
@ -288,7 +285,7 @@ static int ocfs2_fill_new_dir(struct ocfs2_super *osb,

 	i_size_write(inode, inode->i_sb->s_blocksize);
 	inode->i_nlink = 2;
-	inode->i_blocks = ocfs2_align_bytes_to_sectors(inode->i_sb->s_blocksize);
+	inode->i_blocks = ocfs2_inode_sector_count(inode);
 	status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
 	if (status < 0) {
 		mlog_errno(status);
@ -1486,8 +1483,7 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
 	struct buffer_head **bhs = NULL;
 	const char *c;
 	struct super_block *sb = osb->sb;
-	u64 p_blkno;
-	int p_blocks;
+	u64 p_blkno, p_blocks;
 	int virtual, blocks, status, i, bytes_left;

 	bytes_left = i_size_read(inode) + 1;
@ -1514,8 +1510,8 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
 		goto bail;
 	}

-	status = ocfs2_extent_map_get_blocks(inode, 0, 1, &p_blkno,
-					     &p_blocks);
+	status = ocfs2_extent_map_get_blocks(inode, 0, &p_blkno, &p_blocks,
+					     NULL);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@ -1674,8 +1670,11 @@ static int ocfs2_symlink(struct inode *dir,
 	inode->i_rdev = 0;
 	newsize = l - 1;
 	if (l > ocfs2_fast_symlink_chars(sb)) {
+		u32 offset = 0;
+
 		inode->i_op = &ocfs2_symlink_inode_operations;
-		status = ocfs2_do_extend_allocation(osb, inode, 1, new_fe_bh,
+		status = ocfs2_do_extend_allocation(osb, inode, &offset, 1,
+						    new_fe_bh,
 						    handle, data_ac, NULL,
 						    NULL);
 		if (status < 0) {
@ -1689,7 +1688,7 @@ static int ocfs2_symlink(struct inode *dir,
 			goto bail;
 		}
 		i_size_write(inode, newsize);
-		inode->i_blocks = ocfs2_align_bytes_to_sectors(newsize);
+		inode->i_blocks = ocfs2_inode_sector_count(inode);
 	} else {
 		inode->i_op = &ocfs2_fast_symlink_inode_operations;
 		memcpy((char *) fe->id2.i_symlink, symname, l);
@ -2222,9 +2221,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
 	/* Record which orphan dir our inode now resides
 	 * in. delete_inode will use this to determine which orphan
 	 * dir to lock. */
-	spin_lock(&OCFS2_I(inode)->ip_lock);
-	OCFS2_I(inode)->ip_orphaned_slot = osb->slot_num;
-	spin_unlock(&OCFS2_I(inode)->ip_lock);
+	fe->i_orphaned_slot = cpu_to_le16(osb->slot_num);

 	mlog(0, "Inode %llu orphaned in slot %d\n",
 	     (unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num);
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@ -46,11 +46,6 @@
 #include "endian.h"
 #include "ocfs2_lockid.h"

-struct ocfs2_extent_map {
-	u32		em_clusters;
-	struct rb_root	em_extents;
-};
-
 /* Most user visible OCFS2 inodes will have very few pieces of
 * metadata, but larger files (including bitmaps, etc) must be taken
 * into account when designing an access scheme. We allow a small
@ -303,6 +298,13 @@ static inline int ocfs2_should_order_data(struct inode *inode)
 	return 1;
 }

+static inline int ocfs2_sparse_alloc(struct ocfs2_super *osb)
+{
+	if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC)
+		return 1;
+	return 0;
+}
+
 /* set / clear functions because cluster events can make these happen
 * in parallel so we want the transitions to be atomic. this also
 * means that any future flags osb_flags must be protected by spinlock
@ -461,6 +463,49 @@ static inline unsigned long ocfs2_align_bytes_to_sectors(u64 bytes)
 	return (unsigned long)((bytes + 511) >> 9);
 }

+static inline unsigned int ocfs2_page_index_to_clusters(struct super_block *sb,
+							unsigned long pg_index)
+{
+	u32 clusters = pg_index;
+	unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits;
+
+	if (unlikely(PAGE_CACHE_SHIFT > cbits))
+		clusters = pg_index << (PAGE_CACHE_SHIFT - cbits);
+	else if (PAGE_CACHE_SHIFT < cbits)
+		clusters = pg_index >> (cbits - PAGE_CACHE_SHIFT);
+
+	return clusters;
+}
+
+/*
+ * Find the 1st page index which covers the given clusters.
+ */
+static inline unsigned long ocfs2_align_clusters_to_page_index(struct super_block *sb,
+							u32 clusters)
+{
+	unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits;
+	unsigned long index = clusters;
+
+	if (PAGE_CACHE_SHIFT > cbits) {
+		index = clusters >> (PAGE_CACHE_SHIFT - cbits);
+	} else if (PAGE_CACHE_SHIFT < cbits) {
+		index = clusters << (cbits - PAGE_CACHE_SHIFT);
+	}
+
+	return index;
+}
+
+static inline unsigned int ocfs2_pages_per_cluster(struct super_block *sb)
+{
+	unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits;
+	unsigned int pages_per_cluster = 1;
+
+	if (PAGE_CACHE_SHIFT < cbits)
+		pages_per_cluster = 1 << (cbits - PAGE_CACHE_SHIFT);
+
+	return pages_per_cluster;
+}
+
 #define ocfs2_set_bit ext2_set_bit
 #define ocfs2_clear_bit ext2_clear_bit
 #define ocfs2_test_bit ext2_test_bit
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@ -86,7 +86,8 @@
 	OCFS2_SB(sb)->s_feature_incompat &= ~(mask)

 #define OCFS2_FEATURE_COMPAT_SUPP	OCFS2_FEATURE_COMPAT_BACKUP_SB
-#define OCFS2_FEATURE_INCOMPAT_SUPP	OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT
+#define OCFS2_FEATURE_INCOMPAT_SUPP	(OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT \
+					 | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC)
 #define OCFS2_FEATURE_RO_COMPAT_SUPP	0

 /*
@ -154,6 +155,12 @@
 #define OCFS2_FL_VISIBLE	(0x000100FF)	/* User visible flags */
 #define OCFS2_FL_MODIFIABLE	(0x000100FF)	/* User modifiable flags */

+/*
+ * Extent record flags (e_node.leaf.flags)
+ */
+#define OCFS2_EXT_UNWRITTEN	(0x01)	/* Extent is allocated but
+					 * unwritten */
+
 /*
 * ioctl commands
 */
@ -282,10 +289,21 @@ static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = {
 /*
 * On disk extent record for OCFS2
 * It describes a range of clusters on disk.
+ *
+ * Length fields are divided into interior and leaf node versions.
+ * This leaves room for a flags field (OCFS2_EXT_*) in the leaf nodes.
 */
 struct ocfs2_extent_rec {
 /*00*/	__le32 e_cpos;		/* Offset into the file, in clusters */
-	__le32 e_clusters;	/* Clusters covered by this extent */
+	union {
+		__le32 e_int_clusters; /* Clusters covered by all children */
+		struct {
+			__le16 e_leaf_clusters; /* Clusters covered by this
+						   extent */
+			__u8 e_reserved1;
+			__u8 e_flags; /* Extent flags */
+		};
+	};
 	__le64 e_blkno;		/* Physical disk offset, in blocks */
 /*10*/
 };
@ -311,7 +329,10 @@ struct ocfs2_extent_list {
 /*00*/	__le16 l_tree_depth;		/* Extent tree depth from this
 					   point.  0 means data extents
 					   hang directly off this
-					   header (a leaf) */
+					   header (a leaf)
+					   NOTE: The high 8 bits cannot be
+					   used - tree_depth is never that big.
+					*/
 	__le16 l_count;			/* Number of extent records */
 	__le16 l_next_free_rec;		/* Next unused extent slot */
 	__le16 l_reserved1;
@ -446,7 +467,9 @@ struct ocfs2_dinode {
 	__le32 i_ctime_nsec;
 	__le32 i_mtime_nsec;
 	__le32 i_attr;
-	__le32 i_reserved1;
+	__le16 i_orphaned_slot;		/* Only valid when OCFS2_ORPHANED_FL
+					   was set in i_flags */
+	__le16 i_reserved1;
 /*70*/	__le64 i_reserved2[8];
 /*B8*/	union {
 		__le64 i_pad1;		/* Generic way to refer to this
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@ -44,6 +44,7 @@ enum ocfs2_lock_type {
 	OCFS2_LOCK_TYPE_RENAME,
 	OCFS2_LOCK_TYPE_RW,
 	OCFS2_LOCK_TYPE_DENTRY,
+	OCFS2_LOCK_TYPE_OPEN,
 	OCFS2_NUM_LOCK_TYPES
 };

@ -69,6 +70,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
 		case OCFS2_LOCK_TYPE_DENTRY:
 			c = 'N';
 			break;
+		case OCFS2_LOCK_TYPE_OPEN:
+			c = 'O';
+			break;
 		default:
 			c = '\0';
 	}
@ -85,6 +89,7 @@ static char *ocfs2_lock_type_strings[] = {
 	 * important job it does, anyway. */
 	[OCFS2_LOCK_TYPE_RW] = "Write/Read",
 	[OCFS2_LOCK_TYPE_DENTRY] = "Dentry",
+	[OCFS2_LOCK_TYPE_OPEN] = "Open",
 };

 static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@ -197,7 +197,7 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb)
 		goto bail;
 	}

-	status = ocfs2_extent_map_get_blocks(inode, 0ULL, 1, &blkno, NULL);
+	status = ocfs2_extent_map_get_blocks(inode, 0ULL, &blkno, NULL, NULL);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@ -381,8 +381,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
 					     le32_to_cpu(fe->i_clusters)));
 	spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);
 	i_size_write(alloc_inode, le64_to_cpu(fe->i_size));
-	alloc_inode->i_blocks =
-		ocfs2_align_bytes_to_sectors(i_size_read(alloc_inode));
+	alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode);

 	status = 0;
 bail:
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@ -806,9 +806,6 @@ static int __init ocfs2_init(void)

 	ocfs2_print_version();

-	if (init_ocfs2_extent_maps())
-		return -ENOMEM;
-
 	status = init_ocfs2_uptodate_cache();
 	if (status < 0) {
 		mlog_errno(status);
@ -837,7 +834,6 @@ leave:
 	if (status < 0) {
 		ocfs2_free_mem_caches();
 		exit_ocfs2_uptodate_cache();
-		exit_ocfs2_extent_maps();
 	}

 	mlog_exit(status);
@ -863,8 +859,6 @@ static void __exit ocfs2_exit(void)

 	unregister_filesystem(&ocfs2_fs_type);

-	exit_ocfs2_extent_maps();
-
 	exit_ocfs2_uptodate_cache();

 	mlog_exit_void();
@ -963,6 +957,7 @@ static void ocfs2_inode_init_once(void *data,
 		ocfs2_lock_res_init_once(&oi->ip_rw_lockres);
 		ocfs2_lock_res_init_once(&oi->ip_meta_lockres);
 		ocfs2_lock_res_init_once(&oi->ip_data_lockres);
+		ocfs2_lock_res_init_once(&oi->ip_open_lockres);

 		ocfs2_metadata_cache_init(&oi->vfs_inode);

--- a/fs/ocfs2/vote.c
+++ b/fs/ocfs2/vote.c
@ -63,17 +63,10 @@ struct ocfs2_msg_hdr
 	__be32 h_node_num;    /* node sending this particular message. */
 };

-/* OCFS2_MAX_FILENAME_LEN is 255 characters, but we want to align this
- * for the network. */
-#define OCFS2_VOTE_FILENAME_LEN 256
 struct ocfs2_vote_msg
 {
 	struct ocfs2_msg_hdr v_hdr;
-	union {
-		__be32 v_generic1;
-		__be32 v_orphaned_slot;	/* Used during delete votes */
-		__be32 v_nlink;		/* Used during unlink votes */
-	} md1;				/* Message type dependant 1 */
+	__be32 v_reserved1;
 };

 /* Responses are given these values to maintain backwards
@ -86,7 +79,6 @@ struct ocfs2_response_msg
 {
 	struct ocfs2_msg_hdr r_hdr;
 	__be32 r_response;
-	__be32 r_orphaned_slot;
 };

 struct ocfs2_vote_work {
@ -96,7 +88,6 @@ struct ocfs2_vote_work {

 enum ocfs2_vote_request {
 	OCFS2_VOTE_REQ_INVALID = 0,
-	OCFS2_VOTE_REQ_DELETE,
 	OCFS2_VOTE_REQ_MOUNT,
 	OCFS2_VOTE_REQ_UMOUNT,
 	OCFS2_VOTE_REQ_LAST
@ -151,135 +142,23 @@ static void ocfs2_process_umount_request(struct ocfs2_super *osb,
 	ocfs2_node_map_set_bit(osb, &osb->umount_map, node_num);
 }

-void ocfs2_mark_inode_remotely_deleted(struct inode *inode)
-{
-	struct ocfs2_inode_info *oi = OCFS2_I(inode);
-
-	assert_spin_locked(&oi->ip_lock);
-	/* We set the SKIP_DELETE flag on the inode so we don't try to
-	 * delete it in delete_inode ourselves, thus avoiding
-	 * unecessary lock pinging. If the other node failed to wipe
-	 * the inode as a result of a crash, then recovery will pick
-	 * up the slack. */
-	oi->ip_flags |= OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE;
-}
-
-static int ocfs2_process_delete_request(struct inode *inode,
-					int *orphaned_slot)
-{
-	int response = OCFS2_RESPONSE_BUSY;
-
-	mlog(0, "DELETE vote on inode %lu, read lnk_cnt = %u, slot = %d\n",
-	     inode->i_ino, inode->i_nlink, *orphaned_slot);
-
-	spin_lock(&OCFS2_I(inode)->ip_lock);
-
-	/* Whatever our vote response is, we want to make sure that
-	 * the orphaned slot is recorded properly on this node *and*
-	 * on the requesting node. Technically, if the requesting node
-	 * did not know which slot the inode is orphaned in but we
-	 * respond with BUSY he doesn't actually need the orphaned
-	 * slot, but it doesn't hurt to do it here anyway. */
-	if ((*orphaned_slot) != OCFS2_INVALID_SLOT) {
-		mlog_bug_on_msg(OCFS2_I(inode)->ip_orphaned_slot !=
-				OCFS2_INVALID_SLOT &&
-				OCFS2_I(inode)->ip_orphaned_slot !=
-				(*orphaned_slot),
-				"Inode %llu: This node thinks it's "
-				"orphaned in slot %d, messaged it's in %d\n",
-				(unsigned long long)OCFS2_I(inode)->ip_blkno,
-				OCFS2_I(inode)->ip_orphaned_slot,
-				*orphaned_slot);
-
-		mlog(0, "Setting orphaned slot for inode %llu to %d\n",
-		     (unsigned long long)OCFS2_I(inode)->ip_blkno,
-		     *orphaned_slot);
-
-		OCFS2_I(inode)->ip_orphaned_slot = *orphaned_slot;
-	} else {
-		mlog(0, "Sending back orphaned slot %d for inode %llu\n",
-		     OCFS2_I(inode)->ip_orphaned_slot,
-		     (unsigned long long)OCFS2_I(inode)->ip_blkno);
-
-		*orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
-	}
-
-	/* vote no if the file is still open. */
-	if (OCFS2_I(inode)->ip_open_count) {
-		mlog(0, "open count = %u\n",
-		     OCFS2_I(inode)->ip_open_count);
-		spin_unlock(&OCFS2_I(inode)->ip_lock);
-		goto done;
-	}
-	spin_unlock(&OCFS2_I(inode)->ip_lock);
-
-	/* directories are a bit ugly... What if someone is sitting in
-	 * it? We want to make sure the inode is removed completely as
-	 * a result of the iput in process_vote. */
-	if (S_ISDIR(inode->i_mode) && (atomic_read(&inode->i_count) != 1)) {
-		mlog(0, "i_count = %u\n", atomic_read(&inode->i_count));
-		goto done;
-	}
-
-	if (filemap_fdatawrite(inode->i_mapping)) {
-		mlog(ML_ERROR, "Could not sync inode %llu for delete!\n",
-		     (unsigned long long)OCFS2_I(inode)->ip_blkno);
-		goto done;
-	}
-	sync_mapping_buffers(inode->i_mapping);
-	truncate_inode_pages(inode->i_mapping, 0);
-	ocfs2_extent_map_trunc(inode, 0);
-
-	spin_lock(&OCFS2_I(inode)->ip_lock);
-	/* double check open count - someone might have raced this
-	 * thread into ocfs2_file_open while we were writing out
-	 * data. If we're to allow a wipe of this inode now, we *must*
-	 * hold the spinlock until we've marked it. */
-	if (OCFS2_I(inode)->ip_open_count) {
-		mlog(0, "Raced to wipe! open count = %u\n",
-		     OCFS2_I(inode)->ip_open_count);
-		spin_unlock(&OCFS2_I(inode)->ip_lock);
-		goto done;
-	}
-
-	/* Mark the inode as being wiped from disk. */
-	ocfs2_mark_inode_remotely_deleted(inode);
-	spin_unlock(&OCFS2_I(inode)->ip_lock);
-
-	/* Not sure this is necessary anymore. */
-	d_prune_aliases(inode);
-
-	/* If we get here, then we're voting 'yes', so commit the
-	 * delete on our side. */
-	response = OCFS2_RESPONSE_OK;
-done:
-	return response;
-}
-
 static void ocfs2_process_vote(struct ocfs2_super *osb,
 			       struct ocfs2_vote_msg *msg)
 {
 	int net_status, vote_response;
-	int orphaned_slot = 0;
-	unsigned int node_num, generation;
+	unsigned int node_num;
 	u64 blkno;
 	enum ocfs2_vote_request request;
-	struct inode *inode = NULL;
 	struct ocfs2_msg_hdr *hdr = &msg->v_hdr;
 	struct ocfs2_response_msg response;

 	/* decode the network mumbo jumbo into local variables. */
 	request = be32_to_cpu(hdr->h_request);
 	blkno = be64_to_cpu(hdr->h_blkno);
-	generation = be32_to_cpu(hdr->h_generation);
 	node_num = be32_to_cpu(hdr->h_node_num);
-	if (request == OCFS2_VOTE_REQ_DELETE)
-		orphaned_slot = be32_to_cpu(msg->md1.v_orphaned_slot);

-	mlog(0, "processing vote: request = %u, blkno = %llu, "
-	     "generation = %u, node_num = %u, priv1 = %u\n", request,
-	     (unsigned long long)blkno, generation, node_num,
-	     be32_to_cpu(msg->md1.v_generic1));
+	mlog(0, "processing vote: request = %u, blkno = %llu, node_num = %u\n",
+	     request, (unsigned long long)blkno, node_num);

 	if (!ocfs2_is_valid_vote_request(request)) {
 		mlog(ML_ERROR, "Invalid vote request %d from node %u\n",
@ -302,52 +181,6 @@ static void ocfs2_process_vote(struct ocfs2_super *osb,
 		break;
 	}

-	/* We cannot process the remaining message types before we're
-	 * fully mounted. It's perfectly safe however to send a 'yes'
-	 * response as we can't possibly have any of the state they're
-	 * asking us to modify yet. */
-	if (atomic_read(&osb->vol_state) == VOLUME_INIT)
-		goto respond;
-
-	/* If we get here, then the request is against an inode. */
-	inode = ocfs2_ilookup_for_vote(osb, blkno,
-				       request == OCFS2_VOTE_REQ_DELETE);
-
-	/* Not finding the inode is perfectly valid - it means we're
-	 * not interested in what the other node is about to do to it
-	 * so in those cases we automatically respond with an
-	 * affirmative. Cluster locking ensures that we won't race
-	 * interest in the inode with this vote request. */
-	if (!inode)
-		goto respond;
-
-	/* Check generation values. It's possible for us to get a
-	 * request against a stale inode. If so then we proceed as if
-	 * we had not found an inode in the first place. */
-	if (inode->i_generation != generation) {
-		mlog(0, "generation passed %u != inode generation = %u, "
-		     "ip_flags = %x, ip_blkno = %llu, msg %llu, i_count = %u, "
-		     "message type = %u\n", generation, inode->i_generation,
-		     OCFS2_I(inode)->ip_flags,
-		     (unsigned long long)OCFS2_I(inode)->ip_blkno,
-		     (unsigned long long)blkno, atomic_read(&inode->i_count),
-		     request);
-		iput(inode);
-		inode = NULL;
-		goto respond;
-	}
-
-	switch (request) {
-	case OCFS2_VOTE_REQ_DELETE:
-		vote_response = ocfs2_process_delete_request(inode,
-							     &orphaned_slot);
-		break;
-	default:
-		mlog(ML_ERROR, "node %u, invalid request: %u\n",
-		     node_num, request);
-		vote_response = OCFS2_RESPONSE_BAD_MSG;
-	}
-
 respond:
 	/* Response struture is small so we just put it on the stack
 	 * and stuff it inline. */
@ -357,7 +190,6 @@ respond:
 	response.r_hdr.h_generation = hdr->h_generation;
 	response.r_hdr.h_node_num = cpu_to_be32(osb->node_num);
 	response.r_response = cpu_to_be32(vote_response);
-	response.r_orphaned_slot = cpu_to_be32(orphaned_slot);

 	net_status = o2net_send_message(OCFS2_MESSAGE_TYPE_RESPONSE,
 					osb->net_key,
@ -373,9 +205,6 @@ respond:
 	    && net_status != -ENOTCONN)
 		mlog(ML_ERROR, "message to node %u fails with error %d!\n",
 		     node_num, net_status);
-
-	if (inode)
-		iput(inode);
 }

 static void ocfs2_vote_thread_do_work(struct ocfs2_super *osb)
@ -634,8 +463,7 @@ bail:
 static struct ocfs2_vote_msg * ocfs2_new_vote_request(struct ocfs2_super *osb,
 						      u64 blkno,
 						      unsigned int generation,
-						      enum ocfs2_vote_request type,
-						      u32 priv)
+						      enum ocfs2_vote_request type)
 {
 	struct ocfs2_vote_msg *request;
 	struct ocfs2_msg_hdr *hdr;
@ -651,8 +479,6 @@ static struct ocfs2_vote_msg * ocfs2_new_vote_request(struct ocfs2_super *osb,
 		hdr->h_request = cpu_to_be32(type);
 		hdr->h_blkno = cpu_to_be64(blkno);
 		hdr->h_generation = cpu_to_be32(generation);
-
-		request->md1.v_generic1 = cpu_to_be32(priv);
 	}

 	return request;
@ -664,7 +490,7 @@ static int ocfs2_do_request_vote(struct ocfs2_super *osb,
 				 struct ocfs2_vote_msg *request,
 				 struct ocfs2_net_response_cb *callback)
 {
-	int status, response;
+	int status, response = -EBUSY;
 	unsigned int response_id;
 	struct ocfs2_msg_hdr *hdr;

@ -686,109 +512,12 @@ bail:
 	return status;
 }

-static int ocfs2_request_vote(struct inode *inode,
-			      struct ocfs2_vote_msg *request,
-			      struct ocfs2_net_response_cb *callback)
-{
-	int status;
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-
-	if (ocfs2_inode_is_new(inode))
-		return 0;
-
-	status = -EAGAIN;
-	while (status == -EAGAIN) {
-		if (!(osb->s_mount_opt & OCFS2_MOUNT_NOINTR) &&
-		    signal_pending(current))
-			return -ERESTARTSYS;
-
-		status = ocfs2_super_lock(osb, 0);
-		if (status < 0) {
-			mlog_errno(status);
-			break;
-		}
-
-		status = 0;
-		if (!ocfs2_node_map_is_only(osb, &osb->mounted_map,
-					   osb->node_num))
-			status = ocfs2_do_request_vote(osb, request, callback);
-
-		ocfs2_super_unlock(osb, 0);
-	}
-	return status;
-}
-
-static void ocfs2_delete_response_cb(void *priv,
-				     struct ocfs2_response_msg *resp)
-{
-	int orphaned_slot, node;
-	struct inode *inode = priv;
-
-	orphaned_slot = be32_to_cpu(resp->r_orphaned_slot);
-	node = be32_to_cpu(resp->r_hdr.h_node_num);
-	mlog(0, "node %d tells us that inode %llu is orphaned in slot %d\n",
-	     node, (unsigned long long)OCFS2_I(inode)->ip_blkno,
-	     orphaned_slot);
-
-	/* The other node may not actually know which slot the inode
-	 * is orphaned in. */
-	if (orphaned_slot == OCFS2_INVALID_SLOT)
-		return;
-
-	/* Ok, the responding node knows which slot this inode is
-	 * orphaned in. We verify that the information is correct and
-	 * then record this in the inode. ocfs2_delete_inode will use
-	 * this information to determine which lock to take. */
-	spin_lock(&OCFS2_I(inode)->ip_lock);
-	mlog_bug_on_msg(OCFS2_I(inode)->ip_orphaned_slot != orphaned_slot &&
-			OCFS2_I(inode)->ip_orphaned_slot
-			!= OCFS2_INVALID_SLOT, "Inode %llu: Node %d says it's "
-			"orphaned in slot %d, we think it's in %d\n",
-			(unsigned long long)OCFS2_I(inode)->ip_blkno,
-			be32_to_cpu(resp->r_hdr.h_node_num),
-			orphaned_slot, OCFS2_I(inode)->ip_orphaned_slot);
-
-	OCFS2_I(inode)->ip_orphaned_slot = orphaned_slot;
-	spin_unlock(&OCFS2_I(inode)->ip_lock);
-}
-
-int ocfs2_request_delete_vote(struct inode *inode)
-{
-	int orphaned_slot, status;
-	struct ocfs2_net_response_cb delete_cb;
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-	struct ocfs2_vote_msg *request;
-
-	spin_lock(&OCFS2_I(inode)->ip_lock);
-	orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
-	spin_unlock(&OCFS2_I(inode)->ip_lock);
-
-	delete_cb.rc_cb = ocfs2_delete_response_cb;
-	delete_cb.rc_priv = inode;
-
-	mlog(0, "Inode %llu, we start thinking orphaned slot is %d\n",
-	     (unsigned long long)OCFS2_I(inode)->ip_blkno, orphaned_slot);
-
-	status = -ENOMEM;
-	request = ocfs2_new_vote_request(osb, OCFS2_I(inode)->ip_blkno,
-					 inode->i_generation,
-					 OCFS2_VOTE_REQ_DELETE, orphaned_slot);
-	if (request) {
-		status = ocfs2_request_vote(inode, request, &delete_cb);
-
-		kfree(request);
-	}
-
-	return status;
-}
-
 int ocfs2_request_mount_vote(struct ocfs2_super *osb)
 {
 	int status;
 	struct ocfs2_vote_msg *request = NULL;

-	request = ocfs2_new_vote_request(osb, 0ULL, 0,
-					 OCFS2_VOTE_REQ_MOUNT, 0);
+	request = ocfs2_new_vote_request(osb, 0ULL, 0, OCFS2_VOTE_REQ_MOUNT);
 	if (!request) {
 		status = -ENOMEM;
 		goto bail;
@ -821,8 +550,7 @@ int ocfs2_request_umount_vote(struct ocfs2_super *osb)
 	int status;
 	struct ocfs2_vote_msg *request = NULL;

-	request = ocfs2_new_vote_request(osb, 0ULL, 0,
-					 OCFS2_VOTE_REQ_UMOUNT, 0);
+	request = ocfs2_new_vote_request(osb, 0ULL, 0, OCFS2_VOTE_REQ_UMOUNT);
 	if (!request) {
 		status = -ENOMEM;
 		goto bail;
@ -969,7 +697,6 @@ static int ocfs2_handle_vote_message(struct o2net_msg *msg,
 	     be32_to_cpu(work->w_msg.v_hdr.h_generation));
 	mlog(0, "h_node_num = %u\n",
 	     be32_to_cpu(work->w_msg.v_hdr.h_node_num));
-	mlog(0, "v_generic1 = %u\n", be32_to_cpu(work->w_msg.md1.v_generic1));

 	spin_lock(&osb->vote_task_lock);
 	list_add_tail(&work->w_list, &osb->vote_list);
--- a/fs/ocfs2/vote.h
+++ b/fs/ocfs2/vote.h
@ -38,14 +38,11 @@ static inline void ocfs2_kick_vote_thread(struct ocfs2_super *osb)
 	wake_up(&osb->vote_event);
 }

-int ocfs2_request_delete_vote(struct inode *inode);
 int ocfs2_request_mount_vote(struct ocfs2_super *osb);
 int ocfs2_request_umount_vote(struct ocfs2_super *osb);
 int ocfs2_register_net_handlers(struct ocfs2_super *osb);
 void ocfs2_unregister_net_handlers(struct ocfs2_super *osb);

-void ocfs2_mark_inode_remotely_deleted(struct inode *inode);
-
 void ocfs2_remove_node_from_vote_queues(struct ocfs2_super *osb,
 					int node_num);
 #endif
--- a/fs/sync.c
+++ b/fs/sync.c
@ -239,13 +239,11 @@ out:
 /*
 * `endbyte' is inclusive
 */
-int do_sync_file_range(struct file *file, loff_t offset, loff_t endbyte,
-			unsigned int flags)
+int do_sync_mapping_range(struct address_space *mapping, loff_t offset,
+			  loff_t endbyte, unsigned int flags)
 {
 	int ret;
-	struct address_space *mapping;

-	mapping = file->f_mapping;
 	if (!mapping) {
 		ret = -EINVAL;
 		goto out;
@ -275,4 +273,4 @@ int do_sync_file_range(struct file *file, loff_t offset, loff_t endbyte,
 out:
 	return ret;
 }
-EXPORT_SYMBOL_GPL(do_sync_file_range);
+EXPORT_SYMBOL_GPL(do_sync_mapping_range);
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@ -843,8 +843,13 @@ extern int fcntl_setlease(unsigned int fd, struct file *filp, long arg);
 extern int fcntl_getlease(struct file *filp);

 /* fs/sync.c */
-extern int do_sync_file_range(struct file *file, loff_t offset, loff_t endbyte,
-			unsigned int flags);
+extern int do_sync_mapping_range(struct address_space *mapping, loff_t offset,
+			loff_t endbyte, unsigned int flags);
+static inline int do_sync_file_range(struct file *file, loff_t offset,
+			loff_t endbyte, unsigned int flags)
+{
+	return do_sync_mapping_range(file->f_mapping, offset, endbyte, flags);
+}

 /* fs/locks.c */
 extern void locks_init_lock(struct file_lock *);