Merge branch 'upstream-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mfasheh/ocfs2

* 'upstream-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mfasheh/ocfs2: (27 commits)
  ocfs2: Cache extent records
  ocfs2: Remember rw lock level during direct io
  ocfs2: Fix up i_blocks calculation to know about holes
  ocfs2: Fix extent lookup to return true size of holes
  ocfs2: Read from an unwritten extent returns zeros
  ocfs2: make room for unwritten extents flag
  ocfs2: Use own splice write actor
  ocfs2: Use do_sync_mapping_range() in ocfs2_zero_tail_for_truncate()
  [PATCH] Turn do_sync_file_range() into do_sync_mapping_range()
  ocfs2: zero tail of sparse files on truncate
  ocfs2: Teach ocfs2_get_block() about holes
  ocfs2: remove ocfs2_prepare_write() and ocfs2_commit_write()
  ocfs2: teach ocfs2_file_aio_write() about sparse files
  ocfs2: Turn off shared writeable mmap for local files systems with holes.
  ocfs2: abstract out allocation locking
  ocfs2: teach extend/truncate about sparse files
  ocfs2: temporarily remove extent map caching
  ocfs2: sparse b-tree support
  ocfs2: small cleanup of ocfs2_request_delete()
  ocfs2: remove unused code
  ...
This commit is contained in:
Linus Torvalds 2007-04-27 10:29:56 -07:00
commit ea6db58f3e
31 changed files with 4786 additions and 2326 deletions

File diff suppressed because it is too large Load diff

View file

@ -31,7 +31,8 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
handle_t *handle,
struct inode *inode,
struct buffer_head *fe_bh,
u64 blkno,
u32 cpos,
u64 start_blk,
u32 new_clusters,
struct ocfs2_alloc_context *meta_ac);
int ocfs2_num_free_extents(struct ocfs2_super *osb,
@ -70,6 +71,8 @@ struct ocfs2_truncate_context {
struct buffer_head *tc_last_eb_bh;
};
int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
u64 new_i_size);
int ocfs2_prepare_truncate(struct ocfs2_super *osb,
struct inode *inode,
struct buffer_head *fe_bh,
@ -79,4 +82,26 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
struct buffer_head *fe_bh,
struct ocfs2_truncate_context *tc);
int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
u32 cpos, struct buffer_head **leaf_bh);
/*
* Helper function to look at the # of clusters in an extent record.
*/
static inline unsigned int ocfs2_rec_clusters(struct ocfs2_extent_list *el,
struct ocfs2_extent_rec *rec)
{
/*
* Cluster count in extent records is slightly different
* between interior nodes and leaf nodes. This is to support
* unwritten extents which need a flags field in leaf node
* records, thus shrinking the available space for a clusters
* field.
*/
if (el->l_tree_depth)
return le32_to_cpu(rec->e_int_clusters);
else
return le16_to_cpu(rec->e_leaf_clusters);
}
#endif /* OCFS2_ALLOC_H */

File diff suppressed because it is too large Load diff

View file

@ -30,12 +30,83 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
unsigned from,
unsigned to);
int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
struct inode *inode, unsigned int from,
unsigned int to, int new);
int walk_page_buffers( handle_t *handle,
struct buffer_head *head,
unsigned from,
unsigned to,
int *partial,
int (*fn)( handle_t *handle,
struct buffer_head *bh));
struct ocfs2_write_ctxt;
typedef int (ocfs2_page_writer)(struct inode *, struct ocfs2_write_ctxt *,
u64 *, unsigned int *, unsigned int *);
ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos,
size_t count, ocfs2_page_writer *actor,
void *priv);
struct ocfs2_write_ctxt {
size_t w_count;
loff_t w_pos;
u32 w_cpos;
unsigned int w_finished_copy;
/* This is true if page_size > cluster_size */
unsigned int w_large_pages;
/* Filler callback and private data */
ocfs2_page_writer *w_write_data_page;
void *w_private;
/* Only valid for the filler callback */
struct page *w_this_page;
unsigned int w_this_page_new;
};
struct ocfs2_buffered_write_priv {
char *b_src_buf;
const struct iovec *b_cur_iov; /* Current iovec */
size_t b_cur_off; /* Offset in the
* current iovec */
};
int ocfs2_map_and_write_user_data(struct inode *inode,
struct ocfs2_write_ctxt *wc,
u64 *p_blkno,
unsigned int *ret_from,
unsigned int *ret_to);
struct ocfs2_splice_write_priv {
struct splice_desc *s_sd;
struct pipe_buffer *s_buf;
struct pipe_inode_info *s_pipe;
/* Neither offset value is ever larger than one page */
unsigned int s_offset;
unsigned int s_buf_offset;
};
int ocfs2_map_and_write_splice_data(struct inode *inode,
struct ocfs2_write_ctxt *wc,
u64 *p_blkno,
unsigned int *ret_from,
unsigned int *ret_to);
/* all ocfs2_dio_end_io()'s fault */
#define ocfs2_iocb_is_rw_locked(iocb) \
test_bit(0, (unsigned long *)&iocb->private)
#define ocfs2_iocb_set_rw_locked(iocb) \
set_bit(0, (unsigned long *)&iocb->private)
static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level)
{
set_bit(0, (unsigned long *)&iocb->private);
if (level)
set_bit(1, (unsigned long *)&iocb->private);
else
clear_bit(1, (unsigned long *)&iocb->private);
}
#define ocfs2_iocb_clear_rw_locked(iocb) \
clear_bit(0, (unsigned long *)&iocb->private)
#define ocfs2_iocb_rw_locked_level(iocb) \
test_bit(1, (unsigned long *)&iocb->private)
#endif /* OCFS2_FILE_H */

View file

@ -46,6 +46,7 @@
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/workqueue.h>
#include <linux/reboot.h>
#include "heartbeat.h"
#include "nodemanager.h"
@ -72,7 +73,9 @@ static void o2quo_fence_self(void)
/* panic spins with interrupts enabled. with preempt
* threads can still schedule, etc, etc */
o2hb_stop_all_regions();
panic("ocfs2 is very sorry to be fencing this system by panicing\n");
printk("ocfs2 is very sorry to be fencing this system by restarting\n");
emergency_restart();
}
/* Indicate that a timeout occured on a hearbeat region write. The

View file

@ -38,6 +38,9 @@
* locking semantics of the file system using the protocol. It should
* be somewhere else, I'm sure, but right now it isn't.
*
* New in version 8:
* - Replace delete inode votes with a cluster lock
*
* New in version 7:
* - DLM join domain includes the live nodemap
*
@ -57,7 +60,7 @@
* - full 64 bit i_size in the metadata lock lvbs
* - introduction of "rw" lock and pushing meta/data locking down
*/
#define O2NET_PROTOCOL_VERSION 7ULL
#define O2NET_PROTOCOL_VERSION 8ULL
struct o2net_handshake {
__be64 protocol_version;
__be64 connector_id;

View file

@ -358,15 +358,17 @@ int ocfs2_do_extend_dir(struct super_block *sb,
{
int status;
int extend;
u64 p_blkno;
u64 p_blkno, v_blkno;
spin_lock(&OCFS2_I(dir)->ip_lock);
extend = (i_size_read(dir) == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters));
spin_unlock(&OCFS2_I(dir)->ip_lock);
if (extend) {
status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, 1,
parent_fe_bh, handle,
u32 offset = OCFS2_I(dir)->ip_clusters;
status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, &offset,
1, parent_fe_bh, handle,
data_ac, meta_ac, NULL);
BUG_ON(status == -EAGAIN);
if (status < 0) {
@ -375,9 +377,8 @@ int ocfs2_do_extend_dir(struct super_block *sb,
}
}
status = ocfs2_extent_map_get_blocks(dir, (dir->i_blocks >>
(sb->s_blocksize_bits - 9)),
1, &p_blkno, NULL);
v_blkno = ocfs2_blocks_for_bytes(sb, i_size_read(dir));
status = ocfs2_extent_map_get_blocks(dir, v_blkno, &p_blkno, NULL, NULL);
if (status < 0) {
mlog_errno(status);
goto bail;
@ -486,7 +487,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
dir_i_size += dir->i_sb->s_blocksize;
i_size_write(dir, dir_i_size);
dir->i_blocks = ocfs2_align_bytes_to_sectors(dir_i_size);
dir->i_blocks = ocfs2_inode_sector_count(dir);
status = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
if (status < 0) {
mlog_errno(status);

View file

@ -430,11 +430,10 @@ redo_bucket:
dlm_lockres_put(res);
cond_resched_lock(&dlm->spinlock);
if (dropped)
goto redo_bucket;
}
cond_resched_lock(&dlm->spinlock);
num += n;
mlog(0, "%s: touched %d lockreses in bucket %d "
"(tot=%d)\n", dlm->name, n, i, num);
@ -1035,7 +1034,7 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm)
{
int status = 0, tmpstat, node;
struct domain_join_ctxt *ctxt;
enum dlm_query_join_response response;
enum dlm_query_join_response response = JOIN_DISALLOW;
mlog_entry("%p", dlm);

View file

@ -611,6 +611,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
}
} while (status != 0);
spin_lock(&dlm_reco_state_lock);
switch (ndata->state) {
case DLM_RECO_NODE_DATA_INIT:
case DLM_RECO_NODE_DATA_FINALIZE_SENT:
@ -641,6 +642,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
ndata->node_num, dead_node);
break;
}
spin_unlock(&dlm_reco_state_lock);
}
mlog(0, "done requesting all lock info\n");

View file

@ -225,11 +225,17 @@ static struct ocfs2_lock_res_ops ocfs2_dentry_lops = {
.flags = 0,
};
static struct ocfs2_lock_res_ops ocfs2_inode_open_lops = {
.get_osb = ocfs2_get_inode_osb,
.flags = 0,
};
static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
{
return lockres->l_type == OCFS2_LOCK_TYPE_META ||
lockres->l_type == OCFS2_LOCK_TYPE_DATA ||
lockres->l_type == OCFS2_LOCK_TYPE_RW;
lockres->l_type == OCFS2_LOCK_TYPE_RW ||
lockres->l_type == OCFS2_LOCK_TYPE_OPEN;
}
static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres)
@ -373,6 +379,9 @@ void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
case OCFS2_LOCK_TYPE_DATA:
ops = &ocfs2_inode_data_lops;
break;
case OCFS2_LOCK_TYPE_OPEN:
ops = &ocfs2_inode_open_lops;
break;
default:
mlog_bug_on_msg(1, "type: %d\n", type);
ops = NULL; /* thanks, gcc */
@ -1129,6 +1138,12 @@ int ocfs2_create_new_inode_locks(struct inode *inode)
goto bail;
}
ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_open_lockres, 0, 0);
if (ret) {
mlog_errno(ret);
goto bail;
}
bail:
mlog_exit(ret);
return ret;
@ -1182,6 +1197,99 @@ void ocfs2_rw_unlock(struct inode *inode, int write)
mlog_exit_void();
}
/*
* ocfs2_open_lock always get PR mode lock.
*/
int ocfs2_open_lock(struct inode *inode)
{
int status = 0;
struct ocfs2_lock_res *lockres;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
BUG_ON(!inode);
mlog_entry_void();
mlog(0, "inode %llu take PRMODE open lock\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno);
if (ocfs2_mount_local(osb))
goto out;
lockres = &OCFS2_I(inode)->ip_open_lockres;
status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
LKM_PRMODE, 0, 0);
if (status < 0)
mlog_errno(status);
out:
mlog_exit(status);
return status;
}
int ocfs2_try_open_lock(struct inode *inode, int write)
{
int status = 0, level;
struct ocfs2_lock_res *lockres;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
BUG_ON(!inode);
mlog_entry_void();
mlog(0, "inode %llu try to take %s open lock\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
write ? "EXMODE" : "PRMODE");
if (ocfs2_mount_local(osb))
goto out;
lockres = &OCFS2_I(inode)->ip_open_lockres;
level = write ? LKM_EXMODE : LKM_PRMODE;
/*
* The file system may already holding a PRMODE/EXMODE open lock.
* Since we pass LKM_NOQUEUE, the request won't block waiting on
* other nodes and the -EAGAIN will indicate to the caller that
* this inode is still in use.
*/
status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
level, LKM_NOQUEUE, 0);
out:
mlog_exit(status);
return status;
}
/*
* ocfs2_open_unlock unlock PR and EX mode open locks.
*/
void ocfs2_open_unlock(struct inode *inode)
{
struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_open_lockres;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
mlog_entry_void();
mlog(0, "inode %llu drop open lock\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno);
if (ocfs2_mount_local(osb))
goto out;
if(lockres->l_ro_holders)
ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres,
LKM_PRMODE);
if(lockres->l_ex_holders)
ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres,
LKM_EXMODE);
out:
mlog_exit_void();
}
int ocfs2_data_lock_full(struct inode *inode,
int write,
int arg_flags)
@ -1387,8 +1495,7 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
if (S_ISLNK(inode->i_mode) && !oi->ip_clusters)
inode->i_blocks = 0;
else
inode->i_blocks =
ocfs2_align_bytes_to_sectors(i_size_read(inode));
inode->i_blocks = ocfs2_inode_sector_count(inode);
inode->i_uid = be32_to_cpu(lvb->lvb_iuid);
inode->i_gid = be32_to_cpu(lvb->lvb_igid);
@ -1479,12 +1586,15 @@ static int ocfs2_meta_lock_update(struct inode *inode,
{
int status = 0;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_lock_res *lockres = NULL;
struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres;
struct ocfs2_dinode *fe;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
mlog_entry_void();
if (ocfs2_mount_local(osb))
goto bail;
spin_lock(&oi->ip_lock);
if (oi->ip_flags & OCFS2_INODE_DELETED) {
mlog(0, "Orphaned inode %llu was deleted while we "
@ -1496,22 +1606,16 @@ static int ocfs2_meta_lock_update(struct inode *inode,
}
spin_unlock(&oi->ip_lock);
if (!ocfs2_mount_local(osb)) {
lockres = &oi->ip_meta_lockres;
if (!ocfs2_should_refresh_lock_res(lockres))
goto bail;
}
if (!ocfs2_should_refresh_lock_res(lockres))
goto bail;
/* This will discard any caching information we might have had
* for the inode metadata. */
ocfs2_metadata_cache_purge(inode);
/* will do nothing for inode types that don't use the extent
* map (directories, bitmap files, etc) */
ocfs2_extent_map_trunc(inode, 0);
if (lockres && ocfs2_meta_lvb_is_trustable(inode, lockres)) {
if (ocfs2_meta_lvb_is_trustable(inode, lockres)) {
mlog(0, "Trusting LVB on inode %llu\n",
(unsigned long long)oi->ip_blkno);
ocfs2_refresh_inode_from_lvb(inode);
@ -1558,8 +1662,7 @@ static int ocfs2_meta_lock_update(struct inode *inode,
status = 0;
bail_refresh:
if (lockres)
ocfs2_complete_lock_res_refresh(lockres, status);
ocfs2_complete_lock_res_refresh(lockres, status);
bail:
mlog_exit(status);
return status;
@ -1630,7 +1733,6 @@ int ocfs2_meta_lock_full(struct inode *inode,
wait_event(osb->recovery_event,
ocfs2_node_map_is_empty(osb, &osb->recovery_map));
acquired = 0;
lockres = &OCFS2_I(inode)->ip_meta_lockres;
level = ex ? LKM_EXMODE : LKM_PRMODE;
dlm_flags = 0;
@ -2458,12 +2560,19 @@ int ocfs2_drop_inode_locks(struct inode *inode)
* ocfs2_clear_inode has done it for us. */
err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
&OCFS2_I(inode)->ip_data_lockres);
&OCFS2_I(inode)->ip_open_lockres);
if (err < 0)
mlog_errno(err);
status = err;
err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
&OCFS2_I(inode)->ip_data_lockres);
if (err < 0)
mlog_errno(err);
if (err < 0 && !status)
status = err;
err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
&OCFS2_I(inode)->ip_meta_lockres);
if (err < 0)

View file

@ -80,6 +80,9 @@ void ocfs2_data_unlock(struct inode *inode,
int write);
int ocfs2_rw_lock(struct inode *inode, int write);
void ocfs2_rw_unlock(struct inode *inode, int write);
int ocfs2_open_lock(struct inode *inode);
int ocfs2_try_open_lock(struct inode *inode, int write);
void ocfs2_open_unlock(struct inode *inode);
int ocfs2_meta_lock_atime(struct inode *inode,
struct vfsmount *vfsmnt,
int *level);

File diff suppressed because it is too large Load diff

View file

@ -25,22 +25,29 @@
#ifndef _EXTENT_MAP_H
#define _EXTENT_MAP_H
int init_ocfs2_extent_maps(void);
void exit_ocfs2_extent_maps(void);
struct ocfs2_extent_map_item {
unsigned int ei_cpos;
unsigned int ei_phys;
unsigned int ei_clusters;
unsigned int ei_flags;
/*
* EVERY CALL here except _init, _trunc, and _drop expects alloc_sem
* to be held. The allocation cannot change at all while the map is
* in the process of being updated.
*/
int ocfs2_extent_map_init(struct inode *inode);
int ocfs2_extent_map_append(struct inode *inode,
struct ocfs2_extent_rec *rec,
u32 new_clusters);
int ocfs2_extent_map_get_blocks(struct inode *inode,
u64 v_blkno, int count,
u64 *p_blkno, int *ret_count);
int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters);
int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters);
struct list_head ei_list;
};
#define OCFS2_MAX_EXTENT_MAP_ITEMS 3
struct ocfs2_extent_map {
unsigned int em_num_items;
struct list_head em_list;
};
void ocfs2_extent_map_init(struct inode *inode);
void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cluster);
void ocfs2_extent_map_insert_rec(struct inode *inode,
struct ocfs2_extent_rec *rec);
int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, u32 *p_cluster,
u32 *num_clusters, unsigned int *extent_flags);
int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
u64 *ret_count, unsigned int *extent_flags);
#endif /* _EXTENT_MAP_H */

View file

@ -33,6 +33,7 @@
#include <linux/sched.h>
#include <linux/pipe_fs_i.h>
#include <linux/mount.h>
#include <linux/writeback.h>
#define MLOG_MASK_PREFIX ML_INODE
#include <cluster/masklog.h>
@ -215,7 +216,7 @@ int ocfs2_set_inode_size(handle_t *handle,
mlog_entry_void();
i_size_write(inode, new_i_size);
inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size);
inode->i_blocks = ocfs2_inode_sector_count(inode);
inode->i_ctime = inode->i_mtime = CURRENT_TIME;
status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
@ -261,6 +262,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
{
int status;
handle_t *handle;
struct ocfs2_dinode *di;
mlog_entry_void();
@ -274,12 +276,39 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
goto out;
}
status = ocfs2_set_inode_size(handle, inode, fe_bh, new_i_size);
status = ocfs2_journal_access(handle, inode, fe_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto out_commit;
}
/*
* Do this before setting i_size.
*/
status = ocfs2_zero_tail_for_truncate(inode, handle, new_i_size);
if (status) {
mlog_errno(status);
goto out_commit;
}
i_size_write(inode, new_i_size);
inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size);
inode->i_ctime = inode->i_mtime = CURRENT_TIME;
di = (struct ocfs2_dinode *) fe_bh->b_data;
di->i_size = cpu_to_le64(new_i_size);
di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
status = ocfs2_journal_dirty(handle, fe_bh);
if (status < 0)
mlog_errno(status);
out_commit:
ocfs2_commit_trans(osb, handle);
out:
mlog_exit(status);
return status;
}
@ -342,19 +371,6 @@ static int ocfs2_truncate_file(struct inode *inode,
mlog_errno(status);
goto bail;
}
ocfs2_data_unlock(inode, 1);
if (le32_to_cpu(fe->i_clusters) ==
ocfs2_clusters_for_bytes(osb->sb, new_i_size)) {
mlog(0, "fe->i_clusters = %u, so we do a simple truncate\n",
fe->i_clusters);
/* No allocation change is required, so lets fast path
* this truncate. */
status = ocfs2_simple_size_update(inode, di_bh, new_i_size);
if (status < 0)
mlog_errno(status);
goto bail;
}
/* alright, we're going to need to do a full blown alloc size
* change. Orphan the inode so that recovery can complete the
@ -363,22 +379,25 @@ static int ocfs2_truncate_file(struct inode *inode,
status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
if (status < 0) {
mlog_errno(status);
goto bail;
goto bail_unlock_data;
}
status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
if (status < 0) {
mlog_errno(status);
goto bail;
goto bail_unlock_data;
}
status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
if (status < 0) {
mlog_errno(status);
goto bail;
goto bail_unlock_data;
}
/* TODO: orphan dir cleanup here. */
bail_unlock_data:
ocfs2_data_unlock(inode, 1);
bail:
mlog_exit(status);
@ -397,6 +416,7 @@ bail:
*/
int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
struct inode *inode,
u32 *logical_offset,
u32 clusters_to_add,
struct buffer_head *fe_bh,
handle_t *handle,
@ -460,18 +480,14 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
mlog(0, "Allocating %u clusters at block %u for inode %llu\n",
num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
status = ocfs2_insert_extent(osb, handle, inode, fe_bh, block,
num_bits, meta_ac);
status = ocfs2_insert_extent(osb, handle, inode, fe_bh,
*logical_offset, block, num_bits,
meta_ac);
if (status < 0) {
mlog_errno(status);
goto leave;
}
le32_add_cpu(&fe->i_clusters, num_bits);
spin_lock(&OCFS2_I(inode)->ip_lock);
OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
spin_unlock(&OCFS2_I(inode)->ip_lock);
status = ocfs2_journal_dirty(handle, fe_bh);
if (status < 0) {
mlog_errno(status);
@ -479,6 +495,7 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
}
clusters_to_add -= num_bits;
*logical_offset += num_bits;
if (clusters_to_add) {
mlog(0, "need to alloc once more, clusters = %u, wanted = "
@ -494,14 +511,87 @@ leave:
return status;
}
/*
* For a given allocation, determine which allocators will need to be
* accessed, and lock them, reserving the appropriate number of bits.
*
* Called from ocfs2_extend_allocation() for file systems which don't
* support holes, and from ocfs2_write() for file systems which
* understand sparse inodes.
*/
int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
u32 clusters_to_add,
struct ocfs2_alloc_context **data_ac,
struct ocfs2_alloc_context **meta_ac)
{
int ret, num_free_extents;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
*meta_ac = NULL;
*data_ac = NULL;
mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
"clusters_to_add = %u\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode),
le32_to_cpu(di->i_clusters), clusters_to_add);
num_free_extents = ocfs2_num_free_extents(osb, inode, di);
if (num_free_extents < 0) {
ret = num_free_extents;
mlog_errno(ret);
goto out;
}
/*
* Sparse allocation file systems need to be more conservative
* with reserving room for expansion - the actual allocation
* happens while we've got a journal handle open so re-taking
* a cluster lock (because we ran out of room for another
* extent) will violate ordering rules.
*
* Most of the time we'll only be seeing this 1 cluster at a time
* anyway.
*/
if (!num_free_extents ||
(ocfs2_sparse_alloc(osb) && num_free_extents < clusters_to_add)) {
ret = ocfs2_reserve_new_metadata(osb, di, meta_ac);
if (ret < 0) {
if (ret != -ENOSPC)
mlog_errno(ret);
goto out;
}
}
ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
if (ret < 0) {
if (ret != -ENOSPC)
mlog_errno(ret);
goto out;
}
out:
if (ret) {
if (*meta_ac) {
ocfs2_free_alloc_context(*meta_ac);
*meta_ac = NULL;
}
/*
* We cannot have an error and a non null *data_ac.
*/
}
return ret;
}
static int ocfs2_extend_allocation(struct inode *inode,
u32 clusters_to_add)
{
int status = 0;
int restart_func = 0;
int drop_alloc_sem = 0;
int credits, num_free_extents;
u32 prev_clusters;
int credits;
u32 prev_clusters, logical_start;
struct buffer_head *bh = NULL;
struct ocfs2_dinode *fe = NULL;
handle_t *handle = NULL;
@ -512,6 +602,12 @@ static int ocfs2_extend_allocation(struct inode *inode,
mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);
/*
* This function only exists for file systems which don't
* support holes.
*/
BUG_ON(ocfs2_sparse_alloc(osb));
status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
OCFS2_BH_CACHED, inode);
if (status < 0) {
@ -526,39 +622,11 @@ static int ocfs2_extend_allocation(struct inode *inode,
goto leave;
}
logical_start = OCFS2_I(inode)->ip_clusters;
restart_all:
BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
mlog(0, "extend inode %llu, i_size = %lld, fe->i_clusters = %u, "
"clusters_to_add = %u\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode),
fe->i_clusters, clusters_to_add);
num_free_extents = ocfs2_num_free_extents(osb,
inode,
fe);
if (num_free_extents < 0) {
status = num_free_extents;
mlog_errno(status);
goto leave;
}
if (!num_free_extents) {
status = ocfs2_reserve_new_metadata(osb, fe, &meta_ac);
if (status < 0) {
if (status != -ENOSPC)
mlog_errno(status);
goto leave;
}
}
status = ocfs2_reserve_clusters(osb, clusters_to_add, &data_ac);
if (status < 0) {
if (status != -ENOSPC)
mlog_errno(status);
goto leave;
}
/* blocks peope in read/write from reading our allocation
* until we're done changing it. We depend on i_mutex to block
* other extend/truncate calls while we're here. Ordering wrt
@ -566,6 +634,13 @@ restart_all:
down_write(&OCFS2_I(inode)->ip_alloc_sem);
drop_alloc_sem = 1;
status = ocfs2_lock_allocators(inode, fe, clusters_to_add, &data_ac,
&meta_ac);
if (status) {
mlog_errno(status);
goto leave;
}
credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add);
handle = ocfs2_start_trans(osb, credits);
if (IS_ERR(handle)) {
@ -590,6 +665,7 @@ restarted_transaction:
status = ocfs2_do_extend_allocation(osb,
inode,
&logical_start,
clusters_to_add,
bh,
handle,
@ -778,7 +854,7 @@ static int ocfs2_extend_file(struct inode *inode,
size_t tail_to_skip)
{
int ret = 0;
u32 clusters_to_add;
u32 clusters_to_add = 0;
BUG_ON(!tail_to_skip && !di_bh);
@ -790,6 +866,11 @@ static int ocfs2_extend_file(struct inode *inode,
goto out;
BUG_ON(new_i_size < i_size_read(inode));
if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
BUG_ON(tail_to_skip != 0);
goto out_update_size;
}
clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) -
OCFS2_I(inode)->ip_clusters;
@ -825,6 +906,7 @@ static int ocfs2_extend_file(struct inode *inode,
goto out_unlock;
}
out_update_size:
if (!tail_to_skip) {
/* We're being called from ocfs2_setattr() which wants
* us to update i_size */
@ -834,7 +916,8 @@ static int ocfs2_extend_file(struct inode *inode,
}
out_unlock:
ocfs2_data_unlock(inode, 1);
if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
ocfs2_data_unlock(inode, 1);
out:
return ret;
@ -972,7 +1055,8 @@ int ocfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
ret = ocfs2_meta_lock(inode, NULL, 0);
if (ret) {
mlog_errno(ret);
if (ret != -ENOENT)
mlog_errno(ret);
goto out;
}
@ -1035,10 +1119,49 @@ out:
return ret;
}
/*
* Will look for holes and unwritten extents in the range starting at
* pos for count bytes (inclusive).
*/
static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos,
size_t count)
{
int ret = 0;
unsigned int extent_flags;
u32 cpos, clusters, extent_len, phys_cpos;
struct super_block *sb = inode->i_sb;
cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
while (clusters) {
ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
&extent_flags);
if (ret < 0) {
mlog_errno(ret);
goto out;
}
if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) {
ret = 1;
break;
}
if (extent_len > clusters)
extent_len = clusters;
clusters -= extent_len;
cpos += extent_len;
}
out:
return ret;
}
static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
loff_t *ppos,
size_t count,
int appending)
int appending,
int *direct_io)
{
int ret = 0, meta_level = appending;
struct inode *inode = dentry->d_inode;
@ -1089,6 +1212,49 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
} else {
saved_pos = *ppos;
}
if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
loff_t end = saved_pos + count;
/*
* Skip the O_DIRECT checks if we don't need
* them.
*/
if (!direct_io || !(*direct_io))
break;
/*
* Allowing concurrent direct writes means
* i_size changes wouldn't be synchronized, so
* one node could wind up truncating another
* nodes writes.
*/
if (end > i_size_read(inode)) {
*direct_io = 0;
break;
}
/*
* We don't fill holes during direct io, so
* check for them here. If any are found, the
* caller will have to retake some cluster
* locks and initiate the io as buffered.
*/
ret = ocfs2_check_range_for_holes(inode, saved_pos,
count);
if (ret == 1) {
*direct_io = 0;
ret = 0;
} else if (ret < 0)
mlog_errno(ret);
break;
}
/*
* The rest of this loop is concerned with legacy file
* systems which don't support sparse files.
*/
newsize = count + saved_pos;
mlog(0, "pos=%lld newsize=%lld cursize=%lld\n",
@ -1141,55 +1307,264 @@ out:
return ret;
}
static inline void
ocfs2_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
{
const struct iovec *iov = *iovp;
size_t base = *basep;
do {
int copy = min(bytes, iov->iov_len - base);
bytes -= copy;
base += copy;
if (iov->iov_len == base) {
iov++;
base = 0;
}
} while (bytes);
*iovp = iov;
*basep = base;
}
static struct page * ocfs2_get_write_source(struct ocfs2_buffered_write_priv *bp,
const struct iovec *cur_iov,
size_t iov_offset)
{
int ret;
char *buf;
struct page *src_page = NULL;
buf = cur_iov->iov_base + iov_offset;
if (!segment_eq(get_fs(), KERNEL_DS)) {
/*
* Pull in the user page. We want to do this outside
* of the meta data locks in order to preserve locking
* order in case of page fault.
*/
ret = get_user_pages(current, current->mm,
(unsigned long)buf & PAGE_CACHE_MASK, 1,
0, 0, &src_page, NULL);
if (ret == 1)
bp->b_src_buf = kmap(src_page);
else
src_page = ERR_PTR(-EFAULT);
} else {
bp->b_src_buf = buf;
}
return src_page;
}
static void ocfs2_put_write_source(struct ocfs2_buffered_write_priv *bp,
struct page *page)
{
if (page) {
kunmap(page);
page_cache_release(page);
}
}
static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos,
const struct iovec *iov,
unsigned long nr_segs,
size_t count,
ssize_t o_direct_written)
{
int ret = 0;
ssize_t copied, total = 0;
size_t iov_offset = 0;
const struct iovec *cur_iov = iov;
struct ocfs2_buffered_write_priv bp;
struct page *page;
/*
* handle partial DIO write. Adjust cur_iov if needed.
*/
ocfs2_set_next_iovec(&cur_iov, &iov_offset, o_direct_written);
do {
bp.b_cur_off = iov_offset;
bp.b_cur_iov = cur_iov;
page = ocfs2_get_write_source(&bp, cur_iov, iov_offset);
if (IS_ERR(page)) {
ret = PTR_ERR(page);
goto out;
}
copied = ocfs2_buffered_write_cluster(file, *ppos, count,
ocfs2_map_and_write_user_data,
&bp);
ocfs2_put_write_source(&bp, page);
if (copied < 0) {
mlog_errno(copied);
ret = copied;
goto out;
}
total += copied;
*ppos = *ppos + copied;
count -= copied;
ocfs2_set_next_iovec(&cur_iov, &iov_offset, copied);
} while(count);
out:
return total ? total : ret;
}
static int ocfs2_check_iovec(const struct iovec *iov, size_t *counted,
unsigned long *nr_segs)
{
size_t ocount; /* original count */
unsigned long seg;
ocount = 0;
for (seg = 0; seg < *nr_segs; seg++) {
const struct iovec *iv = &iov[seg];
/*
* If any segment has a negative length, or the cumulative
* length ever wraps negative then return -EINVAL.
*/
ocount += iv->iov_len;
if (unlikely((ssize_t)(ocount|iv->iov_len) < 0))
return -EINVAL;
if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
continue;
if (seg == 0)
return -EFAULT;
*nr_segs = seg;
ocount -= iv->iov_len; /* This segment is no good */
break;
}
*counted = ocount;
return 0;
}
static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
const struct iovec *iov,
unsigned long nr_segs,
loff_t pos)
{
int ret, rw_level, have_alloc_sem = 0;
struct file *filp = iocb->ki_filp;
struct inode *inode = filp->f_path.dentry->d_inode;
int appending = filp->f_flags & O_APPEND ? 1 : 0;
int ret, direct_io, appending, rw_level, have_alloc_sem = 0;
int can_do_direct, sync = 0;
ssize_t written = 0;
size_t ocount; /* original count */
size_t count; /* after file limit checks */
loff_t *ppos = &iocb->ki_pos;
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_path.dentry->d_inode;
mlog_entry("(0x%p, %u, '%.*s')\n", filp,
mlog_entry("(0x%p, %u, '%.*s')\n", file,
(unsigned int)nr_segs,
filp->f_path.dentry->d_name.len,
filp->f_path.dentry->d_name.name);
file->f_path.dentry->d_name.len,
file->f_path.dentry->d_name.name);
/* happy write of zero bytes */
if (iocb->ki_left == 0)
return 0;
ret = ocfs2_check_iovec(iov, &ocount, &nr_segs);
if (ret)
return ret;
count = ocount;
vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
appending = file->f_flags & O_APPEND ? 1 : 0;
direct_io = file->f_flags & O_DIRECT ? 1 : 0;
mutex_lock(&inode->i_mutex);
relock:
/* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */
if (filp->f_flags & O_DIRECT) {
have_alloc_sem = 1;
if (direct_io) {
down_read(&inode->i_alloc_sem);
have_alloc_sem = 1;
}
/* concurrent O_DIRECT writes are allowed */
rw_level = (filp->f_flags & O_DIRECT) ? 0 : 1;
rw_level = !direct_io;
ret = ocfs2_rw_lock(inode, rw_level);
if (ret < 0) {
rw_level = -1;
mlog_errno(ret);
goto out;
goto out_sems;
}
ret = ocfs2_prepare_inode_for_write(filp->f_path.dentry, &iocb->ki_pos,
iocb->ki_left, appending);
can_do_direct = direct_io;
ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos,
iocb->ki_left, appending,
&can_do_direct);
if (ret < 0) {
mlog_errno(ret);
goto out;
}
/*
* We can't complete the direct I/O as requested, fall back to
* buffered I/O.
*/
if (direct_io && !can_do_direct) {
ocfs2_rw_unlock(inode, rw_level);
up_read(&inode->i_alloc_sem);
have_alloc_sem = 0;
rw_level = -1;
direct_io = 0;
sync = 1;
goto relock;
}
if (!sync && ((file->f_flags & O_SYNC) || IS_SYNC(inode)))
sync = 1;
/*
* XXX: Is it ok to execute these checks a second time?
*/
ret = generic_write_checks(file, ppos, &count, S_ISBLK(inode->i_mode));
if (ret)
goto out;
/*
* Set pos so that sync_page_range_nolock() below understands
* where to start from. We might've moved it around via the
* calls above. The range we want to actually sync starts from
* *ppos here.
*
*/
pos = *ppos;
/* communicate with ocfs2_dio_end_io */
ocfs2_iocb_set_rw_locked(iocb);
ocfs2_iocb_set_rw_locked(iocb, rw_level);
ret = generic_file_aio_write_nolock(iocb, iov, nr_segs, iocb->ki_pos);
if (direct_io) {
written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
ppos, count, ocount);
if (written < 0) {
ret = written;
goto out_dio;
}
} else {
written = ocfs2_file_buffered_write(file, ppos, iov, nr_segs,
count, written);
if (written < 0) {
ret = written;
if (ret != -EFAULT || ret != -ENOSPC)
mlog_errno(ret);
goto out;
}
}
out_dio:
/* buffered aio wouldn't have proper lock coverage today */
BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
/*
* deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
@ -1207,13 +1582,102 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
}
out:
if (rw_level != -1)
ocfs2_rw_unlock(inode, rw_level);
out_sems:
if (have_alloc_sem)
up_read(&inode->i_alloc_sem);
if (rw_level != -1)
ocfs2_rw_unlock(inode, rw_level);
if (written > 0 && sync) {
ssize_t err;
err = sync_page_range_nolock(inode, file->f_mapping, pos, count);
if (err < 0)
written = err;
}
mutex_unlock(&inode->i_mutex);
mlog_exit(ret);
return written ? written : ret;
}
static int ocfs2_splice_write_actor(struct pipe_inode_info *pipe,
struct pipe_buffer *buf,
struct splice_desc *sd)
{
int ret, count, total = 0;
ssize_t copied = 0;
struct ocfs2_splice_write_priv sp;
ret = buf->ops->pin(pipe, buf);
if (ret)
goto out;
sp.s_sd = sd;
sp.s_buf = buf;
sp.s_pipe = pipe;
sp.s_offset = sd->pos & ~PAGE_CACHE_MASK;
sp.s_buf_offset = buf->offset;
count = sd->len;
if (count + sp.s_offset > PAGE_CACHE_SIZE)
count = PAGE_CACHE_SIZE - sp.s_offset;
do {
/*
* splice wants us to copy up to one page at a
* time. For pagesize > cluster size, this means we
* might enter ocfs2_buffered_write_cluster() more
* than once, so keep track of our progress here.
*/
copied = ocfs2_buffered_write_cluster(sd->file,
(loff_t)sd->pos + total,
count,
ocfs2_map_and_write_splice_data,
&sp);
if (copied < 0) {
mlog_errno(copied);
ret = copied;
goto out;
}
count -= copied;
sp.s_offset += copied;
sp.s_buf_offset += copied;
total += copied;
} while (count);
ret = 0;
out:
return total ? total : ret;
}
static ssize_t __ocfs2_file_splice_write(struct pipe_inode_info *pipe,
struct file *out,
loff_t *ppos,
size_t len,
unsigned int flags)
{
int ret, err;
struct address_space *mapping = out->f_mapping;
struct inode *inode = mapping->host;
ret = __splice_from_pipe(pipe, out, ppos, len, flags,
ocfs2_splice_write_actor);
if (ret > 0) {
*ppos += ret;
if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
err = generic_osync_inode(inode, mapping,
OSYNC_METADATA|OSYNC_DATA);
if (err)
ret = err;
}
}
return ret;
}
@ -1239,14 +1703,15 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
goto out;
}
ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0);
ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0,
NULL);
if (ret < 0) {
mlog_errno(ret);
goto out_unlock;
}
/* ok, we're done with i_size and alloc work */
ret = generic_file_splice_write_nolock(pipe, out, ppos, len, flags);
ret = __ocfs2_file_splice_write(pipe, out, ppos, len, flags);
out_unlock:
ocfs2_rw_unlock(inode, 1);
@ -1323,7 +1788,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
}
rw_level = 0;
/* communicate with ocfs2_dio_end_io */
ocfs2_iocb_set_rw_locked(iocb);
ocfs2_iocb_set_rw_locked(iocb, rw_level);
}
/*

View file

@ -39,12 +39,17 @@ enum ocfs2_alloc_restarted {
};
int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
struct inode *inode,
u32 *cluster_start,
u32 clusters_to_add,
struct buffer_head *fe_bh,
handle_t *handle,
struct ocfs2_alloc_context *data_ac,
struct ocfs2_alloc_context *meta_ac,
enum ocfs2_alloc_restarted *reason);
int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
u32 clusters_to_add,
struct ocfs2_alloc_context **data_ac,
struct ocfs2_alloc_context **meta_ac);
int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat);

View file

@ -89,24 +89,6 @@ void ocfs2_set_inode_flags(struct inode *inode)
inode->i_flags |= S_DIRSYNC;
}
struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb,
u64 blkno,
int delete_vote)
{
struct ocfs2_find_inode_args args;
/* ocfs2_ilookup_for_vote should *only* be called from the
* vote thread */
BUG_ON(current != osb->vote_task);
args.fi_blkno = blkno;
args.fi_flags = OCFS2_FI_FLAG_NOWAIT;
if (delete_vote)
args.fi_flags |= OCFS2_FI_FLAG_DELETE;
args.fi_ino = ino_from_blkno(osb->sb, blkno);
return ilookup5(osb->sb, args.fi_ino, ocfs2_find_actor, &args);
}
struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, int flags)
{
struct inode *inode = NULL;
@ -182,28 +164,6 @@ static int ocfs2_find_actor(struct inode *inode, void *opaque)
if (oi->ip_blkno != args->fi_blkno)
goto bail;
/* OCFS2_FI_FLAG_NOWAIT is *only* set from
* ocfs2_ilookup_for_vote which won't create an inode for one
* that isn't found. The vote thread which doesn't want to get
* an inode which is in the process of going away - otherwise
* the call to __wait_on_freeing_inode in find_inode_fast will
* cause it to deadlock on an inode which may be waiting on a
* vote (or lock release) in delete_inode */
if ((args->fi_flags & OCFS2_FI_FLAG_NOWAIT) &&
(inode->i_state & (I_FREEING|I_CLEAR))) {
/* As stated above, we're not going to return an
* inode. In the case of a delete vote, the voting
* code is going to signal the other node to go
* ahead. Mark that state here, so this freeing inode
* has the state when it gets to delete_inode. */
if (args->fi_flags & OCFS2_FI_FLAG_DELETE) {
spin_lock(&oi->ip_lock);
ocfs2_mark_inode_remotely_deleted(inode);
spin_unlock(&oi->ip_lock);
}
goto bail;
}
ret = 1;
bail:
mlog_exit(ret);
@ -261,6 +221,9 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
goto bail;
}
OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr);
inode->i_version = 1;
inode->i_generation = le32_to_cpu(fe->i_generation);
inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev));
@ -272,8 +235,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
if (S_ISLNK(inode->i_mode) && !fe->i_clusters)
inode->i_blocks = 0;
else
inode->i_blocks =
ocfs2_align_bytes_to_sectors(le64_to_cpu(fe->i_size));
inode->i_blocks = ocfs2_inode_sector_count(inode);
inode->i_mapping->a_ops = &ocfs2_aops;
inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime);
inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec);
@ -288,10 +250,6 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
(unsigned long long)OCFS2_I(inode)->ip_blkno,
(unsigned long long)fe->i_blkno);
OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
OCFS2_I(inode)->ip_orphaned_slot = OCFS2_INVALID_SLOT;
OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr);
inode->i_nlink = le16_to_cpu(fe->i_links_count);
if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL))
@ -347,6 +305,9 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres,
OCFS2_LOCK_TYPE_META, 0, inode);
ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres,
OCFS2_LOCK_TYPE_OPEN, 0, inode);
}
ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_rw_lockres,
@ -421,7 +382,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
* cluster lock before trusting anything anyway.
*/
can_lock = !(args->fi_flags & OCFS2_FI_FLAG_SYSFILE)
&& !(args->fi_flags & OCFS2_FI_FLAG_NOLOCK)
&& !(args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY)
&& !ocfs2_mount_local(osb);
/*
@ -438,7 +399,17 @@ static int ocfs2_read_locked_inode(struct inode *inode,
OCFS2_LOCK_TYPE_META,
generation, inode);
ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres,
OCFS2_LOCK_TYPE_OPEN,
0, inode);
if (can_lock) {
status = ocfs2_open_lock(inode);
if (status) {
make_bad_inode(inode);
mlog_errno(status);
return status;
}
status = ocfs2_meta_lock(inode, NULL, 0);
if (status) {
make_bad_inode(inode);
@ -447,6 +418,14 @@ static int ocfs2_read_locked_inode(struct inode *inode,
}
}
if (args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY) {
status = ocfs2_try_open_lock(inode, 0);
if (status) {
make_bad_inode(inode);
return status;
}
}
status = ocfs2_read_block(osb, args->fi_blkno, &bh, 0,
can_lock ? inode : NULL);
if (status < 0) {
@ -507,50 +486,56 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
struct buffer_head *fe_bh)
{
int status = 0;
handle_t *handle = NULL;
struct ocfs2_truncate_context *tc = NULL;
struct ocfs2_dinode *fe;
handle_t *handle = NULL;
mlog_entry_void();
fe = (struct ocfs2_dinode *) fe_bh->b_data;
/* zero allocation, zero truncate :) */
if (!fe->i_clusters)
goto bail;
if (fe->i_clusters) {
handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
mlog_errno(status);
goto out;
}
handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
status = ocfs2_journal_access(handle, inode, fe_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto out;
}
i_size_write(inode, 0);
status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
if (status < 0) {
mlog_errno(status);
goto out;
}
ocfs2_commit_trans(osb, handle);
handle = NULL;
mlog_errno(status);
goto bail;
status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc);
if (status < 0) {
mlog_errno(status);
goto out;
}
status = ocfs2_commit_truncate(osb, inode, fe_bh, tc);
if (status < 0) {
mlog_errno(status);
goto out;
}
}
status = ocfs2_set_inode_size(handle, inode, fe_bh, 0ULL);
if (status < 0) {
mlog_errno(status);
goto bail;
}
ocfs2_commit_trans(osb, handle);
handle = NULL;
status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc);
if (status < 0) {
mlog_errno(status);
goto bail;
}
status = ocfs2_commit_truncate(osb, inode, fe_bh, tc);
if (status < 0) {
mlog_errno(status);
goto bail;
}
bail:
out:
if (handle)
ocfs2_commit_trans(osb, handle);
mlog_exit(status);
return status;
}
@ -678,10 +663,10 @@ static int ocfs2_wipe_inode(struct inode *inode,
struct inode *orphan_dir_inode = NULL;
struct buffer_head *orphan_dir_bh = NULL;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct ocfs2_dinode *di;
/* We've already voted on this so it should be readonly - no
* spinlock needed. */
orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
di = (struct ocfs2_dinode *) di_bh->b_data;
orphaned_slot = le16_to_cpu(di->i_orphaned_slot);
status = ocfs2_check_orphan_recovery_state(osb, orphaned_slot);
if (status)
@ -839,11 +824,20 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
goto bail;
}
status = ocfs2_request_delete_vote(inode);
/* -EBUSY means that other nodes are still using the
* inode. We're done here though, so avoid doing anything on
* disk and let them worry about deleting it. */
if (status == -EBUSY) {
/*
* This is how ocfs2 determines whether an inode is still live
* within the cluster. Every node takes a shared read lock on
* the inode open lock in ocfs2_read_locked_inode(). When we
* get to ->delete_inode(), each node tries to convert it's
* lock to an exclusive. Trylocks are serialized by the inode
* meta data lock. If the upconvert suceeds, we know the inode
* is no longer live and can be deleted.
*
* Though we call this with the meta data lock held, the
* trylock keeps us from ABBA deadlock.
*/
status = ocfs2_try_open_lock(inode, 1);
if (status == -EAGAIN) {
status = 0;
mlog(0, "Skipping delete of %llu because it is in use on"
"other nodes\n", (unsigned long long)oi->ip_blkno);
@ -854,21 +848,10 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
goto bail;
}
spin_lock(&oi->ip_lock);
if (oi->ip_orphaned_slot == OCFS2_INVALID_SLOT) {
/* Nobody knew which slot this inode was orphaned
* into. This may happen during node death and
* recovery knows how to clean it up so we can safely
* ignore this inode for now on. */
mlog(0, "Nobody knew where inode %llu was orphaned!\n",
(unsigned long long)oi->ip_blkno);
} else {
*wipe = 1;
mlog(0, "Inode %llu is ok to wipe from orphan dir %d\n",
(unsigned long long)oi->ip_blkno, oi->ip_orphaned_slot);
}
spin_unlock(&oi->ip_lock);
*wipe = 1;
mlog(0, "Inode %llu is ok to wipe from orphan dir %u\n",
(unsigned long long)oi->ip_blkno,
le16_to_cpu(di->i_orphaned_slot));
bail:
return status;
@ -1001,11 +984,16 @@ void ocfs2_clear_inode(struct inode *inode)
mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL,
"Inode=%lu\n", inode->i_ino);
/* For remove delete_inode vote, we hold open lock before,
* now it is time to unlock PR and EX open locks. */
ocfs2_open_unlock(inode);
/* Do these before all the other work so that we don't bounce
* the vote thread while waiting to destroy the locks. */
ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres);
ocfs2_mark_lockres_freeing(&oi->ip_meta_lockres);
ocfs2_mark_lockres_freeing(&oi->ip_data_lockres);
ocfs2_mark_lockres_freeing(&oi->ip_open_lockres);
/* We very well may get a clear_inode before all an inodes
* metadata has hit disk. Of course, we can't drop any cluster
@ -1020,8 +1008,7 @@ void ocfs2_clear_inode(struct inode *inode)
"Clear inode of %llu, inode has io markers\n",
(unsigned long long)oi->ip_blkno);
ocfs2_extent_map_drop(inode, 0);
ocfs2_extent_map_init(inode);
ocfs2_extent_map_trunc(inode, 0);
status = ocfs2_drop_inode_locks(inode);
if (status < 0)
@ -1030,6 +1017,7 @@ void ocfs2_clear_inode(struct inode *inode)
ocfs2_lock_res_free(&oi->ip_rw_lockres);
ocfs2_lock_res_free(&oi->ip_meta_lockres);
ocfs2_lock_res_free(&oi->ip_data_lockres);
ocfs2_lock_res_free(&oi->ip_open_lockres);
ocfs2_metadata_cache_purge(inode);
@ -1086,9 +1074,6 @@ void ocfs2_drop_inode(struct inode *inode)
mlog(0, "Drop inode %llu, nlink = %u, ip_flags = 0x%x\n",
(unsigned long long)oi->ip_blkno, inode->i_nlink, oi->ip_flags);
/* Testing ip_orphaned_slot here wouldn't work because we may
* not have gotten a delete_inode vote from any other nodes
* yet. */
if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)
generic_delete_inode(inode);
else
@ -1121,8 +1106,8 @@ struct buffer_head *ocfs2_bread(struct inode *inode,
return NULL;
}
tmperr = ocfs2_extent_map_get_blocks(inode, block, 1,
&p_blkno, NULL);
tmperr = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL,
NULL);
if (tmperr < 0) {
mlog_errno(tmperr);
goto fail;
@ -1259,7 +1244,7 @@ void ocfs2_refresh_inode(struct inode *inode,
if (S_ISLNK(inode->i_mode) && le32_to_cpu(fe->i_clusters) == 0)
inode->i_blocks = 0;
else
inode->i_blocks = ocfs2_align_bytes_to_sectors(i_size_read(inode));
inode->i_blocks = ocfs2_inode_sector_count(inode);
inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime);
inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec);
inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime);

View file

@ -26,6 +26,8 @@
#ifndef OCFS2_INODE_H
#define OCFS2_INODE_H
#include "extent_map.h"
/* OCFS2 Inode Private Data */
struct ocfs2_inode_info
{
@ -34,6 +36,7 @@ struct ocfs2_inode_info
struct ocfs2_lock_res ip_rw_lockres;
struct ocfs2_lock_res ip_meta_lockres;
struct ocfs2_lock_res ip_data_lockres;
struct ocfs2_lock_res ip_open_lockres;
/* protects allocation changes on this inode. */
struct rw_semaphore ip_alloc_sem;
@ -42,9 +45,7 @@ struct ocfs2_inode_info
spinlock_t ip_lock;
u32 ip_open_count;
u32 ip_clusters;
struct ocfs2_extent_map ip_map;
struct list_head ip_io_markers;
int ip_orphaned_slot;
struct mutex ip_io_mutex;
@ -64,6 +65,8 @@ struct ocfs2_inode_info
struct ocfs2_caching_info ip_metadata_cache;
struct ocfs2_extent_map ip_extent_map;
struct inode vfs_inode;
};
@ -117,14 +120,9 @@ void ocfs2_delete_inode(struct inode *inode);
void ocfs2_drop_inode(struct inode *inode);
/* Flags for ocfs2_iget() */
#define OCFS2_FI_FLAG_NOWAIT 0x1
#define OCFS2_FI_FLAG_DELETE 0x2
#define OCFS2_FI_FLAG_SYSFILE 0x4
#define OCFS2_FI_FLAG_NOLOCK 0x8
#define OCFS2_FI_FLAG_SYSFILE 0x4
#define OCFS2_FI_FLAG_ORPHAN_RECOVERY 0x8
struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, int flags);
struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb,
u64 blkno,
int delete_vote);
int ocfs2_inode_init_private(struct inode *inode);
int ocfs2_inode_revalidate(struct dentry *dentry);
int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
@ -144,4 +142,11 @@ int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb);
void ocfs2_set_inode_flags(struct inode *inode);
static inline blkcnt_t ocfs2_inode_sector_count(struct inode *inode)
{
int c_to_s_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits - 9;
return (blkcnt_t)(OCFS2_I(inode)->ip_clusters << c_to_s_bits);
}
#endif /* OCFS2_INODE_H */

View file

@ -649,29 +649,20 @@ bail:
static int ocfs2_force_read_journal(struct inode *inode)
{
int status = 0;
int i, p_blocks;
u64 v_blkno, p_blkno;
#define CONCURRENT_JOURNAL_FILL 32
int i;
u64 v_blkno, p_blkno, p_blocks, num_blocks;
#define CONCURRENT_JOURNAL_FILL 32ULL
struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL];
mlog_entry_void();
BUG_ON(inode->i_blocks !=
ocfs2_align_bytes_to_sectors(i_size_read(inode)));
memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL);
mlog(0, "Force reading %llu blocks\n",
(unsigned long long)(inode->i_blocks >>
(inode->i_sb->s_blocksize_bits - 9)));
num_blocks = ocfs2_blocks_for_bytes(inode->i_sb, inode->i_size);
v_blkno = 0;
while (v_blkno <
(inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9))) {
while (v_blkno < num_blocks) {
status = ocfs2_extent_map_get_blocks(inode, v_blkno,
1, &p_blkno,
&p_blocks);
&p_blkno, &p_blocks, NULL);
if (status < 0) {
mlog_errno(status);
goto bail;
@ -1306,7 +1297,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
continue;
iter = ocfs2_iget(osb, le64_to_cpu(de->inode),
OCFS2_FI_FLAG_NOLOCK);
OCFS2_FI_FLAG_ORPHAN_RECOVERY);
if (IS_ERR(iter))
continue;
@ -1418,7 +1409,6 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
/* Set the proper information to get us going into
* ocfs2_delete_inode. */
oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
oi->ip_orphaned_slot = slot;
spin_unlock(&oi->ip_lock);
iput(inode);

View file

@ -390,7 +390,7 @@ static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
/* We may be deleting metadata blocks, so metadata alloc dinode +
one desc. block for each possible delete. */
if (tree_depth && next_free == 1 &&
le32_to_cpu(last_el->l_recs[i].e_clusters) == clusters_to_del)
ocfs2_rec_clusters(last_el, &last_el->l_recs[i]) == clusters_to_del)
credits += 1 + tree_depth;
/* update to the truncate log. */

View file

@ -85,8 +85,11 @@ int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
int ret = 0, lock_level = 0;
struct ocfs2_super *osb = OCFS2_SB(file->f_dentry->d_inode->i_sb);
/* We don't want to support shared writable mappings yet. */
if (!ocfs2_mount_local(osb) &&
/*
* Only support shared writeable mmap for local mounts which
* don't know about holes.
*/
if ((!ocfs2_mount_local(osb) || ocfs2_sparse_alloc(osb)) &&
((vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_MAYSHARE)) &&
((vma->vm_flags & VM_WRITE) || (vma->vm_flags & VM_MAYWRITE))) {
mlog(0, "disallow shared writable mmaps %lx\n", vma->vm_flags);

View file

@ -175,8 +175,6 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0);
if (IS_ERR(inode)) {
mlog(ML_ERROR, "Unable to create inode %llu\n",
(unsigned long long)blkno);
ret = ERR_PTR(-EACCES);
goto bail_unlock;
}
@ -189,7 +187,6 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
* unlink. */
spin_lock(&oi->ip_lock);
oi->ip_flags &= ~OCFS2_INODE_MAYBE_ORPHANED;
oi->ip_orphaned_slot = OCFS2_INVALID_SLOT;
spin_unlock(&oi->ip_lock);
bail_add:
@ -288,7 +285,7 @@ static int ocfs2_fill_new_dir(struct ocfs2_super *osb,
i_size_write(inode, inode->i_sb->s_blocksize);
inode->i_nlink = 2;
inode->i_blocks = ocfs2_align_bytes_to_sectors(inode->i_sb->s_blocksize);
inode->i_blocks = ocfs2_inode_sector_count(inode);
status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
if (status < 0) {
mlog_errno(status);
@ -1486,8 +1483,7 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
struct buffer_head **bhs = NULL;
const char *c;
struct super_block *sb = osb->sb;
u64 p_blkno;
int p_blocks;
u64 p_blkno, p_blocks;
int virtual, blocks, status, i, bytes_left;
bytes_left = i_size_read(inode) + 1;
@ -1514,8 +1510,8 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
goto bail;
}
status = ocfs2_extent_map_get_blocks(inode, 0, 1, &p_blkno,
&p_blocks);
status = ocfs2_extent_map_get_blocks(inode, 0, &p_blkno, &p_blocks,
NULL);
if (status < 0) {
mlog_errno(status);
goto bail;
@ -1674,8 +1670,11 @@ static int ocfs2_symlink(struct inode *dir,
inode->i_rdev = 0;
newsize = l - 1;
if (l > ocfs2_fast_symlink_chars(sb)) {
u32 offset = 0;
inode->i_op = &ocfs2_symlink_inode_operations;
status = ocfs2_do_extend_allocation(osb, inode, 1, new_fe_bh,
status = ocfs2_do_extend_allocation(osb, inode, &offset, 1,
new_fe_bh,
handle, data_ac, NULL,
NULL);
if (status < 0) {
@ -1689,7 +1688,7 @@ static int ocfs2_symlink(struct inode *dir,
goto bail;
}
i_size_write(inode, newsize);
inode->i_blocks = ocfs2_align_bytes_to_sectors(newsize);
inode->i_blocks = ocfs2_inode_sector_count(inode);
} else {
inode->i_op = &ocfs2_fast_symlink_inode_operations;
memcpy((char *) fe->id2.i_symlink, symname, l);
@ -2222,9 +2221,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
/* Record which orphan dir our inode now resides
* in. delete_inode will use this to determine which orphan
* dir to lock. */
spin_lock(&OCFS2_I(inode)->ip_lock);
OCFS2_I(inode)->ip_orphaned_slot = osb->slot_num;
spin_unlock(&OCFS2_I(inode)->ip_lock);
fe->i_orphaned_slot = cpu_to_le16(osb->slot_num);
mlog(0, "Inode %llu orphaned in slot %d\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num);

View file

@ -46,11 +46,6 @@
#include "endian.h"
#include "ocfs2_lockid.h"
struct ocfs2_extent_map {
u32 em_clusters;
struct rb_root em_extents;
};
/* Most user visible OCFS2 inodes will have very few pieces of
* metadata, but larger files (including bitmaps, etc) must be taken
* into account when designing an access scheme. We allow a small
@ -303,6 +298,13 @@ static inline int ocfs2_should_order_data(struct inode *inode)
return 1;
}
static inline int ocfs2_sparse_alloc(struct ocfs2_super *osb)
{
if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC)
return 1;
return 0;
}
/* set / clear functions because cluster events can make these happen
* in parallel so we want the transitions to be atomic. this also
* means that any future flags osb_flags must be protected by spinlock
@ -461,6 +463,49 @@ static inline unsigned long ocfs2_align_bytes_to_sectors(u64 bytes)
return (unsigned long)((bytes + 511) >> 9);
}
static inline unsigned int ocfs2_page_index_to_clusters(struct super_block *sb,
unsigned long pg_index)
{
u32 clusters = pg_index;
unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits;
if (unlikely(PAGE_CACHE_SHIFT > cbits))
clusters = pg_index << (PAGE_CACHE_SHIFT - cbits);
else if (PAGE_CACHE_SHIFT < cbits)
clusters = pg_index >> (cbits - PAGE_CACHE_SHIFT);
return clusters;
}
/*
* Find the 1st page index which covers the given clusters.
*/
static inline unsigned long ocfs2_align_clusters_to_page_index(struct super_block *sb,
u32 clusters)
{
unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits;
unsigned long index = clusters;
if (PAGE_CACHE_SHIFT > cbits) {
index = clusters >> (PAGE_CACHE_SHIFT - cbits);
} else if (PAGE_CACHE_SHIFT < cbits) {
index = clusters << (cbits - PAGE_CACHE_SHIFT);
}
return index;
}
static inline unsigned int ocfs2_pages_per_cluster(struct super_block *sb)
{
unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits;
unsigned int pages_per_cluster = 1;
if (PAGE_CACHE_SHIFT < cbits)
pages_per_cluster = 1 << (cbits - PAGE_CACHE_SHIFT);
return pages_per_cluster;
}
#define ocfs2_set_bit ext2_set_bit
#define ocfs2_clear_bit ext2_clear_bit
#define ocfs2_test_bit ext2_test_bit

View file

@ -86,7 +86,8 @@
OCFS2_SB(sb)->s_feature_incompat &= ~(mask)
#define OCFS2_FEATURE_COMPAT_SUPP OCFS2_FEATURE_COMPAT_BACKUP_SB
#define OCFS2_FEATURE_INCOMPAT_SUPP OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT
#define OCFS2_FEATURE_INCOMPAT_SUPP (OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT \
| OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC)
#define OCFS2_FEATURE_RO_COMPAT_SUPP 0
/*
@ -154,6 +155,12 @@
#define OCFS2_FL_VISIBLE (0x000100FF) /* User visible flags */
#define OCFS2_FL_MODIFIABLE (0x000100FF) /* User modifiable flags */
/*
* Extent record flags (e_node.leaf.flags)
*/
#define OCFS2_EXT_UNWRITTEN (0x01) /* Extent is allocated but
* unwritten */
/*
* ioctl commands
*/
@ -282,10 +289,21 @@ static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = {
/*
* On disk extent record for OCFS2
* It describes a range of clusters on disk.
*
* Length fields are divided into interior and leaf node versions.
* This leaves room for a flags field (OCFS2_EXT_*) in the leaf nodes.
*/
struct ocfs2_extent_rec {
/*00*/ __le32 e_cpos; /* Offset into the file, in clusters */
__le32 e_clusters; /* Clusters covered by this extent */
union {
__le32 e_int_clusters; /* Clusters covered by all children */
struct {
__le16 e_leaf_clusters; /* Clusters covered by this
extent */
__u8 e_reserved1;
__u8 e_flags; /* Extent flags */
};
};
__le64 e_blkno; /* Physical disk offset, in blocks */
/*10*/
};
@ -311,7 +329,10 @@ struct ocfs2_extent_list {
/*00*/ __le16 l_tree_depth; /* Extent tree depth from this
point. 0 means data extents
hang directly off this
header (a leaf) */
header (a leaf)
NOTE: The high 8 bits cannot be
used - tree_depth is never that big.
*/
__le16 l_count; /* Number of extent records */
__le16 l_next_free_rec; /* Next unused extent slot */
__le16 l_reserved1;
@ -446,7 +467,9 @@ struct ocfs2_dinode {
__le32 i_ctime_nsec;
__le32 i_mtime_nsec;
__le32 i_attr;
__le32 i_reserved1;
__le16 i_orphaned_slot; /* Only valid when OCFS2_ORPHANED_FL
was set in i_flags */
__le16 i_reserved1;
/*70*/ __le64 i_reserved2[8];
/*B8*/ union {
__le64 i_pad1; /* Generic way to refer to this

View file

@ -44,6 +44,7 @@ enum ocfs2_lock_type {
OCFS2_LOCK_TYPE_RENAME,
OCFS2_LOCK_TYPE_RW,
OCFS2_LOCK_TYPE_DENTRY,
OCFS2_LOCK_TYPE_OPEN,
OCFS2_NUM_LOCK_TYPES
};
@ -69,6 +70,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
case OCFS2_LOCK_TYPE_DENTRY:
c = 'N';
break;
case OCFS2_LOCK_TYPE_OPEN:
c = 'O';
break;
default:
c = '\0';
}
@ -85,6 +89,7 @@ static char *ocfs2_lock_type_strings[] = {
* important job it does, anyway. */
[OCFS2_LOCK_TYPE_RW] = "Write/Read",
[OCFS2_LOCK_TYPE_DENTRY] = "Dentry",
[OCFS2_LOCK_TYPE_OPEN] = "Open",
};
static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)

View file

@ -197,7 +197,7 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb)
goto bail;
}
status = ocfs2_extent_map_get_blocks(inode, 0ULL, 1, &blkno, NULL);
status = ocfs2_extent_map_get_blocks(inode, 0ULL, &blkno, NULL, NULL);
if (status < 0) {
mlog_errno(status);
goto bail;

View file

@ -381,8 +381,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
le32_to_cpu(fe->i_clusters)));
spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);
i_size_write(alloc_inode, le64_to_cpu(fe->i_size));
alloc_inode->i_blocks =
ocfs2_align_bytes_to_sectors(i_size_read(alloc_inode));
alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode);
status = 0;
bail:

View file

@ -806,9 +806,6 @@ static int __init ocfs2_init(void)
ocfs2_print_version();
if (init_ocfs2_extent_maps())
return -ENOMEM;
status = init_ocfs2_uptodate_cache();
if (status < 0) {
mlog_errno(status);
@ -837,7 +834,6 @@ leave:
if (status < 0) {
ocfs2_free_mem_caches();
exit_ocfs2_uptodate_cache();
exit_ocfs2_extent_maps();
}
mlog_exit(status);
@ -863,8 +859,6 @@ static void __exit ocfs2_exit(void)
unregister_filesystem(&ocfs2_fs_type);
exit_ocfs2_extent_maps();
exit_ocfs2_uptodate_cache();
mlog_exit_void();
@ -963,6 +957,7 @@ static void ocfs2_inode_init_once(void *data,
ocfs2_lock_res_init_once(&oi->ip_rw_lockres);
ocfs2_lock_res_init_once(&oi->ip_meta_lockres);
ocfs2_lock_res_init_once(&oi->ip_data_lockres);
ocfs2_lock_res_init_once(&oi->ip_open_lockres);
ocfs2_metadata_cache_init(&oi->vfs_inode);

View file

@ -63,17 +63,10 @@ struct ocfs2_msg_hdr
__be32 h_node_num; /* node sending this particular message. */
};
/* OCFS2_MAX_FILENAME_LEN is 255 characters, but we want to align this
* for the network. */
#define OCFS2_VOTE_FILENAME_LEN 256
struct ocfs2_vote_msg
{
struct ocfs2_msg_hdr v_hdr;
union {
__be32 v_generic1;
__be32 v_orphaned_slot; /* Used during delete votes */
__be32 v_nlink; /* Used during unlink votes */
} md1; /* Message type dependant 1 */
__be32 v_reserved1;
};
/* Responses are given these values to maintain backwards
@ -86,7 +79,6 @@ struct ocfs2_response_msg
{
struct ocfs2_msg_hdr r_hdr;
__be32 r_response;
__be32 r_orphaned_slot;
};
struct ocfs2_vote_work {
@ -96,7 +88,6 @@ struct ocfs2_vote_work {
enum ocfs2_vote_request {
OCFS2_VOTE_REQ_INVALID = 0,
OCFS2_VOTE_REQ_DELETE,
OCFS2_VOTE_REQ_MOUNT,
OCFS2_VOTE_REQ_UMOUNT,
OCFS2_VOTE_REQ_LAST
@ -151,135 +142,23 @@ static void ocfs2_process_umount_request(struct ocfs2_super *osb,
ocfs2_node_map_set_bit(osb, &osb->umount_map, node_num);
}
void ocfs2_mark_inode_remotely_deleted(struct inode *inode)
{
struct ocfs2_inode_info *oi = OCFS2_I(inode);
assert_spin_locked(&oi->ip_lock);
/* We set the SKIP_DELETE flag on the inode so we don't try to
* delete it in delete_inode ourselves, thus avoiding
* unecessary lock pinging. If the other node failed to wipe
* the inode as a result of a crash, then recovery will pick
* up the slack. */
oi->ip_flags |= OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE;
}
static int ocfs2_process_delete_request(struct inode *inode,
int *orphaned_slot)
{
int response = OCFS2_RESPONSE_BUSY;
mlog(0, "DELETE vote on inode %lu, read lnk_cnt = %u, slot = %d\n",
inode->i_ino, inode->i_nlink, *orphaned_slot);
spin_lock(&OCFS2_I(inode)->ip_lock);
/* Whatever our vote response is, we want to make sure that
* the orphaned slot is recorded properly on this node *and*
* on the requesting node. Technically, if the requesting node
* did not know which slot the inode is orphaned in but we
* respond with BUSY he doesn't actually need the orphaned
* slot, but it doesn't hurt to do it here anyway. */
if ((*orphaned_slot) != OCFS2_INVALID_SLOT) {
mlog_bug_on_msg(OCFS2_I(inode)->ip_orphaned_slot !=
OCFS2_INVALID_SLOT &&
OCFS2_I(inode)->ip_orphaned_slot !=
(*orphaned_slot),
"Inode %llu: This node thinks it's "
"orphaned in slot %d, messaged it's in %d\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
OCFS2_I(inode)->ip_orphaned_slot,
*orphaned_slot);
mlog(0, "Setting orphaned slot for inode %llu to %d\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
*orphaned_slot);
OCFS2_I(inode)->ip_orphaned_slot = *orphaned_slot;
} else {
mlog(0, "Sending back orphaned slot %d for inode %llu\n",
OCFS2_I(inode)->ip_orphaned_slot,
(unsigned long long)OCFS2_I(inode)->ip_blkno);
*orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
}
/* vote no if the file is still open. */
if (OCFS2_I(inode)->ip_open_count) {
mlog(0, "open count = %u\n",
OCFS2_I(inode)->ip_open_count);
spin_unlock(&OCFS2_I(inode)->ip_lock);
goto done;
}
spin_unlock(&OCFS2_I(inode)->ip_lock);
/* directories are a bit ugly... What if someone is sitting in
* it? We want to make sure the inode is removed completely as
* a result of the iput in process_vote. */
if (S_ISDIR(inode->i_mode) && (atomic_read(&inode->i_count) != 1)) {
mlog(0, "i_count = %u\n", atomic_read(&inode->i_count));
goto done;
}
if (filemap_fdatawrite(inode->i_mapping)) {
mlog(ML_ERROR, "Could not sync inode %llu for delete!\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno);
goto done;
}
sync_mapping_buffers(inode->i_mapping);
truncate_inode_pages(inode->i_mapping, 0);
ocfs2_extent_map_trunc(inode, 0);
spin_lock(&OCFS2_I(inode)->ip_lock);
/* double check open count - someone might have raced this
* thread into ocfs2_file_open while we were writing out
* data. If we're to allow a wipe of this inode now, we *must*
* hold the spinlock until we've marked it. */
if (OCFS2_I(inode)->ip_open_count) {
mlog(0, "Raced to wipe! open count = %u\n",
OCFS2_I(inode)->ip_open_count);
spin_unlock(&OCFS2_I(inode)->ip_lock);
goto done;
}
/* Mark the inode as being wiped from disk. */
ocfs2_mark_inode_remotely_deleted(inode);
spin_unlock(&OCFS2_I(inode)->ip_lock);
/* Not sure this is necessary anymore. */
d_prune_aliases(inode);
/* If we get here, then we're voting 'yes', so commit the
* delete on our side. */
response = OCFS2_RESPONSE_OK;
done:
return response;
}
static void ocfs2_process_vote(struct ocfs2_super *osb,
struct ocfs2_vote_msg *msg)
{
int net_status, vote_response;
int orphaned_slot = 0;
unsigned int node_num, generation;
unsigned int node_num;
u64 blkno;
enum ocfs2_vote_request request;
struct inode *inode = NULL;
struct ocfs2_msg_hdr *hdr = &msg->v_hdr;
struct ocfs2_response_msg response;
/* decode the network mumbo jumbo into local variables. */
request = be32_to_cpu(hdr->h_request);
blkno = be64_to_cpu(hdr->h_blkno);
generation = be32_to_cpu(hdr->h_generation);
node_num = be32_to_cpu(hdr->h_node_num);
if (request == OCFS2_VOTE_REQ_DELETE)
orphaned_slot = be32_to_cpu(msg->md1.v_orphaned_slot);
mlog(0, "processing vote: request = %u, blkno = %llu, "
"generation = %u, node_num = %u, priv1 = %u\n", request,
(unsigned long long)blkno, generation, node_num,
be32_to_cpu(msg->md1.v_generic1));
mlog(0, "processing vote: request = %u, blkno = %llu, node_num = %u\n",
request, (unsigned long long)blkno, node_num);
if (!ocfs2_is_valid_vote_request(request)) {
mlog(ML_ERROR, "Invalid vote request %d from node %u\n",
@ -302,52 +181,6 @@ static void ocfs2_process_vote(struct ocfs2_super *osb,
break;
}
/* We cannot process the remaining message types before we're
* fully mounted. It's perfectly safe however to send a 'yes'
* response as we can't possibly have any of the state they're
* asking us to modify yet. */
if (atomic_read(&osb->vol_state) == VOLUME_INIT)
goto respond;
/* If we get here, then the request is against an inode. */
inode = ocfs2_ilookup_for_vote(osb, blkno,
request == OCFS2_VOTE_REQ_DELETE);
/* Not finding the inode is perfectly valid - it means we're
* not interested in what the other node is about to do to it
* so in those cases we automatically respond with an
* affirmative. Cluster locking ensures that we won't race
* interest in the inode with this vote request. */
if (!inode)
goto respond;
/* Check generation values. It's possible for us to get a
* request against a stale inode. If so then we proceed as if
* we had not found an inode in the first place. */
if (inode->i_generation != generation) {
mlog(0, "generation passed %u != inode generation = %u, "
"ip_flags = %x, ip_blkno = %llu, msg %llu, i_count = %u, "
"message type = %u\n", generation, inode->i_generation,
OCFS2_I(inode)->ip_flags,
(unsigned long long)OCFS2_I(inode)->ip_blkno,
(unsigned long long)blkno, atomic_read(&inode->i_count),
request);
iput(inode);
inode = NULL;
goto respond;
}
switch (request) {
case OCFS2_VOTE_REQ_DELETE:
vote_response = ocfs2_process_delete_request(inode,
&orphaned_slot);
break;
default:
mlog(ML_ERROR, "node %u, invalid request: %u\n",
node_num, request);
vote_response = OCFS2_RESPONSE_BAD_MSG;
}
respond:
/* Response struture is small so we just put it on the stack
* and stuff it inline. */
@ -357,7 +190,6 @@ respond:
response.r_hdr.h_generation = hdr->h_generation;
response.r_hdr.h_node_num = cpu_to_be32(osb->node_num);
response.r_response = cpu_to_be32(vote_response);
response.r_orphaned_slot = cpu_to_be32(orphaned_slot);
net_status = o2net_send_message(OCFS2_MESSAGE_TYPE_RESPONSE,
osb->net_key,
@ -373,9 +205,6 @@ respond:
&& net_status != -ENOTCONN)
mlog(ML_ERROR, "message to node %u fails with error %d!\n",
node_num, net_status);
if (inode)
iput(inode);
}
static void ocfs2_vote_thread_do_work(struct ocfs2_super *osb)
@ -634,8 +463,7 @@ bail:
static struct ocfs2_vote_msg * ocfs2_new_vote_request(struct ocfs2_super *osb,
u64 blkno,
unsigned int generation,
enum ocfs2_vote_request type,
u32 priv)
enum ocfs2_vote_request type)
{
struct ocfs2_vote_msg *request;
struct ocfs2_msg_hdr *hdr;
@ -651,8 +479,6 @@ static struct ocfs2_vote_msg * ocfs2_new_vote_request(struct ocfs2_super *osb,
hdr->h_request = cpu_to_be32(type);
hdr->h_blkno = cpu_to_be64(blkno);
hdr->h_generation = cpu_to_be32(generation);
request->md1.v_generic1 = cpu_to_be32(priv);
}
return request;
@ -664,7 +490,7 @@ static int ocfs2_do_request_vote(struct ocfs2_super *osb,
struct ocfs2_vote_msg *request,
struct ocfs2_net_response_cb *callback)
{
int status, response;
int status, response = -EBUSY;
unsigned int response_id;
struct ocfs2_msg_hdr *hdr;
@ -686,109 +512,12 @@ bail:
return status;
}
static int ocfs2_request_vote(struct inode *inode,
struct ocfs2_vote_msg *request,
struct ocfs2_net_response_cb *callback)
{
int status;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
if (ocfs2_inode_is_new(inode))
return 0;
status = -EAGAIN;
while (status == -EAGAIN) {
if (!(osb->s_mount_opt & OCFS2_MOUNT_NOINTR) &&
signal_pending(current))
return -ERESTARTSYS;
status = ocfs2_super_lock(osb, 0);
if (status < 0) {
mlog_errno(status);
break;
}
status = 0;
if (!ocfs2_node_map_is_only(osb, &osb->mounted_map,
osb->node_num))
status = ocfs2_do_request_vote(osb, request, callback);
ocfs2_super_unlock(osb, 0);
}
return status;
}
static void ocfs2_delete_response_cb(void *priv,
struct ocfs2_response_msg *resp)
{
int orphaned_slot, node;
struct inode *inode = priv;
orphaned_slot = be32_to_cpu(resp->r_orphaned_slot);
node = be32_to_cpu(resp->r_hdr.h_node_num);
mlog(0, "node %d tells us that inode %llu is orphaned in slot %d\n",
node, (unsigned long long)OCFS2_I(inode)->ip_blkno,
orphaned_slot);
/* The other node may not actually know which slot the inode
* is orphaned in. */
if (orphaned_slot == OCFS2_INVALID_SLOT)
return;
/* Ok, the responding node knows which slot this inode is
* orphaned in. We verify that the information is correct and
* then record this in the inode. ocfs2_delete_inode will use
* this information to determine which lock to take. */
spin_lock(&OCFS2_I(inode)->ip_lock);
mlog_bug_on_msg(OCFS2_I(inode)->ip_orphaned_slot != orphaned_slot &&
OCFS2_I(inode)->ip_orphaned_slot
!= OCFS2_INVALID_SLOT, "Inode %llu: Node %d says it's "
"orphaned in slot %d, we think it's in %d\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
be32_to_cpu(resp->r_hdr.h_node_num),
orphaned_slot, OCFS2_I(inode)->ip_orphaned_slot);
OCFS2_I(inode)->ip_orphaned_slot = orphaned_slot;
spin_unlock(&OCFS2_I(inode)->ip_lock);
}
int ocfs2_request_delete_vote(struct inode *inode)
{
int orphaned_slot, status;
struct ocfs2_net_response_cb delete_cb;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct ocfs2_vote_msg *request;
spin_lock(&OCFS2_I(inode)->ip_lock);
orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
spin_unlock(&OCFS2_I(inode)->ip_lock);
delete_cb.rc_cb = ocfs2_delete_response_cb;
delete_cb.rc_priv = inode;
mlog(0, "Inode %llu, we start thinking orphaned slot is %d\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno, orphaned_slot);
status = -ENOMEM;
request = ocfs2_new_vote_request(osb, OCFS2_I(inode)->ip_blkno,
inode->i_generation,
OCFS2_VOTE_REQ_DELETE, orphaned_slot);
if (request) {
status = ocfs2_request_vote(inode, request, &delete_cb);
kfree(request);
}
return status;
}
int ocfs2_request_mount_vote(struct ocfs2_super *osb)
{
int status;
struct ocfs2_vote_msg *request = NULL;
request = ocfs2_new_vote_request(osb, 0ULL, 0,
OCFS2_VOTE_REQ_MOUNT, 0);
request = ocfs2_new_vote_request(osb, 0ULL, 0, OCFS2_VOTE_REQ_MOUNT);
if (!request) {
status = -ENOMEM;
goto bail;
@ -821,8 +550,7 @@ int ocfs2_request_umount_vote(struct ocfs2_super *osb)
int status;
struct ocfs2_vote_msg *request = NULL;
request = ocfs2_new_vote_request(osb, 0ULL, 0,
OCFS2_VOTE_REQ_UMOUNT, 0);
request = ocfs2_new_vote_request(osb, 0ULL, 0, OCFS2_VOTE_REQ_UMOUNT);
if (!request) {
status = -ENOMEM;
goto bail;
@ -969,7 +697,6 @@ static int ocfs2_handle_vote_message(struct o2net_msg *msg,
be32_to_cpu(work->w_msg.v_hdr.h_generation));
mlog(0, "h_node_num = %u\n",
be32_to_cpu(work->w_msg.v_hdr.h_node_num));
mlog(0, "v_generic1 = %u\n", be32_to_cpu(work->w_msg.md1.v_generic1));
spin_lock(&osb->vote_task_lock);
list_add_tail(&work->w_list, &osb->vote_list);

View file

@ -38,14 +38,11 @@ static inline void ocfs2_kick_vote_thread(struct ocfs2_super *osb)
wake_up(&osb->vote_event);
}
int ocfs2_request_delete_vote(struct inode *inode);
int ocfs2_request_mount_vote(struct ocfs2_super *osb);
int ocfs2_request_umount_vote(struct ocfs2_super *osb);
int ocfs2_register_net_handlers(struct ocfs2_super *osb);
void ocfs2_unregister_net_handlers(struct ocfs2_super *osb);
void ocfs2_mark_inode_remotely_deleted(struct inode *inode);
void ocfs2_remove_node_from_vote_queues(struct ocfs2_super *osb,
int node_num);
#endif

View file

@ -239,13 +239,11 @@ out:
/*
* `endbyte' is inclusive
*/
int do_sync_file_range(struct file *file, loff_t offset, loff_t endbyte,
unsigned int flags)
int do_sync_mapping_range(struct address_space *mapping, loff_t offset,
loff_t endbyte, unsigned int flags)
{
int ret;
struct address_space *mapping;
mapping = file->f_mapping;
if (!mapping) {
ret = -EINVAL;
goto out;
@ -275,4 +273,4 @@ int do_sync_file_range(struct file *file, loff_t offset, loff_t endbyte,
out:
return ret;
}
EXPORT_SYMBOL_GPL(do_sync_file_range);
EXPORT_SYMBOL_GPL(do_sync_mapping_range);

View file

@ -843,8 +843,13 @@ extern int fcntl_setlease(unsigned int fd, struct file *filp, long arg);
extern int fcntl_getlease(struct file *filp);
/* fs/sync.c */
extern int do_sync_file_range(struct file *file, loff_t offset, loff_t endbyte,
unsigned int flags);
extern int do_sync_mapping_range(struct address_space *mapping, loff_t offset,
loff_t endbyte, unsigned int flags);
static inline int do_sync_file_range(struct file *file, loff_t offset,
loff_t endbyte, unsigned int flags)
{
return do_sync_mapping_range(file->f_mapping, offset, endbyte, flags);
}
/* fs/locks.c */
extern void locks_init_lock(struct file_lock *);