ext4: Add multi block allocator for ext4

Signed-off-by: Alex Tomas <alex@clusterfs.com>
Signed-off-by: Andreas Dilger <adilger@clusterfs.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
This commit is contained in:
Alex Tomas 2008-01-29 00:19:52 -05:00 committed by Theodore Ts'o
parent 1988b51e47
commit c9de560ded
13 changed files with 4900 additions and 38 deletions

View file

@ -86,9 +86,11 @@ Alex is working on a new set of patches right now.
When mounting an ext4 filesystem, the following option are accepted: When mounting an ext4 filesystem, the following option are accepted:
(*) == default (*) == default
extents ext4 will use extents to address file data. The extents (*) ext4 will use extents to address file data. The
file system will no longer be mountable by ext3. file system will no longer be mountable by ext3.
noextents ext4 will not use extents for newly created files
journal_checksum Enable checksumming of the journal transactions. journal_checksum Enable checksumming of the journal transactions.
This will allow the recovery code in e2fsck and the This will allow the recovery code in e2fsck and the
kernel to detect corruption in the kernel. It is a kernel to detect corruption in the kernel. It is a
@ -206,6 +208,12 @@ nobh (a) cache disk block mapping information
"nobh" option tries to avoid associating buffer "nobh" option tries to avoid associating buffer
heads (supported only for "writeback" mode). heads (supported only for "writeback" mode).
mballoc (*) Use the multiple block allocator for block allocation
nomballoc disabled multiple block allocator for block allocation.
stripe=n Number of filesystem blocks that mballoc will try
to use for allocation size and alignment. For RAID5/6
systems this should be the number of data
disks * RAID chunk size in file system blocks.
Data Mode Data Mode
--------- ---------

View file

@ -857,6 +857,45 @@ CPUs.
The "procs_blocked" line gives the number of processes currently blocked, The "procs_blocked" line gives the number of processes currently blocked,
waiting for I/O to complete. waiting for I/O to complete.
1.9 Ext4 file system parameters
------------------------------
Ext4 file system have one directory per partition under /proc/fs/ext4/
# ls /proc/fs/ext4/hdc/
group_prealloc max_to_scan mb_groups mb_history min_to_scan order2_req
stats stream_req
mb_groups:
This file gives the details of mutiblock allocator buddy cache of free blocks
mb_history:
Multiblock allocation history.
stats:
This file indicate whether the multiblock allocator should start collecting
statistics. The statistics are shown during unmount
group_prealloc:
The multiblock allocator normalize the block allocation request to
group_prealloc filesystem blocks if we don't have strip value set.
The stripe value can be specified at mount time or during mke2fs.
max_to_scan:
How long multiblock allocator can look for a best extent (in found extents)
min_to_scan:
How long multiblock allocator must look for a best extent
order2_req:
Multiblock allocator use 2^N search using buddies only for requests greater
than or equal to order2_req. The request size is specfied in file system
blocks. A value of 2 indicate only if the requests are greater than or equal
to 4 blocks.
stream_req:
Files smaller than stream_req are served by the stream allocator, whose
purpose is to pack requests as close each to other as possible to
produce smooth I/O traffic. Avalue of 16 indicate that file smaller than 16
filesystem block size will use group based preallocation.
------------------------------------------------------------------------------ ------------------------------------------------------------------------------
Summary Summary

View file

@ -6,7 +6,7 @@ obj-$(CONFIG_EXT4DEV_FS) += ext4dev.o
ext4dev-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ ext4dev-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
ext4_jbd2.o migrate.o ext4_jbd2.o migrate.o mballoc.o
ext4dev-$(CONFIG_EXT4DEV_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o ext4dev-$(CONFIG_EXT4DEV_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
ext4dev-$(CONFIG_EXT4DEV_FS_POSIX_ACL) += acl.o ext4dev-$(CONFIG_EXT4DEV_FS_POSIX_ACL) += acl.o

View file

@ -577,6 +577,8 @@ void ext4_discard_reservation(struct inode *inode)
struct ext4_reserve_window_node *rsv; struct ext4_reserve_window_node *rsv;
spinlock_t *rsv_lock = &EXT4_SB(inode->i_sb)->s_rsv_window_lock; spinlock_t *rsv_lock = &EXT4_SB(inode->i_sb)->s_rsv_window_lock;
ext4_mb_discard_inode_preallocations(inode);
if (!block_i) if (!block_i)
return; return;
@ -785,19 +787,29 @@ error_return:
* @inode: inode * @inode: inode
* @block: start physical block to free * @block: start physical block to free
* @count: number of blocks to count * @count: number of blocks to count
* @metadata: Are these metadata blocks
*/ */
void ext4_free_blocks(handle_t *handle, struct inode *inode, void ext4_free_blocks(handle_t *handle, struct inode *inode,
ext4_fsblk_t block, unsigned long count) ext4_fsblk_t block, unsigned long count,
int metadata)
{ {
struct super_block * sb; struct super_block * sb;
unsigned long dquot_freed_blocks; unsigned long dquot_freed_blocks;
/* this isn't the right place to decide whether block is metadata
* inode.c/extents.c knows better, but for safety ... */
if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) ||
ext4_should_journal_data(inode))
metadata = 1;
sb = inode->i_sb; sb = inode->i_sb;
if (!sb) {
printk ("ext4_free_blocks: nonexistent device"); if (!test_opt(sb, MBALLOC) || !EXT4_SB(sb)->s_group_info)
return; ext4_free_blocks_sb(handle, sb, block, count,
} &dquot_freed_blocks);
ext4_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks); else
ext4_mb_free_blocks(handle, inode, block, count,
metadata, &dquot_freed_blocks);
if (dquot_freed_blocks) if (dquot_freed_blocks)
DQUOT_FREE_BLOCK(inode, dquot_freed_blocks); DQUOT_FREE_BLOCK(inode, dquot_freed_blocks);
return; return;
@ -1576,7 +1588,7 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
} }
/** /**
* ext4_new_blocks() -- core block(s) allocation function * ext4_new_blocks_old() -- core block(s) allocation function
* @handle: handle to this transaction * @handle: handle to this transaction
* @inode: file inode * @inode: file inode
* @goal: given target block(filesystem wide) * @goal: given target block(filesystem wide)
@ -1589,7 +1601,7 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
* any specific goal block. * any specific goal block.
* *
*/ */
ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode, ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
ext4_fsblk_t goal, unsigned long *count, int *errp) ext4_fsblk_t goal, unsigned long *count, int *errp)
{ {
struct buffer_head *bitmap_bh = NULL; struct buffer_head *bitmap_bh = NULL;
@ -1849,13 +1861,46 @@ out:
} }
ext4_fsblk_t ext4_new_block(handle_t *handle, struct inode *inode, ext4_fsblk_t ext4_new_block(handle_t *handle, struct inode *inode,
ext4_fsblk_t goal, int *errp) ext4_fsblk_t goal, int *errp)
{ {
unsigned long count = 1; struct ext4_allocation_request ar;
ext4_fsblk_t ret;
return ext4_new_blocks(handle, inode, goal, &count, errp); if (!test_opt(inode->i_sb, MBALLOC)) {
unsigned long count = 1;
ret = ext4_new_blocks_old(handle, inode, goal, &count, errp);
return ret;
}
memset(&ar, 0, sizeof(ar));
ar.inode = inode;
ar.goal = goal;
ar.len = 1;
ret = ext4_mb_new_blocks(handle, &ar, errp);
return ret;
} }
ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
ext4_fsblk_t goal, unsigned long *count, int *errp)
{
struct ext4_allocation_request ar;
ext4_fsblk_t ret;
if (!test_opt(inode->i_sb, MBALLOC)) {
ret = ext4_new_blocks_old(handle, inode, goal, count, errp);
return ret;
}
memset(&ar, 0, sizeof(ar));
ar.inode = inode;
ar.goal = goal;
ar.len = *count;
ret = ext4_mb_new_blocks(handle, &ar, errp);
*count = ar.len;
return ret;
}
/** /**
* ext4_count_free_blocks() -- count filesystem free blocks * ext4_count_free_blocks() -- count filesystem free blocks
* @sb: superblock * @sb: superblock

View file

@ -853,7 +853,7 @@ cleanup:
for (i = 0; i < depth; i++) { for (i = 0; i < depth; i++) {
if (!ablocks[i]) if (!ablocks[i])
continue; continue;
ext4_free_blocks(handle, inode, ablocks[i], 1); ext4_free_blocks(handle, inode, ablocks[i], 1, 1);
} }
} }
kfree(ablocks); kfree(ablocks);
@ -1698,7 +1698,7 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
ext_debug("index is empty, remove it, free block %llu\n", leaf); ext_debug("index is empty, remove it, free block %llu\n", leaf);
bh = sb_find_get_block(inode->i_sb, leaf); bh = sb_find_get_block(inode->i_sb, leaf);
ext4_forget(handle, 1, inode, bh, leaf); ext4_forget(handle, 1, inode, bh, leaf);
ext4_free_blocks(handle, inode, leaf, 1); ext4_free_blocks(handle, inode, leaf, 1, 1);
return err; return err;
} }
@ -1759,8 +1759,10 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
{ {
struct buffer_head *bh; struct buffer_head *bh;
unsigned short ee_len = ext4_ext_get_actual_len(ex); unsigned short ee_len = ext4_ext_get_actual_len(ex);
int i; int i, metadata = 0;
if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
metadata = 1;
#ifdef EXTENTS_STATS #ifdef EXTENTS_STATS
{ {
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@ -1789,7 +1791,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
bh = sb_find_get_block(inode->i_sb, start + i); bh = sb_find_get_block(inode->i_sb, start + i);
ext4_forget(handle, 0, inode, bh, start + i); ext4_forget(handle, 0, inode, bh, start + i);
} }
ext4_free_blocks(handle, inode, start, num); ext4_free_blocks(handle, inode, start, num, metadata);
} else if (from == le32_to_cpu(ex->ee_block) } else if (from == le32_to_cpu(ex->ee_block)
&& to <= le32_to_cpu(ex->ee_block) + ee_len - 1) { && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n", printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n",
@ -2287,6 +2289,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
ext4_fsblk_t goal, newblock; ext4_fsblk_t goal, newblock;
int err = 0, depth, ret; int err = 0, depth, ret;
unsigned long allocated = 0; unsigned long allocated = 0;
struct ext4_allocation_request ar;
__clear_bit(BH_New, &bh_result->b_state); __clear_bit(BH_New, &bh_result->b_state);
ext_debug("blocks %u/%lu requested for inode %u\n", ext_debug("blocks %u/%lu requested for inode %u\n",
@ -2397,8 +2400,15 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
if (S_ISREG(inode->i_mode) && (!EXT4_I(inode)->i_block_alloc_info)) if (S_ISREG(inode->i_mode) && (!EXT4_I(inode)->i_block_alloc_info))
ext4_init_block_alloc_info(inode); ext4_init_block_alloc_info(inode);
/* allocate new block */ /* find neighbour allocated blocks */
goal = ext4_ext_find_goal(inode, path, iblock); ar.lleft = iblock;
err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
if (err)
goto out2;
ar.lright = iblock;
err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright);
if (err)
goto out2;
/* /*
* See if request is beyond maximum number of blocks we can have in * See if request is beyond maximum number of blocks we can have in
@ -2421,7 +2431,18 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
allocated = le16_to_cpu(newex.ee_len); allocated = le16_to_cpu(newex.ee_len);
else else
allocated = max_blocks; allocated = max_blocks;
newblock = ext4_new_blocks(handle, inode, goal, &allocated, &err);
/* allocate new block */
ar.inode = inode;
ar.goal = ext4_ext_find_goal(inode, path, iblock);
ar.logical = iblock;
ar.len = allocated;
if (S_ISREG(inode->i_mode))
ar.flags = EXT4_MB_HINT_DATA;
else
/* disable in-core preallocation for non-regular files */
ar.flags = 0;
newblock = ext4_mb_new_blocks(handle, &ar, &err);
if (!newblock) if (!newblock)
goto out2; goto out2;
ext_debug("allocate new block: goal %llu, found %llu/%lu\n", ext_debug("allocate new block: goal %llu, found %llu/%lu\n",
@ -2429,14 +2450,17 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
/* try to insert new extent into found leaf and return */ /* try to insert new extent into found leaf and return */
ext4_ext_store_pblock(&newex, newblock); ext4_ext_store_pblock(&newex, newblock);
newex.ee_len = cpu_to_le16(allocated); newex.ee_len = cpu_to_le16(ar.len);
if (create == EXT4_CREATE_UNINITIALIZED_EXT) /* Mark uninitialized */ if (create == EXT4_CREATE_UNINITIALIZED_EXT) /* Mark uninitialized */
ext4_ext_mark_uninitialized(&newex); ext4_ext_mark_uninitialized(&newex);
err = ext4_ext_insert_extent(handle, inode, path, &newex); err = ext4_ext_insert_extent(handle, inode, path, &newex);
if (err) { if (err) {
/* free data blocks we just allocated */ /* free data blocks we just allocated */
/* not a good idea to call discard here directly,
* but otherwise we'd need to call it every free() */
ext4_mb_discard_inode_preallocations(inode);
ext4_free_blocks(handle, inode, ext_pblock(&newex), ext4_free_blocks(handle, inode, ext_pblock(&newex),
le16_to_cpu(newex.ee_len)); le16_to_cpu(newex.ee_len), 0);
goto out2; goto out2;
} }
@ -2445,6 +2469,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
/* previous routine could use block we allocated */ /* previous routine could use block we allocated */
newblock = ext_pblock(&newex); newblock = ext_pblock(&newex);
allocated = le16_to_cpu(newex.ee_len);
outnew: outnew:
__set_bit(BH_New, &bh_result->b_state); __set_bit(BH_New, &bh_result->b_state);
@ -2496,6 +2521,8 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
down_write(&EXT4_I(inode)->i_data_sem); down_write(&EXT4_I(inode)->i_data_sem);
ext4_ext_invalidate_cache(inode); ext4_ext_invalidate_cache(inode);
ext4_mb_discard_inode_preallocations(inode);
/* /*
* TODO: optimization is possible here. * TODO: optimization is possible here.
* Probably we need not scan at all, * Probably we need not scan at all,

View file

@ -551,7 +551,7 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
return ret; return ret;
failed_out: failed_out:
for (i = 0; i <index; i++) for (i = 0; i <index; i++)
ext4_free_blocks(handle, inode, new_blocks[i], 1); ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
return ret; return ret;
} }
@ -650,9 +650,9 @@ failed:
ext4_journal_forget(handle, branch[i].bh); ext4_journal_forget(handle, branch[i].bh);
} }
for (i = 0; i <indirect_blks; i++) for (i = 0; i <indirect_blks; i++)
ext4_free_blocks(handle, inode, new_blocks[i], 1); ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
ext4_free_blocks(handle, inode, new_blocks[i], num); ext4_free_blocks(handle, inode, new_blocks[i], num, 0);
return err; return err;
} }
@ -749,9 +749,10 @@ err_out:
for (i = 1; i <= num; i++) { for (i = 1; i <= num; i++) {
BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget"); BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget");
ext4_journal_forget(handle, where[i].bh); ext4_journal_forget(handle, where[i].bh);
ext4_free_blocks(handle,inode,le32_to_cpu(where[i-1].key),1); ext4_free_blocks(handle, inode,
le32_to_cpu(where[i-1].key), 1, 0);
} }
ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks); ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks, 0);
return err; return err;
} }
@ -2052,7 +2053,7 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
} }
} }
ext4_free_blocks(handle, inode, block_to_free, count); ext4_free_blocks(handle, inode, block_to_free, count, 0);
} }
/** /**
@ -2225,7 +2226,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
ext4_journal_test_restart(handle, inode); ext4_journal_test_restart(handle, inode);
} }
ext4_free_blocks(handle, inode, nr, 1); ext4_free_blocks(handle, inode, nr, 1, 1);
if (parent_bh) { if (parent_bh) {
/* /*

4552
fs/ext4/mballoc.c Normal file

File diff suppressed because it is too large Load diff

View file

@ -236,10 +236,10 @@ static int free_dind_blocks(handle_t *handle,
for (i = 0; i < max_entries; i++) { for (i = 0; i < max_entries; i++) {
if (tmp_idata[i]) if (tmp_idata[i])
ext4_free_blocks(handle, inode, ext4_free_blocks(handle, inode,
le32_to_cpu(tmp_idata[i]), 1); le32_to_cpu(tmp_idata[i]), 1, 1);
} }
put_bh(bh); put_bh(bh);
ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1); ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1, 1);
return 0; return 0;
} }
@ -267,7 +267,7 @@ static int free_tind_blocks(handle_t *handle,
} }
} }
put_bh(bh); put_bh(bh);
ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1); ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1, 1);
return 0; return 0;
} }
@ -278,7 +278,7 @@ static int free_ind_block(handle_t *handle, struct inode *inode)
if (ei->i_data[EXT4_IND_BLOCK]) if (ei->i_data[EXT4_IND_BLOCK])
ext4_free_blocks(handle, inode, ext4_free_blocks(handle, inode,
le32_to_cpu(ei->i_data[EXT4_IND_BLOCK]), 1); le32_to_cpu(ei->i_data[EXT4_IND_BLOCK]), 1, 1);
if (ei->i_data[EXT4_DIND_BLOCK]) { if (ei->i_data[EXT4_DIND_BLOCK]) {
retval = free_dind_blocks(handle, inode, retval = free_dind_blocks(handle, inode,
@ -365,7 +365,7 @@ static int free_ext_idx(handle_t *handle, struct inode *inode,
} }
} }
put_bh(bh); put_bh(bh);
ext4_free_blocks(handle, inode, block, 1); ext4_free_blocks(handle, inode, block, 1, 1);
return retval; return retval;
} }

View file

@ -503,6 +503,7 @@ static void ext4_put_super (struct super_block * sb)
struct ext4_super_block *es = sbi->s_es; struct ext4_super_block *es = sbi->s_es;
int i; int i;
ext4_mb_release(sb);
ext4_ext_release(sb); ext4_ext_release(sb);
ext4_xattr_put_super(sb); ext4_xattr_put_super(sb);
jbd2_journal_destroy(sbi->s_journal); jbd2_journal_destroy(sbi->s_journal);
@ -569,6 +570,8 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
ei->i_block_alloc_info = NULL; ei->i_block_alloc_info = NULL;
ei->vfs_inode.i_version = 1; ei->vfs_inode.i_version = 1;
memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache)); memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
INIT_LIST_HEAD(&ei->i_prealloc_list);
spin_lock_init(&ei->i_prealloc_lock);
return &ei->vfs_inode; return &ei->vfs_inode;
} }
@ -881,6 +884,7 @@ enum {
Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version, Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
Opt_mballoc, Opt_nomballoc, Opt_stripe,
}; };
static match_table_t tokens = { static match_table_t tokens = {
@ -935,6 +939,9 @@ static match_table_t tokens = {
{Opt_extents, "extents"}, {Opt_extents, "extents"},
{Opt_noextents, "noextents"}, {Opt_noextents, "noextents"},
{Opt_i_version, "i_version"}, {Opt_i_version, "i_version"},
{Opt_mballoc, "mballoc"},
{Opt_nomballoc, "nomballoc"},
{Opt_stripe, "stripe=%u"},
{Opt_err, NULL}, {Opt_err, NULL},
{Opt_resize, "resize"}, {Opt_resize, "resize"},
}; };
@ -1284,6 +1291,19 @@ clear_qf_name:
set_opt(sbi->s_mount_opt, I_VERSION); set_opt(sbi->s_mount_opt, I_VERSION);
sb->s_flags |= MS_I_VERSION; sb->s_flags |= MS_I_VERSION;
break; break;
case Opt_mballoc:
set_opt(sbi->s_mount_opt, MBALLOC);
break;
case Opt_nomballoc:
clear_opt(sbi->s_mount_opt, MBALLOC);
break;
case Opt_stripe:
if (match_int(&args[0], &option))
return 0;
if (option < 0)
return 0;
sbi->s_stripe = option;
break;
default: default:
printk (KERN_ERR printk (KERN_ERR
"EXT4-fs: Unrecognized mount option \"%s\" " "EXT4-fs: Unrecognized mount option \"%s\" "
@ -1742,6 +1762,34 @@ static ext4_fsblk_t descriptor_loc(struct super_block *sb,
return (has_super + ext4_group_first_block_no(sb, bg)); return (has_super + ext4_group_first_block_no(sb, bg));
} }
/**
* ext4_get_stripe_size: Get the stripe size.
* @sbi: In memory super block info
*
* If we have specified it via mount option, then
* use the mount option value. If the value specified at mount time is
* greater than the blocks per group use the super block value.
* If the super block value is greater than blocks per group return 0.
* Allocator needs it be less than blocks per group.
*
*/
static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
{
unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride);
unsigned long stripe_width =
le32_to_cpu(sbi->s_es->s_raid_stripe_width);
if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group)
return sbi->s_stripe;
if (stripe_width <= sbi->s_blocks_per_group)
return stripe_width;
if (stride <= sbi->s_blocks_per_group)
return stride;
return 0;
}
static int ext4_fill_super (struct super_block *sb, void *data, int silent) static int ext4_fill_super (struct super_block *sb, void *data, int silent)
__releases(kernel_sem) __releases(kernel_sem)
@ -2091,6 +2139,8 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
sbi->s_rsv_window_head.rsv_goal_size = 0; sbi->s_rsv_window_head.rsv_goal_size = 0;
ext4_rsv_window_add(sb, &sbi->s_rsv_window_head); ext4_rsv_window_add(sb, &sbi->s_rsv_window_head);
sbi->s_stripe = ext4_get_stripe_size(sbi);
/* /*
* set up enough so that it can read an inode * set up enough so that it can read an inode
*/ */
@ -2250,6 +2300,7 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
"writeback"); "writeback");
ext4_ext_init(sb); ext4_ext_init(sb);
ext4_mb_init(sb, needs_recovery);
lock_kernel(); lock_kernel();
return 0; return 0;
@ -3232,9 +3283,15 @@ static struct file_system_type ext4dev_fs_type = {
static int __init init_ext4_fs(void) static int __init init_ext4_fs(void)
{ {
int err = init_ext4_xattr(); int err;
err = init_ext4_mballoc();
if (err) if (err)
return err; return err;
err = init_ext4_xattr();
if (err)
goto out2;
err = init_inodecache(); err = init_inodecache();
if (err) if (err)
goto out1; goto out1;
@ -3246,6 +3303,8 @@ out:
destroy_inodecache(); destroy_inodecache();
out1: out1:
exit_ext4_xattr(); exit_ext4_xattr();
out2:
exit_ext4_mballoc();
return err; return err;
} }
@ -3254,6 +3313,7 @@ static void __exit exit_ext4_fs(void)
unregister_filesystem(&ext4dev_fs_type); unregister_filesystem(&ext4dev_fs_type);
destroy_inodecache(); destroy_inodecache();
exit_ext4_xattr(); exit_ext4_xattr();
exit_ext4_mballoc();
} }
MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");

View file

@ -480,7 +480,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
ea_bdebug(bh, "refcount now=0; freeing"); ea_bdebug(bh, "refcount now=0; freeing");
if (ce) if (ce)
mb_cache_entry_free(ce); mb_cache_entry_free(ce);
ext4_free_blocks(handle, inode, bh->b_blocknr, 1); ext4_free_blocks(handle, inode, bh->b_blocknr, 1, 1);
get_bh(bh); get_bh(bh);
ext4_forget(handle, 1, inode, bh, bh->b_blocknr); ext4_forget(handle, 1, inode, bh, bh->b_blocknr);
} else { } else {
@ -821,7 +821,7 @@ inserted:
new_bh = sb_getblk(sb, block); new_bh = sb_getblk(sb, block);
if (!new_bh) { if (!new_bh) {
getblk_failed: getblk_failed:
ext4_free_blocks(handle, inode, block, 1); ext4_free_blocks(handle, inode, block, 1, 1);
error = -EIO; error = -EIO;
goto cleanup; goto cleanup;
} }

View file

@ -20,6 +20,8 @@
#include <linux/blkdev.h> #include <linux/blkdev.h>
#include <linux/magic.h> #include <linux/magic.h>
#include <linux/ext4_fs_i.h>
/* /*
* The second extended filesystem constants/structures * The second extended filesystem constants/structures
*/ */
@ -51,6 +53,50 @@
#define ext4_debug(f, a...) do {} while (0) #define ext4_debug(f, a...) do {} while (0)
#endif #endif
#define EXT4_MULTIBLOCK_ALLOCATOR 1
/* prefer goal again. length */
#define EXT4_MB_HINT_MERGE 1
/* blocks already reserved */
#define EXT4_MB_HINT_RESERVED 2
/* metadata is being allocated */
#define EXT4_MB_HINT_METADATA 4
/* first blocks in the file */
#define EXT4_MB_HINT_FIRST 8
/* search for the best chunk */
#define EXT4_MB_HINT_BEST 16
/* data is being allocated */
#define EXT4_MB_HINT_DATA 32
/* don't preallocate (for tails) */
#define EXT4_MB_HINT_NOPREALLOC 64
/* allocate for locality group */
#define EXT4_MB_HINT_GROUP_ALLOC 128
/* allocate goal blocks or none */
#define EXT4_MB_HINT_GOAL_ONLY 256
/* goal is meaningful */
#define EXT4_MB_HINT_TRY_GOAL 512
struct ext4_allocation_request {
/* target inode for block we're allocating */
struct inode *inode;
/* logical block in target inode */
ext4_lblk_t logical;
/* phys. target (a hint) */
ext4_fsblk_t goal;
/* the closest logical allocated block to the left */
ext4_lblk_t lleft;
/* phys. block for ^^^ */
ext4_fsblk_t pleft;
/* the closest logical allocated block to the right */
ext4_lblk_t lright;
/* phys. block for ^^^ */
ext4_fsblk_t pright;
/* how many blocks we want to allocate */
unsigned long len;
/* flags. see above EXT4_MB_HINT_* */
unsigned long flags;
};
/* /*
* Special inodes numbers * Special inodes numbers
*/ */
@ -474,6 +520,7 @@ do { \
#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ #define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ #define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
#define EXT4_MOUNT_MBALLOC 0x4000000 /* Buddy allocation support */
/* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */ /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
#ifndef _LINUX_EXT2_FS_H #ifndef _LINUX_EXT2_FS_H
#define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt #define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt
@ -912,7 +959,7 @@ extern ext4_fsblk_t ext4_new_blocks (handle_t *handle, struct inode *inode,
extern ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode, extern ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
ext4_fsblk_t goal, unsigned long *count, int *errp); ext4_fsblk_t goal, unsigned long *count, int *errp);
extern void ext4_free_blocks (handle_t *handle, struct inode *inode, extern void ext4_free_blocks (handle_t *handle, struct inode *inode,
ext4_fsblk_t block, unsigned long count); ext4_fsblk_t block, unsigned long count, int metadata);
extern void ext4_free_blocks_sb (handle_t *handle, struct super_block *sb, extern void ext4_free_blocks_sb (handle_t *handle, struct super_block *sb,
ext4_fsblk_t block, unsigned long count, ext4_fsblk_t block, unsigned long count,
unsigned long *pdquot_freed_blocks); unsigned long *pdquot_freed_blocks);
@ -950,6 +997,20 @@ extern unsigned long ext4_count_dirs (struct super_block *);
extern void ext4_check_inodes_bitmap (struct super_block *); extern void ext4_check_inodes_bitmap (struct super_block *);
extern unsigned long ext4_count_free (struct buffer_head *, unsigned); extern unsigned long ext4_count_free (struct buffer_head *, unsigned);
/* mballoc.c */
extern long ext4_mb_stats;
extern long ext4_mb_max_to_scan;
extern int ext4_mb_init(struct super_block *, int);
extern int ext4_mb_release(struct super_block *);
extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
struct ext4_allocation_request *, int *);
extern int ext4_mb_reserve_blocks(struct super_block *, int);
extern void ext4_mb_discard_inode_preallocations(struct inode *);
extern int __init init_ext4_mballoc(void);
extern void exit_ext4_mballoc(void);
extern void ext4_mb_free_blocks(handle_t *, struct inode *,
unsigned long, unsigned long, int, unsigned long *);
/* inode.c */ /* inode.c */
int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
@ -1080,6 +1141,19 @@ static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
raw_inode->i_size_high = cpu_to_le32(i_size >> 32); raw_inode->i_size_high = cpu_to_le32(i_size >> 32);
} }
static inline
struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
ext4_group_t group)
{
struct ext4_group_info ***grp_info;
long indexv, indexh;
grp_info = EXT4_SB(sb)->s_group_info;
indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb));
indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1);
return grp_info[indexv][indexh];
}
#define ext4_std_error(sb, errno) \ #define ext4_std_error(sb, errno) \
do { \ do { \
if ((errno)) \ if ((errno)) \

View file

@ -158,6 +158,10 @@ struct ext4_inode_info {
* struct timespec i_{a,c,m}time in the generic inode. * struct timespec i_{a,c,m}time in the generic inode.
*/ */
struct timespec i_crtime; struct timespec i_crtime;
/* mballoc */
struct list_head i_prealloc_list;
spinlock_t i_prealloc_lock;
}; };
#endif /* _LINUX_EXT4_FS_I */ #endif /* _LINUX_EXT4_FS_I */

View file

@ -91,6 +91,58 @@ struct ext4_sb_info {
unsigned long s_ext_blocks; unsigned long s_ext_blocks;
unsigned long s_ext_extents; unsigned long s_ext_extents;
#endif #endif
/* for buddy allocator */
struct ext4_group_info ***s_group_info;
struct inode *s_buddy_cache;
long s_blocks_reserved;
spinlock_t s_reserve_lock;
struct list_head s_active_transaction;
struct list_head s_closed_transaction;
struct list_head s_committed_transaction;
spinlock_t s_md_lock;
tid_t s_last_transaction;
unsigned short *s_mb_offsets, *s_mb_maxs;
/* tunables */
unsigned long s_stripe;
unsigned long s_mb_stream_request;
unsigned long s_mb_max_to_scan;
unsigned long s_mb_min_to_scan;
unsigned long s_mb_stats;
unsigned long s_mb_order2_reqs;
unsigned long s_mb_group_prealloc;
/* where last allocation was done - for stream allocation */
unsigned long s_mb_last_group;
unsigned long s_mb_last_start;
/* history to debug policy */
struct ext4_mb_history *s_mb_history;
int s_mb_history_cur;
int s_mb_history_max;
int s_mb_history_num;
struct proc_dir_entry *s_mb_proc;
spinlock_t s_mb_history_lock;
int s_mb_history_filter;
/* stats for buddy allocator */
spinlock_t s_mb_pa_lock;
atomic_t s_bal_reqs; /* number of reqs with len > 1 */
atomic_t s_bal_success; /* we found long enough chunks */
atomic_t s_bal_allocated; /* in blocks */
atomic_t s_bal_ex_scanned; /* total extents scanned */
atomic_t s_bal_goals; /* goal hits */
atomic_t s_bal_breaks; /* too long searches */
atomic_t s_bal_2orders; /* 2^order hits */
spinlock_t s_bal_lock;
unsigned long s_mb_buddies_generated;
unsigned long long s_mb_generation_time;
atomic_t s_mb_lost_chunks;
atomic_t s_mb_preallocated;
atomic_t s_mb_discarded;
/* locality groups */
struct ext4_locality_group *s_locality_groups;
}; };
#endif /* _LINUX_EXT4_FS_SB */ #endif /* _LINUX_EXT4_FS_SB */