From 74d553aad7926ed05e05d9d5cff516a7b31375fc Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 3 Apr 2013 12:39:17 -0400 Subject: [PATCH 01/50] ext4: collapse handling of data=ordered and data=writeback codepaths The only difference between how we handle data=ordered and data=writeback is a single call to ext4_jbd2_file_inode(). Eliminate code duplication by factoring out redundant the code paths. Signed-off-by: "Theodore Ts'o" Reviewed-by: Lukas Czerner --- fs/ext4/ext4.h | 1 + fs/ext4/inode.c | 125 +++++++++--------------------------- include/trace/events/ext4.h | 10 +-- 3 files changed, 31 insertions(+), 105 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 3b83cd604796..f91e11bd9753 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1374,6 +1374,7 @@ enum { EXT4_STATE_DIOREAD_LOCK, /* Disable support for dio read nolocking */ EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */ + EXT4_STATE_ORDERED_MODE, /* data=ordered mode */ }; #define EXT4_INODE_BIT_FNS(name, field, offset) \ diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index b3a5213bc73e..4ee69270a48a 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1145,77 +1145,36 @@ static int ext4_generic_write_end(struct file *file, * ext4 never places buffers on inode->i_mapping->private_list. metadata * buffers are managed internally. */ -static int ext4_ordered_write_end(struct file *file, - struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) +static int ext4_write_end(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) { handle_t *handle = ext4_journal_current_handle(); struct inode *inode = mapping->host; int ret = 0, ret2; - trace_ext4_ordered_write_end(inode, pos, len, copied); - ret = ext4_jbd2_file_inode(handle, inode); - - if (ret == 0) { - ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, - page, fsdata); - copied = ret2; - if (pos + len > inode->i_size && ext4_can_truncate(inode)) - /* if we have allocated more blocks and copied - * less. We will have blocks allocated outside - * inode->i_size. So truncate them - */ - ext4_orphan_add(handle, inode); - if (ret2 < 0) - ret = ret2; - } else { - unlock_page(page); - page_cache_release(page); + trace_ext4_write_end(inode, pos, len, copied); + if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE)) { + ret = ext4_jbd2_file_inode(handle, inode); + if (ret) { + unlock_page(page); + page_cache_release(page); + goto errout; + } } - ret2 = ext4_journal_stop(handle); - if (!ret) - ret = ret2; - - if (pos + len > inode->i_size) { - ext4_truncate_failed_write(inode); - /* - * If truncate failed early the inode might still be - * on the orphan list; we need to make sure the inode - * is removed from the orphan list in that case. - */ - if (inode->i_nlink) - ext4_orphan_del(NULL, inode); - } - - - return ret ? ret : copied; -} - -static int ext4_writeback_write_end(struct file *file, - struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) -{ - handle_t *handle = ext4_journal_current_handle(); - struct inode *inode = mapping->host; - int ret = 0, ret2; - - trace_ext4_writeback_write_end(inode, pos, len, copied); - ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, - page, fsdata); - copied = ret2; + copied = ext4_generic_write_end(file, mapping, pos, len, copied, + page, fsdata); + if (copied < 0) + ret = copied; if (pos + len > inode->i_size && ext4_can_truncate(inode)) /* if we have allocated more blocks and copied * less. We will have blocks allocated outside * inode->i_size. So truncate them */ ext4_orphan_add(handle, inode); - - if (ret2 < 0) - ret = ret2; - +errout: ret2 = ext4_journal_stop(handle); if (!ret) ret = ret2; @@ -2818,18 +2777,9 @@ static int ext4_da_write_end(struct file *file, unsigned long start, end; int write_mode = (int)(unsigned long)fsdata; - if (write_mode == FALL_BACK_TO_NONDELALLOC) { - switch (ext4_inode_journal_mode(inode)) { - case EXT4_INODE_ORDERED_DATA_MODE: - return ext4_ordered_write_end(file, mapping, pos, - len, copied, page, fsdata); - case EXT4_INODE_WRITEBACK_DATA_MODE: - return ext4_writeback_write_end(file, mapping, pos, - len, copied, page, fsdata); - default: - BUG(); - } - } + if (write_mode == FALL_BACK_TO_NONDELALLOC) + return ext4_write_end(file, mapping, pos, + len, copied, page, fsdata); trace_ext4_da_write_end(inode, pos, len, copied); start = pos & (PAGE_CACHE_SIZE - 1); @@ -3334,27 +3284,12 @@ static int ext4_journalled_set_page_dirty(struct page *page) return __set_page_dirty_nobuffers(page); } -static const struct address_space_operations ext4_ordered_aops = { +static const struct address_space_operations ext4_aops = { .readpage = ext4_readpage, .readpages = ext4_readpages, .writepage = ext4_writepage, .write_begin = ext4_write_begin, - .write_end = ext4_ordered_write_end, - .bmap = ext4_bmap, - .invalidatepage = ext4_invalidatepage, - .releasepage = ext4_releasepage, - .direct_IO = ext4_direct_IO, - .migratepage = buffer_migrate_page, - .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, -}; - -static const struct address_space_operations ext4_writeback_aops = { - .readpage = ext4_readpage, - .readpages = ext4_readpages, - .writepage = ext4_writepage, - .write_begin = ext4_write_begin, - .write_end = ext4_writeback_write_end, + .write_end = ext4_write_end, .bmap = ext4_bmap, .invalidatepage = ext4_invalidatepage, .releasepage = ext4_releasepage, @@ -3399,23 +3334,21 @@ void ext4_set_aops(struct inode *inode) { switch (ext4_inode_journal_mode(inode)) { case EXT4_INODE_ORDERED_DATA_MODE: - if (test_opt(inode->i_sb, DELALLOC)) - inode->i_mapping->a_ops = &ext4_da_aops; - else - inode->i_mapping->a_ops = &ext4_ordered_aops; + ext4_set_inode_state(inode, EXT4_STATE_ORDERED_MODE); break; case EXT4_INODE_WRITEBACK_DATA_MODE: - if (test_opt(inode->i_sb, DELALLOC)) - inode->i_mapping->a_ops = &ext4_da_aops; - else - inode->i_mapping->a_ops = &ext4_writeback_aops; + ext4_clear_inode_state(inode, EXT4_STATE_ORDERED_MODE); break; case EXT4_INODE_JOURNAL_DATA_MODE: inode->i_mapping->a_ops = &ext4_journalled_aops; - break; + return; default: BUG(); } + if (test_opt(inode->i_sb, DELALLOC)) + inode->i_mapping->a_ops = &ext4_da_aops; + else + inode->i_mapping->a_ops = &ext4_aops; } diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 4ee471003859..58459b785565 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -257,15 +257,7 @@ DECLARE_EVENT_CLASS(ext4__write_end, __entry->pos, __entry->len, __entry->copied) ); -DEFINE_EVENT(ext4__write_end, ext4_ordered_write_end, - - TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, - unsigned int copied), - - TP_ARGS(inode, pos, len, copied) -); - -DEFINE_EVENT(ext4__write_end, ext4_writeback_write_end, +DEFINE_EVENT(ext4__write_end, ext4_write_end, TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, unsigned int copied), From eed4333f087b45e325d5b9ab29665d82ae9cab88 Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Wed, 3 Apr 2013 12:41:17 -0400 Subject: [PATCH 02/50] ext4: fold ext4_generic_write_end() into ext4_write_end() After collapsing the handling of data ordered and data writeback codepath, ext4_generic_write_end() has only one caller, ext4_write_end(). So we fold it into ext4_write_end(). Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" Reviewed-by: Lukas Czerner --- fs/ext4/inode.c | 67 ++++++++++++++++++++----------------------------- 1 file changed, 27 insertions(+), 40 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 4ee69270a48a..a4ffb470fbf3 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1087,14 +1087,32 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh) return ext4_handle_dirty_metadata(handle, NULL, bh); } -static int ext4_generic_write_end(struct file *file, - struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) +/* + * We need to pick up the new inode size which generic_commit_write gave us + * `file' can be NULL - eg, when called from page_symlink(). + * + * ext4 never places buffers on inode->i_mapping->private_list. metadata + * buffers are managed internally. + */ +static int ext4_write_end(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) { - int i_size_changed = 0; - struct inode *inode = mapping->host; handle_t *handle = ext4_journal_current_handle(); + struct inode *inode = mapping->host; + int ret = 0, ret2; + int i_size_changed = 0; + + trace_ext4_write_end(inode, pos, len, copied); + if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE)) { + ret = ext4_jbd2_file_inode(handle, inode); + if (ret) { + unlock_page(page); + page_cache_release(page); + goto errout; + } + } if (ext4_has_inline_data(inode)) copied = ext4_write_inline_data_end(inode, pos, len, @@ -1105,7 +1123,7 @@ static int ext4_generic_write_end(struct file *file, /* * No need to use i_size_read() here, the i_size - * cannot change under us because we hold i_mutex. + * cannot change under us because we hole i_mutex. * * But it's important to update i_size while still holding page lock: * page writeout could otherwise come in and zero beyond i_size. @@ -1115,10 +1133,10 @@ static int ext4_generic_write_end(struct file *file, i_size_changed = 1; } - if (pos + copied > EXT4_I(inode)->i_disksize) { + if (pos + copied > EXT4_I(inode)->i_disksize) { /* We need to mark inode dirty even if * new_i_size is less that inode->i_size - * bu greater than i_disksize.(hint delalloc) + * but greater than i_disksize. (hint delalloc) */ ext4_update_i_disksize(inode, (pos + copied)); i_size_changed = 1; @@ -1135,37 +1153,6 @@ static int ext4_generic_write_end(struct file *file, if (i_size_changed) ext4_mark_inode_dirty(handle, inode); - return copied; -} - -/* - * We need to pick up the new inode size which generic_commit_write gave us - * `file' can be NULL - eg, when called from page_symlink(). - * - * ext4 never places buffers on inode->i_mapping->private_list. metadata - * buffers are managed internally. - */ -static int ext4_write_end(struct file *file, - struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) -{ - handle_t *handle = ext4_journal_current_handle(); - struct inode *inode = mapping->host; - int ret = 0, ret2; - - trace_ext4_write_end(inode, pos, len, copied); - if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE)) { - ret = ext4_jbd2_file_inode(handle, inode); - if (ret) { - unlock_page(page); - page_cache_release(page); - goto errout; - } - } - - copied = ext4_generic_write_end(file, mapping, pos, len, copied, - page, fsdata); if (copied < 0) ret = copied; if (pos + len > inode->i_size && ext4_can_truncate(inode)) From 781f143ea0fd7981ebe2e8cd96114997c8cf6c07 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 3 Apr 2013 12:43:17 -0400 Subject: [PATCH 03/50] ext4: fold ext4_alloc_blocks() in ext4_alloc_branch() The older code was far more complicated than it needed to be because of how we spliced in the ext4's new multiblock allocator into ext3's indirect block code. By folding ext4_alloc_blocks() into ext4_alloc_branch(), we make the code far more understable, shave off over 130 lines of code and half a kilobyte of compiled object code. Signed-off-by: "Theodore Ts'o" --- fs/ext4/indirect.c | 227 +++++++++------------------------------------ 1 file changed, 46 insertions(+), 181 deletions(-) diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index a04183127ef0..c0f9e4699f0b 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -291,131 +291,6 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks, return count; } -/** - * ext4_alloc_blocks: multiple allocate blocks needed for a branch - * @handle: handle for this transaction - * @inode: inode which needs allocated blocks - * @iblock: the logical block to start allocated at - * @goal: preferred physical block of allocation - * @indirect_blks: the number of blocks need to allocate for indirect - * blocks - * @blks: number of desired blocks - * @new_blocks: on return it will store the new block numbers for - * the indirect blocks(if needed) and the first direct block, - * @err: on return it will store the error code - * - * This function will return the number of blocks allocated as - * requested by the passed-in parameters. - */ -static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, - ext4_lblk_t iblock, ext4_fsblk_t goal, - int indirect_blks, int blks, - ext4_fsblk_t new_blocks[4], int *err) -{ - struct ext4_allocation_request ar; - int target, i; - unsigned long count = 0, blk_allocated = 0; - int index = 0; - ext4_fsblk_t current_block = 0; - int ret = 0; - - /* - * Here we try to allocate the requested multiple blocks at once, - * on a best-effort basis. - * To build a branch, we should allocate blocks for - * the indirect blocks(if not allocated yet), and at least - * the first direct block of this branch. That's the - * minimum number of blocks need to allocate(required) - */ - /* first we try to allocate the indirect blocks */ - target = indirect_blks; - while (target > 0) { - count = target; - /* allocating blocks for indirect blocks and direct blocks */ - current_block = ext4_new_meta_blocks(handle, inode, goal, - 0, &count, err); - if (*err) - goto failed_out; - - if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) { - EXT4_ERROR_INODE(inode, - "current_block %llu + count %lu > %d!", - current_block, count, - EXT4_MAX_BLOCK_FILE_PHYS); - *err = -EIO; - goto failed_out; - } - - target -= count; - /* allocate blocks for indirect blocks */ - while (index < indirect_blks && count) { - new_blocks[index++] = current_block++; - count--; - } - if (count > 0) { - /* - * save the new block number - * for the first direct block - */ - new_blocks[index] = current_block; - WARN(1, KERN_INFO "%s returned more blocks than " - "requested\n", __func__); - break; - } - } - - target = blks - count ; - blk_allocated = count; - if (!target) - goto allocated; - /* Now allocate data blocks */ - memset(&ar, 0, sizeof(ar)); - ar.inode = inode; - ar.goal = goal; - ar.len = target; - ar.logical = iblock; - if (S_ISREG(inode->i_mode)) - /* enable in-core preallocation only for regular files */ - ar.flags = EXT4_MB_HINT_DATA; - - current_block = ext4_mb_new_blocks(handle, &ar, err); - if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) { - EXT4_ERROR_INODE(inode, - "current_block %llu + ar.len %d > %d!", - current_block, ar.len, - EXT4_MAX_BLOCK_FILE_PHYS); - *err = -EIO; - goto failed_out; - } - - if (*err && (target == blks)) { - /* - * if the allocation failed and we didn't allocate - * any blocks before - */ - goto failed_out; - } - if (!*err) { - if (target == blks) { - /* - * save the new block number - * for the first direct block - */ - new_blocks[index] = current_block; - } - blk_allocated += ar.len; - } -allocated: - /* total number of blocks allocated for direct blocks */ - ret = blk_allocated; - *err = 0; - return ret; -failed_out: - for (i = 0; i < index; i++) - ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); - return ret; -} - /** * ext4_alloc_branch - allocate and set up a chain of blocks. * @handle: handle for this transaction @@ -448,60 +323,59 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, int *blks, ext4_fsblk_t goal, ext4_lblk_t *offsets, Indirect *branch) { - int blocksize = inode->i_sb->s_blocksize; - int i, n = 0; - int err = 0; - struct buffer_head *bh; - int num; - ext4_fsblk_t new_blocks[4]; - ext4_fsblk_t current_block; + struct ext4_allocation_request ar; + struct buffer_head * bh; + ext4_fsblk_t b, new_blocks[4]; + __le32 *p; + int i, j, err, len = 1; - num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks, - *blks, new_blocks, &err); - if (err) - return err; - - branch[0].key = cpu_to_le32(new_blocks[0]); /* - * metadata blocks and data blocks are allocated. + * Set up for the direct block allocation */ - for (n = 1; n <= indirect_blks; n++) { - /* - * Get buffer_head for parent block, zero it out - * and set the pointer to new one, then send - * parent to disk. - */ - bh = sb_getblk(inode->i_sb, new_blocks[n-1]); + memset(&ar, 0, sizeof(ar)); + ar.inode = inode; + ar.len = *blks; + ar.logical = iblock; + if (S_ISREG(inode->i_mode)) + ar.flags = EXT4_MB_HINT_DATA; + + for (i = 0; i <= indirect_blks; i++) { + if (i == indirect_blks) { + ar.goal = goal; + new_blocks[i] = ext4_mb_new_blocks(handle, &ar, &err); + } else + goal = new_blocks[i] = ext4_new_meta_blocks(handle, inode, + goal, 0, NULL, &err); + if (err) { + i--; + goto failed; + } + branch[i].key = cpu_to_le32(new_blocks[i]); + if (i == 0) + continue; + + bh = branch[i].bh = sb_getblk(inode->i_sb, new_blocks[i-1]); if (unlikely(!bh)) { err = -ENOMEM; goto failed; } - - branch[n].bh = bh; lock_buffer(bh); BUFFER_TRACE(bh, "call get_create_access"); err = ext4_journal_get_create_access(handle, bh); if (err) { - /* Don't brelse(bh) here; it's done in - * ext4_journal_forget() below */ unlock_buffer(bh); goto failed; } - memset(bh->b_data, 0, blocksize); - branch[n].p = (__le32 *) bh->b_data + offsets[n]; - branch[n].key = cpu_to_le32(new_blocks[n]); - *branch[n].p = branch[n].key; - if (n == indirect_blks) { - current_block = new_blocks[n]; - /* - * End of chain, update the last new metablock of - * the chain to point to the new allocated - * data blocks numbers - */ - for (i = 1; i < num; i++) - *(branch[n].p + i) = cpu_to_le32(++current_block); - } + memset(bh->b_data, 0, bh->b_size); + p = branch[i].p = (__le32 *) bh->b_data + offsets[i]; + b = new_blocks[i]; + + if (i == indirect_blks) + len = ar.len; + for (j = 0; j < len; j++) + *p++ = cpu_to_le32(b++); + BUFFER_TRACE(bh, "marking uptodate"); set_buffer_uptodate(bh); unlock_buffer(bh); @@ -511,25 +385,16 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, if (err) goto failed; } - *blks = num; - return err; + *blks = ar.len; + return 0; failed: - /* Allocation failed, free what we already allocated */ - ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0); - for (i = 1; i <= n ; i++) { - /* - * branch[i].bh is newly allocated, so there is no - * need to revoke the block, which is why we don't - * need to set EXT4_FREE_BLOCKS_METADATA. - */ - ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, - EXT4_FREE_BLOCKS_FORGET); + for (; i >= 0; i--) { + if (i != indirect_blks && branch[i].bh) + ext4_forget(handle, 1, inode, branch[i].bh, + branch[i].bh->b_blocknr); + ext4_free_blocks(handle, inode, NULL, new_blocks[i], + (i == indirect_blks) ? ar.len : 1, 0); } - for (i = n+1; i < indirect_blks; i++) - ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); - - ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0); - return err; } From 26a4c0c6ccecf6814cf44f951c97222bd795bc1a Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 3 Apr 2013 12:45:17 -0400 Subject: [PATCH 04/50] ext4: refactor punch hole code Move common code in ext4_ind_punch_hole() and ext4_ext_punch_hole() into ext4_punch_hole(). This saves over 150 lines of code. This also fixes a potential bug when the punch_hole() code is racing against indirect-to-extents or extents-to-indirect migation. We are currently using i_mutex to protect against changes to the inode flag; specifically, the append-only, immutable, and extents inode flags. So we need to take i_mutex before deciding whether to use the extents-specific or indirect-specific punch_hole code. Also, there was a missing call to ext4_inode_block_unlocked_dio() in the indirect punch codepath. This was added in commit 02d262dffcf4c to block DIO readers racing against the punch operation in the codepath for extent-mapped inodes, but it was missing for indirect-block mapped inodes. One of the advantages of refactoring the code is that it makes such oversights much less likely. Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 7 +- fs/ext4/extents.c | 185 +-------------------------------------------- fs/ext4/indirect.c | 158 +------------------------------------- fs/ext4/inode.c | 180 +++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 183 insertions(+), 347 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index f91e11bd9753..0649253804c4 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2110,7 +2110,8 @@ extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock); extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk); extern void ext4_ind_truncate(struct inode *inode); -extern int ext4_ind_punch_hole(struct file *file, loff_t offset, loff_t length); +extern int ext4_free_hole_blocks(handle_t *handle, struct inode *inode, + ext4_lblk_t first, ext4_lblk_t stop); /* ioctl.c */ extern long ext4_ioctl(struct file *, unsigned int, unsigned long); @@ -2575,8 +2576,8 @@ extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); extern void ext4_ext_truncate(struct inode *); -extern int ext4_ext_punch_hole(struct file *file, loff_t offset, - loff_t length); +extern int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, + ext4_lblk_t end); extern void ext4_ext_init(struct super_block *); extern void ext4_ext_release(struct super_block *); extern long ext4_fallocate(struct file *file, int mode, loff_t offset, diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 9c6d06dcef8b..d58365e40df7 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2599,8 +2599,8 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path) return 1; } -static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, - ext4_lblk_t end) +int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, + ext4_lblk_t end) { struct super_block *sb = inode->i_sb; int depth = ext_depth(inode); @@ -4623,187 +4623,6 @@ static int ext4_xattr_fiemap(struct inode *inode, return (error < 0 ? error : 0); } -/* - * ext4_ext_punch_hole - * - * Punches a hole of "length" bytes in a file starting - * at byte "offset" - * - * @inode: The inode of the file to punch a hole in - * @offset: The starting byte offset of the hole - * @length: The length of the hole - * - * Returns the number of blocks removed or negative on err - */ -int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) -{ - struct inode *inode = file_inode(file); - struct super_block *sb = inode->i_sb; - ext4_lblk_t first_block, stop_block; - struct address_space *mapping = inode->i_mapping; - handle_t *handle; - loff_t first_page, last_page, page_len; - loff_t first_page_offset, last_page_offset; - int credits, err = 0; - - /* - * Write out all dirty pages to avoid race conditions - * Then release them. - */ - if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { - err = filemap_write_and_wait_range(mapping, - offset, offset + length - 1); - - if (err) - return err; - } - - mutex_lock(&inode->i_mutex); - /* It's not possible punch hole on append only file */ - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { - err = -EPERM; - goto out_mutex; - } - if (IS_SWAPFILE(inode)) { - err = -ETXTBSY; - goto out_mutex; - } - - /* No need to punch hole beyond i_size */ - if (offset >= inode->i_size) - goto out_mutex; - - /* - * If the hole extends beyond i_size, set the hole - * to end after the page that contains i_size - */ - if (offset + length > inode->i_size) { - length = inode->i_size + - PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) - - offset; - } - - first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - last_page = (offset + length) >> PAGE_CACHE_SHIFT; - - first_page_offset = first_page << PAGE_CACHE_SHIFT; - last_page_offset = last_page << PAGE_CACHE_SHIFT; - - /* Now release the pages */ - if (last_page_offset > first_page_offset) { - truncate_pagecache_range(inode, first_page_offset, - last_page_offset - 1); - } - - /* Wait all existing dio workers, newcomers will block on i_mutex */ - ext4_inode_block_unlocked_dio(inode); - err = ext4_flush_unwritten_io(inode); - if (err) - goto out_dio; - inode_dio_wait(inode); - - credits = ext4_writepage_trans_blocks(inode); - handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); - if (IS_ERR(handle)) { - err = PTR_ERR(handle); - goto out_dio; - } - - - /* - * Now we need to zero out the non-page-aligned data in the - * pages at the start and tail of the hole, and unmap the buffer - * heads for the block aligned regions of the page that were - * completely zeroed. - */ - if (first_page > last_page) { - /* - * If the file space being truncated is contained within a page - * just zero out and unmap the middle of that page - */ - err = ext4_discard_partial_page_buffers(handle, - mapping, offset, length, 0); - - if (err) - goto out; - } else { - /* - * zero out and unmap the partial page that contains - * the start of the hole - */ - page_len = first_page_offset - offset; - if (page_len > 0) { - err = ext4_discard_partial_page_buffers(handle, mapping, - offset, page_len, 0); - if (err) - goto out; - } - - /* - * zero out and unmap the partial page that contains - * the end of the hole - */ - page_len = offset + length - last_page_offset; - if (page_len > 0) { - err = ext4_discard_partial_page_buffers(handle, mapping, - last_page_offset, page_len, 0); - if (err) - goto out; - } - } - - /* - * If i_size is contained in the last page, we need to - * unmap and zero the partial page after i_size - */ - if (inode->i_size >> PAGE_CACHE_SHIFT == last_page && - inode->i_size % PAGE_CACHE_SIZE != 0) { - - page_len = PAGE_CACHE_SIZE - - (inode->i_size & (PAGE_CACHE_SIZE - 1)); - - if (page_len > 0) { - err = ext4_discard_partial_page_buffers(handle, - mapping, inode->i_size, page_len, 0); - - if (err) - goto out; - } - } - - first_block = (offset + sb->s_blocksize - 1) >> - EXT4_BLOCK_SIZE_BITS(sb); - stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); - - /* If there are no blocks to remove, return now */ - if (first_block >= stop_block) - goto out; - - down_write(&EXT4_I(inode)->i_data_sem); - ext4_discard_preallocations(inode); - - err = ext4_es_remove_extent(inode, first_block, - stop_block - first_block); - err = ext4_ext_remove_space(inode, first_block, stop_block - 1); - - ext4_discard_preallocations(inode); - - if (IS_SYNC(inode)) - ext4_handle_sync(handle); - - up_write(&EXT4_I(inode)->i_data_sem); - -out: - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); - ext4_mark_inode_dirty(handle, inode); - ext4_journal_stop(handle); -out_dio: - ext4_inode_resume_unlocked_dio(inode); -out_mutex: - mutex_unlock(&inode->i_mutex); - return err; -} - int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len) { diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index c0f9e4699f0b..d8846779f4ea 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -1434,8 +1434,8 @@ err: return ret; } -static int ext4_free_hole_blocks(handle_t *handle, struct inode *inode, - ext4_lblk_t first, ext4_lblk_t stop) +int ext4_free_hole_blocks(handle_t *handle, struct inode *inode, + ext4_lblk_t first, ext4_lblk_t stop) { int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); int level, ret = 0; @@ -1469,157 +1469,3 @@ err: return ret; } -int ext4_ind_punch_hole(struct file *file, loff_t offset, loff_t length) -{ - struct inode *inode = file_inode(file); - struct super_block *sb = inode->i_sb; - ext4_lblk_t first_block, stop_block; - struct address_space *mapping = inode->i_mapping; - handle_t *handle = NULL; - loff_t first_page, last_page, page_len; - loff_t first_page_offset, last_page_offset; - int err = 0; - - /* - * Write out all dirty pages to avoid race conditions - * Then release them. - */ - if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { - err = filemap_write_and_wait_range(mapping, - offset, offset + length - 1); - if (err) - return err; - } - - mutex_lock(&inode->i_mutex); - /* It's not possible punch hole on append only file */ - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { - err = -EPERM; - goto out_mutex; - } - if (IS_SWAPFILE(inode)) { - err = -ETXTBSY; - goto out_mutex; - } - - /* No need to punch hole beyond i_size */ - if (offset >= inode->i_size) - goto out_mutex; - - /* - * If the hole extents beyond i_size, set the hole - * to end after the page that contains i_size - */ - if (offset + length > inode->i_size) { - length = inode->i_size + - PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) - - offset; - } - - first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - last_page = (offset + length) >> PAGE_CACHE_SHIFT; - - first_page_offset = first_page << PAGE_CACHE_SHIFT; - last_page_offset = last_page << PAGE_CACHE_SHIFT; - - /* Now release the pages */ - if (last_page_offset > first_page_offset) { - truncate_pagecache_range(inode, first_page_offset, - last_page_offset - 1); - } - - /* Wait all existing dio works, newcomers will block on i_mutex */ - inode_dio_wait(inode); - - handle = start_transaction(inode); - if (IS_ERR(handle)) - goto out_mutex; - - /* - * Now we need to zero out the non-page-aligned data in the - * pages at the start and tail of the hole, and unmap the buffer - * heads for the block aligned regions of the page that were - * completely zerod. - */ - if (first_page > last_page) { - /* - * If the file space being truncated is contained within a page - * just zero out and unmap the middle of that page - */ - err = ext4_discard_partial_page_buffers(handle, - mapping, offset, length, 0); - if (err) - goto out; - } else { - /* - * Zero out and unmap the paritial page that contains - * the start of the hole - */ - page_len = first_page_offset - offset; - if (page_len > 0) { - err = ext4_discard_partial_page_buffers(handle, mapping, - offset, page_len, 0); - if (err) - goto out; - } - - /* - * Zero out and unmap the partial page that contains - * the end of the hole - */ - page_len = offset + length - last_page_offset; - if (page_len > 0) { - err = ext4_discard_partial_page_buffers(handle, mapping, - last_page_offset, page_len, 0); - if (err) - goto out; - } - } - - /* - * If i_size contained in the last page, we need to - * unmap and zero the paritial page after i_size - */ - if (inode->i_size >> PAGE_CACHE_SHIFT == last_page && - inode->i_size % PAGE_CACHE_SIZE != 0) { - page_len = PAGE_CACHE_SIZE - - (inode->i_size & (PAGE_CACHE_SIZE - 1)); - if (page_len > 0) { - err = ext4_discard_partial_page_buffers(handle, - mapping, inode->i_size, page_len, 0); - if (err) - goto out; - } - } - - first_block = (offset + sb->s_blocksize - 1) >> - EXT4_BLOCK_SIZE_BITS(sb); - stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); - - if (first_block >= stop_block) - goto out; - - down_write(&EXT4_I(inode)->i_data_sem); - ext4_discard_preallocations(inode); - - err = ext4_es_remove_extent(inode, first_block, - stop_block - first_block); - err = ext4_free_hole_blocks(handle, inode, first_block, stop_block); - - ext4_discard_preallocations(inode); - - if (IS_SYNC(inode)) - ext4_handle_sync(handle); - - up_write(&EXT4_I(inode)->i_data_sem); - -out: - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); - ext4_mark_inode_dirty(handle, inode); - ext4_journal_stop(handle); - -out_mutex: - mutex_unlock(&inode->i_mutex); - - return err; -} diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index a4ffb470fbf3..9bda50aa34e2 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3566,20 +3566,190 @@ int ext4_can_truncate(struct inode *inode) int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) { struct inode *inode = file_inode(file); + struct super_block *sb = inode->i_sb; + ext4_lblk_t first_block, stop_block; + struct address_space *mapping = inode->i_mapping; + loff_t first_page, last_page, page_len; + loff_t first_page_offset, last_page_offset; + handle_t *handle; + unsigned int credits; + int ret = 0; + if (!S_ISREG(inode->i_mode)) return -EOPNOTSUPP; - if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - return ext4_ind_punch_hole(file, offset, length); - - if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) { + if (EXT4_SB(sb)->s_cluster_ratio > 1) { /* TODO: Add support for bigalloc file systems */ return -EOPNOTSUPP; } trace_ext4_punch_hole(inode, offset, length); - return ext4_ext_punch_hole(file, offset, length); + /* + * Write out all dirty pages to avoid race conditions + * Then release them. + */ + if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { + ret = filemap_write_and_wait_range(mapping, offset, + offset + length - 1); + if (ret) + return ret; + } + + mutex_lock(&inode->i_mutex); + /* It's not possible punch hole on append only file */ + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { + ret = -EPERM; + goto out_mutex; + } + if (IS_SWAPFILE(inode)) { + ret = -ETXTBSY; + goto out_mutex; + } + + /* No need to punch hole beyond i_size */ + if (offset >= inode->i_size) + goto out_mutex; + + /* + * If the hole extends beyond i_size, set the hole + * to end after the page that contains i_size + */ + if (offset + length > inode->i_size) { + length = inode->i_size + + PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) - + offset; + } + + first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + last_page = (offset + length) >> PAGE_CACHE_SHIFT; + + first_page_offset = first_page << PAGE_CACHE_SHIFT; + last_page_offset = last_page << PAGE_CACHE_SHIFT; + + /* Now release the pages */ + if (last_page_offset > first_page_offset) { + truncate_pagecache_range(inode, first_page_offset, + last_page_offset - 1); + } + + /* Wait all existing dio workers, newcomers will block on i_mutex */ + ext4_inode_block_unlocked_dio(inode); + ret = ext4_flush_unwritten_io(inode); + if (ret) + goto out_dio; + inode_dio_wait(inode); + + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + credits = ext4_writepage_trans_blocks(inode); + else + credits = ext4_blocks_for_truncate(inode); + handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + ext4_std_error(sb, ret); + goto out_dio; + } + + /* + * Now we need to zero out the non-page-aligned data in the + * pages at the start and tail of the hole, and unmap the + * buffer heads for the block aligned regions of the page that + * were completely zeroed. + */ + if (first_page > last_page) { + /* + * If the file space being truncated is contained + * within a page just zero out and unmap the middle of + * that page + */ + ret = ext4_discard_partial_page_buffers(handle, + mapping, offset, length, 0); + + if (ret) + goto out_stop; + } else { + /* + * zero out and unmap the partial page that contains + * the start of the hole + */ + page_len = first_page_offset - offset; + if (page_len > 0) { + ret = ext4_discard_partial_page_buffers(handle, mapping, + offset, page_len, 0); + if (ret) + goto out_stop; + } + + /* + * zero out and unmap the partial page that contains + * the end of the hole + */ + page_len = offset + length - last_page_offset; + if (page_len > 0) { + ret = ext4_discard_partial_page_buffers(handle, mapping, + last_page_offset, page_len, 0); + if (ret) + goto out_stop; + } + } + + /* + * If i_size is contained in the last page, we need to + * unmap and zero the partial page after i_size + */ + if (inode->i_size >> PAGE_CACHE_SHIFT == last_page && + inode->i_size % PAGE_CACHE_SIZE != 0) { + page_len = PAGE_CACHE_SIZE - + (inode->i_size & (PAGE_CACHE_SIZE - 1)); + + if (page_len > 0) { + ret = ext4_discard_partial_page_buffers(handle, + mapping, inode->i_size, page_len, 0); + + if (ret) + goto out_stop; + } + } + + first_block = (offset + sb->s_blocksize - 1) >> + EXT4_BLOCK_SIZE_BITS(sb); + stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); + + /* If there are no blocks to remove, return now */ + if (first_block >= stop_block) + goto out_stop; + + down_write(&EXT4_I(inode)->i_data_sem); + ext4_discard_preallocations(inode); + + ret = ext4_es_remove_extent(inode, first_block, + stop_block - first_block); + if (ret) { + up_write(&EXT4_I(inode)->i_data_sem); + goto out_stop; + } + + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + ret = ext4_ext_remove_space(inode, first_block, + stop_block - 1); + else + ret = ext4_free_hole_blocks(handle, inode, first_block, + stop_block); + + ext4_discard_preallocations(inode); + if (IS_SYNC(inode)) + ext4_handle_sync(handle); + up_write(&EXT4_I(inode)->i_data_sem); + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + ext4_mark_inode_dirty(handle, inode); +out_stop: + ext4_journal_stop(handle); +out_dio: + ext4_inode_resume_unlocked_dio(inode); +out_mutex: + mutex_unlock(&inode->i_mutex); + return ret; } /* From 819c4920b7e60ecfd6f0f61d890af4cdf3873d18 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 3 Apr 2013 12:47:17 -0400 Subject: [PATCH 05/50] ext4: refactor truncate code Move common code in ext4_ind_truncate() and ext4_ext_truncate() into ext4_truncate(). This saves over 60 lines of code. Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 4 +-- fs/ext4/extents.c | 60 +------------------------------ fs/ext4/indirect.c | 88 +++------------------------------------------- fs/ext4/inode.c | 74 ++++++++++++++++++++++++++++++++++++-- 4 files changed, 78 insertions(+), 148 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 0649253804c4..d05ba3886f33 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2109,7 +2109,7 @@ extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, unsigned long nr_segs); extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock); extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk); -extern void ext4_ind_truncate(struct inode *inode); +extern void ext4_ind_truncate(handle_t *, struct inode *inode); extern int ext4_free_hole_blocks(handle_t *handle, struct inode *inode, ext4_lblk_t first, ext4_lblk_t stop); @@ -2575,7 +2575,7 @@ extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk); extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); -extern void ext4_ext_truncate(struct inode *); +extern void ext4_ext_truncate(handle_t *, struct inode *); extern int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, ext4_lblk_t end); extern void ext4_ext_init(struct super_block *); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index d58365e40df7..cbbe8a4deac6 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4257,47 +4257,12 @@ out3: return err ? err : allocated; } -void ext4_ext_truncate(struct inode *inode) +void ext4_ext_truncate(handle_t *handle, struct inode *inode) { - struct address_space *mapping = inode->i_mapping; struct super_block *sb = inode->i_sb; ext4_lblk_t last_block; - handle_t *handle; - loff_t page_len; int err = 0; - /* - * finish any pending end_io work so we won't run the risk of - * converting any truncated blocks to initialized later - */ - ext4_flush_unwritten_io(inode); - - /* - * probably first extent we're gonna free will be last in block - */ - err = ext4_writepage_trans_blocks(inode); - handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, err); - if (IS_ERR(handle)) - return; - - if (inode->i_size % PAGE_CACHE_SIZE != 0) { - page_len = PAGE_CACHE_SIZE - - (inode->i_size & (PAGE_CACHE_SIZE - 1)); - - err = ext4_discard_partial_page_buffers(handle, - mapping, inode->i_size, page_len, 0); - - if (err) - goto out_stop; - } - - if (ext4_orphan_add(handle, inode)) - goto out_stop; - - down_write(&EXT4_I(inode)->i_data_sem); - - ext4_discard_preallocations(inode); - /* * TODO: optimization is possible here. * Probably we need not scan at all, @@ -4313,29 +4278,6 @@ void ext4_ext_truncate(struct inode *inode) err = ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block); err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); - - /* In a multi-transaction truncate, we only make the final - * transaction synchronous. - */ - if (IS_SYNC(inode)) - ext4_handle_sync(handle); - - up_write(&EXT4_I(inode)->i_data_sem); - -out_stop: - /* - * If this was a simple ftruncate() and the file will remain alive, - * then we need to clear up the orphan record which we created above. - * However, if this was a real unlink then we were called by - * ext4_delete_inode(), and we allow that function to clean up the - * orphan info for us. - */ - if (inode->i_nlink) - ext4_orphan_del(handle, inode); - - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); - ext4_mark_inode_dirty(handle, inode); - ext4_journal_stop(handle); } static void ext4_falloc_update_inode(struct inode *inode, diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index d8846779f4ea..98be6f697463 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -806,26 +806,9 @@ int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk) * be able to restart the transaction at a conventient checkpoint to make * sure we don't overflow the journal. * - * start_transaction gets us a new handle for a truncate transaction, - * and extend_transaction tries to extend the existing one a bit. If + * Try to extend this transaction for the purposes of truncation. If * extend fails, we need to propagate the failure up and restart the * transaction in the top-level truncate loop. --sct - */ -static handle_t *start_transaction(struct inode *inode) -{ - handle_t *result; - - result = ext4_journal_start(inode, EXT4_HT_TRUNCATE, - ext4_blocks_for_truncate(inode)); - if (!IS_ERR(result)) - return result; - - ext4_std_error(inode->i_sb, PTR_ERR(result)); - return result; -} - -/* - * Try to extend this transaction for the purposes of truncation. * * Returns 0 if we managed to create more room. If we can't create more * room, and the transaction must be restarted we return 1. @@ -1218,68 +1201,30 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, } } -void ext4_ind_truncate(struct inode *inode) +void ext4_ind_truncate(handle_t *handle, struct inode *inode) { - handle_t *handle; struct ext4_inode_info *ei = EXT4_I(inode); __le32 *i_data = ei->i_data; int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); - struct address_space *mapping = inode->i_mapping; ext4_lblk_t offsets[4]; Indirect chain[4]; Indirect *partial; __le32 nr = 0; int n = 0; ext4_lblk_t last_block, max_block; - loff_t page_len; unsigned blocksize = inode->i_sb->s_blocksize; - int err; - - handle = start_transaction(inode); - if (IS_ERR(handle)) - return; /* AKPM: return what? */ last_block = (inode->i_size + blocksize-1) >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1) >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); - if (inode->i_size % PAGE_CACHE_SIZE != 0) { - page_len = PAGE_CACHE_SIZE - - (inode->i_size & (PAGE_CACHE_SIZE - 1)); - - err = ext4_discard_partial_page_buffers(handle, - mapping, inode->i_size, page_len, 0); - - if (err) - goto out_stop; - } - if (last_block != max_block) { n = ext4_block_to_path(inode, last_block, offsets, NULL); if (n == 0) - goto out_stop; /* error */ + return; } - /* - * OK. This truncate is going to happen. We add the inode to the - * orphan list, so that if this truncate spans multiple transactions, - * and we crash, we will resume the truncate when the filesystem - * recovers. It also marks the inode dirty, to catch the new size. - * - * Implication: the file must always be in a sane, consistent - * truncatable state while each transaction commits. - */ - if (ext4_orphan_add(handle, inode)) - goto out_stop; - - /* - * From here we block out all ext4_get_block() callers who want to - * modify the block allocation tree. - */ - down_write(&ei->i_data_sem); - - ext4_discard_preallocations(inode); ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block); /* @@ -1296,7 +1241,7 @@ void ext4_ind_truncate(struct inode *inode) * It is unnecessary to free any data blocks if last_block is * equal to the indirect block limit. */ - goto out_unlock; + return; } else if (n == 1) { /* direct blocks */ ext4_free_data(handle, inode, NULL, i_data+offsets[0], i_data + EXT4_NDIR_BLOCKS); @@ -1356,31 +1301,6 @@ do_indirects: case EXT4_TIND_BLOCK: ; } - -out_unlock: - up_write(&ei->i_data_sem); - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); - ext4_mark_inode_dirty(handle, inode); - - /* - * In a multi-transaction truncate, we only make the final transaction - * synchronous - */ - if (IS_SYNC(inode)) - ext4_handle_sync(handle); -out_stop: - /* - * If this was a simple ftruncate(), and the file will remain alive - * then we need to clear up the orphan record which we created above. - * However, if this was a real unlink then we were called by - * ext4_delete_inode(), and we allow that function to clean up the - * orphan info for us. - */ - if (inode->i_nlink) - ext4_orphan_del(handle, inode); - - ext4_journal_stop(handle); - trace_ext4_truncate_exit(inode); } static int free_hole_blocks(handle_t *handle, struct inode *inode, diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 9bda50aa34e2..49c80e4ac5ac 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3738,9 +3738,9 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) stop_block); ext4_discard_preallocations(inode); + up_write(&EXT4_I(inode)->i_data_sem); if (IS_SYNC(inode)) ext4_handle_sync(handle); - up_write(&EXT4_I(inode)->i_data_sem); inode->i_mtime = inode->i_ctime = ext4_current_time(inode); ext4_mark_inode_dirty(handle, inode); out_stop: @@ -3782,6 +3782,12 @@ out_mutex: */ void ext4_truncate(struct inode *inode) { + struct ext4_inode_info *ei = EXT4_I(inode); + unsigned int credits; + handle_t *handle; + struct address_space *mapping = inode->i_mapping; + loff_t page_len; + trace_ext4_truncate_enter(inode); if (!ext4_can_truncate(inode)) @@ -3800,10 +3806,72 @@ void ext4_truncate(struct inode *inode) return; } + /* + * finish any pending end_io work so we won't run the risk of + * converting any truncated blocks to initialized later + */ + ext4_flush_unwritten_io(inode); + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - ext4_ext_truncate(inode); + credits = ext4_writepage_trans_blocks(inode); else - ext4_ind_truncate(inode); + credits = ext4_blocks_for_truncate(inode); + + handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); + if (IS_ERR(handle)) { + ext4_std_error(inode->i_sb, PTR_ERR(handle)); + return; + } + + if (inode->i_size % PAGE_CACHE_SIZE != 0) { + page_len = PAGE_CACHE_SIZE - + (inode->i_size & (PAGE_CACHE_SIZE - 1)); + + if (ext4_discard_partial_page_buffers(handle, + mapping, inode->i_size, page_len, 0)) + goto out_stop; + } + + /* + * We add the inode to the orphan list, so that if this + * truncate spans multiple transactions, and we crash, we will + * resume the truncate when the filesystem recovers. It also + * marks the inode dirty, to catch the new size. + * + * Implication: the file must always be in a sane, consistent + * truncatable state while each transaction commits. + */ + if (ext4_orphan_add(handle, inode)) + goto out_stop; + + down_write(&EXT4_I(inode)->i_data_sem); + + ext4_discard_preallocations(inode); + + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + ext4_ext_truncate(handle, inode); + else + ext4_ind_truncate(handle, inode); + + up_write(&ei->i_data_sem); + + if (IS_SYNC(inode)) + ext4_handle_sync(handle); + +out_stop: + /* + * If this was a simple ftruncate() and the file will remain alive, + * then we need to clear up the orphan record which we created above. + * However, if this was a real unlink then we were called by + * ext4_delete_inode(), and we allow that function to clean up the + * orphan info for us. + */ + if (inode->i_nlink) + ext4_orphan_del(handle, inode); + + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + ext4_mark_inode_dirty(handle, inode); + ext4_journal_stop(handle); trace_ext4_truncate_exit(inode); } From 19b5ef615787062a87c4ea15fcdb0e256b62ed19 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 3 Apr 2013 21:58:52 -0400 Subject: [PATCH 06/50] ext4: add mutex_is_locked() assertion to ext4_truncate() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Added fixup from Lukáš Czerner which only checks the assertion when the inode is not new and is not being freed. ] Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 49c80e4ac5ac..56ebd662033b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3788,6 +3788,13 @@ void ext4_truncate(struct inode *inode) struct address_space *mapping = inode->i_mapping; loff_t page_len; + /* + * There is a possibility that we're either freeing the inode + * or it completely new indode. In those cases we might not + * have i_mutex locked because it's not necessary. + */ + if (!(inode->i_state & (I_NEW|I_FREEING))) + WARN_ON(!mutex_is_locked(&inode->i_mutex)); trace_ext4_truncate_enter(inode); if (!ext4_can_truncate(inode)) From b10a44c369d7b8a28825f1fd24f13dc31c2e3a25 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 3 Apr 2013 22:00:52 -0400 Subject: [PATCH 07/50] ext4: add might_sleep() annotations Signed-off-by: "Theodore Ts'o" Reviewed-by: Lukas Czerner --- fs/ext4/ext4_jbd2.c | 6 ++++++ fs/ext4/mballoc.c | 4 ++++ 2 files changed, 10 insertions(+) diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 7058975e3a55..0e1dc9e70ce5 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -43,6 +43,8 @@ handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line, { journal_t *journal; + might_sleep(); + trace_ext4_journal_start(sb, nblocks, _RET_IP_); if (sb->s_flags & MS_RDONLY) return ERR_PTR(-EROFS); @@ -113,6 +115,8 @@ int __ext4_journal_get_write_access(const char *where, unsigned int line, { int err = 0; + might_sleep(); + if (ext4_handle_valid(handle)) { err = jbd2_journal_get_write_access(handle, bh); if (err) @@ -209,6 +213,8 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line, { int err = 0; + might_sleep(); + if (ext4_handle_valid(handle)) { err = jbd2_journal_dirty_metadata(handle, bh); if (err) { diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index ee6614bdb639..36c82a39d03f 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1011,6 +1011,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) struct page *page; int ret = 0; + might_sleep(); mb_debug(1, "init group %u\n", group); this_grp = ext4_get_group_info(sb, group); /* @@ -1082,6 +1083,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, struct ext4_sb_info *sbi = EXT4_SB(sb); struct inode *inode = sbi->s_buddy_cache; + might_sleep(); mb_debug(1, "load group %u\n", group); blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; @@ -4217,6 +4219,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, unsigned int inquota = 0; unsigned int reserv_clstrs = 0; + might_sleep(); sb = ar->inode->i_sb; sbi = EXT4_SB(sb); @@ -4470,6 +4473,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, int err = 0; int ret; + might_sleep(); if (bh) { if (block) BUG_ON(block != bh->b_blocknr); From d76a3a77113db020d9bb1e894822869410450bd9 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 3 Apr 2013 22:02:52 -0400 Subject: [PATCH 08/50] ext4/jbd2: don't wait (forever) for stale tid caused by wraparound In the case where an inode has a very stale transaction id (tid) in i_datasync_tid or i_sync_tid, it's possible that after a very large (2**31) number of transactions, that the tid number space might wrap, causing tid_geq()'s calculations to fail. Commit deeeaf13 "jbd2: fix fsync() tid wraparound bug", later modified by commit e7b04ac0 "jbd2: don't wake kjournald unnecessarily", attempted to fix this problem, but it only avoided kjournald spinning forever by fixing the logic in jbd2_log_start_commit(). Unfortunately, in the codepaths in fs/ext4/fsync.c and fs/ext4/inode.c that might call jbd2_log_start_commit() with a stale tid, those functions will subsequently call jbd2_log_wait_commit() with the same stale tid, and then wait for a very long time. To fix this, we replace the calls to jbd2_log_start_commit() and jbd2_log_wait_commit() with a call to a new function, jbd2_complete_transaction(), which will correctly handle stale tid's. As a bonus, jbd2_complete_transaction() will avoid locking j_state_lock for writing unless a commit needs to be started. This should have a small (but probably not measurable) improvement for ext4's scalability. Signed-off-by: "Theodore Ts'o" Reported-by: Ben Hutchings Reported-by: George Barnett Cc: stable@vger.kernel.org --- fs/ext4/fsync.c | 3 +-- fs/ext4/inode.c | 3 +-- fs/jbd2/journal.c | 31 +++++++++++++++++++++++++++++++ include/linux/jbd2.h | 1 + 4 files changed, 34 insertions(+), 4 deletions(-) diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 3278e64e57b6..e0ba8a408def 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -166,8 +166,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (journal->j_flags & JBD2_BARRIER && !jbd2_trans_will_send_data_barrier(journal, commit_tid)) needs_barrier = true; - jbd2_log_start_commit(journal, commit_tid); - ret = jbd2_log_wait_commit(journal, commit_tid); + ret = jbd2_complete_transaction(journal, commit_tid); if (needs_barrier) { err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); if (!ret) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 56ebd662033b..addba9e0a1a4 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -210,8 +210,7 @@ void ext4_evict_inode(struct inode *inode) journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; tid_t commit_tid = EXT4_I(inode)->i_datasync_tid; - jbd2_log_start_commit(journal, commit_tid); - jbd2_log_wait_commit(journal, commit_tid); + jbd2_complete_transaction(journal, commit_tid); filemap_write_and_wait(&inode->i_data); } truncate_inode_pages(&inode->i_data, 0); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index ed10991ab006..886ec2faa9b4 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -709,6 +709,37 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid) return err; } +/* + * When this function returns the transaction corresponding to tid + * will be completed. If the transaction has currently running, start + * committing that transaction before waiting for it to complete. If + * the transaction id is stale, it is by definition already completed, + * so just return SUCCESS. + */ +int jbd2_complete_transaction(journal_t *journal, tid_t tid) +{ + int need_to_wait = 1; + + read_lock(&journal->j_state_lock); + if (journal->j_running_transaction && + journal->j_running_transaction->t_tid == tid) { + if (journal->j_commit_request != tid) { + /* transaction not yet started, so request it */ + read_unlock(&journal->j_state_lock); + jbd2_log_start_commit(journal, tid); + goto wait_commit; + } + } else if (!(journal->j_committing_transaction && + journal->j_committing_transaction->t_tid == tid)) + need_to_wait = 0; + read_unlock(&journal->j_state_lock); + if (!need_to_wait) + return 0; +wait_commit: + return jbd2_log_wait_commit(journal, tid); +} +EXPORT_SYMBOL(jbd2_complete_transaction); + /* * Log buffer allocation routines: */ diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 50e5a5e6a712..f0289754b464 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1200,6 +1200,7 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t tid); int jbd2_journal_start_commit(journal_t *journal, tid_t *tid); int jbd2_journal_force_commit_nested(journal_t *journal); int jbd2_log_wait_commit(journal_t *journal, tid_t tid); +int jbd2_complete_transaction(journal_t *journal, tid_t tid); int jbd2_log_do_checkpoint(journal_t *journal); int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid); From 996bb9fddd5b68d1dfb5e27d30ca2c7a72448596 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 3 Apr 2013 22:04:52 -0400 Subject: [PATCH 09/50] ext4: support simple conversion of extent-mapped inodes to use i_blocks In order to make it simpler to test the code which support i_blocks/indirect-mapped inodes, support the conversion of inodes which are less than 12 blocks and which are contained in no more than a single extent. The primary intended use of this code is to converting freshly created zero-length files and empty directories. Note that the version of chattr in e2fsprogs 1.42.7 and earlier has a check that prevents the clearing of the extent flag. A simple patch which allows "chattr -e " to work will be checked into the e2fsprogs git repository. Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 3 ++- fs/ext4/extents.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++ fs/ext4/ioctl.c | 20 +++++++--------- 3 files changed, 69 insertions(+), 13 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index d05ba3886f33..73f3e60f7078 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -403,7 +403,7 @@ struct flex_groups { #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ #define EXT4_FL_USER_VISIBLE 0x004BDFFF /* User visible flags */ -#define EXT4_FL_USER_MODIFIABLE 0x004B80FF /* User modifiable flags */ +#define EXT4_FL_USER_MODIFIABLE 0x004380FF /* User modifiable flags */ /* Flags that should be inherited by new inodes from their parent. */ #define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\ @@ -2608,6 +2608,7 @@ extern int ext4_find_delalloc_range(struct inode *inode, extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk); extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); +extern int ext4_ind_migrate(struct inode *inode); /* move_extent.c */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index cbbe8a4deac6..235246719074 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4610,3 +4610,62 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return error; } + +/* + * Migrate a simple extent-based inode to use the i_blocks[] array + */ +int ext4_ind_migrate(struct inode *inode) +{ + struct ext4_extent_header *eh; + struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_extent *ex; + unsigned int i, len; + ext4_fsblk_t blk; + handle_t *handle; + int ret; + + if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_INCOMPAT_EXTENTS) || + (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) + return -EINVAL; + + down_write(&EXT4_I(inode)->i_data_sem); + ret = ext4_ext_check_inode(inode); + if (ret) + goto errout; + + eh = ext_inode_hdr(inode); + ex = EXT_FIRST_EXTENT(eh); + if (ext4_blocks_count(es) > EXT4_MAX_BLOCK_FILE_PHYS || + eh->eh_depth != 0 || eh->eh_entries > 1) { + ret = -EOPNOTSUPP; + goto errout; + } + if (eh->eh_entries == 0) + blk = len = 0; + else { + len = le16_to_cpu(ex->ee_len); + blk = ext4_ext_pblock(ex); + if (len > EXT4_NDIR_BLOCKS) { + ret = -EOPNOTSUPP; + goto errout; + } + } + + handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto errout; + } + + ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS); + memset(ei->i_data, 0, sizeof(ei->i_data)); + for (i=0; i < len; i++) + ei->i_data[i] = cpu_to_le32(blk++); + ext4_mark_inode_dirty(handle, inode); + ext4_journal_stop(handle); +errout: + up_write(&EXT4_I(inode)->i_data_sem); + return ret; +} diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 721f4d33e148..a07b7bc0856a 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -83,17 +83,8 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (!capable(CAP_SYS_RESOURCE)) goto flags_out; } - if (oldflags & EXT4_EXTENTS_FL) { - /* We don't support clearning extent flags */ - if (!(flags & EXT4_EXTENTS_FL)) { - err = -EOPNOTSUPP; - goto flags_out; - } - } else if (flags & EXT4_EXTENTS_FL) { - /* migrate the file */ + if ((flags ^ oldflags) & EXT4_EXTENTS_FL) migrate = 1; - flags &= ~EXT4_EXTENTS_FL; - } if (flags & EXT4_EOFBLOCKS_FL) { /* we don't support adding EOFBLOCKS flag */ @@ -137,8 +128,13 @@ flags_err: err = ext4_change_inode_journal_flag(inode, jflag); if (err) goto flags_out; - if (migrate) - err = ext4_ext_migrate(inode); + if (migrate) { + if (flags & EXT4_EXTENTS_FL) + err = ext4_ext_migrate(inode); + else + err = ext4_ind_migrate(inode); + } + flags_out: mutex_unlock(&inode->i_mutex); mnt_drop_write_file(filp); From 794446c6946513c684d448205fbd76fa35f38b72 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Wed, 3 Apr 2013 22:06:52 -0400 Subject: [PATCH 10/50] jbd2: fix race between jbd2_journal_remove_checkpoint and ->j_commit_callback The following race is possible: [kjournald2] other_task jbd2_journal_commit_transaction() j_state = T_FINISHED; spin_unlock(&journal->j_list_lock); ->jbd2_journal_remove_checkpoint() ->jbd2_journal_free_transaction(); ->kmem_cache_free(transaction) ->j_commit_callback(journal, transaction); -> USE_AFTER_FREE WARNING: at lib/list_debug.c:62 __list_del_entry+0x1c0/0x250() Hardware name: list_del corruption. prev->next should be ffff88019a4ec198, but was 6b6b6b6b6b6b6b6b Modules linked in: cpufreq_ondemand acpi_cpufreq freq_table mperf coretemp kvm_intel kvm crc32c_intel ghash_clmulni_intel microcode sg xhci_hcd button sd_mod crc_t10dif aesni_intel ablk_helper cryptd lrw aes_x86_64 xts gf128mul ahci libahci pata_acpi ata_generic dm_mirror dm_region_hash dm_log dm_mod Pid: 16400, comm: jbd2/dm-1-8 Tainted: G W 3.8.0-rc3+ #107 Call Trace: [] warn_slowpath_common+0xad/0xf0 [] warn_slowpath_fmt+0x46/0x50 [] ? ext4_journal_commit_callback+0x99/0xc0 [] __list_del_entry+0x1c0/0x250 [] ext4_journal_commit_callback+0x6f/0xc0 [] jbd2_journal_commit_transaction+0x23a6/0x2570 [] ? try_to_del_timer_sync+0x82/0xa0 [] ? del_timer_sync+0x91/0x1e0 [] kjournald2+0x19f/0x6a0 [] ? wake_up_bit+0x40/0x40 [] ? bit_spin_lock+0x80/0x80 [] kthread+0x10e/0x120 [] ? __init_kthread_worker+0x70/0x70 [] ret_from_fork+0x7c/0xb0 [] ? __init_kthread_worker+0x70/0x70 In order to demonstrace this issue one should mount ext4 with mount -o discard option on SSD disk. This makes callback longer and race window becomes wider. In order to fix this we should mark transaction as finished only after callbacks have completed Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" Cc: stable@vger.kernel.org --- fs/jbd2/commit.c | 50 +++++++++++++++++++++++++------------------- include/linux/jbd2.h | 1 + 2 files changed, 29 insertions(+), 22 deletions(-) diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 750c70148eff..0f53946f13c1 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -382,7 +382,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) int space_left = 0; int first_tag = 0; int tag_flag; - int i, to_free = 0; + int i; int tag_bytes = journal_tag_bytes(journal); struct buffer_head *cbh = NULL; /* For transactional checksums */ __u32 crc32_sum = ~0; @@ -1134,7 +1134,7 @@ restart_loop: journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged; spin_unlock(&journal->j_history_lock); - commit_transaction->t_state = T_FINISHED; + commit_transaction->t_state = T_COMMIT_CALLBACK; J_ASSERT(commit_transaction == journal->j_committing_transaction); journal->j_commit_sequence = commit_transaction->t_tid; journal->j_committing_transaction = NULL; @@ -1149,38 +1149,44 @@ restart_loop: journal->j_average_commit_time*3) / 4; else journal->j_average_commit_time = commit_time; + write_unlock(&journal->j_state_lock); - if (commit_transaction->t_checkpoint_list == NULL && - commit_transaction->t_checkpoint_io_list == NULL) { - __jbd2_journal_drop_transaction(journal, commit_transaction); - to_free = 1; + if (journal->j_checkpoint_transactions == NULL) { + journal->j_checkpoint_transactions = commit_transaction; + commit_transaction->t_cpnext = commit_transaction; + commit_transaction->t_cpprev = commit_transaction; } else { - if (journal->j_checkpoint_transactions == NULL) { - journal->j_checkpoint_transactions = commit_transaction; - commit_transaction->t_cpnext = commit_transaction; - commit_transaction->t_cpprev = commit_transaction; - } else { - commit_transaction->t_cpnext = - journal->j_checkpoint_transactions; - commit_transaction->t_cpprev = - commit_transaction->t_cpnext->t_cpprev; - commit_transaction->t_cpnext->t_cpprev = + commit_transaction->t_cpnext = + journal->j_checkpoint_transactions; + commit_transaction->t_cpprev = + commit_transaction->t_cpnext->t_cpprev; + commit_transaction->t_cpnext->t_cpprev = + commit_transaction; + commit_transaction->t_cpprev->t_cpnext = commit_transaction; - commit_transaction->t_cpprev->t_cpnext = - commit_transaction; - } } spin_unlock(&journal->j_list_lock); - + /* Drop all spin_locks because commit_callback may be block. + * __journal_remove_checkpoint() can not destroy transaction + * under us because it is not marked as T_FINISHED yet */ if (journal->j_commit_callback) journal->j_commit_callback(journal, commit_transaction); trace_jbd2_end_commit(journal, commit_transaction); jbd_debug(1, "JBD2: commit %d complete, head %d\n", journal->j_commit_sequence, journal->j_tail_sequence); - if (to_free) - jbd2_journal_free_transaction(commit_transaction); + write_lock(&journal->j_state_lock); + spin_lock(&journal->j_list_lock); + commit_transaction->t_state = T_FINISHED; + /* Recheck checkpoint lists after j_list_lock was dropped */ + if (commit_transaction->t_checkpoint_list == NULL && + commit_transaction->t_checkpoint_io_list == NULL) { + __jbd2_journal_drop_transaction(journal, commit_transaction); + jbd2_journal_free_transaction(commit_transaction); + } + spin_unlock(&journal->j_list_lock); + write_unlock(&journal->j_state_lock); wake_up(&journal->j_wait_done_commit); } diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index f0289754b464..f9fe88957b7a 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -480,6 +480,7 @@ struct transaction_s T_COMMIT, T_COMMIT_DFLUSH, T_COMMIT_JFLUSH, + T_COMMIT_CALLBACK, T_FINISHED } t_state; From 5d3ee20855e28169d711b394857ee608a5023094 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Wed, 3 Apr 2013 22:08:52 -0400 Subject: [PATCH 11/50] ext4: fix journal callback list traversal It is incorrect to use list_for_each_entry_safe() for journal callback traversial because ->next may be removed by other task: ->ext4_mb_free_metadata() ->ext4_mb_free_metadata() ->ext4_journal_callback_del() This results in the following issue: WARNING: at lib/list_debug.c:62 __list_del_entry+0x1c0/0x250() Hardware name: list_del corruption. prev->next should be ffff88019a4ec198, but was 6b6b6b6b6b6b6b6b Modules linked in: cpufreq_ondemand acpi_cpufreq freq_table mperf coretemp kvm_intel kvm crc32c_intel ghash_clmulni_intel microcode sg xhci_hcd button sd_mod crc_t10dif aesni_intel ablk_helper cryptd lrw aes_x86_64 xts gf128mul ahci libahci pata_acpi ata_generic dm_mirror dm_region_hash dm_log dm_mod Pid: 16400, comm: jbd2/dm-1-8 Tainted: G W 3.8.0-rc3+ #107 Call Trace: [] warn_slowpath_common+0xad/0xf0 [] warn_slowpath_fmt+0x46/0x50 [] ? ext4_journal_commit_callback+0x99/0xc0 [] __list_del_entry+0x1c0/0x250 [] ext4_journal_commit_callback+0x6f/0xc0 [] jbd2_journal_commit_transaction+0x23a6/0x2570 [] ? try_to_del_timer_sync+0x82/0xa0 [] ? del_timer_sync+0x91/0x1e0 [] kjournald2+0x19f/0x6a0 [] ? wake_up_bit+0x40/0x40 [] ? bit_spin_lock+0x80/0x80 [] kthread+0x10e/0x120 [] ? __init_kthread_worker+0x70/0x70 [] ret_from_fork+0x7c/0xb0 [] ? __init_kthread_worker+0x70/0x70 This patch fix the issue as follows: - ext4_journal_commit_callback() make list truly traversial safe simply by always starting from list_head - fix race between two ext4_journal_callback_del() and ext4_journal_callback_try_del() Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" Reviewed-by: Jan Kara Cc: stable@vger.kernel.com --- fs/ext4/ext4_jbd2.h | 6 +++++- fs/ext4/mballoc.c | 8 ++++---- fs/ext4/super.c | 7 +++++-- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 4c216b1bf20c..aeed0bac693b 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -194,16 +194,20 @@ static inline void ext4_journal_callback_add(handle_t *handle, * ext4_journal_callback_del: delete a registered callback * @handle: active journal transaction handle on which callback was registered * @jce: registered journal callback entry to unregister + * Return true if object was sucessfully removed */ -static inline void ext4_journal_callback_del(handle_t *handle, +static inline bool ext4_journal_callback_try_del(handle_t *handle, struct ext4_journal_cb_entry *jce) { + bool deleted; struct ext4_sb_info *sbi = EXT4_SB(handle->h_transaction->t_journal->j_private); spin_lock(&sbi->s_md_lock); + deleted = !list_empty(&jce->jce_list); list_del_init(&jce->jce_list); spin_unlock(&sbi->s_md_lock); + return deleted; } int diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 36c82a39d03f..580aada3d1bb 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4423,11 +4423,11 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, node = rb_prev(new_node); if (node) { entry = rb_entry(node, struct ext4_free_data, efd_node); - if (can_merge(entry, new_entry)) { + if (can_merge(entry, new_entry) && + ext4_journal_callback_try_del(handle, &entry->efd_jce)) { new_entry->efd_start_cluster = entry->efd_start_cluster; new_entry->efd_count += entry->efd_count; rb_erase(node, &(db->bb_free_root)); - ext4_journal_callback_del(handle, &entry->efd_jce); kmem_cache_free(ext4_free_data_cachep, entry); } } @@ -4435,10 +4435,10 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, node = rb_next(new_node); if (node) { entry = rb_entry(node, struct ext4_free_data, efd_node); - if (can_merge(new_entry, entry)) { + if (can_merge(new_entry, entry) && + ext4_journal_callback_try_del(handle, &entry->efd_jce)) { new_entry->efd_count += entry->efd_count; rb_erase(node, &(db->bb_free_root)); - ext4_journal_callback_del(handle, &entry->efd_jce); kmem_cache_free(ext4_free_data_cachep, entry); } } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 5d6d53578124..67ae9e332d42 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -353,10 +353,13 @@ static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); int error = is_journal_aborted(journal); - struct ext4_journal_cb_entry *jce, *tmp; + struct ext4_journal_cb_entry *jce; + BUG_ON(txn->t_state == T_FINISHED); spin_lock(&sbi->s_md_lock); - list_for_each_entry_safe(jce, tmp, &txn->t_private_list, jce_list) { + while (!list_empty(&txn->t_private_list)) { + jce = list_entry(txn->t_private_list.next, + struct ext4_journal_cb_entry, jce_list); list_del_init(&jce->jce_list); spin_unlock(&sbi->s_md_lock); jce->jce_func(sb, jce, error); From a75ae78f087f933ab3432e98bb4dbbf2196cf6d5 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Wed, 3 Apr 2013 22:10:52 -0400 Subject: [PATCH 12/50] ext4: unregister es_shrinker if mount failed Otherwise destroyed ext_sb_info will be part of global shinker list and result in the following OOPS: JBD2: corrupted journal superblock JBD2: recovery failed EXT4-fs (dm-2): error loading journal general protection fault: 0000 [#1] SMP Modules linked in: fuse acpi_cpufreq freq_table mperf coretemp kvm_intel kvm crc32c_intel microcode sg button sd_mod crc_t10dif ahci libahci pata_acpi ata_generic dm_mirror dm_region_hash dm_log dm_\ mod CPU 1 Pid: 2758, comm: mount Not tainted 3.8.0-rc3+ #136 /DH55TC RIP: 0010:[] [] unregister_shrinker+0xad/0xe0 RSP: 0000:ffff88011d5cbcd8 EFLAGS: 00010207 RAX: 6b6b6b6b6b6b6b6b RBX: 6b6b6b6b6b6b6b53 RCX: 0000000000000006 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000246 RBP: ffff88011d5cbce8 R08: 0000000000000002 R09: 0000000000000001 R10: 0000000000000001 R11: 0000000000000000 R12: ffff88011cd3f848 R13: ffff88011cd3f830 R14: ffff88011cd3f000 R15: 0000000000000000 FS: 00007f7b721dd7e0(0000) GS:ffff880121a00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 00007fffa6f75038 CR3: 000000011bc1c000 CR4: 00000000000007e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process mount (pid: 2758, threadinfo ffff88011d5ca000, task ffff880116aacb80) Stack: ffff88011cd3f000 ffffffff8209b6c0 ffff88011d5cbd18 ffffffff812482f1 00000000000003f3 00000000ffffffea ffff880115f4c200 0000000000000000 ffff88011d5cbda8 ffffffff81249381 ffff8801219d8bf8 ffffffff00000000 Call Trace: [] deactivate_locked_super+0x91/0xb0 [] mount_bdev+0x331/0x340 [] ? ext4_alloc_flex_bg_array+0x180/0x180 [] ext4_mount+0x15/0x20 [] mount_fs+0x9a/0x2e0 [] vfs_kern_mount+0xc5/0x170 [] do_new_mount+0x172/0x2e0 [] do_mount+0x376/0x380 [] sys_mount+0x138/0x150 [] system_call_fastpath+0x16/0x1b Code: 8b 05 88 04 eb 00 48 3d 90 ff 06 82 48 8d 58 e8 75 19 4c 89 e7 e8 e4 d7 2c 00 48 c7 c7 00 ff 06 82 e8 58 5f ef ff 5b 41 5c c9 c3 <48> 8b 4b 18 48 8b 73 20 48 89 da 31 c0 48 c7 c7 c5 a0 e4 81 e\ 8 RIP [] unregister_shrinker+0xad/0xe0 RSP Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" Cc: stable@vger.kernel.org --- fs/ext4/super.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 67ae9e332d42..febbe0e18024 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3701,6 +3701,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_err_report.function = print_daily_error_info; sbi->s_err_report.data = (unsigned long) sb; + /* Register extent status tree shrinker */ + ext4_es_register_shrinker(sb); + err = percpu_counter_init(&sbi->s_freeclusters_counter, ext4_count_free_clusters(sb)); if (!err) { @@ -3726,9 +3729,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_max_writeback_mb_bump = 128; sbi->s_extent_max_zeroout_kb = 32; - /* Register extent status tree shrinker */ - ext4_es_register_shrinker(sb); - /* * set up enough so that it can read an inode */ @@ -4013,6 +4013,7 @@ failed_mount_wq: sbi->s_journal = NULL; } failed_mount3: + ext4_es_unregister_shrinker(sb); del_timer(&sbi->s_err_report); if (sbi->s_flex_groups) ext4_kvfree(sbi->s_flex_groups); From 689110098c7df10fab8800d3bf8e727c21f426fb Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Wed, 3 Apr 2013 22:12:52 -0400 Subject: [PATCH 13/50] ext4: make ext4_block_in_group() much more efficient Currently in when getting the block group number for a particular block in ext4_block_in_group() we're using ext4_get_group_no_and_offset() which uses do_div() to get the block group and the remainer which is offset within the group. We don't need all of that in ext4_block_in_group() as we only need to figure out the group number. This commit changes ext4_block_in_group() to calculate group number directly. This shows as a big improvement with regards to cpu utilization. Measuring fallocate -l 15T on fresh file system with perf showed that 23% of cpu time was spend in the ext4_get_group_no_and_offset(). With this change it completely disappears from the list only bumping the occurrence of ext4_init_block_bitmap() which is the biggest user of ext4_block_in_group() by 4%. As the result of this change on my system the fallocate call was approx. 10% faster. However since there is '-g' option in mkfs which allow us setting different groups size (mostly for developers) I've introduced new per file system flag whether we have a standard block group size or not. The flag is used to determine whether we can use the bit shift optimization or not. Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" --- fs/ext4/balloc.c | 22 ++++++++++++++++------ fs/ext4/ext4.h | 10 +++++++++- fs/ext4/super.c | 4 ++++ 3 files changed, 29 insertions(+), 7 deletions(-) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 92e68b33fffd..d6babf94907e 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -49,14 +49,24 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, } -static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block, - ext4_group_t block_group) +/* + * Check whether the 'block' lives within the 'block_group'. Returns 1 if so + * and 0 otherwise. + */ +static inline int ext4_block_in_group(struct super_block *sb, + ext4_fsblk_t block, + ext4_group_t block_group) { ext4_group_t actual_group; - ext4_get_group_no_and_offset(sb, block, &actual_group, NULL); - if (actual_group == block_group) - return 1; - return 0; + + if (test_opt2(sb, STD_GROUP_SIZE)) + actual_group = + (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) + + block) >> + (EXT4_BLOCK_SIZE_BITS(sb) + EXT4_CLUSTER_BITS(sb) + 3); + else + ext4_get_group_no_and_offset(sb, block, &actual_group, NULL); + return (actual_group == block_group) ? 1 : 0; } /* Return the number of clusters used for file system metadata; this diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 73f3e60f7078..cc58aa8e9869 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -949,7 +949,7 @@ struct ext4_inode_info { #define EXT2_FLAGS_TEST_FILESYS 0x0004 /* to test development code */ /* - * Mount flags + * Mount flags set via mount options or defaults */ #define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */ #define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */ @@ -981,8 +981,16 @@ struct ext4_inode_info { #define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */ #define EXT4_MOUNT_INIT_INODE_TABLE 0x80000000 /* Initialize uninitialized itables */ +/* + * Mount flags set either automatically (could not be set by mount option) + * based on per file system feature or property or in special cases such as + * distinguishing between explicit mount option definition and default. + */ #define EXT4_MOUNT2_EXPLICIT_DELALLOC 0x00000001 /* User explicitly specified delalloc */ +#define EXT4_MOUNT2_STD_GROUP_SIZE 0x00000002 /* We have standard group + size of blocksize * 8 + blocks */ #define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \ ~EXT4_MOUNT_##opt diff --git a/fs/ext4/super.c b/fs/ext4/super.c index febbe0e18024..525beb6e3e1e 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3529,6 +3529,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb)); sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb)); + /* Do we have standard group size of blocksize * 8 blocks ? */ + if (sbi->s_blocks_per_group == blocksize << 3) + set_opt2(sb, STD_GROUP_SIZE); + for (i = 0; i < 4; i++) sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); sbi->s_def_hash_version = es->s_def_hash_version; From bd86298e60b84b5e6d2da3e75c4ce2f6b70bdeed Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Wed, 3 Apr 2013 23:32:34 -0400 Subject: [PATCH 14/50] ext4: introduce ext4_get_group_number() Currently on many places in ext4 we're using ext4_get_group_no_and_offset() even though we're only interested in knowing the block group of the particular block, not the offset within the block group so we can use more efficient way to compute block group. This patch introduces ext4_get_group_number() which computes block group for a given block much more efficiently. Use this function instead of ext4_get_group_no_and_offset() everywhere where we're only interested in knowing the block group. Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" --- fs/ext4/balloc.c | 25 ++++++++++++++++++------- fs/ext4/ext4.h | 10 +++++++--- fs/ext4/mballoc.c | 6 +++--- fs/ext4/resize.c | 10 +++++----- 4 files changed, 33 insertions(+), 18 deletions(-) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index d6babf94907e..9e8d8ffb063f 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -29,6 +29,23 @@ static unsigned ext4_num_base_meta_clusters(struct super_block *sb, * balloc.c contains the blocks allocation and deallocation routines */ +/* + * Calculate block group number for a given block number + */ +ext4_group_t ext4_get_group_number(struct super_block *sb, + ext4_fsblk_t block) +{ + ext4_group_t group; + + if (test_opt2(sb, STD_GROUP_SIZE)) + group = (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) + + block) >> + (EXT4_BLOCK_SIZE_BITS(sb) + EXT4_CLUSTER_BITS(sb) + 3); + else + ext4_get_group_no_and_offset(sb, block, &group, NULL); + return group; +} + /* * Calculate the block group number and offset into the block/cluster * allocation bitmap, given a block number @@ -59,13 +76,7 @@ static inline int ext4_block_in_group(struct super_block *sb, { ext4_group_t actual_group; - if (test_opt2(sb, STD_GROUP_SIZE)) - actual_group = - (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) + - block) >> - (EXT4_BLOCK_SIZE_BITS(sb) + EXT4_CLUSTER_BITS(sb) + 3); - else - ext4_get_group_no_and_offset(sb, block, &actual_group, NULL); + actual_group = ext4_get_group_number(sb, block); return (actual_group == block_group) ? 1 : 0; } diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index cc58aa8e9869..a0637e5057ae 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1793,9 +1793,6 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no) */ #define ERR_BAD_DX_DIR -75000 -void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, - ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp); - /* * Timeout and state flag for lazy initialization inode thread. */ @@ -1917,6 +1914,13 @@ int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group, struct buffer_head *bh); /* balloc.c */ +extern void ext4_get_group_no_and_offset(struct super_block *sb, + ext4_fsblk_t blocknr, + ext4_group_t *blockgrpp, + ext4_grpblk_t *offsetp); +extern ext4_group_t ext4_get_group_number(struct super_block *sb, + ext4_fsblk_t block); + extern void ext4_validate_block_bitmap(struct super_block *sb, struct ext4_group_desc *desc, unsigned int block_group, diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 580aada3d1bb..8c8d05218021 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3344,7 +3344,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac, if (pa->pa_type == MB_GROUP_PA) grp_blk--; - ext4_get_group_no_and_offset(sb, grp_blk, &grp, NULL); + grp = ext4_get_group_number(sb, grp_blk); /* * possible race: @@ -3809,7 +3809,7 @@ repeat: list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { BUG_ON(pa->pa_type != MB_INODE_PA); - ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); + group = ext4_get_group_number(sb, pa->pa_pstart); err = ext4_mb_load_buddy(sb, group, &e4b); if (err) { @@ -4071,7 +4071,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) { - ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); + group = ext4_get_group_number(sb, pa->pa_pstart); if (ext4_mb_load_buddy(sb, group, &e4b)) { ext4_error(sb, "Error loading buddy information for %u", group); diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index c169477a62c9..e3498534a2c1 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -272,7 +272,7 @@ next_group: if (start_blk >= last_blk) goto next_group; group_data[bb_index].block_bitmap = start_blk++; - ext4_get_group_no_and_offset(sb, start_blk - 1, &group, NULL); + group = ext4_get_group_number(sb, start_blk - 1); group -= group_data[0].group; group_data[group].free_blocks_count--; if (flexbg_size > 1) @@ -284,7 +284,7 @@ next_group: if (start_blk >= last_blk) goto next_group; group_data[ib_index].inode_bitmap = start_blk++; - ext4_get_group_no_and_offset(sb, start_blk - 1, &group, NULL); + group = ext4_get_group_number(sb, start_blk - 1); group -= group_data[0].group; group_data[group].free_blocks_count--; if (flexbg_size > 1) @@ -296,7 +296,7 @@ next_group: if (start_blk + EXT4_SB(sb)->s_itb_per_group > last_blk) goto next_group; group_data[it_index].inode_table = start_blk; - ext4_get_group_no_and_offset(sb, start_blk, &group, NULL); + group = ext4_get_group_number(sb, start_blk - 1); group -= group_data[0].group; group_data[group].free_blocks_count -= EXT4_SB(sb)->s_itb_per_group; @@ -392,7 +392,7 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle, ext4_group_t group; int err; - ext4_get_group_no_and_offset(sb, block, &group, NULL); + group = ext4_get_group_number(sb, block); start = ext4_group_first_block_no(sb, group); group -= flex_gd->groups[0].group; @@ -1879,7 +1879,7 @@ retry: /* Nothing need to do */ return 0; - ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &offset); + n_group = ext4_get_group_number(sb, n_blocks_count - 1); ext4_get_group_no_and_offset(sb, o_blocks_count - 1, &o_group, &offset); n_desc_blocks = num_desc_blocks(sb, n_group + 1); From bc2d9db48c95ec6c9c5ecc97ddc61343d751f219 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Wed, 3 Apr 2013 23:33:27 -0400 Subject: [PATCH 15/50] ext4: Transfer initialized block to right neighbor if possible Currently when converting extent to initialized we attempt to transfer initialized block to the left neighbour if possible when certain criteria are met. However we do not attempt to do the same for the right neighbor. This commit adds the possibility to transfer initialized block to the right neighbour if: 1. We're not converting the whole extent 2. Both extents are stored in the same extent tree node 3. Right neighbor is initialized 4. Right neighbor is logically abutting the current one 5. Right neighbor is physically abutting the current one 6. Right neighbor would not overflow the length limit This is basically the same logic as with transferring to the left. This will gain us some performance benefits since it is faster than inserting extent and then merging it. It would also prevent some situation in delalloc patch when we might run out of metadata reservation. This is due to the fact that we would attempt to split the extent first (possibly allocating new metadata block) even though we did not counted for that because it can (and will) be merged again. This commit fix that scenario, because we no longer need to split the extent in such case. Signed-off-by: Lukas Czerner --- fs/ext4/extents.c | 139 ++++++++++++++++++++++++++++++---------------- 1 file changed, 91 insertions(+), 48 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 235246719074..892662334be9 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3153,29 +3153,28 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, struct ext4_extent_header *eh; struct ext4_map_blocks split_map; struct ext4_extent zero_ex; - struct ext4_extent *ex; + struct ext4_extent *ex, *abut_ex; ext4_lblk_t ee_block, eof_block; - unsigned int ee_len, depth; - int allocated, max_zeroout = 0; + unsigned int ee_len, depth, map_len = map->m_len; + int allocated = 0, max_zeroout = 0; int err = 0; int split_flag = 0; ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical" "block %llu, max_blocks %u\n", inode->i_ino, - (unsigned long long)map->m_lblk, map->m_len); + (unsigned long long)map->m_lblk, map_len); sbi = EXT4_SB(inode->i_sb); eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >> inode->i_sb->s_blocksize_bits; - if (eof_block < map->m_lblk + map->m_len) - eof_block = map->m_lblk + map->m_len; + if (eof_block < map->m_lblk + map_len) + eof_block = map->m_lblk + map_len; depth = ext_depth(inode); eh = path[depth].p_hdr; ex = path[depth].p_ext; ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); - allocated = ee_len - (map->m_lblk - ee_block); zero_ex.ee_len = 0; trace_ext4_ext_convert_to_initialized_enter(inode, map, ex); @@ -3186,77 +3185,121 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, /* * Attempt to transfer newly initialized blocks from the currently - * uninitialized extent to its left neighbor. This is much cheaper + * uninitialized extent to its neighbor. This is much cheaper * than an insertion followed by a merge as those involve costly - * memmove() calls. This is the common case in steady state for - * workloads doing fallocate(FALLOC_FL_KEEP_SIZE) followed by append - * writes. + * memmove() calls. Transferring to the left is the common case in + * steady state for workloads doing fallocate(FALLOC_FL_KEEP_SIZE) + * followed by append writes. * * Limitations of the current logic: - * - L1: we only deal with writes at the start of the extent. - * The approach could be extended to writes at the end - * of the extent but this scenario was deemed less common. - * - L2: we do not deal with writes covering the whole extent. + * - L1: we do not deal with writes covering the whole extent. * This would require removing the extent if the transfer * is possible. - * - L3: we only attempt to merge with an extent stored in the + * - L2: we only attempt to merge with an extent stored in the * same extent tree node. */ - if ((map->m_lblk == ee_block) && /*L1*/ - (map->m_len < ee_len) && /*L2*/ - (ex > EXT_FIRST_EXTENT(eh))) { /*L3*/ - struct ext4_extent *prev_ex; + if ((map->m_lblk == ee_block) && + /* See if we can merge left */ + (map_len < ee_len) && /*L1*/ + (ex > EXT_FIRST_EXTENT(eh))) { /*L2*/ ext4_lblk_t prev_lblk; ext4_fsblk_t prev_pblk, ee_pblk; - unsigned int prev_len, write_len; + unsigned int prev_len; - prev_ex = ex - 1; - prev_lblk = le32_to_cpu(prev_ex->ee_block); - prev_len = ext4_ext_get_actual_len(prev_ex); - prev_pblk = ext4_ext_pblock(prev_ex); + abut_ex = ex - 1; + prev_lblk = le32_to_cpu(abut_ex->ee_block); + prev_len = ext4_ext_get_actual_len(abut_ex); + prev_pblk = ext4_ext_pblock(abut_ex); ee_pblk = ext4_ext_pblock(ex); - write_len = map->m_len; /* - * A transfer of blocks from 'ex' to 'prev_ex' is allowed + * A transfer of blocks from 'ex' to 'abut_ex' is allowed * upon those conditions: - * - C1: prev_ex is initialized, - * - C2: prev_ex is logically abutting ex, - * - C3: prev_ex is physically abutting ex, - * - C4: prev_ex can receive the additional blocks without + * - C1: abut_ex is initialized, + * - C2: abut_ex is logically abutting ex, + * - C3: abut_ex is physically abutting ex, + * - C4: abut_ex can receive the additional blocks without * overflowing the (initialized) length limit. */ - if ((!ext4_ext_is_uninitialized(prev_ex)) && /*C1*/ + if ((!ext4_ext_is_uninitialized(abut_ex)) && /*C1*/ ((prev_lblk + prev_len) == ee_block) && /*C2*/ ((prev_pblk + prev_len) == ee_pblk) && /*C3*/ - (prev_len < (EXT_INIT_MAX_LEN - write_len))) { /*C4*/ + (prev_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/ err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto out; trace_ext4_ext_convert_to_initialized_fastpath(inode, - map, ex, prev_ex); + map, ex, abut_ex); - /* Shift the start of ex by 'write_len' blocks */ - ex->ee_block = cpu_to_le32(ee_block + write_len); - ext4_ext_store_pblock(ex, ee_pblk + write_len); - ex->ee_len = cpu_to_le16(ee_len - write_len); + /* Shift the start of ex by 'map_len' blocks */ + ex->ee_block = cpu_to_le32(ee_block + map_len); + ext4_ext_store_pblock(ex, ee_pblk + map_len); + ex->ee_len = cpu_to_le16(ee_len - map_len); ext4_ext_mark_uninitialized(ex); /* Restore the flag */ - /* Extend prev_ex by 'write_len' blocks */ - prev_ex->ee_len = cpu_to_le16(prev_len + write_len); - - /* Mark the block containing both extents as dirty */ - ext4_ext_dirty(handle, inode, path + depth); - - /* Update path to point to the right extent */ - path[depth].p_ext = prev_ex; + /* Extend abut_ex by 'map_len' blocks */ + abut_ex->ee_len = cpu_to_le16(prev_len + map_len); /* Result: number of initialized blocks past m_lblk */ - allocated = write_len; - goto out; + allocated = map_len; + } + } else if (((map->m_lblk + map_len) == (ee_block + ee_len)) && + (map_len < ee_len) && /*L1*/ + ex < EXT_LAST_EXTENT(eh)) { /*L2*/ + /* See if we can merge right */ + ext4_lblk_t next_lblk; + ext4_fsblk_t next_pblk, ee_pblk; + unsigned int next_len; + + abut_ex = ex + 1; + next_lblk = le32_to_cpu(abut_ex->ee_block); + next_len = ext4_ext_get_actual_len(abut_ex); + next_pblk = ext4_ext_pblock(abut_ex); + ee_pblk = ext4_ext_pblock(ex); + + /* + * A transfer of blocks from 'ex' to 'abut_ex' is allowed + * upon those conditions: + * - C1: abut_ex is initialized, + * - C2: abut_ex is logically abutting ex, + * - C3: abut_ex is physically abutting ex, + * - C4: abut_ex can receive the additional blocks without + * overflowing the (initialized) length limit. + */ + if ((!ext4_ext_is_uninitialized(abut_ex)) && /*C1*/ + ((map->m_lblk + map_len) == next_lblk) && /*C2*/ + ((ee_pblk + ee_len) == next_pblk) && /*C3*/ + (next_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/ + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto out; + + trace_ext4_ext_convert_to_initialized_fastpath(inode, + map, ex, abut_ex); + + /* Shift the start of abut_ex by 'map_len' blocks */ + abut_ex->ee_block = cpu_to_le32(next_lblk - map_len); + ext4_ext_store_pblock(abut_ex, next_pblk - map_len); + ex->ee_len = cpu_to_le16(ee_len - map_len); + ext4_ext_mark_uninitialized(ex); /* Restore the flag */ + + /* Extend abut_ex by 'map_len' blocks */ + abut_ex->ee_len = cpu_to_le16(next_len + map_len); + + /* Result: number of initialized blocks past m_lblk */ + allocated = map_len; } } + if (allocated) { + /* Mark the block containing both extents as dirty */ + ext4_ext_dirty(handle, inode, path + depth); + + /* Update path to point to the right extent */ + path[depth].p_ext = abut_ex; + goto out; + } else + allocated = ee_len - (map->m_lblk - ee_block); WARN_ON(map->m_lblk < ee_block); /* From be8981be6b820b637c88978b2241738487a98ebf Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Wed, 3 Apr 2013 23:33:28 -0400 Subject: [PATCH 16/50] ext4: try to prepend extent to the existing one Currently when inserting extent in ext4_ext_insert_extent() we would only try to to see if we can append new extent to the found extent. If we can not, then we proceed with adding new extent into the extent tree, but then possibly merging it back again. We can avoid this situation by trying to append and prepend new extent to the existing ones. However since the new extent can be on either sides of the existing extent, we have to pick the right extent to try to append/prepend to. This patch adds the conditions to pick the right extent to append/prepend to and adds the actual prepending condition as well. This will also eliminate the need to use "reserved" block for possibly growing extent tree. Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 108 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 85 insertions(+), 23 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 892662334be9..6c5a70a7b573 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1813,39 +1813,101 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, } depth = ext_depth(inode); ex = path[depth].p_ext; + eh = path[depth].p_hdr; if (unlikely(path[depth].p_hdr == NULL)) { EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); return -EIO; } /* try to insert block into found extent and return */ - if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO) - && ext4_can_extents_be_merged(inode, ex, newext)) { - ext_debug("append [%d]%d block to %u:[%d]%d (from %llu)\n", - ext4_ext_is_uninitialized(newext), - ext4_ext_get_actual_len(newext), - le32_to_cpu(ex->ee_block), - ext4_ext_is_uninitialized(ex), - ext4_ext_get_actual_len(ex), - ext4_ext_pblock(ex)); - err = ext4_ext_get_access(handle, inode, path + depth); - if (err) - return err; + if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)) { /* - * ext4_can_extents_be_merged should have checked that either - * both extents are uninitialized, or both aren't. Thus we - * need to check only one of them here. + * Try to see whether we should rather test the extent on + * right from ex, or from the left of ex. This is because + * ext4_ext_find_extent() can return either extent on the + * left, or on the right from the searched position. This + * will make merging more effective. */ - if (ext4_ext_is_uninitialized(ex)) - uninitialized = 1; - ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + if (ex < EXT_LAST_EXTENT(eh) && + (le32_to_cpu(ex->ee_block) + + ext4_ext_get_actual_len(ex) < + le32_to_cpu(newext->ee_block))) { + ex += 1; + goto prepend; + } else if ((ex > EXT_FIRST_EXTENT(eh)) && + (le32_to_cpu(newext->ee_block) + + ext4_ext_get_actual_len(newext) < + le32_to_cpu(ex->ee_block))) + ex -= 1; + + /* Try to append newex to the ex */ + if (ext4_can_extents_be_merged(inode, ex, newext)) { + ext_debug("append [%d]%d block to %u:[%d]%d" + "(from %llu)\n", + ext4_ext_is_uninitialized(newext), + ext4_ext_get_actual_len(newext), + le32_to_cpu(ex->ee_block), + ext4_ext_is_uninitialized(ex), + ext4_ext_get_actual_len(ex), + ext4_ext_pblock(ex)); + err = ext4_ext_get_access(handle, inode, + path + depth); + if (err) + return err; + + /* + * ext4_can_extents_be_merged should have checked + * that either both extents are uninitialized, or + * both aren't. Thus we need to check only one of + * them here. + */ + if (ext4_ext_is_uninitialized(ex)) + uninitialized = 1; + ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + ext4_ext_get_actual_len(newext)); - if (uninitialized) - ext4_ext_mark_uninitialized(ex); - eh = path[depth].p_hdr; - nearex = ex; - goto merge; + if (uninitialized) + ext4_ext_mark_uninitialized(ex); + eh = path[depth].p_hdr; + nearex = ex; + goto merge; + } + +prepend: + /* Try to prepend newex to the ex */ + if (ext4_can_extents_be_merged(inode, newext, ex)) { + ext_debug("prepend %u[%d]%d block to %u:[%d]%d" + "(from %llu)\n", + le32_to_cpu(newext->ee_block), + ext4_ext_is_uninitialized(newext), + ext4_ext_get_actual_len(newext), + le32_to_cpu(ex->ee_block), + ext4_ext_is_uninitialized(ex), + ext4_ext_get_actual_len(ex), + ext4_ext_pblock(ex)); + err = ext4_ext_get_access(handle, inode, + path + depth); + if (err) + return err; + + /* + * ext4_can_extents_be_merged should have checked + * that either both extents are uninitialized, or + * both aren't. Thus we need to check only one of + * them here. + */ + if (ext4_ext_is_uninitialized(ex)) + uninitialized = 1; + ex->ee_block = newext->ee_block; + ext4_ext_store_pblock(ex, ext4_ext_pblock(newext)); + ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + + ext4_ext_get_actual_len(newext)); + if (uninitialized) + ext4_ext_mark_uninitialized(ex); + eh = path[depth].p_hdr; + nearex = ex; + goto merge; + } } depth = ext_depth(inode); From f78ee70db40040e6f38a5134527d4760254d6683 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Wed, 3 Apr 2013 23:33:30 -0400 Subject: [PATCH 17/50] ext4: print more info in ext4_print_free_blocks() Additionally print i_allocated_meta_blocks information as well. Signed-off-by: Lukas Czerner --- fs/ext4/inode.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index addba9e0a1a4..769c656ea3b1 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1615,22 +1615,25 @@ static void ext4_print_free_blocks(struct inode *inode) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct super_block *sb = inode->i_sb; + struct ext4_inode_info *ei = EXT4_I(inode); ext4_msg(sb, KERN_CRIT, "Total free blocks count %lld", EXT4_C2B(EXT4_SB(inode->i_sb), - ext4_count_free_clusters(inode->i_sb))); + ext4_count_free_clusters(sb))); ext4_msg(sb, KERN_CRIT, "Free/Dirty block details"); ext4_msg(sb, KERN_CRIT, "free_blocks=%lld", - (long long) EXT4_C2B(EXT4_SB(inode->i_sb), + (long long) EXT4_C2B(EXT4_SB(sb), percpu_counter_sum(&sbi->s_freeclusters_counter))); ext4_msg(sb, KERN_CRIT, "dirty_blocks=%lld", - (long long) EXT4_C2B(EXT4_SB(inode->i_sb), + (long long) EXT4_C2B(EXT4_SB(sb), percpu_counter_sum(&sbi->s_dirtyclusters_counter))); ext4_msg(sb, KERN_CRIT, "Block reservation details"); ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u", - EXT4_I(inode)->i_reserved_data_blocks); + ei->i_reserved_data_blocks); ext4_msg(sb, KERN_CRIT, "i_reserved_meta_blocks=%u", - EXT4_I(inode)->i_reserved_meta_blocks); + ei->i_reserved_meta_blocks); + ext4_msg(sb, KERN_CRIT, "i_allocated_meta_blocks=%u", + ei->i_allocated_meta_blocks); return; } From 393d1d1d76933886d5e1ce603214c9987589c6d5 Mon Sep 17 00:00:00 2001 From: "Dr. Tilmann Bubeck" Date: Mon, 8 Apr 2013 12:54:05 -0400 Subject: [PATCH 18/50] ext4: implementation of a new ioctl called EXT4_IOC_SWAP_BOOT Add a new ioctl, EXT4_IOC_SWAP_BOOT which swaps i_blocks and associated attributes (like i_blocks, i_size, i_flags, ...) from the specified inode with inode EXT4_BOOT_LOADER_INO (#5). This is typically used to store a boot loader in a secure part of the filesystem, where it can't be changed by a normal user by accident. The data blocks of the previous boot loader will be associated with the given inode. This usercode program is a simple example of the usage: int main(int argc, char *argv[]) { int fd; int err; if ( argc != 2 ) { printf("usage: ext4-swap-boot-inode FILE-TO-SWAP\n"); exit(1); } fd = open(argv[1], O_WRONLY); if ( fd < 0 ) { perror("open"); exit(1); } err = ioctl(fd, EXT4_IOC_SWAP_BOOT); if ( err < 0 ) { perror("ioctl"); exit(1); } close(fd); exit(0); } [ Modified by Theodore Ts'o to fix a number of bugs in the original code.] Signed-off-by: Dr. Tilmann Bubeck Signed-off-by: "Theodore Ts'o" --- Documentation/filesystems/ext4.txt | 10 ++ fs/ext4/ext4.h | 8 ++ fs/ext4/inode.c | 11 +- fs/ext4/ioctl.c | 197 +++++++++++++++++++++++++++++ fs/ext4/move_extent.c | 48 +++---- 5 files changed, 248 insertions(+), 26 deletions(-) diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt index 34ea4f1fa6ea..5dd957d8b25b 100644 --- a/Documentation/filesystems/ext4.txt +++ b/Documentation/filesystems/ext4.txt @@ -587,6 +587,16 @@ Table of Ext4 specific ioctls bitmaps and inode table, the userspace tool thus just passes the new number of blocks. +EXT4_IOC_SWAP_BOOT Swap i_blocks and associated attributes + (like i_blocks, i_size, i_flags, ...) from + the specified inode with inode + EXT4_BOOT_LOADER_INO (#5). This is typically + used to store a boot loader in a secure part of + the filesystem, where it can't be changed by a + normal user by accident. + The data blocks of the previous boot loader + will be associated with the given inode. + .............................................................................. References diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index a0637e5057ae..d91871570982 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -616,6 +616,7 @@ enum { #define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12) #define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent) #define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64) +#define EXT4_IOC_SWAP_BOOT _IO('f', 17) #if defined(__KERNEL__) && defined(CONFIG_COMPAT) /* @@ -1341,6 +1342,7 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) return ino == EXT4_ROOT_INO || ino == EXT4_USR_QUOTA_INO || ino == EXT4_GRP_QUOTA_INO || + ino == EXT4_BOOT_LOADER_INO || ino == EXT4_JOURNAL_INO || ino == EXT4_RESIZE_INO || (ino >= EXT4_FIRST_INO(sb) && @@ -2624,6 +2626,12 @@ extern int ext4_ind_migrate(struct inode *inode); /* move_extent.c */ +extern void ext4_double_down_write_data_sem(struct inode *first, + struct inode *second); +extern void ext4_double_up_write_data_sem(struct inode *orig_inode, + struct inode *donor_inode); +void ext4_inode_double_lock(struct inode *inode1, struct inode *inode2); +void ext4_inode_double_unlock(struct inode *inode1, struct inode *inode2); extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 start_orig, __u64 start_donor, __u64 len, __u64 *moved_len); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 769c656ea3b1..a29bfc2142ef 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4191,8 +4191,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) * NeilBrown 1999oct15 */ if (inode->i_nlink == 0) { - if (inode->i_mode == 0 || - !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) { + if ((inode->i_mode == 0 || + !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) && + ino != EXT4_BOOT_LOADER_INO) { /* this inode is deleted */ ret = -ESTALE; goto bad_inode; @@ -4200,7 +4201,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) /* The only unlinked inodes we let through here have * valid i_mode and are being read by the orphan * recovery code: that's fine, we're about to complete - * the process of deleting those. */ + * the process of deleting those. + * OR it is the EXT4_BOOT_LOADER_INO which is + * not initialized on a new filesystem. */ } ei->i_flags = le32_to_cpu(raw_inode->i_flags); inode->i_blocks = ext4_inode_blocks(raw_inode, ei); @@ -4320,6 +4323,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) else init_special_inode(inode, inode->i_mode, new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); + } else if (ino == EXT4_BOOT_LOADER_INO) { + make_bad_inode(inode); } else { ret = -EIO; EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index a07b7bc0856a..cbc3acea6bcf 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -17,9 +17,201 @@ #include #include "ext4_jbd2.h" #include "ext4.h" +#include "ext4_extents.h" #define MAX_32_NUM ((((unsigned long long) 1) << 32) - 1) +/** + * Swap memory between @a and @b for @len bytes. + * + * @a: pointer to first memory area + * @b: pointer to second memory area + * @len: number of bytes to swap + * + */ +static void memswap(void *a, void *b, size_t len) +{ + unsigned char *ap, *bp; + unsigned char tmp; + + ap = (unsigned char *)a; + bp = (unsigned char *)b; + while (len-- > 0) { + tmp = *ap; + *ap = *bp; + *bp = tmp; + ap++; + bp++; + } +} + +/** + * Swap i_data and associated attributes between @inode1 and @inode2. + * This function is used for the primary swap between inode1 and inode2 + * and also to revert this primary swap in case of errors. + * + * Therefore you have to make sure, that calling this method twice + * will revert all changes. + * + * @inode1: pointer to first inode + * @inode2: pointer to second inode + */ +static void swap_inode_data(struct inode *inode1, struct inode *inode2) +{ + loff_t isize; + struct ext4_inode_info *ei1; + struct ext4_inode_info *ei2; + + ei1 = EXT4_I(inode1); + ei2 = EXT4_I(inode2); + + memswap(&inode1->i_flags, &inode2->i_flags, sizeof(inode1->i_flags)); + memswap(&inode1->i_version, &inode2->i_version, + sizeof(inode1->i_version)); + memswap(&inode1->i_blocks, &inode2->i_blocks, + sizeof(inode1->i_blocks)); + memswap(&inode1->i_bytes, &inode2->i_bytes, sizeof(inode1->i_bytes)); + memswap(&inode1->i_atime, &inode2->i_atime, sizeof(inode1->i_atime)); + memswap(&inode1->i_mtime, &inode2->i_mtime, sizeof(inode1->i_mtime)); + + memswap(ei1->i_data, ei2->i_data, sizeof(ei1->i_data)); + memswap(&ei1->i_flags, &ei2->i_flags, sizeof(ei1->i_flags)); + memswap(&ei1->i_disksize, &ei2->i_disksize, sizeof(ei1->i_disksize)); + memswap(&ei1->i_es_tree, &ei2->i_es_tree, sizeof(ei1->i_es_tree)); + memswap(&ei1->i_es_lru_nr, &ei2->i_es_lru_nr, sizeof(ei1->i_es_lru_nr)); + + isize = i_size_read(inode1); + i_size_write(inode1, i_size_read(inode2)); + i_size_write(inode2, isize); +} + +/** + * Swap the information from the given @inode and the inode + * EXT4_BOOT_LOADER_INO. It will basically swap i_data and all other + * important fields of the inodes. + * + * @sb: the super block of the filesystem + * @inode: the inode to swap with EXT4_BOOT_LOADER_INO + * + */ +static long swap_inode_boot_loader(struct super_block *sb, + struct inode *inode) +{ + handle_t *handle; + int err; + struct inode *inode_bl; + struct ext4_inode_info *ei; + struct ext4_inode_info *ei_bl; + struct ext4_sb_info *sbi; + + if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode)) { + err = -EINVAL; + goto swap_boot_out; + } + + if (!inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) { + err = -EPERM; + goto swap_boot_out; + } + + sbi = EXT4_SB(sb); + ei = EXT4_I(inode); + + inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO); + if (IS_ERR(inode_bl)) { + err = PTR_ERR(inode_bl); + goto swap_boot_out; + } + ei_bl = EXT4_I(inode_bl); + + filemap_flush(inode->i_mapping); + filemap_flush(inode_bl->i_mapping); + + /* Protect orig inodes against a truncate and make sure, + * that only 1 swap_inode_boot_loader is running. */ + ext4_inode_double_lock(inode, inode_bl); + + truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages(&inode_bl->i_data, 0); + + /* Wait for all existing dio workers */ + ext4_inode_block_unlocked_dio(inode); + ext4_inode_block_unlocked_dio(inode_bl); + inode_dio_wait(inode); + inode_dio_wait(inode_bl); + + handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2); + if (IS_ERR(handle)) { + err = -EINVAL; + goto swap_boot_out; + } + + /* Protect extent tree against block allocations via delalloc */ + ext4_double_down_write_data_sem(inode, inode_bl); + + if (inode_bl->i_nlink == 0) { + /* this inode has never been used as a BOOT_LOADER */ + set_nlink(inode_bl, 1); + i_uid_write(inode_bl, 0); + i_gid_write(inode_bl, 0); + inode_bl->i_flags = 0; + ei_bl->i_flags = 0; + inode_bl->i_version = 1; + i_size_write(inode_bl, 0); + inode_bl->i_mode = S_IFREG; + if (EXT4_HAS_INCOMPAT_FEATURE(sb, + EXT4_FEATURE_INCOMPAT_EXTENTS)) { + ext4_set_inode_flag(inode_bl, EXT4_INODE_EXTENTS); + ext4_ext_tree_init(handle, inode_bl); + } else + memset(ei_bl->i_data, 0, sizeof(ei_bl->i_data)); + } + + swap_inode_data(inode, inode_bl); + + inode->i_ctime = inode_bl->i_ctime = ext4_current_time(inode); + + spin_lock(&sbi->s_next_gen_lock); + inode->i_generation = sbi->s_next_generation++; + inode_bl->i_generation = sbi->s_next_generation++; + spin_unlock(&sbi->s_next_gen_lock); + + ext4_discard_preallocations(inode); + + err = ext4_mark_inode_dirty(handle, inode); + if (err < 0) { + ext4_warning(inode->i_sb, + "couldn't mark inode #%lu dirty (err %d)", + inode->i_ino, err); + /* Revert all changes: */ + swap_inode_data(inode, inode_bl); + } else { + err = ext4_mark_inode_dirty(handle, inode_bl); + if (err < 0) { + ext4_warning(inode_bl->i_sb, + "couldn't mark inode #%lu dirty (err %d)", + inode_bl->i_ino, err); + /* Revert all changes: */ + swap_inode_data(inode, inode_bl); + ext4_mark_inode_dirty(handle, inode); + } + } + + ext4_journal_stop(handle); + + ext4_double_up_write_data_sem(inode, inode_bl); + + ext4_inode_resume_unlocked_dio(inode); + ext4_inode_resume_unlocked_dio(inode_bl); + + ext4_inode_double_unlock(inode, inode_bl); + + iput(inode_bl); + +swap_boot_out: + return err; +} + long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -353,6 +545,11 @@ group_add_out: return err; } + case EXT4_IOC_SWAP_BOOT: + if (!(filp->f_mode & FMODE_WRITE)) + return -EBADF; + return swap_inode_boot_loader(sb, inode); + case EXT4_IOC_RESIZE_FS: { ext4_fsblk_t n_blocks_count; struct super_block *sb = inode->i_sb; diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 33e1c086858b..a2e696e16331 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -144,12 +144,13 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path, } /** - * double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem + * ext4_double_down_write_data_sem - Acquire two inodes' write lock + * of i_data_sem * * Acquire write lock of i_data_sem of the two inodes */ -static void -double_down_write_data_sem(struct inode *first, struct inode *second) +void +ext4_double_down_write_data_sem(struct inode *first, struct inode *second) { if (first < second) { down_write(&EXT4_I(first)->i_data_sem); @@ -162,14 +163,15 @@ double_down_write_data_sem(struct inode *first, struct inode *second) } /** - * double_up_write_data_sem - Release two inodes' write lock of i_data_sem + * ext4_double_up_write_data_sem - Release two inodes' write lock of i_data_sem * * @orig_inode: original inode structure to be released its lock first * @donor_inode: donor inode structure to be released its lock second * Release write lock of i_data_sem of two inodes (orig and donor). */ -static void -double_up_write_data_sem(struct inode *orig_inode, struct inode *donor_inode) +void +ext4_double_up_write_data_sem(struct inode *orig_inode, + struct inode *donor_inode) { up_write(&EXT4_I(orig_inode)->i_data_sem); up_write(&EXT4_I(donor_inode)->i_data_sem); @@ -976,7 +978,7 @@ again: * necessary, just swap data blocks between orig and donor. */ if (uninit) { - double_down_write_data_sem(orig_inode, donor_inode); + ext4_double_down_write_data_sem(orig_inode, donor_inode); /* If any of extents in range became initialized we have to * fallback to data copying */ uninit = mext_check_coverage(orig_inode, orig_blk_offset, @@ -990,7 +992,7 @@ again: goto drop_data_sem; if (!uninit) { - double_up_write_data_sem(orig_inode, donor_inode); + ext4_double_up_write_data_sem(orig_inode, donor_inode); goto data_copy; } if ((page_has_private(pagep[0]) && @@ -1004,7 +1006,7 @@ again: donor_inode, orig_blk_offset, block_len_in_page, err); drop_data_sem: - double_up_write_data_sem(orig_inode, donor_inode); + ext4_double_up_write_data_sem(orig_inode, donor_inode); goto unlock_pages; } data_copy: @@ -1065,11 +1067,11 @@ repair_branches: * Extents are swapped already, but we are not able to copy data. * Try to swap extents to it's original places */ - double_down_write_data_sem(orig_inode, donor_inode); + ext4_double_down_write_data_sem(orig_inode, donor_inode); replaced_count = mext_replace_branches(handle, donor_inode, orig_inode, orig_blk_offset, block_len_in_page, &err2); - double_up_write_data_sem(orig_inode, donor_inode); + ext4_double_up_write_data_sem(orig_inode, donor_inode); if (replaced_count != block_len_in_page) { EXT4_ERROR_INODE_BLOCK(orig_inode, (sector_t)(orig_blk_offset), "Unable to copy data block," @@ -1209,15 +1211,15 @@ mext_check_arguments(struct inode *orig_inode, } /** - * mext_inode_double_lock - Lock i_mutex on both @inode1 and @inode2 + * ext4_inode_double_lock - Lock i_mutex on both @inode1 and @inode2 * * @inode1: the inode structure * @inode2: the inode structure * * Lock two inodes' i_mutex */ -static void -mext_inode_double_lock(struct inode *inode1, struct inode *inode2) +void +ext4_inode_double_lock(struct inode *inode1, struct inode *inode2) { BUG_ON(inode1 == inode2); if (inode1 < inode2) { @@ -1230,15 +1232,15 @@ mext_inode_double_lock(struct inode *inode1, struct inode *inode2) } /** - * mext_inode_double_unlock - Release i_mutex on both @inode1 and @inode2 + * ext4_inode_double_unlock - Release i_mutex on both @inode1 and @inode2 * * @inode1: the inode that is released first * @inode2: the inode that is released second * */ -static void -mext_inode_double_unlock(struct inode *inode1, struct inode *inode2) +void +ext4_inode_double_unlock(struct inode *inode1, struct inode *inode2) { mutex_unlock(&inode1->i_mutex); mutex_unlock(&inode2->i_mutex); @@ -1333,7 +1335,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, return -EINVAL; } /* Protect orig and donor inodes against a truncate */ - mext_inode_double_lock(orig_inode, donor_inode); + ext4_inode_double_lock(orig_inode, donor_inode); /* Wait for all existing dio workers */ ext4_inode_block_unlocked_dio(orig_inode); @@ -1342,7 +1344,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, inode_dio_wait(donor_inode); /* Protect extent tree against block allocations via delalloc */ - double_down_write_data_sem(orig_inode, donor_inode); + ext4_double_down_write_data_sem(orig_inode, donor_inode); /* Check the filesystem environment whether move_extent can be done */ ret = mext_check_arguments(orig_inode, donor_inode, orig_start, donor_start, &len); @@ -1466,7 +1468,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, * b. racing with ->readpage, ->write_begin, and ext4_get_block * in move_extent_per_page */ - double_up_write_data_sem(orig_inode, donor_inode); + ext4_double_up_write_data_sem(orig_inode, donor_inode); while (orig_page_offset <= seq_end_page) { @@ -1500,7 +1502,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, block_len_in_page = rest_blocks; } - double_down_write_data_sem(orig_inode, donor_inode); + ext4_double_down_write_data_sem(orig_inode, donor_inode); if (ret < 0) break; @@ -1538,10 +1540,10 @@ out: ext4_ext_drop_refs(holecheck_path); kfree(holecheck_path); } - double_up_write_data_sem(orig_inode, donor_inode); + ext4_double_up_write_data_sem(orig_inode, donor_inode); ext4_inode_resume_unlocked_dio(orig_inode); ext4_inode_resume_unlocked_dio(donor_inode); - mext_inode_double_unlock(orig_inode, donor_inode); + ext4_inode_double_unlock(orig_inode, donor_inode); return ret; } From e8238f9a8339b3578c85e4192a7a23bc2bdc0333 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Mon, 8 Apr 2013 13:02:25 -0400 Subject: [PATCH 19/50] ext4: fix incorrect lock ordering for ext4_ind_migrate existing locking ordering: journal-> i_data_sem, but ext4_ind_migrate() grab locks in opposite order which may result in deadlock. Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 6c5a70a7b573..ea607f907232 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4735,6 +4735,10 @@ int ext4_ind_migrate(struct inode *inode) (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) return -EINVAL; + handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1); + if (IS_ERR(handle)) + return PTR_ERR(handle); + down_write(&EXT4_I(inode)->i_data_sem); ret = ext4_ext_check_inode(inode); if (ret) @@ -4758,19 +4762,13 @@ int ext4_ind_migrate(struct inode *inode) } } - handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto errout; - } - ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS); memset(ei->i_data, 0, sizeof(ei->i_data)); for (i=0; i < len; i++) ei->i_data[i] = cpu_to_le32(blk++); ext4_mark_inode_dirty(handle, inode); - ext4_journal_stop(handle); errout: + ext4_journal_stop(handle); up_write(&EXT4_I(inode)->i_data_sem); return ret; } From bcb1385096caf421363d47e735acda940cb1e12b Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 9 Apr 2013 09:21:41 -0400 Subject: [PATCH 20/50] ext4: fix deadlock with quota feature We didn't mark hidden quota files with S_NOQUOTA flag and thus quota was accounted even for quota files. Thus we could recurse back to quota code when adding new blocks to quota file which can easily deadlock. Mark hidden quota files properly. Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 525beb6e3e1e..968ca9369175 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4953,6 +4953,8 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id, return PTR_ERR(qf_inode); } + /* Don't account quota for quota files to avoid recursion */ + qf_inode->i_flags |= S_NOQUOTA; err = dquot_enable(qf_inode, type, format_id, flags); iput(qf_inode); From 5c1ff33640293499f9b16450029c9fb4dc06543e Mon Sep 17 00:00:00 2001 From: Eric Whitney Date: Tue, 9 Apr 2013 09:27:31 -0400 Subject: [PATCH 21/50] ext4: fix free space estimate in ext4_nonda_switch() Values stored in s_freeclusters_counter and s_dirtyclusters_counter are both in cluster units. Remove the cluster to block conversion applied to s_freeclusters_counter causing an inflated estimate of free space because s_dirtyclusters_counter is not similarly converted. Rename free_blocks and dirty_blocks to better reflect the units these variables contain to avoid future confusion. This fix corrects ENOSPC failures for xfstests 127 and 231 on bigalloc file systems. Signed-off-by: Eric Whitney Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index a29bfc2142ef..f9b0b479ff4c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2609,7 +2609,7 @@ out_writepages: static int ext4_nonda_switch(struct super_block *sb) { - s64 free_blocks, dirty_blocks; + s64 free_clusters, dirty_clusters; struct ext4_sb_info *sbi = EXT4_SB(sb); /* @@ -2620,17 +2620,18 @@ static int ext4_nonda_switch(struct super_block *sb) * Delalloc need an accurate free block accounting. So switch * to non delalloc when we are near to error range. */ - free_blocks = EXT4_C2B(sbi, - percpu_counter_read_positive(&sbi->s_freeclusters_counter)); - dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyclusters_counter); + free_clusters = + percpu_counter_read_positive(&sbi->s_freeclusters_counter); + dirty_clusters = + percpu_counter_read_positive(&sbi->s_dirtyclusters_counter); /* * Start pushing delalloc when 1/2 of free blocks are dirty. */ - if (dirty_blocks && (free_blocks < 2 * dirty_blocks)) + if (dirty_clusters && (free_clusters < 2 * dirty_clusters)) try_to_writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE); - if (2 * free_blocks < 3 * dirty_blocks || - free_blocks < (dirty_blocks + EXT4_FREECLUSTERS_WATERMARK)) { + if (2 * free_clusters < 3 * dirty_clusters || + free_clusters < (dirty_clusters + EXT4_FREECLUSTERS_WATERMARK)) { /* * free block count is less than 150% of dirty blocks * or free blocks is less than watermark From eabe0444df90b0cdfa7fdc4f0b4b253f0a498ff7 Mon Sep 17 00:00:00 2001 From: Andrey Sidorov Date: Tue, 9 Apr 2013 12:22:29 -0400 Subject: [PATCH 22/50] ext4: speed-up releasing blocks on commit Improve mb_free_blocks speed by clearing entire range at once instead of iterating over each bit. Freeing block-by-block also makes buddy bitmap subtree flip twice making most of the work a no-op. Very few bits in buddy bitmap require change, e.g. freeing entire group is a 1 bit flip only. As a result, releasing blocks of 60G file now takes 5ms instead of 2.7s. This is especially good for non-preemptive kernels as there is no rescheduling during release. Signed-off-by: Andrey Sidorov Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 239 +++++++++++++++++++++++++++++++++------------- 1 file changed, 175 insertions(+), 64 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 8c8d05218021..6a87f7217474 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -405,6 +405,12 @@ static inline void mb_clear_bit(int bit, void *addr) ext4_clear_bit(bit, addr); } +static inline int mb_test_and_clear_bit(int bit, void *addr) +{ + addr = mb_correct_addr_and_bit(&bit, addr); + return ext4_test_and_clear_bit(bit, addr); +} + static inline int mb_find_next_zero_bit(void *addr, int max, int start) { int fix = 0, ret, tmpmax; @@ -764,6 +770,24 @@ void ext4_mb_generate_buddy(struct super_block *sb, spin_unlock(&EXT4_SB(sb)->s_bal_lock); } +static void mb_regenerate_buddy(struct ext4_buddy *e4b) +{ + int count; + int order = 1; + void *buddy; + + while ((buddy = mb_find_buddy(e4b, order++, &count))) { + ext4_set_bits(buddy, 0, count); + } + e4b->bd_info->bb_fragments = 0; + memset(e4b->bd_info->bb_counters, 0, + sizeof(*e4b->bd_info->bb_counters) * + (e4b->bd_sb->s_blocksize_bits + 2)); + + ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy, + e4b->bd_bitmap, e4b->bd_group); +} + /* The buddy information is attached the buddy cache inode * for convenience. The information regarding each group * is loaded via ext4_mb_load_buddy. The information involve @@ -1246,6 +1270,33 @@ static void mb_clear_bits(void *bm, int cur, int len) } } +/* clear bits in given range + * will return first found zero bit if any, -1 otherwise + */ +static int mb_test_and_clear_bits(void *bm, int cur, int len) +{ + __u32 *addr; + int zero_bit = -1; + + len = cur + len; + while (cur < len) { + if ((cur & 31) == 0 && (len - cur) >= 32) { + /* fast path: clear whole word at once */ + addr = bm + (cur >> 3); + if (*addr != (__u32)(-1) && zero_bit == -1) + zero_bit = cur + mb_find_next_zero_bit(addr, 32, 0); + *addr = 0; + cur += 32; + continue; + } + if (!mb_test_and_clear_bit(cur, bm) && zero_bit == -1) + zero_bit = cur; + cur++; + } + + return zero_bit; +} + void ext4_set_bits(void *bm, int cur, int len) { __u32 *addr; @@ -1264,17 +1315,90 @@ void ext4_set_bits(void *bm, int cur, int len) } } -static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, - int first, int count) +/* + * _________________________________________________________________ */ + +static inline int mb_buddy_adjust_border(int* bit, void* bitmap, int side) { - int block = 0; - int max = 0; - int order; - void *buddy; - void *buddy2; + if (mb_test_bit(*bit + side, bitmap)) { + mb_clear_bit(*bit, bitmap); + (*bit) -= side; + return 1; + } + else { + (*bit) += side; + mb_set_bit(*bit, bitmap); + return -1; + } +} + +static void mb_buddy_mark_free(struct ext4_buddy *e4b, int first, int last) +{ + int max; + int order = 1; + void *buddy = mb_find_buddy(e4b, order, &max); + + while (buddy) { + void *buddy2; + + /* Bits in range [first; last] are known to be set since + * corresponding blocks were allocated. Bits in range + * (first; last) will stay set because they form buddies on + * upper layer. We just deal with borders if they don't + * align with upper layer and then go up. + * Releasing entire group is all about clearing + * single bit of highest order buddy. + */ + + /* Example: + * --------------------------------- + * | 1 | 1 | 1 | 1 | + * --------------------------------- + * | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | + * --------------------------------- + * 0 1 2 3 4 5 6 7 + * \_____________________/ + * + * Neither [1] nor [6] is aligned to above layer. + * Left neighbour [0] is free, so mark it busy, + * decrease bb_counters and extend range to + * [0; 6] + * Right neighbour [7] is busy. It can't be coaleasced with [6], so + * mark [6] free, increase bb_counters and shrink range to + * [0; 5]. + * Then shift range to [0; 2], go up and do the same. + */ + + + if (first & 1) + e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&first, buddy, -1); + if (!(last & 1)) + e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&last, buddy, 1); + if (first > last) + break; + order++; + + if (first == last || !(buddy2 = mb_find_buddy(e4b, order, &max))) { + mb_clear_bits(buddy, first, last - first + 1); + e4b->bd_info->bb_counters[order - 1] += last - first + 1; + break; + } + first >>= 1; + last >>= 1; + buddy = buddy2; + } +} + +static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, + int first, int count) +{ + int left_is_free = 0; + int right_is_free = 0; + int block; + int last = first + count - 1; struct super_block *sb = e4b->bd_sb; - BUG_ON(first + count > (sb->s_blocksize << 3)); + BUG_ON(last >= (sb->s_blocksize << 3)); assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); mb_check_buddy(e4b); mb_free_blocks_double(inode, e4b, first, count); @@ -1283,67 +1407,54 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, if (first < e4b->bd_info->bb_first_free) e4b->bd_info->bb_first_free = first; - /* let's maintain fragments counter */ + /* access memory sequentially: check left neighbour, + * clear range and then check right neighbour + */ if (first != 0) - block = !mb_test_bit(first - 1, e4b->bd_bitmap); - if (first + count < EXT4_SB(sb)->s_mb_maxs[0]) - max = !mb_test_bit(first + count, e4b->bd_bitmap); - if (block && max) + left_is_free = !mb_test_bit(first - 1, e4b->bd_bitmap); + block = mb_test_and_clear_bits(e4b->bd_bitmap, first, count); + if (last + 1 < EXT4_SB(sb)->s_mb_maxs[0]) + right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap); + + if (unlikely(block != -1)) { + ext4_fsblk_t blocknr; + + blocknr = ext4_group_first_block_no(sb, e4b->bd_group); + blocknr += EXT4_C2B(EXT4_SB(sb), block); + ext4_grp_locked_error(sb, e4b->bd_group, + inode ? inode->i_ino : 0, + blocknr, + "freeing already freed block " + "(bit %u)", block); + mb_regenerate_buddy(e4b); + goto done; + } + + /* let's maintain fragments counter */ + if (left_is_free && right_is_free) e4b->bd_info->bb_fragments--; - else if (!block && !max) + else if (!left_is_free && !right_is_free) e4b->bd_info->bb_fragments++; - /* let's maintain buddy itself */ - while (count-- > 0) { - block = first++; - order = 0; - - if (!mb_test_bit(block, e4b->bd_bitmap)) { - ext4_fsblk_t blocknr; - - blocknr = ext4_group_first_block_no(sb, e4b->bd_group); - blocknr += EXT4_C2B(EXT4_SB(sb), block); - ext4_grp_locked_error(sb, e4b->bd_group, - inode ? inode->i_ino : 0, - blocknr, - "freeing already freed block " - "(bit %u)", block); - } - mb_clear_bit(block, e4b->bd_bitmap); - e4b->bd_info->bb_counters[order]++; - - /* start of the buddy */ - buddy = mb_find_buddy(e4b, order, &max); - - do { - block &= ~1UL; - if (mb_test_bit(block, buddy) || - mb_test_bit(block + 1, buddy)) - break; - - /* both the buddies are free, try to coalesce them */ - buddy2 = mb_find_buddy(e4b, order + 1, &max); - - if (!buddy2) - break; - - if (order > 0) { - /* for special purposes, we don't set - * free bits in bitmap */ - mb_set_bit(block, buddy); - mb_set_bit(block + 1, buddy); - } - e4b->bd_info->bb_counters[order]--; - e4b->bd_info->bb_counters[order]--; - - block = block >> 1; - order++; - e4b->bd_info->bb_counters[order]++; - - mb_clear_bit(block, buddy2); - buddy = buddy2; - } while (1); + /* buddy[0] == bd_bitmap is a special case, so handle + * it right away and let mb_buddy_mark_free stay free of + * zero order checks. + * Check if neighbours are to be coaleasced, + * adjust bitmap bb_counters and borders appropriately. + */ + if (first & 1) { + first += !left_is_free; + e4b->bd_info->bb_counters[0] += left_is_free ? -1 : 1; } + if (!(last & 1)) { + last -= !right_is_free; + e4b->bd_info->bb_counters[0] += right_is_free ? -1 : 1; + } + + if (first <= last) + mb_buddy_mark_free(e4b, first >> 1, last >> 1); + +done: mb_set_largest_free_order(sb, e4b->bd_info); mb_check_buddy(e4b); } From f45a5ef91bef7e02149a216ed6dc3fcdd8b38268 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 9 Apr 2013 12:39:26 -0400 Subject: [PATCH 23/50] ext4: improve credit estimate for EXT4_SINGLEDATA_TRANS_BLOCKS Estimate of 27 credits for allocation of a block in extent based inode is unnecessarily high. We can easily argue 20 is enough. Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4_jbd2.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index aeed0bac693b..c8c6885406db 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -29,11 +29,13 @@ * block to complete the transaction. * * For extents-enabled fs we may have to allocate and modify up to - * 5 levels of tree + root which are stored in the inode. */ + * 5 levels of tree, data block (for each of these we need bitmap + group + * summaries), root which is stored in the inode, sb + */ #define EXT4_SINGLEDATA_TRANS_BLOCKS(sb) \ (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \ - ? 27U : 8U) + ? 20U : 8U) /* Extended attribute operations touch at most two data buffers, * two bitmap buffers, and two group summaries, in addition to the inode From 27dd43854227bb0e6ab70129bd21b60d396db2e7 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Tue, 9 Apr 2013 22:11:22 -0400 Subject: [PATCH 24/50] ext4: introduce reserved space Currently in ENOSPC condition when writing into unwritten space, or punching a hole, we might need to split the extent and grow extent tree. However since we can not allocate any new metadata blocks we'll have to zero out unwritten part of extent or punched out part of extent, or in the worst case return ENOSPC even though use actually does not allocate any space. Also in delalloc path we do reserve metadata and data blocks for the time we're going to write out, however metadata block reservation is very tricky especially since we expect that logical connectivity implies physical connectivity, however that might not be the case and hence we might end up allocating more metadata blocks than previously reserved. So in future, metadata reservation checks should be removed since we can not assure that we do not under reserve. And this is where reserved space comes into the picture. When mounting the file system we slice off a little bit of the file system space (2% or 4096 clusters, whichever is smaller) which can be then used for the cases mentioned above to prevent costly zeroout, or unexpected ENOSPC. The number of reserved clusters can be set via sysfs, however it can never be bigger than number of free clusters in the file system. Note that this patch fixes the failure of xfstest 274 as expected. Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" Reviewed-by: Carlos Maiolino --- Documentation/filesystems/ext4.txt | 11 ++++ fs/ext4/balloc.c | 18 +++++-- fs/ext4/ext4.h | 13 +++-- fs/ext4/extents.c | 27 ++++++---- fs/ext4/inode.c | 11 +++- fs/ext4/super.c | 84 ++++++++++++++++++++++++++++-- 6 files changed, 141 insertions(+), 23 deletions(-) diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt index 5dd957d8b25b..f7cbf574a875 100644 --- a/Documentation/filesystems/ext4.txt +++ b/Documentation/filesystems/ext4.txt @@ -494,6 +494,17 @@ Files in /sys/fs/ext4/ session_write_kbytes This file is read-only and shows the number of kilobytes of data that have been written to this filesystem since it was mounted. + + reserved_clusters This is RW file and contains number of reserved + clusters in the file system which will be used + in the specific situations to avoid costly + zeroout, unexpected ENOSPC, or possible data + loss. The default is 2% or 4096 clusters, + whichever is smaller and this can be changed + however it can never exceed number of clusters + in the file system. If there is not enough space + for the reserved space when mounting the file + mount will _not_ fail. .............................................................................. Ioctls diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 9e8d8ffb063f..8dcaea69e37f 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -499,20 +499,22 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) static int ext4_has_free_clusters(struct ext4_sb_info *sbi, s64 nclusters, unsigned int flags) { - s64 free_clusters, dirty_clusters, root_clusters; + s64 free_clusters, dirty_clusters, rsv, resv_clusters; struct percpu_counter *fcc = &sbi->s_freeclusters_counter; struct percpu_counter *dcc = &sbi->s_dirtyclusters_counter; free_clusters = percpu_counter_read_positive(fcc); dirty_clusters = percpu_counter_read_positive(dcc); + resv_clusters = atomic64_read(&sbi->s_resv_clusters); /* * r_blocks_count should always be multiple of the cluster ratio so * we are safe to do a plane bit shift only. */ - root_clusters = ext4_r_blocks_count(sbi->s_es) >> sbi->s_cluster_bits; + rsv = (ext4_r_blocks_count(sbi->s_es) >> sbi->s_cluster_bits) + + resv_clusters; - if (free_clusters - (nclusters + root_clusters + dirty_clusters) < + if (free_clusters - (nclusters + rsv + dirty_clusters) < EXT4_FREECLUSTERS_WATERMARK) { free_clusters = percpu_counter_sum_positive(fcc); dirty_clusters = percpu_counter_sum_positive(dcc); @@ -520,15 +522,21 @@ static int ext4_has_free_clusters(struct ext4_sb_info *sbi, /* Check whether we have space after accounting for current * dirty clusters & root reserved clusters. */ - if (free_clusters >= ((root_clusters + nclusters) + dirty_clusters)) + if (free_clusters >= (rsv + nclusters + dirty_clusters)) return 1; /* Hm, nope. Are (enough) root reserved clusters available? */ if (uid_eq(sbi->s_resuid, current_fsuid()) || (!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && in_group_p(sbi->s_resgid)) || capable(CAP_SYS_RESOURCE) || - (flags & EXT4_MB_USE_ROOT_BLOCKS)) { + (flags & EXT4_MB_USE_ROOT_BLOCKS)) { + if (free_clusters >= (nclusters + dirty_clusters + + resv_clusters)) + return 1; + } + /* No free blocks. Let's see if we can dip into reserved pool */ + if (flags & EXT4_MB_USE_RESERVED) { if (free_clusters >= (nclusters + dirty_clusters)) return 1; } diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index d91871570982..12b560435aba 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -121,6 +121,8 @@ typedef unsigned int ext4_group_t; #define EXT4_MB_STREAM_ALLOC 0x0800 /* Use reserved root blocks if needed */ #define EXT4_MB_USE_ROOT_BLOCKS 0x1000 +/* Use blocks from reserved pool */ +#define EXT4_MB_USE_RESERVED 0x2000 struct ext4_allocation_request { /* target inode for block we're allocating */ @@ -557,9 +559,8 @@ enum { #define EXT4_GET_BLOCKS_UNINIT_EXT 0x0002 #define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT (EXT4_GET_BLOCKS_UNINIT_EXT|\ EXT4_GET_BLOCKS_CREATE) - /* Caller is from the delayed allocation writeout path, - so set the magic i_delalloc_reserve_flag after taking the - inode allocation semaphore for */ + /* Caller is from the delayed allocation writeout path + * finally doing the actual allocation of delayed blocks */ #define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004 /* caller is from the direct IO path, request to creation of an unitialized extents if not allocated, split the uninitialized @@ -571,8 +572,9 @@ enum { /* Convert extent to initialized after IO complete */ #define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\ EXT4_GET_BLOCKS_CREATE_UNINIT_EXT) - /* Punch out blocks of an extent */ -#define EXT4_GET_BLOCKS_PUNCH_OUT_EXT 0x0020 + /* Eventual metadata allocation (due to growing extent tree) + * should not fail, so try to use reserved blocks for that.*/ +#define EXT4_GET_BLOCKS_METADATA_NOFAIL 0x0020 /* Don't normalize allocation size (used for fallocate) */ #define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040 /* Request will not result in inode size update (user for fallocate) */ @@ -1188,6 +1190,7 @@ struct ext4_sb_info { unsigned int s_mount_flags; unsigned int s_def_mount_opt; ext4_fsblk_t s_sb_block; + atomic64_t s_resv_clusters; kuid_t s_resuid; kgid_t s_resgid; unsigned short s_mount_state; diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index ea607f907232..8b158ae2443b 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1942,8 +1942,8 @@ prepend: * There is no free space in the found leaf. * We're gonna add a new leaf in the tree. */ - if (flag & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) - flags = EXT4_MB_USE_ROOT_BLOCKS; + if (flag & EXT4_GET_BLOCKS_METADATA_NOFAIL) + flags = EXT4_MB_USE_RESERVED; err = ext4_ext_create_new_leaf(handle, inode, flags, path, newext); if (err) goto cleanup; @@ -2729,12 +2729,14 @@ again: /* * Split the extent in two so that 'end' is the last - * block in the first new extent + * block in the first new extent. Also we should not + * fail removing space due to ENOSPC so try to use + * reserved block if that happens. */ err = ext4_split_extent_at(handle, inode, path, - end + 1, split_flag, - EXT4_GET_BLOCKS_PRE_IO | - EXT4_GET_BLOCKS_PUNCH_OUT_EXT); + end + 1, split_flag, + EXT4_GET_BLOCKS_PRE_IO | + EXT4_GET_BLOCKS_METADATA_NOFAIL); if (err < 0) goto out; @@ -3209,7 +3211,8 @@ out: static int ext4_ext_convert_to_initialized(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, - struct ext4_ext_path *path) + struct ext4_ext_path *path, + int flags) { struct ext4_sb_info *sbi; struct ext4_extent_header *eh; @@ -3435,7 +3438,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, } allocated = ext4_split_extent(handle, inode, path, - &split_map, split_flag, 0); + &split_map, split_flag, flags); if (allocated < 0) err = allocated; @@ -3755,6 +3758,12 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, flags, allocated); ext4_ext_show_leaf(inode, path); + /* + * When writing into uninitialized space, we should not fail to + * allocate metadata blocks for the new extent block if needed. + */ + flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL; + trace_ext4_ext_handle_uninitialized_extents(inode, map, flags, allocated, newblock); @@ -3818,7 +3827,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, } /* buffered write, writepage time, convert*/ - ret = ext4_ext_convert_to_initialized(handle, inode, map, path); + ret = ext4_ext_convert_to_initialized(handle, inode, map, path, flags); if (ret >= 0) ext4_update_inode_fsync_trans(handle, inode, 1); out: diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index f9b0b479ff4c..629d67b62dfb 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1688,12 +1688,21 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd) */ map.m_lblk = next; map.m_len = max_blocks; - get_blocks_flags = EXT4_GET_BLOCKS_CREATE; + /* + * We're in delalloc path and it is possible that we're going to + * need more metadata blocks than previously reserved. However + * we must not fail because we're in writeback and there is + * nothing we can do about it so it might result in data loss. + * So use reserved blocks to allocate metadata if possible. + */ + get_blocks_flags = EXT4_GET_BLOCKS_CREATE | + EXT4_GET_BLOCKS_METADATA_NOFAIL; if (ext4_should_dioread_nolock(mpd->inode)) get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; if (mpd->b_state & (1 << BH_Delay)) get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; + blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags); if (blks < 0) { struct super_block *sb = mpd->inode->i_sb; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 968ca9369175..6fea87db7daa 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -81,6 +81,7 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly); static void ext4_destroy_lazyinit_thread(void); static void ext4_unregister_li_request(struct super_block *sb); static void ext4_clear_request_list(void); +static int ext4_reserve_clusters(struct ext4_sb_info *, ext4_fsblk_t); #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) static struct file_system_type ext2_fs_type = { @@ -2382,6 +2383,17 @@ struct ext4_attr { int offset; }; +static int parse_strtoull(const char *buf, + unsigned long long max, unsigned long long *value) +{ + int ret; + + ret = kstrtoull(skip_spaces(buf), 0, value); + if (!ret && *value > max) + ret = -EINVAL; + return ret; +} + static int parse_strtoul(const char *buf, unsigned long max, unsigned long *value) { @@ -2466,6 +2478,27 @@ static ssize_t sbi_ui_store(struct ext4_attr *a, return count; } +static ssize_t reserved_clusters_show(struct ext4_attr *a, + struct ext4_sb_info *sbi, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long) atomic64_read(&sbi->s_resv_clusters)); +} + +static ssize_t reserved_clusters_store(struct ext4_attr *a, + struct ext4_sb_info *sbi, + const char *buf, size_t count) +{ + unsigned long long val; + int ret; + + if (parse_strtoull(buf, -1ULL, &val)) + return -EINVAL; + ret = ext4_reserve_clusters(sbi, val); + + return ret ? ret : count; +} + static ssize_t trigger_test_error(struct ext4_attr *a, struct ext4_sb_info *sbi, const char *buf, size_t count) @@ -2503,6 +2536,7 @@ static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store) EXT4_RO_ATTR(delayed_allocation_blocks); EXT4_RO_ATTR(session_write_kbytes); EXT4_RO_ATTR(lifetime_write_kbytes); +EXT4_RW_ATTR(reserved_clusters); EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show, inode_readahead_blks_store, s_inode_readahead_blks); EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal); @@ -2520,6 +2554,7 @@ static struct attribute *ext4_attrs[] = { ATTR_LIST(delayed_allocation_blocks), ATTR_LIST(session_write_kbytes), ATTR_LIST(lifetime_write_kbytes), + ATTR_LIST(reserved_clusters), ATTR_LIST(inode_readahead_blks), ATTR_LIST(inode_goal), ATTR_LIST(mb_stats), @@ -3195,6 +3230,40 @@ int ext4_calculate_overhead(struct super_block *sb) return 0; } + +static ext4_fsblk_t ext4_calculate_resv_clusters(struct ext4_sb_info *sbi) +{ + ext4_fsblk_t resv_clusters; + + /* + * By default we reserve 2% or 4096 clusters, whichever is smaller. + * This should cover the situations where we can not afford to run + * out of space like for example punch hole, or converting + * uninitialized extents in delalloc path. In most cases such + * allocation would require 1, or 2 blocks, higher numbers are + * very rare. + */ + resv_clusters = ext4_blocks_count(sbi->s_es) >> sbi->s_cluster_bits; + + do_div(resv_clusters, 50); + resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096); + + return resv_clusters; +} + + +static int ext4_reserve_clusters(struct ext4_sb_info *sbi, ext4_fsblk_t count) +{ + ext4_fsblk_t clusters = ext4_blocks_count(sbi->s_es) >> + sbi->s_cluster_bits; + + if (count >= clusters) + return -EINVAL; + + atomic64_set(&sbi->s_resv_clusters, count); + return 0; +} + static int ext4_fill_super(struct super_block *sb, void *data, int silent) { char *orig_data = kstrdup(data, GFP_KERNEL); @@ -3918,6 +3987,13 @@ no_journal: "available"); } + err = ext4_reserve_clusters(sbi, ext4_calculate_resv_clusters(sbi)); + if (err) { + ext4_msg(sb, KERN_ERR, "failed to reserve %llu clusters for " + "reserved pool", ext4_calculate_resv_clusters(sbi)); + goto failed_mount4a; + } + err = ext4_setup_system_zone(sb); if (err) { ext4_msg(sb, KERN_ERR, "failed to initialize system " @@ -4750,9 +4826,10 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) struct super_block *sb = dentry->d_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; - ext4_fsblk_t overhead = 0; + ext4_fsblk_t overhead = 0, resv_blocks; u64 fsid; s64 bfree; + resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters)); if (!test_opt(sb, MINIX_DF)) overhead = sbi->s_overhead; @@ -4764,8 +4841,9 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter); /* prevent underflow in case that few free space is available */ buf->f_bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0)); - buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es); - if (buf->f_bfree < ext4_r_blocks_count(es)) + buf->f_bavail = buf->f_bfree - + (ext4_r_blocks_count(es) + resv_blocks); + if (buf->f_bfree < (ext4_r_blocks_count(es) + resv_blocks)) buf->f_bavail = 0; buf->f_files = le32_to_cpu(es->s_inodes_count); buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter); From 8c8e0ca622847a8b1b281b8927d62229effa0004 Mon Sep 17 00:00:00 2001 From: Dmitri Monakho Date: Tue, 9 Apr 2013 22:48:36 -0400 Subject: [PATCH 25/50] ext4: fix usless declarations This patch should fix sparse complains about shadow declatations. Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" --- fs/ext4/ialloc.c | 1 - fs/ext4/ioctl.c | 1 - fs/ext4/mballoc.c | 2 -- fs/ext4/move_extent.c | 1 - 4 files changed, 5 deletions(-) diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 6c5bb8d993fe..4c358f711cef 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -899,7 +899,6 @@ got: if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) { __u32 csum; - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __le32 inum = cpu_to_le32(inode->i_ino); __le32 gen = cpu_to_le32(inode->i_generation); csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index cbc3acea6bcf..9491ac0590f7 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -552,7 +552,6 @@ group_add_out: case EXT4_IOC_RESIZE_FS: { ext4_fsblk_t n_blocks_count; - struct super_block *sb = inode->i_sb; int err = 0, err2 = 0; ext4_group_t o_group = EXT4_SB(sb)->s_groups_count; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 6a87f7217474..a11ea4d6164c 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -884,8 +884,6 @@ static int ext4_mb_init_cache(struct page *page, char *incore) first_block = page->index * blocks_per_page; for (i = 0; i < blocks_per_page; i++) { - int group; - group = (first_block + i) >> 1; if (group >= ngroups) break; diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index a2e696e16331..661f4ce688ec 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -858,7 +858,6 @@ mext_page_mkuptodate(struct page *page, unsigned from, unsigned to) if (buffer_uptodate(bh)) continue; if (!buffer_mapped(bh)) { - int err = 0; err = ext4_get_block(inode, block, bh, 0); if (err) { SetPageError(page); From 0b65349ebc24b80cf9f6654a1c4f66c8c796e9f1 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Tue, 9 Apr 2013 23:56:44 -0400 Subject: [PATCH 26/50] ext4: fix big-endian bug in extent migration code Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" Cc: stable@vger.kernel.org --- fs/ext4/extents.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 8b158ae2443b..34ba222f32fa 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4756,7 +4756,7 @@ int ext4_ind_migrate(struct inode *inode) eh = ext_inode_hdr(inode); ex = EXT_FIRST_EXTENT(eh); if (ext4_blocks_count(es) > EXT4_MAX_BLOCK_FILE_PHYS || - eh->eh_depth != 0 || eh->eh_entries > 1) { + eh->eh_depth != 0 || le16_to_cpu(eh->eh_entries) > 1) { ret = -EOPNOTSUPP; goto errout; } From 171a7f21a76a0958c225b97c00a97a10390d40ee Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Tue, 9 Apr 2013 23:56:48 -0400 Subject: [PATCH 27/50] ext4: fix big-endian bug in metadata checksum calculations Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" Cc: stable@vger.kernel.org --- fs/ext4/inode.c | 8 ++++---- fs/ext4/mmp.c | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 629d67b62dfb..62189c84175f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -55,21 +55,21 @@ static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw, __u16 csum_hi = 0; __u32 csum; - csum_lo = raw->i_checksum_lo; + csum_lo = le16_to_cpu(raw->i_checksum_lo); raw->i_checksum_lo = 0; if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) { - csum_hi = raw->i_checksum_hi; + csum_hi = le16_to_cpu(raw->i_checksum_hi); raw->i_checksum_hi = 0; } csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw, EXT4_INODE_SIZE(inode->i_sb)); - raw->i_checksum_lo = csum_lo; + raw->i_checksum_lo = cpu_to_le16(csum_lo); if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) - raw->i_checksum_hi = csum_hi; + raw->i_checksum_hi = cpu_to_le16(csum_hi); return csum; } diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index f9b551561d2c..b3b1f7d99448 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -7,7 +7,7 @@ #include "ext4.h" /* Checksumming functions */ -static __u32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp) +static __le32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp) { struct ext4_sb_info *sbi = EXT4_SB(sb); int offset = offsetof(struct mmp_struct, mmp_checksum); From d6a771056b32146da1280f7872f6936b0c7770ea Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 9 Apr 2013 23:59:55 -0400 Subject: [PATCH 28/50] ext4: fix miscellaneous big endian warnings None of these result in any bug, but they makes sparse complain. Signed-off-by: "Theodore Ts'o" --- fs/ext4/namei.c | 7 ++++--- fs/ext4/super.c | 6 +++--- fs/ext4/xattr.c | 13 +++++++------ include/trace/events/ext4.h | 6 +++--- 4 files changed, 17 insertions(+), 15 deletions(-) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 3825d6aa8336..45a5ca89797f 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -416,15 +416,16 @@ static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent, { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); - __u32 csum, old_csum; + __u32 csum; + __le32 save_csum; int size; size = count_offset + (count * sizeof(struct dx_entry)); - old_csum = t->dt_checksum; + save_csum = t->dt_checksum; t->dt_checksum = 0; csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size); csum = ext4_chksum(sbi, csum, (__u8 *)t, sizeof(struct dx_tail)); - t->dt_checksum = old_csum; + t->dt_checksum = save_csum; return cpu_to_le32(csum); } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 6fea87db7daa..f355c28fa080 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1952,16 +1952,16 @@ static __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group, if ((sbi->s_es->s_feature_ro_compat & cpu_to_le32(EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))) { /* Use new metadata_csum algorithm */ - __u16 old_csum; + __le16 save_csum; __u32 csum32; - old_csum = gdp->bg_checksum; + save_csum = gdp->bg_checksum; gdp->bg_checksum = 0; csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group, sizeof(le_group)); csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp, sbi->s_desc_size); - gdp->bg_checksum = old_csum; + gdp->bg_checksum = save_csum; crc = csum32 & 0xFFFF; goto out; diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 3a120b277240..c081e34f717f 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -122,17 +122,18 @@ static __le32 ext4_xattr_block_csum(struct inode *inode, struct ext4_xattr_header *hdr) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - __u32 csum, old; + __u32 csum; + __le32 save_csum; + __le64 dsk_block_nr = cpu_to_le64(block_nr); - old = hdr->h_checksum; + save_csum = hdr->h_checksum; hdr->h_checksum = 0; - block_nr = cpu_to_le64(block_nr); - csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&block_nr, - sizeof(block_nr)); + csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&dsk_block_nr, + sizeof(dsk_block_nr)); csum = ext4_chksum(sbi, csum, (__u8 *)hdr, EXT4_BLOCK_SIZE(inode->i_sb)); - hdr->h_checksum = old; + hdr->h_checksum = save_csum; return cpu_to_le32(csum); } diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 58459b785565..d0e686402df8 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -1948,7 +1948,7 @@ TRACE_EVENT(ext4_remove_blocks, __entry->to = to; __entry->partial = partial_cluster; __entry->ee_pblk = ext4_ext_pblock(ex); - __entry->ee_lblk = cpu_to_le32(ex->ee_block); + __entry->ee_lblk = le32_to_cpu(ex->ee_block); __entry->ee_len = ext4_ext_get_actual_len(ex); ), @@ -2052,7 +2052,7 @@ TRACE_EVENT(ext4_ext_remove_space, TRACE_EVENT(ext4_ext_remove_space_done, TP_PROTO(struct inode *inode, ext4_lblk_t start, int depth, - ext4_lblk_t partial, unsigned short eh_entries), + ext4_lblk_t partial, __le16 eh_entries), TP_ARGS(inode, start, depth, partial, eh_entries), @@ -2071,7 +2071,7 @@ TRACE_EVENT(ext4_ext_remove_space_done, __entry->start = start; __entry->depth = depth; __entry->partial = partial; - __entry->eh_entries = eh_entries; + __entry->eh_entries = le16_to_cpu(eh_entries); ), TP_printk("dev %d,%d ino %lu since %u depth %d partial %u " From 0d14b098ce05c067e06eea5ed63d9b5c14656bdb Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Wed, 10 Apr 2013 23:32:52 -0400 Subject: [PATCH 29/50] ext4: move ext4_ind_migrate() into migrate.c Move ext4_ind_migrate() into migrate.c file since it makes much more sense and ext4_ext_migrate() is there as well. Also fix tiny style problem - add spaces around "=" in "i=0". Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 2 +- fs/ext4/extents.c | 57 ---------------------------------------------- fs/ext4/migrate.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 58 insertions(+), 59 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 12b560435aba..75b2326b04c6 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2136,6 +2136,7 @@ extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); /* migrate.c */ extern int ext4_ext_migrate(struct inode *); +extern int ext4_ind_migrate(struct inode *inode); /* namei.c */ extern int ext4_dirent_csum_verify(struct inode *inode, @@ -2625,7 +2626,6 @@ extern int ext4_find_delalloc_range(struct inode *inode, extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk); extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); -extern int ext4_ind_migrate(struct inode *inode); /* move_extent.c */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 34ba222f32fa..6fcb375c8fde 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4724,60 +4724,3 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return error; } - -/* - * Migrate a simple extent-based inode to use the i_blocks[] array - */ -int ext4_ind_migrate(struct inode *inode) -{ - struct ext4_extent_header *eh; - struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; - struct ext4_inode_info *ei = EXT4_I(inode); - struct ext4_extent *ex; - unsigned int i, len; - ext4_fsblk_t blk; - handle_t *handle; - int ret; - - if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_INCOMPAT_EXTENTS) || - (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) - return -EINVAL; - - handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1); - if (IS_ERR(handle)) - return PTR_ERR(handle); - - down_write(&EXT4_I(inode)->i_data_sem); - ret = ext4_ext_check_inode(inode); - if (ret) - goto errout; - - eh = ext_inode_hdr(inode); - ex = EXT_FIRST_EXTENT(eh); - if (ext4_blocks_count(es) > EXT4_MAX_BLOCK_FILE_PHYS || - eh->eh_depth != 0 || le16_to_cpu(eh->eh_entries) > 1) { - ret = -EOPNOTSUPP; - goto errout; - } - if (eh->eh_entries == 0) - blk = len = 0; - else { - len = le16_to_cpu(ex->ee_len); - blk = ext4_ext_pblock(ex); - if (len > EXT4_NDIR_BLOCKS) { - ret = -EOPNOTSUPP; - goto errout; - } - } - - ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS); - memset(ei->i_data, 0, sizeof(ei->i_data)); - for (i=0; i < len; i++) - ei->i_data[i] = cpu_to_le32(blk++); - ext4_mark_inode_dirty(handle, inode); -errout: - ext4_journal_stop(handle); - up_write(&EXT4_I(inode)->i_data_sem); - return ret; -} diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 480acf4a085f..d129a4dc16a7 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -426,7 +426,6 @@ static int free_ext_block(handle_t *handle, struct inode *inode) return retval; } return retval; - } int ext4_ext_migrate(struct inode *inode) @@ -606,3 +605,60 @@ out: return retval; } + +/* + * Migrate a simple extent-based inode to use the i_blocks[] array + */ +int ext4_ind_migrate(struct inode *inode) +{ + struct ext4_extent_header *eh; + struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_extent *ex; + unsigned int i, len; + ext4_fsblk_t blk; + handle_t *handle; + int ret; + + if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_INCOMPAT_EXTENTS) || + (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) + return -EINVAL; + + handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + down_write(&EXT4_I(inode)->i_data_sem); + ret = ext4_ext_check_inode(inode); + if (ret) + goto errout; + + eh = ext_inode_hdr(inode); + ex = EXT_FIRST_EXTENT(eh); + if (ext4_blocks_count(es) > EXT4_MAX_BLOCK_FILE_PHYS || + eh->eh_depth != 0 || le16_to_cpu(eh->eh_entries) > 1) { + ret = -EOPNOTSUPP; + goto errout; + } + if (eh->eh_entries == 0) + blk = len = 0; + else { + len = le16_to_cpu(ex->ee_len); + blk = ext4_ext_pblock(ex); + if (len > EXT4_NDIR_BLOCKS) { + ret = -EOPNOTSUPP; + goto errout; + } + } + + ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS); + memset(ei->i_data, 0, sizeof(ei->i_data)); + for (i=0; i < len; i++) + ei->i_data[i] = cpu_to_le32(blk++); + ext4_mark_inode_dirty(handle, inode); +errout: + ext4_journal_stop(handle); + up_write(&EXT4_I(inode)->i_data_sem); + return ret; +} From 43e50f50866513144026f2ac1a1c5cc20c7a8428 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Thu, 11 Apr 2013 10:54:46 -0400 Subject: [PATCH 30/50] ext4: do not convert to indirect with bigalloc enabled With bigalloc feature enabled we do not support indirect addressing at all so we have to prevent extent addressing to indirect addressing conversion in this case. The problem has been introduced with the commit "ext4: support simple conversion of extent-mapped inodes to use i_blocks" Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" --- fs/ext4/migrate.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index d129a4dc16a7..49e8bdff9163 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -625,6 +625,10 @@ int ext4_ind_migrate(struct inode *inode) (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) return -EINVAL; + if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_BIGALLOC)) + return -EOPNOTSUPP; + handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1); if (IS_ERR(handle)) return PTR_ERR(handle); From 7e8b12c60ad38fb90a162df4e6fc120e3bee104e Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Thu, 11 Apr 2013 23:24:58 -0400 Subject: [PATCH 31/50] ext4: defragmentation code cleanup - grab_cache_page_write_begin() may not wait on page's writeback since (1d1d1a767206). But it is still reasonable to wait on page's writeback here in order to be on the safe side. - Fix miss typo: pass 'length' instead of 'end' to __block_write_begin() https://bugzilla.kernel.org/show_bug.cgi?id=56241 TESTCASE: git://oss.sgi.com/xfs/cmds/xfstests.git MKFS_OPTIONS="-b1024" ; ./check ext4/304 Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" Reviewed-by: Akira Fujita --- fs/ext4/move_extent.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 661f4ce688ec..309ca899a731 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -739,6 +739,7 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode, donor_off += dext_alen; orig_off += dext_alen; + BUG_ON(replaced_count > count); /* Already moved the expected blocks */ if (replaced_count >= count) break; @@ -816,7 +817,13 @@ mext_page_double_lock(struct inode *inode1, struct inode *inode2, page_cache_release(page[0]); return -ENOMEM; } - + /* + * grab_cache_page_write_begin() may not wait on page's writeback if + * BDI not demand that. But it is reasonable to be very conservative + * here and explicitly wait on page's writeback + */ + wait_on_page_writeback(page[0]); + wait_on_page_writeback(page[1]); if (inode1 > inode2) { struct page *tmp; tmp = page[0]; @@ -1034,7 +1041,7 @@ data_copy: } /* Perform all necessary steps similar write_begin()/write_end() * but keeping in mind that i_size will not change */ - *err = __block_write_begin(pagep[0], from, from + replaced_size, + *err = __block_write_begin(pagep[0], from, replaced_size, ext4_get_block); if (!*err) *err = block_commit_write(pagep[0], from, from + replaced_size); From e1091b157c330a21bb0eaa881efe0489a1697ed7 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Thu, 11 Apr 2013 23:37:19 -0400 Subject: [PATCH 32/50] ext4: Use kstrtoul() instead of parse_strtoul() In parse_strtoul() we're still using deprecated simple_strtoul(). Remove parse_strtoul() altogether and replace it with kstrtoul() Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 27 +++++++++------------------ 1 file changed, 9 insertions(+), 18 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index f355c28fa080..bfa29ecfb47a 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2394,19 +2394,6 @@ static int parse_strtoull(const char *buf, return ret; } -static int parse_strtoul(const char *buf, - unsigned long max, unsigned long *value) -{ - char *endp; - - *value = simple_strtoul(skip_spaces(buf), &endp, 0); - endp = skip_spaces(endp); - if (*endp || *value > max) - return -EINVAL; - - return 0; -} - static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a, struct ext4_sb_info *sbi, char *buf) @@ -2446,11 +2433,13 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a, const char *buf, size_t count) { unsigned long t; + int ret; - if (parse_strtoul(buf, 0x40000000, &t)) - return -EINVAL; + ret = kstrtoul(skip_spaces(buf), 0, &t); + if (ret) + return ret; - if (t && !is_power_of_2(t)) + if (t && (!is_power_of_2(t) || t > 0x40000000)) return -EINVAL; sbi->s_inode_readahead_blks = t; @@ -2471,9 +2460,11 @@ static ssize_t sbi_ui_store(struct ext4_attr *a, { unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset); unsigned long t; + int ret; - if (parse_strtoul(buf, 0xffffffff, &t)) - return -EINVAL; + ret = kstrtoul(skip_spaces(buf), 0, &t); + if (ret) + return ret; *ui = t; return count; } From 0058f9658c94037173f7603fc8bae2007cc10253 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 11 Apr 2013 23:48:32 -0400 Subject: [PATCH 33/50] ext4: make ext4_bio_write_page() use BH_Async_Write flags So far ext4_bio_write_page() attached all the pages to ext4_io_end structure. This makes that structure pretty heavy (1 KB for pointers + 16 bytes per page attached to the bio). Also later we would like to share ext4_io_end structure among several bios in case IO to a single extent needs to be split among several bios and pointing to pages from ext4_io_end makes this complex. We remove page pointers from ext4_io_end and use pointers from bio itself instead. This isn't as easy when blocksize < pagesize because then we can have several bios in flight for a single page and we have to be careful when to call end_page_writeback(). However this is a known problem already solved by block_write_full_page() / end_buffer_async_write() so we mimic its behavior here. We mark buffers going to disk with BH_Async_Write flag and in ext4_bio_end_io() we check whether there are any buffers with BH_Async_Write flag left. If there are not, we can call end_page_writeback(). Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" Reviewed-by: Dmitry Monakhov Reviewed-by: Zheng Liu --- fs/ext4/ext4.h | 14 ---- fs/ext4/page-io.c | 163 ++++++++++++++++++++++------------------------ 2 files changed, 77 insertions(+), 100 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 75b2326b04c6..3b41d4ae6f9d 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -198,19 +198,8 @@ struct mpage_da_data { #define EXT4_IO_END_ERROR 0x0002 #define EXT4_IO_END_DIRECT 0x0004 -struct ext4_io_page { - struct page *p_page; - atomic_t p_count; -}; - -#define MAX_IO_PAGES 128 - /* * For converting uninitialized extents on a work queue. - * - * 'page' is only used from the writepage() path; 'pages' is only used for - * buffered writes; they are used to keep page references until conversion - * takes place. For AIO/DIO, neither field is filled in. */ typedef struct ext4_io_end { struct list_head list; /* per-file finished IO list */ @@ -220,15 +209,12 @@ typedef struct ext4_io_end { ssize_t size; /* size of the extent */ struct kiocb *iocb; /* iocb struct for AIO */ int result; /* error value for AIO */ - int num_io_pages; /* for writepages() */ - struct ext4_io_page *pages[MAX_IO_PAGES]; /* for writepages() */ } ext4_io_end_t; struct ext4_io_submit { int io_op; struct bio *io_bio; ext4_io_end_t *io_end; - struct ext4_io_page *io_page; sector_t io_next_block; }; diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 047a6de04a0a..1d98fcfc2ff0 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -29,25 +29,19 @@ #include "xattr.h" #include "acl.h" -static struct kmem_cache *io_page_cachep, *io_end_cachep; +static struct kmem_cache *io_end_cachep; int __init ext4_init_pageio(void) { - io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT); - if (io_page_cachep == NULL) - return -ENOMEM; io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT); - if (io_end_cachep == NULL) { - kmem_cache_destroy(io_page_cachep); + if (io_end_cachep == NULL) return -ENOMEM; - } return 0; } void ext4_exit_pageio(void) { kmem_cache_destroy(io_end_cachep); - kmem_cache_destroy(io_page_cachep); } /* @@ -67,15 +61,6 @@ void ext4_ioend_shutdown(struct inode *inode) cancel_work_sync(&EXT4_I(inode)->i_unwritten_work); } -static void put_io_page(struct ext4_io_page *io_page) -{ - if (atomic_dec_and_test(&io_page->p_count)) { - end_page_writeback(io_page->p_page); - put_page(io_page->p_page); - kmem_cache_free(io_page_cachep, io_page); - } -} - void ext4_free_io_end(ext4_io_end_t *io) { int i; @@ -84,9 +69,6 @@ void ext4_free_io_end(ext4_io_end_t *io) BUG_ON(!list_empty(&io->list)); BUG_ON(io->flag & EXT4_IO_END_UNWRITTEN); - for (i = 0; i < io->num_io_pages; i++) - put_io_page(io->pages[i]); - io->num_io_pages = 0; if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count)) wake_up_all(ext4_ioend_wq(io->inode)); kmem_cache_free(io_end_cachep, io); @@ -243,45 +225,56 @@ static void ext4_end_bio(struct bio *bio, int error) ext4_io_end_t *io_end = bio->bi_private; struct inode *inode; int i; + int blocksize; sector_t bi_sector = bio->bi_sector; BUG_ON(!io_end); + inode = io_end->inode; + blocksize = 1 << inode->i_blkbits; bio->bi_private = NULL; bio->bi_end_io = NULL; if (test_bit(BIO_UPTODATE, &bio->bi_flags)) error = 0; - bio_put(bio); - - for (i = 0; i < io_end->num_io_pages; i++) { - struct page *page = io_end->pages[i]->p_page; + for (i = 0; i < bio->bi_vcnt; i++) { + struct bio_vec *bvec = &bio->bi_io_vec[i]; + struct page *page = bvec->bv_page; struct buffer_head *bh, *head; - loff_t offset; - loff_t io_end_offset; + unsigned bio_start = bvec->bv_offset; + unsigned bio_end = bio_start + bvec->bv_len; + unsigned under_io = 0; + unsigned long flags; + + if (!page) + continue; if (error) { SetPageError(page); set_bit(AS_EIO, &page->mapping->flags); - head = page_buffers(page); - BUG_ON(!head); - - io_end_offset = io_end->offset + io_end->size; - - offset = (sector_t) page->index << PAGE_CACHE_SHIFT; - bh = head; - do { - if ((offset >= io_end->offset) && - (offset+bh->b_size <= io_end_offset)) - buffer_io_error(bh); - - offset += bh->b_size; - bh = bh->b_this_page; - } while (bh != head); } - - put_io_page(io_end->pages[i]); + bh = head = page_buffers(page); + /* + * We check all buffers in the page under BH_Uptodate_Lock + * to avoid races with other end io clearing async_write flags + */ + local_irq_save(flags); + bit_spin_lock(BH_Uptodate_Lock, &head->b_state); + do { + if (bh_offset(bh) < bio_start || + bh_offset(bh) + blocksize > bio_end) { + if (buffer_async_write(bh)) + under_io++; + continue; + } + clear_buffer_async_write(bh); + if (error) + buffer_io_error(bh); + } while ((bh = bh->b_this_page) != head); + bit_spin_unlock(BH_Uptodate_Lock, &head->b_state); + local_irq_restore(flags); + if (!under_io) + end_page_writeback(page); } - io_end->num_io_pages = 0; - inode = io_end->inode; + bio_put(bio); if (error) { io_end->flag |= EXT4_IO_END_ERROR; @@ -345,7 +338,6 @@ static int io_submit_init(struct ext4_io_submit *io, } static int io_submit_add_bh(struct ext4_io_submit *io, - struct ext4_io_page *io_page, struct inode *inode, struct writeback_control *wbc, struct buffer_head *bh) @@ -353,11 +345,6 @@ static int io_submit_add_bh(struct ext4_io_submit *io, ext4_io_end_t *io_end; int ret; - if (buffer_new(bh)) { - clear_buffer_new(bh); - unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); - } - if (io->io_bio && bh->b_blocknr != io->io_next_block) { submit_and_retry: ext4_io_submit(io); @@ -368,9 +355,6 @@ submit_and_retry: return ret; } io_end = io->io_end; - if ((io_end->num_io_pages >= MAX_IO_PAGES) && - (io_end->pages[io_end->num_io_pages-1] != io_page)) - goto submit_and_retry; if (buffer_uninit(bh)) ext4_set_io_unwritten_flag(inode, io_end); io->io_end->size += bh->b_size; @@ -378,11 +362,6 @@ submit_and_retry: ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh)); if (ret != bh->b_size) goto submit_and_retry; - if ((io_end->num_io_pages == 0) || - (io_end->pages[io_end->num_io_pages-1] != io_page)) { - io_end->pages[io_end->num_io_pages++] = io_page; - atomic_inc(&io_page->p_count); - } return 0; } @@ -392,33 +371,29 @@ int ext4_bio_write_page(struct ext4_io_submit *io, struct writeback_control *wbc) { struct inode *inode = page->mapping->host; - unsigned block_start, block_end, blocksize; - struct ext4_io_page *io_page; + unsigned block_start, blocksize; struct buffer_head *bh, *head; int ret = 0; + int nr_submitted = 0; blocksize = 1 << inode->i_blkbits; BUG_ON(!PageLocked(page)); BUG_ON(PageWriteback(page)); - io_page = kmem_cache_alloc(io_page_cachep, GFP_NOFS); - if (!io_page) { - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return -ENOMEM; - } - io_page->p_page = page; - atomic_set(&io_page->p_count, 1); - get_page(page); set_page_writeback(page); ClearPageError(page); - for (bh = head = page_buffers(page), block_start = 0; - bh != head || !block_start; - block_start = block_end, bh = bh->b_this_page) { - - block_end = block_start + blocksize; + /* + * In the first loop we prepare and mark buffers to submit. We have to + * mark all buffers in the page before submitting so that + * end_page_writeback() cannot be called from ext4_bio_end_io() when IO + * on the first buffer finishes and we are still working on submitting + * the second buffer. + */ + bh = head = page_buffers(page); + do { + block_start = bh_offset(bh); if (block_start >= len) { /* * Comments copied from block_write_full_page_endio: @@ -431,7 +406,8 @@ int ext4_bio_write_page(struct ext4_io_submit *io, * mapped, and writes to that region are not written * out to the file." */ - zero_user_segment(page, block_start, block_end); + zero_user_segment(page, block_start, + block_start + blocksize); clear_buffer_dirty(bh); set_buffer_uptodate(bh); continue; @@ -445,7 +421,19 @@ int ext4_bio_write_page(struct ext4_io_submit *io, ext4_io_submit(io); continue; } - ret = io_submit_add_bh(io, io_page, inode, wbc, bh); + if (buffer_new(bh)) { + clear_buffer_new(bh); + unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); + } + set_buffer_async_write(bh); + } while ((bh = bh->b_this_page) != head); + + /* Now submit buffers to write */ + bh = head = page_buffers(page); + do { + if (!buffer_async_write(bh)) + continue; + ret = io_submit_add_bh(io, inode, wbc, bh); if (ret) { /* * We only get here on ENOMEM. Not much else @@ -455,17 +443,20 @@ int ext4_bio_write_page(struct ext4_io_submit *io, redirty_page_for_writepage(wbc, page); break; } + nr_submitted++; clear_buffer_dirty(bh); + } while ((bh = bh->b_this_page) != head); + + /* Error stopped previous loop? Clean up buffers... */ + if (ret) { + do { + clear_buffer_async_write(bh); + bh = bh->b_this_page; + } while (bh != head); } unlock_page(page); - /* - * If the page was truncated before we could do the writeback, - * or we had a memory allocation error while trying to write - * the first buffer head, we won't have submitted any pages for - * I/O. In that case we need to make sure we've cleared the - * PageWriteback bit from the page to prevent the system from - * wedging later on. - */ - put_io_page(io_page); + /* Nothing submitted - we have to end page writeback */ + if (!nr_submitted) + end_page_writeback(page); return ret; } From 4eec708d263f0ee10861d69251708a225b64cac7 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 11 Apr 2013 23:56:53 -0400 Subject: [PATCH 34/50] ext4: use io_end for multiple bios Change writeback path to create just one io_end structure for the extent to which we submit IO and share it among bios writing that extent. This prevents needless splitting and joining of unwritten extents when they cannot be submitted as a single bio. Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" Reviewed-by: Dmitry Monakhov Reviewed-by: Zheng Liu --- fs/ext4/ext4.h | 8 ++- fs/ext4/inode.c | 85 ++++++++++++++++++-------------- fs/ext4/page-io.c | 123 ++++++++++++++++++++++++++++------------------ 3 files changed, 129 insertions(+), 87 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 3b41d4ae6f9d..779d26b7beff 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -209,6 +209,7 @@ typedef struct ext4_io_end { ssize_t size; /* size of the extent */ struct kiocb *iocb; /* iocb struct for AIO */ int result; /* error value for AIO */ + atomic_t count; /* reference counter */ } ext4_io_end_t; struct ext4_io_submit { @@ -2627,11 +2628,14 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, /* page-io.c */ extern int __init ext4_init_pageio(void); -extern void ext4_add_complete_io(ext4_io_end_t *io_end); extern void ext4_exit_pageio(void); extern void ext4_ioend_shutdown(struct inode *); -extern void ext4_free_io_end(ext4_io_end_t *io); extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); +extern ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end); +extern int ext4_put_io_end(ext4_io_end_t *io_end); +extern void ext4_put_io_end_defer(ext4_io_end_t *io_end); +extern void ext4_io_submit_init(struct ext4_io_submit *io, + struct writeback_control *wbc); extern void ext4_end_io_work(struct work_struct *work); extern void ext4_io_submit(struct ext4_io_submit *io); extern int ext4_bio_write_page(struct ext4_io_submit *io, diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 62189c84175f..62492e954483 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1483,7 +1483,10 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd, struct ext4_io_submit io_submit; BUG_ON(mpd->next_page <= mpd->first_page); - memset(&io_submit, 0, sizeof(io_submit)); + ext4_io_submit_init(&io_submit, mpd->wbc); + io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS); + if (!io_submit.io_end) + return -ENOMEM; /* * We need to start from the first_page to the next_page - 1 * to make sure we also write the mapped dirty buffer_heads. @@ -1571,6 +1574,8 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd, pagevec_release(&pvec); } ext4_io_submit(&io_submit); + /* Drop io_end reference we got from init */ + ext4_put_io_end_defer(io_submit.io_end); return ret; } @@ -2229,9 +2234,16 @@ static int ext4_writepage(struct page *page, */ return __ext4_journalled_writepage(page, len); - memset(&io_submit, 0, sizeof(io_submit)); + ext4_io_submit_init(&io_submit, wbc); + io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS); + if (!io_submit.io_end) { + redirty_page_for_writepage(wbc, page); + return -ENOMEM; + } ret = ext4_bio_write_page(&io_submit, page, len, wbc); ext4_io_submit(&io_submit); + /* Drop io_end reference we got from init */ + ext4_put_io_end_defer(io_submit.io_end); return ret; } @@ -3062,9 +3074,13 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, struct inode *inode = file_inode(iocb->ki_filp); ext4_io_end_t *io_end = iocb->private; - /* if not async direct IO or dio with 0 bytes write, just return */ - if (!io_end || !size) - goto out; + /* if not async direct IO just return */ + if (!io_end) { + inode_dio_done(inode); + if (is_async) + aio_complete(iocb, ret, 0); + return; + } ext_debug("ext4_end_io_dio(): io_end 0x%p " "for inode %lu, iocb 0x%p, offset %llu, size %zd\n", @@ -3072,25 +3088,13 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, size); iocb->private = NULL; - - /* if not aio dio with unwritten extents, just free io and return */ - if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { - ext4_free_io_end(io_end); -out: - inode_dio_done(inode); - if (is_async) - aio_complete(iocb, ret, 0); - return; - } - io_end->offset = offset; io_end->size = size; if (is_async) { io_end->iocb = iocb; io_end->result = ret; } - - ext4_add_complete_io(io_end); + ext4_put_io_end_defer(io_end); } /* @@ -3124,6 +3128,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, get_block_t *get_block_func = NULL; int dio_flags = 0; loff_t final_size = offset + count; + ext4_io_end_t *io_end = NULL; /* Use the old path for reads and writes beyond i_size. */ if (rw != WRITE || final_size > inode->i_size) @@ -3162,13 +3167,16 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, iocb->private = NULL; ext4_inode_aio_set(inode, NULL); if (!is_sync_kiocb(iocb)) { - ext4_io_end_t *io_end = ext4_init_io_end(inode, GFP_NOFS); + io_end = ext4_init_io_end(inode, GFP_NOFS); if (!io_end) { ret = -ENOMEM; goto retake_lock; } io_end->flag |= EXT4_IO_END_DIRECT; - iocb->private = io_end; + /* + * Grab reference for DIO. Will be dropped in ext4_end_io_dio() + */ + iocb->private = ext4_get_io_end(io_end); /* * we save the io structure for current async direct * IO, so that later ext4_map_blocks() could flag the @@ -3192,26 +3200,27 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, NULL, dio_flags); - if (iocb->private) - ext4_inode_aio_set(inode, NULL); /* - * The io_end structure takes a reference to the inode, that - * structure needs to be destroyed and the reference to the - * inode need to be dropped, when IO is complete, even with 0 - * byte write, or failed. - * - * In the successful AIO DIO case, the io_end structure will - * be destroyed and the reference to the inode will be dropped - * after the end_io call back function is called. - * - * In the case there is 0 byte write, or error case, since VFS - * direct IO won't invoke the end_io call back function, we - * need to free the end_io structure here. + * Put our reference to io_end. This can free the io_end structure e.g. + * in sync IO case or in case of error. It can even perform extent + * conversion if all bios we submitted finished before we got here. + * Note that in that case iocb->private can be already set to NULL + * here. */ - if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { - ext4_free_io_end(iocb->private); - iocb->private = NULL; - } else if (ret > 0 && !overwrite && ext4_test_inode_state(inode, + if (io_end) { + ext4_inode_aio_set(inode, NULL); + ext4_put_io_end(io_end); + /* + * In case of error or no write ext4_end_io_dio() was not + * called so we have to put iocb's reference. + */ + if (ret <= 0 && ret != -EIOCBQUEUED) { + WARN_ON(iocb->private != io_end); + ext4_put_io_end(io_end); + iocb->private = NULL; + } + } + if (ret > 0 && !overwrite && ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN)) { int err; /* diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 1d98fcfc2ff0..14f9837350d1 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -61,17 +61,28 @@ void ext4_ioend_shutdown(struct inode *inode) cancel_work_sync(&EXT4_I(inode)->i_unwritten_work); } -void ext4_free_io_end(ext4_io_end_t *io) +static void ext4_release_io_end(ext4_io_end_t *io_end) { - int i; + BUG_ON(!list_empty(&io_end->list)); + BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN); - BUG_ON(!io); - BUG_ON(!list_empty(&io->list)); - BUG_ON(io->flag & EXT4_IO_END_UNWRITTEN); + if (atomic_dec_and_test(&EXT4_I(io_end->inode)->i_ioend_count)) + wake_up_all(ext4_ioend_wq(io_end->inode)); + if (io_end->flag & EXT4_IO_END_DIRECT) + inode_dio_done(io_end->inode); + if (io_end->iocb) + aio_complete(io_end->iocb, io_end->result, 0); + kmem_cache_free(io_end_cachep, io_end); +} - if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count)) - wake_up_all(ext4_ioend_wq(io->inode)); - kmem_cache_free(io_end_cachep, io); +static void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) +{ + struct inode *inode = io_end->inode; + + io_end->flag &= ~EXT4_IO_END_UNWRITTEN; + /* Wake up anyone waiting on unwritten extent conversion */ + if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) + wake_up_all(ext4_ioend_wq(inode)); } /* check a range of space and convert unwritten extents to written. */ @@ -94,13 +105,8 @@ static int ext4_end_io(ext4_io_end_t *io) "(inode %lu, offset %llu, size %zd, error %d)", inode->i_ino, offset, size, ret); } - /* Wake up anyone waiting on unwritten extent conversion */ - if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) - wake_up_all(ext4_ioend_wq(inode)); - if (io->flag & EXT4_IO_END_DIRECT) - inode_dio_done(inode); - if (io->iocb) - aio_complete(io->iocb, io->result, 0); + ext4_clear_io_unwritten_flag(io); + ext4_release_io_end(io); return ret; } @@ -131,7 +137,7 @@ static void dump_completed_IO(struct inode *inode) } /* Add the io_end to per-inode completed end_io list. */ -void ext4_add_complete_io(ext4_io_end_t *io_end) +static void ext4_add_complete_io(ext4_io_end_t *io_end) { struct ext4_inode_info *ei = EXT4_I(io_end->inode); struct workqueue_struct *wq; @@ -168,8 +174,6 @@ static int ext4_do_flush_completed_IO(struct inode *inode) err = ext4_end_io(io); if (unlikely(!ret && err)) ret = err; - io->flag &= ~EXT4_IO_END_UNWRITTEN; - ext4_free_io_end(io); } return ret; } @@ -201,10 +205,43 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) atomic_inc(&EXT4_I(inode)->i_ioend_count); io->inode = inode; INIT_LIST_HEAD(&io->list); + atomic_set(&io->count, 1); } return io; } +void ext4_put_io_end_defer(ext4_io_end_t *io_end) +{ + if (atomic_dec_and_test(&io_end->count)) { + if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || !io_end->size) { + ext4_release_io_end(io_end); + return; + } + ext4_add_complete_io(io_end); + } +} + +int ext4_put_io_end(ext4_io_end_t *io_end) +{ + int err = 0; + + if (atomic_dec_and_test(&io_end->count)) { + if (io_end->flag & EXT4_IO_END_UNWRITTEN) { + err = ext4_convert_unwritten_extents(io_end->inode, + io_end->offset, io_end->size); + ext4_clear_io_unwritten_flag(io_end); + } + ext4_release_io_end(io_end); + } + return err; +} + +ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end) +{ + atomic_inc(&io_end->count); + return io_end; +} + /* * Print an buffer I/O error compatible with the fs/buffer.c. This * provides compatibility with dmesg scrapers that look for a specific @@ -287,12 +324,7 @@ static void ext4_end_bio(struct bio *bio, int error) bi_sector >> (inode->i_blkbits - 9)); } - if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { - ext4_free_io_end(io_end); - return; - } - - ext4_add_complete_io(io_end); + ext4_put_io_end_defer(io_end); } void ext4_io_submit(struct ext4_io_submit *io) @@ -306,40 +338,37 @@ void ext4_io_submit(struct ext4_io_submit *io) bio_put(io->io_bio); } io->io_bio = NULL; - io->io_op = 0; +} + +void ext4_io_submit_init(struct ext4_io_submit *io, + struct writeback_control *wbc) +{ + io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); + io->io_bio = NULL; io->io_end = NULL; } -static int io_submit_init(struct ext4_io_submit *io, - struct inode *inode, - struct writeback_control *wbc, - struct buffer_head *bh) +static int io_submit_init_bio(struct ext4_io_submit *io, + struct buffer_head *bh) { - ext4_io_end_t *io_end; - struct page *page = bh->b_page; int nvecs = bio_get_nr_vecs(bh->b_bdev); struct bio *bio; - io_end = ext4_init_io_end(inode, GFP_NOFS); - if (!io_end) - return -ENOMEM; bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES)); bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio->bi_bdev = bh->b_bdev; - bio->bi_private = io->io_end = io_end; bio->bi_end_io = ext4_end_bio; - - io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh); - + bio->bi_private = ext4_get_io_end(io->io_end); + if (!io->io_end->size) + io->io_end->offset = (bh->b_page->index << PAGE_CACHE_SHIFT) + + bh_offset(bh); io->io_bio = bio; - io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); io->io_next_block = bh->b_blocknr; return 0; } static int io_submit_add_bh(struct ext4_io_submit *io, struct inode *inode, - struct writeback_control *wbc, struct buffer_head *bh) { ext4_io_end_t *io_end; @@ -350,18 +379,18 @@ submit_and_retry: ext4_io_submit(io); } if (io->io_bio == NULL) { - ret = io_submit_init(io, inode, wbc, bh); + ret = io_submit_init_bio(io, bh); if (ret) return ret; } - io_end = io->io_end; - if (buffer_uninit(bh)) - ext4_set_io_unwritten_flag(inode, io_end); - io->io_end->size += bh->b_size; - io->io_next_block++; ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh)); if (ret != bh->b_size) goto submit_and_retry; + io_end = io->io_end; + if (buffer_uninit(bh)) + ext4_set_io_unwritten_flag(inode, io_end); + io_end->size += bh->b_size; + io->io_next_block++; return 0; } @@ -433,7 +462,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io, do { if (!buffer_async_write(bh)) continue; - ret = io_submit_add_bh(io, inode, wbc, bh); + ret = io_submit_add_bh(io, inode, bh); if (ret) { /* * We only get here on ENOMEM. Not much else From 7b001d6a0c0c7b92e989c2c3b1d8e151f1df1acc Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 12 Apr 2013 00:03:19 -0400 Subject: [PATCH 35/50] ext4: clear buffer_uninit flag when submitting IO Currently noone cleared buffer_uninit flag. This results in writeback needlessly marking io_end as needing extent conversion scanning extent tree for extents to convert. So clear the buffer_uninit flag once the buffer is submitted for IO and the flag is transformed into EXT4_IO_END_UNWRITTEN flag. Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" Reviewed-by: Zheng Liu --- fs/ext4/page-io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 14f9837350d1..5929cd0baa20 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -387,7 +387,7 @@ submit_and_retry: if (ret != bh->b_size) goto submit_and_retry; io_end = io->io_end; - if (buffer_uninit(bh)) + if (test_clear_buffer_uninit(bh)) ext4_set_io_unwritten_flag(inode, io_end); io_end->size += bh->b_size; io->io_next_block++; From ae4647fb7654676fc44a97e86eb35f9f06b99f66 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 12 Apr 2013 00:03:42 -0400 Subject: [PATCH 36/50] jbd2: reduce journal_head size Remove unused t_cow_tid field (ext4 copy-on-write support doesn't seem to be happening) and change b_modified and b_jlist to bitfields thus saving 8 bytes in the structure. Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" Reviewed-by: Zheng Liu --- include/linux/journal-head.h | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/include/linux/journal-head.h b/include/linux/journal-head.h index c18b46f8aeeb..13a3da25ff07 100644 --- a/include/linux/journal-head.h +++ b/include/linux/journal-head.h @@ -31,21 +31,14 @@ struct journal_head { /* * Journalling list for this buffer [jbd_lock_bh_state()] */ - unsigned b_jlist; + unsigned b_jlist:4; /* * This flag signals the buffer has been modified by * the currently running transaction * [jbd_lock_bh_state()] */ - unsigned b_modified; - - /* - * This feild tracks the last transaction id in which this buffer - * has been cowed - * [jbd_lock_bh_state()] - */ - tid_t b_cow_tid; + unsigned b_modified:1; /* * Copy of the buffer data frozen for writing to the log. From fd03d8daf417fffbcb27fdb30d60f6c81ed813c8 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 18 Apr 2013 14:53:15 -0400 Subject: [PATCH 37/50] ext4: reserve xattr index for Rich ACL support Jan Kara SUSE is carrying out of tree patches for Rich ACL support for ext4 as they didn't get upstream due to opposition of some VFS maintainers. Reserve xattr index for Rich ACLs so that it cannot be taken by anything else which would force users to backup and reset their Rich ACLs on files. Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" --- fs/ext4/xattr.h | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index aa25deb5c6cd..c767dbdd7fc4 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -22,6 +22,7 @@ #define EXT4_XATTR_INDEX_LUSTRE 5 #define EXT4_XATTR_INDEX_SECURITY 6 #define EXT4_XATTR_INDEX_SYSTEM 7 +#define EXT4_XATTR_INDEX_RICHACL 8 struct ext4_xattr_header { __le32 h_magic; /* magic number for identification */ From eb9cc7e16b32c898a1d715733c590f115aa0a099 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 19 Apr 2013 13:38:14 -0400 Subject: [PATCH 38/50] ext4: move quota initialization out of inode allocation transaction Inode allocation transaction is pretty heavy (246 credits with quotas and extents before previous patch, still around 200 after it). This is mostly due to credits required for allocation of quota structures (credits there are heavily overestimated but it's difficult to make better estimates if we don't want to wire non-trivial assumptions about quota format into filesystem). So move quota initialization out of allocation transaction. That way transaction for quota structure allocation will be started only if we need to look up quota structure on disk (rare) and furthermore it will be started for each quota type separately, not for all of them at once. This reduces maximum transaction size to 34 is most cases and to 73 in the worst case. [ Modified by tytso to clean up the cleanup paths for error handling. Also use a separate call to ext4_std_error() for each failure so it is easier for someone who is debugging a problem in this function to determine which function call failed. ] Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" --- fs/ext4/ialloc.c | 85 ++++++++++++++++++++++++++++-------------------- fs/ext4/namei.c | 12 +++---- 2 files changed, 53 insertions(+), 44 deletions(-) diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 4c358f711cef..18d36d85f5c9 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -666,6 +666,23 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, ei = EXT4_I(inode); sbi = EXT4_SB(sb); + /* + * Initalize owners and quota early so that we don't have to account + * for quota initialization worst case in standard inode creating + * transaction + */ + if (owner) { + inode->i_mode = mode; + i_uid_write(inode, owner[0]); + i_gid_write(inode, owner[1]); + } else if (test_opt(sb, GRPID)) { + inode->i_mode = mode; + inode->i_uid = current_fsuid(); + inode->i_gid = dir->i_gid; + } else + inode_init_owner(inode, dir, mode); + dquot_initialize(inode); + if (!goal) goal = sbi->s_inode_goal; @@ -697,7 +714,7 @@ got_group: gdp = ext4_get_group_desc(sb, group, &group_desc_bh); if (!gdp) - goto fail; + goto out; /* * Check free inodes count before loading bitmap. @@ -711,7 +728,7 @@ got_group: brelse(inode_bitmap_bh); inode_bitmap_bh = ext4_read_inode_bitmap(sb, group); if (!inode_bitmap_bh) - goto fail; + goto out; repeat_in_this_group: ino = ext4_find_next_zero_bit((unsigned long *) @@ -733,13 +750,16 @@ repeat_in_this_group: handle_type, nblocks); if (IS_ERR(handle)) { err = PTR_ERR(handle); - goto fail; + ext4_std_error(sb, err); + goto out; } } BUFFER_TRACE(inode_bitmap_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, inode_bitmap_bh); - if (err) - goto fail; + if (err) { + ext4_std_error(sb, err); + goto out; + } ext4_lock_group(sb, group); ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data); ext4_unlock_group(sb, group); @@ -755,8 +775,10 @@ repeat_in_this_group: got: BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh); - if (err) - goto fail; + if (err) { + ext4_std_error(sb, err); + goto out; + } /* We may have to initialize the block bitmap if it isn't already */ if (ext4_has_group_desc_csum(sb) && @@ -768,7 +790,8 @@ got: err = ext4_journal_get_write_access(handle, block_bitmap_bh); if (err) { brelse(block_bitmap_bh); - goto fail; + ext4_std_error(sb, err); + goto out; } BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap"); @@ -787,14 +810,18 @@ got: ext4_unlock_group(sb, group); brelse(block_bitmap_bh); - if (err) - goto fail; + if (err) { + ext4_std_error(sb, err); + goto out; + } } BUFFER_TRACE(group_desc_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, group_desc_bh); - if (err) - goto fail; + if (err) { + ext4_std_error(sb, err); + goto out; + } /* Update the relevant bg descriptor fields */ if (ext4_has_group_desc_csum(sb)) { @@ -840,8 +867,10 @@ got: BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh); - if (err) - goto fail; + if (err) { + ext4_std_error(sb, err); + goto out; + } percpu_counter_dec(&sbi->s_freeinodes_counter); if (S_ISDIR(mode)) @@ -851,16 +880,6 @@ got: flex_group = ext4_flex_group(sbi, group); atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes); } - if (owner) { - inode->i_mode = mode; - i_uid_write(inode, owner[0]); - i_gid_write(inode, owner[1]); - } else if (test_opt(sb, GRPID)) { - inode->i_mode = mode; - inode->i_uid = current_fsuid(); - inode->i_gid = dir->i_gid; - } else - inode_init_owner(inode, dir, mode); inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb); /* This is the optimal IO size (for stat), not the fs block size */ @@ -889,7 +908,9 @@ got: * twice. */ err = -EIO; - goto fail; + ext4_error(sb, "failed to insert inode %lu: doubly allocated?", + inode->i_ino); + goto out; } spin_lock(&sbi->s_next_gen_lock); inode->i_generation = sbi->s_next_generation++; @@ -917,7 +938,6 @@ got: ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); ret = inode; - dquot_initialize(inode); err = dquot_alloc_inode(inode); if (err) goto fail_drop; @@ -951,24 +971,17 @@ got: ext4_debug("allocating inode %lu\n", inode->i_ino); trace_ext4_allocate_inode(inode, dir, mode); - goto really_out; -fail: - ext4_std_error(sb, err); -out: - iput(inode); - ret = ERR_PTR(err); -really_out: brelse(inode_bitmap_bh); return ret; fail_free_drop: dquot_free_inode(inode); - fail_drop: - dquot_drop(inode); - inode->i_flags |= S_NOQUOTA; clear_nlink(inode); unlock_new_inode(inode); +out: + dquot_drop(inode); + inode->i_flags |= S_NOQUOTA; iput(inode); brelse(inode_bitmap_bh); return ERR_PTR(err); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 45a5ca89797f..955c907fc980 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2252,8 +2252,7 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode, dquot_initialize(dir); credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + - EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); retry: inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0, NULL, EXT4_HT_DIR, credits); @@ -2287,8 +2286,7 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry, dquot_initialize(dir); credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + - EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); retry: inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0, NULL, EXT4_HT_DIR, credits); @@ -2397,8 +2395,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) dquot_initialize(dir); credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + - EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); retry: inode = ext4_new_inode_start_handle(dir, S_IFDIR | mode, &dentry->d_name, @@ -2827,8 +2824,7 @@ static int ext4_symlink(struct inode *dir, * quota blocks, sb is already counted in previous macros). */ credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + - EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb); + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3; } retry: inode = ext4_new_inode_start_handle(dir, S_IFLNK|S_IRWXUGO, From 2656497b26d45c1ca51a7727ab92c8307cb99305 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 19 Apr 2013 14:04:12 -0400 Subject: [PATCH 39/50] ext4: mext_insert_extents should update extent block checksum Signed-off-by: Darrick J. Wong Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4_extents.h | 5 +++++ fs/ext4/extents.c | 7 ++----- fs/ext4/move_extent.c | 13 +------------ 3 files changed, 8 insertions(+), 17 deletions(-) diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 8643ff5bbeb7..51bc821ade90 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -270,5 +270,10 @@ static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix, 0xffff); } +#define ext4_ext_dirty(handle, inode, path) \ + __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path)) +int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle, + struct inode *inode, struct ext4_ext_path *path); + #endif /* _EXT4_EXTENTS */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 6fcb375c8fde..107936db244e 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -157,11 +157,8 @@ static int ext4_ext_get_access(handle_t *handle, struct inode *inode, * - ENOMEM * - EIO */ -#define ext4_ext_dirty(handle, inode, path) \ - __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path)) -static int __ext4_ext_dirty(const char *where, unsigned int line, - handle_t *handle, struct inode *inode, - struct ext4_ext_path *path) +int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle, + struct inode *inode, struct ext4_ext_path *path) { int err; if (path->p_bh) { diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 309ca899a731..3dcbf364022f 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -409,18 +409,7 @@ mext_insert_extents(handle_t *handle, struct inode *orig_inode, mext_insert_inside_block(o_start, o_end, start_ext, new_ext, end_ext, eh, range_to_move); - if (depth) { - ret = ext4_handle_dirty_metadata(handle, orig_inode, - orig_path->p_bh); - if (ret) - return ret; - } else { - ret = ext4_mark_inode_dirty(handle, orig_inode); - if (ret < 0) - return ret; - } - - return 0; + return ext4_ext_dirty(handle, orig_inode, orig_path); } /** From 28daf4fae8693d4a285123494899fe01950cba50 Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Fri, 19 Apr 2013 17:49:23 -0400 Subject: [PATCH 40/50] jbd2: use kmem_cache_zalloc instead of kmem_cache_alloc/memset The jbd2_alloc_handle() function is only called by new_handle(). So this commit uses kmem_cache_zalloc() instead of kmem_cache_alloc()/memset(). Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" --- fs/jbd2/transaction.c | 1 - include/linux/jbd2.h | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 325bc019ed88..a1920da22802 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -332,7 +332,6 @@ static handle_t *new_handle(int nblocks) handle_t *handle = jbd2_alloc_handle(GFP_NOFS); if (!handle) return NULL; - memset(handle, 0, sizeof(*handle)); handle->h_buffer_credits = nblocks; handle->h_ref = 1; diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index f9fe88957b7a..6e051f472edb 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1145,7 +1145,7 @@ extern struct kmem_cache *jbd2_handle_cache; static inline handle_t *jbd2_alloc_handle(gfp_t gfp_flags) { - return kmem_cache_alloc(jbd2_handle_cache, gfp_flags); + return kmem_cache_zalloc(jbd2_handle_cache, gfp_flags); } static inline void jbd2_free_handle(handle_t *handle) From 8af0f08227977079f8f227e74d27c59db2ab84f6 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Fri, 19 Apr 2013 17:53:09 -0400 Subject: [PATCH 41/50] ext4: fix readdir error in the case of inline_data+dir_index Zach reported a problem that if inline data is enabled, we don't tell the difference between the offset of '.' and '..'. And a getdents will fail if the user only want to get '.' and what's worse, if there is a conversion happens when the user calls getdents many times, he/she may get the same entry twice. In theory, a dir block would also fail if it is converted to a hashed-index based dir since f_pos will become a hash value, not the real one, but it doesn't happen. And a deep investigation shows that we uses a hash based solution even for a normal dir if the dir_index feature is enabled. So this patch just adds a new htree_inlinedir_to_tree for inline dir, and if we find that the hash index is supported, we will do like what we do for a dir block. Reported-by: Zach Brown Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/dir.c | 20 +++++---- fs/ext4/ext4.h | 23 ++++++++++ fs/ext4/inline.c | 109 ++++++++++++++++++++++++++++++++++++++++++++++- fs/ext4/namei.c | 29 +++++-------- 4 files changed, 153 insertions(+), 28 deletions(-) diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index d8cd1f0f4661..f8d56e4254e0 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -46,7 +46,8 @@ static int is_dx_dir(struct inode *inode) if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb, EXT4_FEATURE_COMPAT_DIR_INDEX) && ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) || - ((inode->i_size >> sb->s_blocksize_bits) == 1))) + ((inode->i_size >> sb->s_blocksize_bits) == 1) || + ext4_has_inline_data(inode))) return 1; return 0; @@ -115,14 +116,6 @@ static int ext4_readdir(struct file *filp, int ret = 0; int dir_has_error = 0; - if (ext4_has_inline_data(inode)) { - int has_inline_data = 1; - ret = ext4_read_inline_dir(filp, dirent, filldir, - &has_inline_data); - if (has_inline_data) - return ret; - } - if (is_dx_dir(inode)) { err = ext4_dx_readdir(filp, dirent, filldir); if (err != ERR_BAD_DX_DIR) { @@ -136,6 +129,15 @@ static int ext4_readdir(struct file *filp, ext4_clear_inode_flag(file_inode(filp), EXT4_INODE_INDEX); } + + if (ext4_has_inline_data(inode)) { + int has_inline_data = 1; + ret = ext4_read_inline_dir(filp, dirent, filldir, + &has_inline_data); + if (has_inline_data) + return ret; + } + stored = 0; offset = filp->f_pos & (sb->s_blocksize - 1); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 779d26b7beff..0aabb344b02e 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2518,6 +2518,11 @@ extern int ext4_try_create_inline_dir(handle_t *handle, extern int ext4_read_inline_dir(struct file *filp, void *dirent, filldir_t filldir, int *has_inline_data); +extern int htree_inlinedir_to_tree(struct file *dir_file, + struct inode *dir, ext4_lblk_t block, + struct dx_hash_info *hinfo, + __u32 start_hash, __u32 start_minor_hash, + int *has_inline_data); extern struct buffer_head *ext4_find_inline_entry(struct inode *dir, const struct qstr *d_name, struct ext4_dir_entry_2 **res_dir, @@ -2554,6 +2559,24 @@ extern void initialize_dirent_tail(struct ext4_dir_entry_tail *t, extern int ext4_handle_dirty_dirent_node(handle_t *handle, struct inode *inode, struct buffer_head *bh); +#define S_SHIFT 12 +static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = { + [S_IFREG >> S_SHIFT] = EXT4_FT_REG_FILE, + [S_IFDIR >> S_SHIFT] = EXT4_FT_DIR, + [S_IFCHR >> S_SHIFT] = EXT4_FT_CHRDEV, + [S_IFBLK >> S_SHIFT] = EXT4_FT_BLKDEV, + [S_IFIFO >> S_SHIFT] = EXT4_FT_FIFO, + [S_IFSOCK >> S_SHIFT] = EXT4_FT_SOCK, + [S_IFLNK >> S_SHIFT] = EXT4_FT_SYMLINK, +}; + +static inline void ext4_set_de_type(struct super_block *sb, + struct ext4_dir_entry_2 *de, + umode_t mode) { + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE)) + de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; +} + /* symlink.c */ extern const struct inode_operations ext4_symlink_inode_operations; diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index c0fd1a123f7d..abf8b6278c3a 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -19,7 +19,8 @@ #define EXT4_XATTR_SYSTEM_DATA "data" #define EXT4_MIN_INLINE_DATA_SIZE ((sizeof(__le32) * EXT4_N_BLOCKS)) -#define EXT4_INLINE_DOTDOT_SIZE 4 +#define EXT4_INLINE_DOTDOT_OFFSET 2 +#define EXT4_INLINE_DOTDOT_SIZE 4 int ext4_get_inline_size(struct inode *inode) { @@ -1289,6 +1290,112 @@ out: return ret; } +/* + * This function fills a red-black tree with information from an + * inlined dir. It returns the number directory entries loaded + * into the tree. If there is an error it is returned in err. + */ +int htree_inlinedir_to_tree(struct file *dir_file, + struct inode *dir, ext4_lblk_t block, + struct dx_hash_info *hinfo, + __u32 start_hash, __u32 start_minor_hash, + int *has_inline_data) +{ + int err = 0, count = 0; + unsigned int parent_ino; + int pos; + struct ext4_dir_entry_2 *de; + struct inode *inode = file_inode(dir_file); + int ret, inline_size = 0; + struct ext4_iloc iloc; + void *dir_buf = NULL; + struct ext4_dir_entry_2 fake; + + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) + return ret; + + down_read(&EXT4_I(inode)->xattr_sem); + if (!ext4_has_inline_data(inode)) { + up_read(&EXT4_I(inode)->xattr_sem); + *has_inline_data = 0; + goto out; + } + + inline_size = ext4_get_inline_size(inode); + dir_buf = kmalloc(inline_size, GFP_NOFS); + if (!dir_buf) { + ret = -ENOMEM; + up_read(&EXT4_I(inode)->xattr_sem); + goto out; + } + + ret = ext4_read_inline_data(inode, dir_buf, inline_size, &iloc); + up_read(&EXT4_I(inode)->xattr_sem); + if (ret < 0) + goto out; + + pos = 0; + parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode); + while (pos < inline_size) { + /* + * As inlined dir doesn't store any information about '.' and + * only the inode number of '..' is stored, we have to handle + * them differently. + */ + if (pos == 0) { + fake.inode = cpu_to_le32(inode->i_ino); + fake.name_len = 1; + strcpy(fake.name, "."); + fake.rec_len = ext4_rec_len_to_disk( + EXT4_DIR_REC_LEN(fake.name_len), + inline_size); + ext4_set_de_type(inode->i_sb, &fake, S_IFDIR); + de = &fake; + pos = EXT4_INLINE_DOTDOT_OFFSET; + } else if (pos == EXT4_INLINE_DOTDOT_OFFSET) { + fake.inode = cpu_to_le32(parent_ino); + fake.name_len = 2; + strcpy(fake.name, ".."); + fake.rec_len = ext4_rec_len_to_disk( + EXT4_DIR_REC_LEN(fake.name_len), + inline_size); + ext4_set_de_type(inode->i_sb, &fake, S_IFDIR); + de = &fake; + pos = EXT4_INLINE_DOTDOT_SIZE; + } else { + de = (struct ext4_dir_entry_2 *)(dir_buf + pos); + pos += ext4_rec_len_from_disk(de->rec_len, inline_size); + if (ext4_check_dir_entry(inode, dir_file, de, + iloc.bh, dir_buf, + inline_size, pos)) { + ret = count; + goto out; + } + } + + ext4fs_dirhash(de->name, de->name_len, hinfo); + if ((hinfo->hash < start_hash) || + ((hinfo->hash == start_hash) && + (hinfo->minor_hash < start_minor_hash))) + continue; + if (de->inode == 0) + continue; + err = ext4_htree_store_dirent(dir_file, + hinfo->hash, hinfo->minor_hash, de); + if (err) { + count = err; + goto out; + } + count++; + } + ret = count; +out: + kfree(dir_buf); + brelse(iloc.bh); + return ret; +} + int ext4_read_inline_dir(struct file *filp, void *dirent, filldir_t filldir, int *has_inline_data) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 955c907fc980..6653fc35ecb7 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -972,6 +972,17 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; + if (ext4_has_inline_data(dir)) { + int has_inline_data = 1; + count = htree_inlinedir_to_tree(dir_file, dir, 0, + &hinfo, start_hash, + start_minor_hash, + &has_inline_data); + if (has_inline_data) { + *next_hash = ~0; + return count; + } + } count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo, start_hash, start_minor_hash); *next_hash = ~0; @@ -1456,24 +1467,6 @@ struct dentry *ext4_get_parent(struct dentry *child) return d_obtain_alias(ext4_iget(child->d_inode->i_sb, ino)); } -#define S_SHIFT 12 -static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = { - [S_IFREG >> S_SHIFT] = EXT4_FT_REG_FILE, - [S_IFDIR >> S_SHIFT] = EXT4_FT_DIR, - [S_IFCHR >> S_SHIFT] = EXT4_FT_CHRDEV, - [S_IFBLK >> S_SHIFT] = EXT4_FT_BLKDEV, - [S_IFIFO >> S_SHIFT] = EXT4_FT_FIFO, - [S_IFSOCK >> S_SHIFT] = EXT4_FT_SOCK, - [S_IFLNK >> S_SHIFT] = EXT4_FT_SYMLINK, -}; - -static inline void ext4_set_de_type(struct super_block *sb, - struct ext4_dir_entry_2 *de, - umode_t mode) { - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE)) - de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; -} - /* * Move count entries from end of map between two memory locations. * Returns pointer to last entry moved. From c4d8b0235aa98f8c26bf94d308be3fdd24154572 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Fri, 19 Apr 2013 17:55:33 -0400 Subject: [PATCH 42/50] ext4: fix readdir error in case inline_data+^dir_index. Zach reported a problem that if inline data is enabled, we don't tell the difference between the offset of '.' and '..'. And a getdents will fail if the user only want to get '.'. And what's worse, we may meet with duplicate dir entries as the offset for inline dir and non-inline one is quite different. This patch just try to resolve this problem if dir_index is disabled. In this case, f_pos is the real offset with the dir block, so for inline dir, we just pretend as if we are a dir block and returns the offset like a norml dir block does. Reported-by: Zach Brown Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/inline.c | 69 +++++++++++++++++++++++++++++++++++------------- 1 file changed, 51 insertions(+), 18 deletions(-) diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index abf8b6278c3a..3e2bf873e8a8 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -1396,6 +1396,14 @@ out: return ret; } +/* + * So this function is called when the volume is mkfsed with + * dir_index disabled. In order to keep f_pos persistent + * after we convert from an inlined dir to a blocked based, + * we just pretend that we are a normal dir and return the + * offset as if '.' and '..' really take place. + * + */ int ext4_read_inline_dir(struct file *filp, void *dirent, filldir_t filldir, int *has_inline_data) @@ -1409,6 +1417,7 @@ int ext4_read_inline_dir(struct file *filp, int ret, inline_size = 0; struct ext4_iloc iloc; void *dir_buf = NULL; + int dotdot_offset, dotdot_size, extra_offset, extra_size; ret = ext4_get_inode_loc(inode, &iloc); if (ret) @@ -1437,8 +1446,21 @@ int ext4_read_inline_dir(struct file *filp, sb = inode->i_sb; stored = 0; parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode); + offset = filp->f_pos; - while (!error && !stored && filp->f_pos < inode->i_size) { + /* + * dotdot_offset and dotdot_size is the real offset and + * size for ".." and "." if the dir is block based while + * the real size for them are only EXT4_INLINE_DOTDOT_SIZE. + * So we will use extra_offset and extra_size to indicate them + * during the inline dir iteration. + */ + dotdot_offset = EXT4_DIR_REC_LEN(1); + dotdot_size = dotdot_offset + EXT4_DIR_REC_LEN(2); + extra_offset = dotdot_size - EXT4_INLINE_DOTDOT_SIZE; + extra_size = extra_offset + inline_size; + + while (!error && !stored && filp->f_pos < extra_size) { revalidate: /* * If the version has changed since the last call to @@ -1447,15 +1469,23 @@ revalidate: * dir to make sure. */ if (filp->f_version != inode->i_version) { - for (i = 0; - i < inode->i_size && i < offset;) { + for (i = 0; i < extra_size && i < offset;) { + /* + * "." is with offset 0 and + * ".." is dotdot_offset. + */ if (!i) { - /* skip "." and ".." if needed. */ - i += EXT4_INLINE_DOTDOT_SIZE; + i = dotdot_offset; + continue; + } else if (i == dotdot_offset) { + i = dotdot_size; continue; } + /* for other entry, the real offset in + * the buf has to be tuned accordingly. + */ de = (struct ext4_dir_entry_2 *) - (dir_buf + i); + (dir_buf + i - extra_offset); /* It's too expensive to do a full * dirent test each time round this * loop, but we do have to test at @@ -1463,43 +1493,47 @@ revalidate: * failure will be detected in the * dirent test below. */ if (ext4_rec_len_from_disk(de->rec_len, - inline_size) < EXT4_DIR_REC_LEN(1)) + extra_size) < EXT4_DIR_REC_LEN(1)) break; i += ext4_rec_len_from_disk(de->rec_len, - inline_size); + extra_size); } offset = i; filp->f_pos = offset; filp->f_version = inode->i_version; } - while (!error && filp->f_pos < inode->i_size) { + while (!error && filp->f_pos < extra_size) { if (filp->f_pos == 0) { error = filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR); if (error) break; stored++; + filp->f_pos = dotdot_offset; + continue; + } - error = filldir(dirent, "..", 2, 0, parent_ino, - DT_DIR); + if (filp->f_pos == dotdot_offset) { + error = filldir(dirent, "..", 2, + dotdot_offset, + parent_ino, DT_DIR); if (error) break; stored++; - filp->f_pos = offset = EXT4_INLINE_DOTDOT_SIZE; + filp->f_pos = dotdot_size; continue; } - de = (struct ext4_dir_entry_2 *)(dir_buf + offset); + de = (struct ext4_dir_entry_2 *) + (dir_buf + filp->f_pos - extra_offset); if (ext4_check_dir_entry(inode, filp, de, iloc.bh, dir_buf, - inline_size, offset)) { + extra_size, filp->f_pos)) { ret = stored; goto out; } - offset += ext4_rec_len_from_disk(de->rec_len, - inline_size); if (le32_to_cpu(de->inode)) { /* We might block in the next section * if the data destination is @@ -1522,9 +1556,8 @@ revalidate: stored++; } filp->f_pos += ext4_rec_len_from_disk(de->rec_len, - inline_size); + extra_size); } - offset = 0; } out: kfree(dir_buf); From 9f203507ed277ee86e3f76a15e09db1c92e40b94 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 20 Apr 2013 15:46:17 -0400 Subject: [PATCH 43/50] ext4: mark all metadata I/O with REQ_META As Dave Chinner pointed out at the 2013 LSF/MM workshop, it's important that metadata I/O requests are marked as such to avoid priority inversions caused by I/O bandwidth throttling. Signed-off-by: "Theodore Ts'o" --- fs/ext4/balloc.c | 2 +- fs/ext4/ialloc.c | 2 +- fs/ext4/mmp.c | 4 ++-- fs/ext4/super.c | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 8dcaea69e37f..d0f13eada0ed 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -441,7 +441,7 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) trace_ext4_read_block_bitmap_load(sb, block_group); bh->b_end_io = ext4_end_bitmap_read; get_bh(bh); - submit_bh(READ, bh); + submit_bh(READ | REQ_META | REQ_PRIO, bh); return bh; verify: ext4_validate_block_bitmap(sb, desc, block_group, bh); diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 18d36d85f5c9..00a818d67b54 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -166,7 +166,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) trace_ext4_load_inode_bitmap(sb, block_group); bh->b_end_io = ext4_end_bitmap_read; get_bh(bh); - submit_bh(READ, bh); + submit_bh(READ | REQ_META | REQ_PRIO, bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { put_bh(bh); diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index b3b1f7d99448..214461e42a05 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -54,7 +54,7 @@ static int write_mmp_block(struct super_block *sb, struct buffer_head *bh) lock_buffer(bh); bh->b_end_io = end_buffer_write_sync; get_bh(bh); - submit_bh(WRITE_SYNC, bh); + submit_bh(WRITE_SYNC | REQ_META | REQ_PRIO, bh); wait_on_buffer(bh); sb_end_write(sb); if (unlikely(!buffer_uptodate(bh))) @@ -86,7 +86,7 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh, get_bh(*bh); lock_buffer(*bh); (*bh)->b_end_io = end_buffer_read_sync; - submit_bh(READ_SYNC, *bh); + submit_bh(READ_SYNC | REQ_META | REQ_PRIO, *bh); wait_on_buffer(*bh); if (!buffer_uptodate(*bh)) { brelse(*bh); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index bfa29ecfb47a..dbc7c090c13a 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4252,7 +4252,7 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb, goto out_bdev; } journal->j_private = sb; - ll_rw_block(READ, 1, &journal->j_sb_buffer); + ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &journal->j_sb_buffer); wait_on_buffer(journal->j_sb_buffer); if (!buffer_uptodate(journal->j_sb_buffer)) { ext4_msg(sb, KERN_ERR, "I/O error on journal device"); From 877f962c5edacfef60ab21cfed6d8d54ce25b8a6 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 20 Apr 2013 19:58:37 -0400 Subject: [PATCH 44/50] buffer: add BH_Prio and BH_Meta flags Add buffer_head flags so that buffer cache writebacks can be marked with the the appropriate request flags, so that metadata blocks can be marked appropriately in blktrace. Signed-off-by: "Theodore Ts'o" --- fs/buffer.c | 5 +++++ include/linux/buffer_head.h | 4 ++++ 2 files changed, 9 insertions(+) diff --git a/fs/buffer.c b/fs/buffer.c index b4dcb34c9635..a15575c0b9ee 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2988,6 +2988,11 @@ int submit_bh(int rw, struct buffer_head * bh) /* Take care of bh's that straddle the end of the device */ guard_bh_eod(rw, bio, bh); + if (buffer_meta(bh)) + rw |= REQ_META; + if (buffer_prio(bh)) + rw |= REQ_PRIO; + bio_get(bio); submit_bio(rw, bio); diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 5afc4f94d110..33c0f8103fe4 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -34,6 +34,8 @@ enum bh_state_bits { BH_Write_EIO, /* I/O error on write */ BH_Unwritten, /* Buffer is allocated on disk but not written */ BH_Quiet, /* Buffer Error Prinks to be quiet */ + BH_Meta, /* Buffer contains metadata */ + BH_Prio, /* Buffer should be submitted with REQ_PRIO */ BH_PrivateStart,/* not a state bit, but the first bit available * for private allocation by other entities @@ -124,6 +126,8 @@ BUFFER_FNS(Delay, delay) BUFFER_FNS(Boundary, boundary) BUFFER_FNS(Write_EIO, write_io_error) BUFFER_FNS(Unwritten, unwritten) +BUFFER_FNS(Meta, meta) +BUFFER_FNS(Prio, prio) #define bh_offset(bh) ((unsigned long)(bh)->b_data & ~PAGE_MASK) From 13fca323e9a8b63c08de7a4e05d3c702516b535d Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 21 Apr 2013 16:45:54 -0400 Subject: [PATCH 45/50] ext4: mark metadata blocks using bh flags This allows metadata writebacks which are issued via block device writeback to be sent with the current write request flags. Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4_jbd2.c | 2 ++ fs/ext4/inode.c | 6 +++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 0e1dc9e70ce5..451eb4045330 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -215,6 +215,8 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line, might_sleep(); + set_buffer_meta(bh); + set_buffer_prio(bh); if (ext4_handle_valid(handle)) { err = jbd2_journal_dirty_metadata(handle, bh); if (err) { diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 62492e954483..d7518e2728f1 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1080,10 +1080,14 @@ retry_journal: /* For write_end() in data=journal mode */ static int write_end_fn(handle_t *handle, struct buffer_head *bh) { + int ret; if (!buffer_mapped(bh) || buffer_freed(bh)) return 0; set_buffer_uptodate(bh); - return ext4_handle_dirty_metadata(handle, NULL, bh); + ret = ext4_handle_dirty_metadata(handle, NULL, bh); + clear_buffer_meta(bh); + clear_buffer_prio(bh); + return ret; } /* From f783f091e49ce4896e6b026af82d76e0537c6089 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 21 Apr 2013 16:47:54 -0400 Subject: [PATCH 46/50] jbd2: trace when lock_buffer in do_get_write_access takes a long time While investigating interactivity problems it was clear that processes sometimes stall for long periods of times if an attempt is made to lock a buffer which is undergoing writeback. It would stall in a trace looking something like [] __lock_buffer+0x2e/0x30 [] do_get_write_access+0x43f/0x4b0 [] jbd2_journal_get_write_access+0x2b/0x50 [] __ext4_journal_get_write_access+0x39/0x80 [] ext4_reserve_inode_write+0x78/0xa0 [] ext4_mark_inode_dirty+0x49/0x220 [] ext4_dirty_inode+0x41/0x60 [] __mark_inode_dirty+0x4e/0x2d0 [] update_time+0x79/0xc0 [] file_update_time+0x98/0x100 [] __generic_file_aio_write+0x17c/0x3b0 [] generic_file_aio_write+0x7a/0xf0 [] ext4_file_write+0x83/0xd0 [] do_sync_write+0xa3/0xe0 [] vfs_write+0xae/0x180 [] sys_write+0x4d/0x90 [] system_call_fastpath+0x1a/0x1f [] 0xffffffffffffffff Signed-off-by: Mel Gorman Signed-off-by: "Theodore Ts'o" --- fs/jbd2/transaction.c | 8 ++++++++ include/trace/events/jbd2.h | 21 +++++++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index a1920da22802..10f524c59ea8 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -639,6 +639,7 @@ do_get_write_access(handle_t *handle, struct journal_head *jh, int error; char *frozen_buffer = NULL; int need_copy = 0; + unsigned long start_lock, time_lock; if (is_handle_aborted(handle)) return -EROFS; @@ -654,9 +655,16 @@ repeat: /* @@@ Need to check for errors here at some point. */ + start_lock = jiffies; lock_buffer(bh); jbd_lock_bh_state(bh); + /* If it takes too long to lock the buffer, trace it */ + time_lock = jbd2_time_diff(start_lock, jiffies); + if (time_lock > HZ/10) + trace_jbd2_lock_buffer_stall(bh->b_bdev->bd_dev, + jiffies_to_msecs(time_lock)); + /* We now hold the buffer lock so it is safe to query the buffer * state. Is the buffer dirty? * diff --git a/include/trace/events/jbd2.h b/include/trace/events/jbd2.h index 070df49e4a1d..c1d1f3eb242d 100644 --- a/include/trace/events/jbd2.h +++ b/include/trace/events/jbd2.h @@ -358,6 +358,27 @@ TRACE_EVENT(jbd2_write_superblock, MINOR(__entry->dev), __entry->write_op) ); +TRACE_EVENT(jbd2_lock_buffer_stall, + + TP_PROTO(dev_t dev, unsigned long stall_ms), + + TP_ARGS(dev, stall_ms), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field(unsigned long, stall_ms ) + ), + + TP_fast_assign( + __entry->dev = dev; + __entry->stall_ms = stall_ms; + ), + + TP_printk("dev %d,%d stall_ms %lu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->stall_ms) +); + #endif /* _TRACE_JBD2_H */ /* This part must be outside protection */ From c5c72d814cf0f650010337c73638b25e6d14d2d4 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 21 Apr 2013 20:19:43 -0400 Subject: [PATCH 47/50] ext4: fix online resizing for ext3-compat file systems Commit fb0a387dcdc restricts block allocations for indirect-mapped files to block groups less than s_blockfile_groups. However, the online resizing code wasn't setting s_blockfile_groups, so the newly added block groups were not available for non-extent mapped files. Reported-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" Cc: stable@vger.kernel.org --- fs/ext4/resize.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index e3498534a2c1..08d2312c0fd4 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1341,6 +1341,8 @@ static void ext4_update_super(struct super_block *sb, /* Update the global fs size fields */ sbi->s_groups_count += flex_gd->count; + sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count, + (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); /* Update the reserved block counts only once the new group is * active. */ From 7f3e3c7cfcec148ccca9c0dd2dbfd7b00b7ac10f Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 21 Apr 2013 20:32:03 -0400 Subject: [PATCH 48/50] ext4: fix Kconfig documentation for CONFIG_EXT4_DEBUG Fox the Kconfig documentation for CONFIG_EXT4_DEBUG to match the change made by commit a0b30c1229: ext4: use module parameters instead of debugfs for mballoc_debug Signed-off-by: "Theodore Ts'o" Cc: stable@vger.kernel.org --- fs/ext4/Kconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index 987358740cb9..efea5d5c44ce 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -71,4 +71,5 @@ config EXT4_DEBUG Enables run-time debugging support for the ext4 filesystem. If you select Y here, then you will be able to turn on debugging - with a command such as "echo 1 > /sys/kernel/debug/ext4/mballoc-debug" + with a command such as: + echo 1 > /sys/module/ext4/parameters/mballoc_debug From 3f8a6411fbada1fa482276591e037f3b1adcf55b Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 21 Apr 2013 22:56:32 -0400 Subject: [PATCH 49/50] ext4: add check for inodes_count overflow in new resize ioctl Addresses-Red-Hat-Bugzilla: #913245 Reported-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" Reviewed-by: Carlos Maiolino Cc: stable@vger.kernel.org --- fs/ext4/resize.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 08d2312c0fd4..b27c96d01965 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1882,6 +1882,10 @@ retry: return 0; n_group = ext4_get_group_number(sb, n_blocks_count - 1); + if (n_group > (0xFFFFFFFFUL / EXT4_INODES_PER_GROUP(sb))) { + ext4_warning(sb, "resize would cause inodes_count overflow"); + return -EINVAL; + } ext4_get_group_no_and_offset(sb, o_blocks_count - 1, &o_group, &offset); n_desc_blocks = num_desc_blocks(sb, n_group + 1); From 0d606e2c9fccdd4e67febf1e2da500e1bfe9e045 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 23 Apr 2013 08:59:35 -0400 Subject: [PATCH 50/50] ext4: fix type-widening bug in inode table readahead code Due to a missing cast, the high 32-bits of a 64-bit block number used when calculating the readahead block for inode tables can get lost. This means we can end up fetching the wrong blocks for readahead for file systems > 16TB. Linus found this when experimenting with an enhacement to the sparse static code checker which checks for missing widening casts before binary "not" operators. Reported-by: Linus Torvalds Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d7518e2728f1..793d44b84d7f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4011,13 +4011,14 @@ make_io: if (EXT4_SB(sb)->s_inode_readahead_blks) { ext4_fsblk_t b, end, table; unsigned num; + __u32 ra_blks = EXT4_SB(sb)->s_inode_readahead_blks; table = ext4_inode_table(sb, gdp); /* s_inode_readahead_blks is always a power of 2 */ - b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1); + b = block & ~((ext4_fsblk_t) ra_blks - 1); if (table > b) b = table; - end = b + EXT4_SB(sb)->s_inode_readahead_blks; + end = b + ra_blks; num = EXT4_INODES_PER_GROUP(sb); if (ext4_has_group_desc_csum(sb)) num -= ext4_itable_unused_count(sb, gdp);