diff --git a/Documentation/filesystems/fscrypt.rst b/Documentation/filesystems/fscrypt.rst index 471a511c7508..68c2bc8275cf 100644 --- a/Documentation/filesystems/fscrypt.rst +++ b/Documentation/filesystems/fscrypt.rst @@ -342,8 +342,8 @@ Contents encryption ------------------- For file contents, each filesystem block is encrypted independently. -Currently, only the case where the filesystem block size is equal to -the system's page size (usually 4096 bytes) is supported. +Starting from Linux kernel 5.5, encryption of filesystems with block +size less than system's page size is supported. Each block's IV is set to the logical block number within the file as a little endian number, except that: diff --git a/fs/buffer.c b/fs/buffer.c index 86a38b979323..d39838090b22 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -47,6 +47,7 @@ #include #include #include +#include static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, @@ -246,10 +247,6 @@ out: return ret; } -/* - * I/O completion handler for block_read_full_page() - pages - * which come unlocked at the end of I/O. - */ static void end_buffer_async_read(struct buffer_head *bh, int uptodate) { unsigned long flags; @@ -307,6 +304,47 @@ still_busy: return; } +struct decrypt_bh_ctx { + struct work_struct work; + struct buffer_head *bh; +}; + +static void decrypt_bh(struct work_struct *work) +{ + struct decrypt_bh_ctx *ctx = + container_of(work, struct decrypt_bh_ctx, work); + struct buffer_head *bh = ctx->bh; + int err; + + err = fscrypt_decrypt_pagecache_blocks(bh->b_page, bh->b_size, + bh_offset(bh)); + end_buffer_async_read(bh, err == 0); + kfree(ctx); +} + +/* + * I/O completion handler for block_read_full_page() - pages + * which come unlocked at the end of I/O. + */ +static void end_buffer_async_read_io(struct buffer_head *bh, int uptodate) +{ + /* Decrypt if needed */ + if (uptodate && IS_ENABLED(CONFIG_FS_ENCRYPTION) && + IS_ENCRYPTED(bh->b_page->mapping->host) && + S_ISREG(bh->b_page->mapping->host->i_mode)) { + struct decrypt_bh_ctx *ctx = kmalloc(sizeof(*ctx), GFP_ATOMIC); + + if (ctx) { + INIT_WORK(&ctx->work, decrypt_bh); + ctx->bh = bh; + fscrypt_enqueue_decrypt_work(&ctx->work); + return; + } + uptodate = 0; + } + end_buffer_async_read(bh, uptodate); +} + /* * Completion handler for block_write_full_page() - pages which are unlocked * during I/O, and which have PageWriteback cleared upon I/O completion. @@ -379,7 +417,7 @@ EXPORT_SYMBOL(end_buffer_async_write); */ static void mark_buffer_async_read(struct buffer_head *bh) { - bh->b_end_io = end_buffer_async_read; + bh->b_end_io = end_buffer_async_read_io; set_buffer_async_read(bh); } diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index b3a2cc7c0252..f8578caba40d 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -198,6 +198,12 @@ struct ext4_system_blocks { */ #define EXT4_IO_END_UNWRITTEN 0x0001 +struct ext4_io_end_vec { + struct list_head list; /* list of io_end_vec */ + loff_t offset; /* offset in the file */ + ssize_t size; /* size of the extent */ +}; + /* * For converting unwritten extents on a work queue. 'handle' is used for * buffered writeback. @@ -211,8 +217,7 @@ typedef struct ext4_io_end { * bios covering the extent */ unsigned int flag; /* unwritten or not */ atomic_t count; /* reference counter */ - loff_t offset; /* offset in the file */ - ssize_t size; /* size of the extent */ + struct list_head list_vec; /* list of ext4_io_end_vec */ } ext4_io_end_t; struct ext4_io_submit { @@ -1579,7 +1584,6 @@ enum { EXT4_STATE_NO_EXPAND, /* No space for expansion */ EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */ EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */ - EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/ EXT4_STATE_NEWENTRY, /* File just added to dir */ EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */ EXT4_STATE_EXT_PRECACHED, /* extents have been precached */ @@ -2562,8 +2566,6 @@ int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); int ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); -int ext4_dio_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create); int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create); int ext4_walk_page_buffers(handle_t *handle, @@ -2606,7 +2608,6 @@ extern int ext4_can_truncate(struct inode *inode); extern int ext4_truncate(struct inode *); extern int ext4_break_layouts(struct inode *); extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length); -extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks); extern void ext4_set_inode_flags(struct inode *); extern int ext4_alloc_da_blocks(struct inode *inode); extern void ext4_set_aops(struct inode *inode); @@ -3266,6 +3267,8 @@ extern long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len); extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, loff_t offset, ssize_t len); +extern int ext4_convert_unwritten_io_end_vec(handle_t *handle, + ext4_io_end_t *io_end); extern int ext4_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); extern int ext4_ext_calc_metadata_amount(struct inode *inode, @@ -3298,6 +3301,10 @@ extern int ext4_swap_extents(handle_t *handle, struct inode *inode1, ext4_lblk_t lblk2, ext4_lblk_t count, int mark_unwritten,int *err); extern int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu); +extern int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode, + int check_cred, int restart_cred, + int revoke_cred); + /* move_extent.c */ extern void ext4_double_down_write_data_sem(struct inode *first, @@ -3324,6 +3331,8 @@ extern int ext4_bio_write_page(struct ext4_io_submit *io, int len, struct writeback_control *wbc, bool keep_towrite); +extern struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end); +extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end); /* mmp.c */ extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t); @@ -3381,6 +3390,7 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) } extern const struct iomap_ops ext4_iomap_ops; +extern const struct iomap_ops ext4_iomap_report_ops; static inline int ext4_buffer_uptodate(struct buffer_head *bh) { diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 7c70b08d104c..d3b8cdea5df7 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -65,12 +65,14 @@ static int ext4_journal_check_start(struct super_block *sb) } handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line, - int type, int blocks, int rsv_blocks) + int type, int blocks, int rsv_blocks, + int revoke_creds) { journal_t *journal; int err; - trace_ext4_journal_start(sb, blocks, rsv_blocks, _RET_IP_); + trace_ext4_journal_start(sb, blocks, rsv_blocks, revoke_creds, + _RET_IP_); err = ext4_journal_check_start(sb); if (err < 0) return ERR_PTR(err); @@ -78,8 +80,8 @@ handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line, journal = EXT4_SB(sb)->s_journal; if (!journal) return ext4_get_nojournal(); - return jbd2__journal_start(journal, blocks, rsv_blocks, GFP_NOFS, - type, line); + return jbd2__journal_start(journal, blocks, rsv_blocks, revoke_creds, + GFP_NOFS, type, line); } int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle) @@ -119,8 +121,8 @@ handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line, return ext4_get_nojournal(); sb = handle->h_journal->j_private; - trace_ext4_journal_start_reserved(sb, handle->h_buffer_credits, - _RET_IP_); + trace_ext4_journal_start_reserved(sb, + jbd2_handle_buffer_credits(handle), _RET_IP_); err = ext4_journal_check_start(sb); if (err < 0) { jbd2_journal_free_reserved(handle); @@ -133,6 +135,19 @@ handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line, return handle; } +int __ext4_journal_ensure_credits(handle_t *handle, int check_cred, + int extend_cred, int revoke_cred) +{ + if (!ext4_handle_valid(handle)) + return 0; + if (jbd2_handle_buffer_credits(handle) >= check_cred && + handle->h_revoke_credits >= revoke_cred) + return 0; + extend_cred = max(0, extend_cred - jbd2_handle_buffer_credits(handle)); + revoke_cred = max(0, revoke_cred - handle->h_revoke_credits); + return ext4_journal_extend(handle, extend_cred, revoke_cred); +} + static void ext4_journal_abort_handle(const char *caller, unsigned int line, const char *err_fn, struct buffer_head *bh, @@ -278,7 +293,7 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line, handle->h_type, handle->h_line_no, handle->h_requested_credits, - handle->h_buffer_credits, err); + jbd2_handle_buffer_credits(handle), err); return err; } ext4_error_inode(inode, where, line, @@ -289,7 +304,8 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line, handle->h_type, handle->h_line_no, handle->h_requested_credits, - handle->h_buffer_credits, err); + jbd2_handle_buffer_credits(handle), + err); } } else { if (inode) diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index ef8fcf7d0d3b..a6b9b66dbfad 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -261,7 +261,8 @@ int __ext4_handle_dirty_super(const char *where, unsigned int line, __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb)) handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line, - int type, int blocks, int rsv_blocks); + int type, int blocks, int rsv_blocks, + int revoke_creds); int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle); #define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096) @@ -288,28 +289,41 @@ static inline int ext4_handle_is_aborted(handle_t *handle) return 0; } -static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed) +static inline int ext4_free_metadata_revoke_credits(struct super_block *sb, + int blocks) { - if (ext4_handle_valid(handle) && handle->h_buffer_credits < needed) - return 0; - return 1; + /* Freeing each metadata block can result in freeing one cluster */ + return blocks * EXT4_SB(sb)->s_cluster_ratio; +} + +static inline int ext4_trans_default_revoke_credits(struct super_block *sb) +{ + return ext4_free_metadata_revoke_credits(sb, 8); } #define ext4_journal_start_sb(sb, type, nblocks) \ - __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks), 0) + __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks), 0, \ + ext4_trans_default_revoke_credits(sb)) #define ext4_journal_start(inode, type, nblocks) \ - __ext4_journal_start((inode), __LINE__, (type), (nblocks), 0) + __ext4_journal_start((inode), __LINE__, (type), (nblocks), 0, \ + ext4_trans_default_revoke_credits((inode)->i_sb)) -#define ext4_journal_start_with_reserve(inode, type, blocks, rsv_blocks) \ - __ext4_journal_start((inode), __LINE__, (type), (blocks), (rsv_blocks)) +#define ext4_journal_start_with_reserve(inode, type, blocks, rsv_blocks)\ + __ext4_journal_start((inode), __LINE__, (type), (blocks), (rsv_blocks),\ + ext4_trans_default_revoke_credits((inode)->i_sb)) + +#define ext4_journal_start_with_revoke(inode, type, blocks, revoke_creds) \ + __ext4_journal_start((inode), __LINE__, (type), (blocks), 0, \ + (revoke_creds)) static inline handle_t *__ext4_journal_start(struct inode *inode, unsigned int line, int type, - int blocks, int rsv_blocks) + int blocks, int rsv_blocks, + int revoke_creds) { return __ext4_journal_start_sb(inode->i_sb, line, type, blocks, - rsv_blocks); + rsv_blocks, revoke_creds); } #define ext4_journal_stop(handle) \ @@ -332,20 +346,68 @@ static inline handle_t *ext4_journal_current_handle(void) return journal_current_handle(); } -static inline int ext4_journal_extend(handle_t *handle, int nblocks) +static inline int ext4_journal_extend(handle_t *handle, int nblocks, int revoke) { if (ext4_handle_valid(handle)) - return jbd2_journal_extend(handle, nblocks); + return jbd2_journal_extend(handle, nblocks, revoke); return 0; } -static inline int ext4_journal_restart(handle_t *handle, int nblocks) +static inline int ext4_journal_restart(handle_t *handle, int nblocks, + int revoke) { if (ext4_handle_valid(handle)) - return jbd2_journal_restart(handle, nblocks); + return jbd2__journal_restart(handle, nblocks, revoke, GFP_NOFS); return 0; } +int __ext4_journal_ensure_credits(handle_t *handle, int check_cred, + int extend_cred, int revoke_cred); + + +/* + * Ensure @handle has at least @check_creds credits available. If not, + * transaction will be extended or restarted to contain at least @extend_cred + * credits. Before restarting transaction @fn is executed to allow for cleanup + * before the transaction is restarted. + * + * The return value is < 0 in case of error, 0 in case the handle has enough + * credits or transaction extension succeeded, 1 in case transaction had to be + * restarted. + */ +#define ext4_journal_ensure_credits_fn(handle, check_cred, extend_cred, \ + revoke_cred, fn) \ +({ \ + __label__ __ensure_end; \ + int err = __ext4_journal_ensure_credits((handle), (check_cred), \ + (extend_cred), (revoke_cred)); \ + \ + if (err <= 0) \ + goto __ensure_end; \ + err = (fn); \ + if (err < 0) \ + goto __ensure_end; \ + err = ext4_journal_restart((handle), (extend_cred), (revoke_cred)); \ + if (err == 0) \ + err = 1; \ +__ensure_end: \ + err; \ +}) + +/* + * Ensure given handle has at least requested amount of credits available, + * possibly restarting transaction if needed. We also make sure the transaction + * has space for at least ext4_trans_default_revoke_credits(sb) revoke records + * as freeing one or two blocks is very common pattern and requesting this is + * very cheap. + */ +static inline int ext4_journal_ensure_credits(handle_t *handle, int credits, + int revoke_creds) +{ + return ext4_journal_ensure_credits_fn(handle, credits, credits, + revoke_creds, 0); +} + static inline int ext4_journal_blocks_per_page(struct inode *inode) { if (EXT4_JOURNAL(inode) != NULL) @@ -407,6 +469,7 @@ static inline int ext4_inode_journal_mode(struct inode *inode) return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ /* We do not support data journalling with delayed allocation */ if (!S_ISREG(inode->i_mode) || + ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) || test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) && !test_opt(inode->i_sb, DELALLOC))) { @@ -437,6 +500,19 @@ static inline int ext4_should_writeback_data(struct inode *inode) return ext4_inode_journal_mode(inode) & EXT4_INODE_WRITEBACK_DATA_MODE; } +static inline int ext4_free_data_revoke_credits(struct inode *inode, int blocks) +{ + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) + return 0; + if (!ext4_should_journal_data(inode)) + return 0; + /* + * Data blocks in one extent are contiguous, just account for partial + * clusters at extent boundaries + */ + return blocks + 2*(EXT4_SB(inode->i_sb)->s_cluster_ratio - 1); +} + /* * This function controls whether or not we should try to go down the * dioread_nolock code paths, which makes it safe to avoid taking diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index fb0f99dc8c22..0e8708b77da6 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -100,29 +100,41 @@ static int ext4_split_extent_at(handle_t *handle, static int ext4_find_delayed_extent(struct inode *inode, struct extent_status *newes); -static int ext4_ext_truncate_extend_restart(handle_t *handle, - struct inode *inode, - int needed) +static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped) { - int err; - - if (!ext4_handle_valid(handle)) - return 0; - if (handle->h_buffer_credits >= needed) - return 0; /* - * If we need to extend the journal get a few extra blocks - * while we're at it for efficiency's sake. + * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this + * moment, get_block can be called only for blocks inside i_size since + * page cache has been already dropped and writes are blocked by + * i_mutex. So we can safely drop the i_data_sem here. */ - needed += 3; - err = ext4_journal_extend(handle, needed - handle->h_buffer_credits); - if (err <= 0) - return err; - err = ext4_truncate_restart_trans(handle, inode, needed); - if (err == 0) - err = -EAGAIN; + BUG_ON(EXT4_JOURNAL(inode) == NULL); + ext4_discard_preallocations(inode); + up_write(&EXT4_I(inode)->i_data_sem); + *dropped = 1; + return 0; +} - return err; +/* + * Make sure 'handle' has at least 'check_cred' credits. If not, restart + * transaction with 'restart_cred' credits. The function drops i_data_sem + * when restarting transaction and gets it after transaction is restarted. + * + * The function returns 0 on success, 1 if transaction had to be restarted, + * and < 0 in case of fatal error. + */ +int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode, + int check_cred, int restart_cred, + int revoke_cred) +{ + int ret; + int dropped = 0; + + ret = ext4_journal_ensure_credits_fn(handle, check_cred, restart_cred, + revoke_cred, ext4_ext_trunc_restart_fn(inode, &dropped)); + if (dropped) + down_write(&EXT4_I(inode)->i_data_sem); + return ret; } /* @@ -1753,16 +1765,9 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, */ if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN) return 0; - /* - * The check for IO to unwritten extent is somewhat racy as we - * increment i_unwritten / set EXT4_STATE_DIO_UNWRITTEN only after - * dropping i_data_sem. But reserved blocks should save us in that - * case. - */ + if (ext4_ext_is_unwritten(ex1) && - (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) || - atomic_read(&EXT4_I(inode)->i_unwritten) || - (ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN))) + ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN) return 0; #ifdef AGGRESSIVE_TEST if (ext1_ee_len >= 4) @@ -1840,7 +1845,8 @@ static void ext4_ext_try_to_merge_up(handle_t *handle, * group descriptor to release the extent tree block. If we * can't get the journal credits, give up. */ - if (ext4_journal_extend(handle, 2)) + if (ext4_journal_extend(handle, 2, + ext4_free_metadata_revoke_credits(inode->i_sb, 1))) return; /* @@ -2727,7 +2733,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int err = 0, correct_index = 0; - int depth = ext_depth(inode), credits; + int depth = ext_depth(inode), credits, revoke_credits; struct ext4_extent_header *eh; ext4_lblk_t a, b; unsigned num; @@ -2819,10 +2825,23 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, credits += (ext_depth(inode)) + 1; } credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); + /* + * We may end up freeing some index blocks and data from the + * punched range. Note that partial clusters are accounted for + * by ext4_free_data_revoke_credits(). + */ + revoke_credits = + ext4_free_metadata_revoke_credits(inode->i_sb, + ext_depth(inode)) + + ext4_free_data_revoke_credits(inode, b - a + 1); - err = ext4_ext_truncate_extend_restart(handle, inode, credits); - if (err) + err = ext4_datasem_ensure_credits(handle, inode, credits, + credits, revoke_credits); + if (err) { + if (err > 0) + err = -EAGAIN; goto out; + } err = ext4_ext_get_access(handle, inode, path + depth); if (err) @@ -2948,7 +2967,9 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, ext_debug("truncate since %u to %u\n", start, end); /* probably first extent we're gonna free will be last in block */ - handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, depth + 1); + handle = ext4_journal_start_with_revoke(inode, EXT4_HT_TRUNCATE, + depth + 1, + ext4_free_metadata_revoke_credits(inode->i_sb, depth)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -4962,23 +4983,13 @@ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, int ret = 0; int ret2 = 0; struct ext4_map_blocks map; - unsigned int credits, blkbits = inode->i_blkbits; + unsigned int blkbits = inode->i_blkbits; + unsigned int credits = 0; map.m_lblk = offset >> blkbits; max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits); - /* - * This is somewhat ugly but the idea is clear: When transaction is - * reserved, everything goes into it. Otherwise we rather start several - * smaller transactions for conversion of each extent separately. - */ - if (handle) { - handle = ext4_journal_start_reserved(handle, - EXT4_HT_EXT_CONVERT); - if (IS_ERR(handle)) - return PTR_ERR(handle); - credits = 0; - } else { + if (!handle) { /* * credits to insert 1 extent into extent tree */ @@ -5009,11 +5020,40 @@ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, if (ret <= 0 || ret2) break; } - if (!credits) - ret2 = ext4_journal_stop(handle); return ret > 0 ? ret2 : ret; } +int ext4_convert_unwritten_io_end_vec(handle_t *handle, ext4_io_end_t *io_end) +{ + int ret, err = 0; + struct ext4_io_end_vec *io_end_vec; + + /* + * This is somewhat ugly but the idea is clear: When transaction is + * reserved, everything goes into it. Otherwise we rather start several + * smaller transactions for conversion of each extent separately. + */ + if (handle) { + handle = ext4_journal_start_reserved(handle, + EXT4_HT_EXT_CONVERT); + if (IS_ERR(handle)) + return PTR_ERR(handle); + } + + list_for_each_entry(io_end_vec, &io_end->list_vec, list) { + ret = ext4_convert_unwritten_extents(handle, io_end->inode, + io_end_vec->offset, + io_end_vec->size); + if (ret) + break; + } + + if (handle) + err = ext4_journal_stop(handle); + + return ret < 0 ? ret : err; +} + /* * If newes is not existing extent (newes->ec_pblk equals zero) find * delayed extent at start of newes and update newes accordingly and @@ -5206,13 +5246,10 @@ ext4_access_path(handle_t *handle, struct inode *inode, * descriptor) for each block group; assume two block * groups */ - if (handle->h_buffer_credits < 7) { - credits = ext4_writepage_trans_blocks(inode); - err = ext4_ext_truncate_extend_restart(handle, inode, credits); - /* EAGAIN is success */ - if (err && err != -EAGAIN) - return err; - } + credits = ext4_writepage_trans_blocks(inode); + err = ext4_datasem_ensure_credits(handle, inode, 7, credits, 0); + if (err < 0) + return err; err = ext4_ext_get_access(handle, inode, path); return err; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 8d2bbcc2d813..6a7293a5cda2 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -29,10 +29,58 @@ #include #include #include +#include #include "ext4.h" #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" +#include "truncate.h" + +static bool ext4_dio_supported(struct inode *inode) +{ + if (IS_ENABLED(CONFIG_FS_ENCRYPTION) && IS_ENCRYPTED(inode)) + return false; + if (fsverity_active(inode)) + return false; + if (ext4_should_journal_data(inode)) + return false; + if (ext4_has_inline_data(inode)) + return false; + return true; +} + +static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + ssize_t ret; + struct inode *inode = file_inode(iocb->ki_filp); + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!inode_trylock_shared(inode)) + return -EAGAIN; + } else { + inode_lock_shared(inode); + } + + if (!ext4_dio_supported(inode)) { + inode_unlock_shared(inode); + /* + * Fallback to buffered I/O if the operation being performed on + * the inode is not supported by direct I/O. The IOCB_DIRECT + * flag needs to be cleared here in order to ensure that the + * direct I/O path within generic_file_read_iter() is not + * taken. + */ + iocb->ki_flags &= ~IOCB_DIRECT; + return generic_file_read_iter(iocb, to); + } + + ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, + is_sync_kiocb(iocb)); + inode_unlock_shared(inode); + + file_accessed(iocb->ki_filp); + return ret; +} #ifdef CONFIG_FS_DAX static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) @@ -64,16 +112,21 @@ static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { - if (unlikely(ext4_forced_shutdown(EXT4_SB(file_inode(iocb->ki_filp)->i_sb)))) + struct inode *inode = file_inode(iocb->ki_filp); + + if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) return -EIO; if (!iov_iter_count(to)) return 0; /* skip atime */ #ifdef CONFIG_FS_DAX - if (IS_DAX(file_inode(iocb->ki_filp))) + if (IS_DAX(inode)) return ext4_dax_read_iter(iocb, to); #endif + if (iocb->ki_flags & IOCB_DIRECT) + return ext4_dio_read_iter(iocb, to); + return generic_file_read_iter(iocb, to); } @@ -103,13 +156,6 @@ static int ext4_release_file(struct inode *inode, struct file *filp) return 0; } -static void ext4_unwritten_wait(struct inode *inode) -{ - wait_queue_head_t *wq = ext4_ioend_wq(inode); - - wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_unwritten) == 0)); -} - /* * This tests whether the IO in question is block-aligned or not. * Ext4 utilizes unwritten extents when hole-filling during direct IO, and they @@ -162,13 +208,13 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from) struct inode *inode = file_inode(iocb->ki_filp); ssize_t ret; + if (unlikely(IS_IMMUTABLE(inode))) + return -EPERM; + ret = generic_write_checks(iocb, from); if (ret <= 0) return ret; - if (unlikely(IS_IMMUTABLE(inode))) - return -EPERM; - /* * If we have encountered a bitmap-format file, the size limit * is smaller than s_maxbytes, which is for extent-mapped files. @@ -180,32 +226,301 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from) return -EFBIG; iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos); } + + ret = file_modified(iocb->ki_filp); + if (ret) + return ret; + return iov_iter_count(from); } +static ssize_t ext4_buffered_write_iter(struct kiocb *iocb, + struct iov_iter *from) +{ + ssize_t ret; + struct inode *inode = file_inode(iocb->ki_filp); + + if (iocb->ki_flags & IOCB_NOWAIT) + return -EOPNOTSUPP; + + inode_lock(inode); + ret = ext4_write_checks(iocb, from); + if (ret <= 0) + goto out; + + current->backing_dev_info = inode_to_bdi(inode); + ret = generic_perform_write(iocb->ki_filp, from, iocb->ki_pos); + current->backing_dev_info = NULL; + +out: + inode_unlock(inode); + if (likely(ret > 0)) { + iocb->ki_pos += ret; + ret = generic_write_sync(iocb, ret); + } + + return ret; +} + +static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset, + ssize_t written, size_t count) +{ + handle_t *handle; + bool truncate = false; + u8 blkbits = inode->i_blkbits; + ext4_lblk_t written_blk, end_blk; + + /* + * Note that EXT4_I(inode)->i_disksize can get extended up to + * inode->i_size while the I/O was running due to writeback of delalloc + * blocks. But, the code in ext4_iomap_alloc() is careful to use + * zeroed/unwritten extents if this is possible; thus we won't leave + * uninitialized blocks in a file even if we didn't succeed in writing + * as much as we intended. + */ + WARN_ON_ONCE(i_size_read(inode) < EXT4_I(inode)->i_disksize); + if (offset + count <= EXT4_I(inode)->i_disksize) { + /* + * We need to ensure that the inode is removed from the orphan + * list if it has been added prematurely, due to writeback of + * delalloc blocks. + */ + if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) { + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + + if (IS_ERR(handle)) { + ext4_orphan_del(NULL, inode); + return PTR_ERR(handle); + } + + ext4_orphan_del(handle, inode); + ext4_journal_stop(handle); + } + + return written; + } + + if (written < 0) + goto truncate; + + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + if (IS_ERR(handle)) { + written = PTR_ERR(handle); + goto truncate; + } + + if (ext4_update_inode_size(inode, offset + written)) + ext4_mark_inode_dirty(handle, inode); + + /* + * We may need to truncate allocated but not written blocks beyond EOF. + */ + written_blk = ALIGN(offset + written, 1 << blkbits); + end_blk = ALIGN(offset + count, 1 << blkbits); + if (written_blk < end_blk && ext4_can_truncate(inode)) + truncate = true; + + /* + * Remove the inode from the orphan list if it has been extended and + * everything went OK. + */ + if (!truncate && inode->i_nlink) + ext4_orphan_del(handle, inode); + ext4_journal_stop(handle); + + if (truncate) { +truncate: + ext4_truncate_failed_write(inode); + /* + * If the truncate operation failed early, then the inode may + * still be on the orphan list. In that case, we need to try + * remove the inode from the in-memory linked list. + */ + if (inode->i_nlink) + ext4_orphan_del(NULL, inode); + } + + return written; +} + +static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size, + int error, unsigned int flags) +{ + loff_t offset = iocb->ki_pos; + struct inode *inode = file_inode(iocb->ki_filp); + + if (error) + return error; + + if (size && flags & IOMAP_DIO_UNWRITTEN) + return ext4_convert_unwritten_extents(NULL, inode, + offset, size); + + return 0; +} + +static const struct iomap_dio_ops ext4_dio_write_ops = { + .end_io = ext4_dio_write_end_io, +}; + +static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + ssize_t ret; + size_t count; + loff_t offset; + handle_t *handle; + struct inode *inode = file_inode(iocb->ki_filp); + bool extend = false, overwrite = false, unaligned_aio = false; + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!inode_trylock(inode)) + return -EAGAIN; + } else { + inode_lock(inode); + } + + if (!ext4_dio_supported(inode)) { + inode_unlock(inode); + /* + * Fallback to buffered I/O if the inode does not support + * direct I/O. + */ + return ext4_buffered_write_iter(iocb, from); + } + + ret = ext4_write_checks(iocb, from); + if (ret <= 0) { + inode_unlock(inode); + return ret; + } + + /* + * Unaligned asynchronous direct I/O must be serialized among each + * other as the zeroing of partial blocks of two competing unaligned + * asynchronous direct I/O writes can result in data corruption. + */ + offset = iocb->ki_pos; + count = iov_iter_count(from); + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) && + !is_sync_kiocb(iocb) && ext4_unaligned_aio(inode, from, offset)) { + unaligned_aio = true; + inode_dio_wait(inode); + } + + /* + * Determine whether the I/O will overwrite allocated and initialized + * blocks. If so, check to see whether it is possible to take the + * dioread_nolock path. + */ + if (!unaligned_aio && ext4_overwrite_io(inode, offset, count) && + ext4_should_dioread_nolock(inode)) { + overwrite = true; + downgrade_write(&inode->i_rwsem); + } + + if (offset + count > EXT4_I(inode)->i_disksize) { + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + + ret = ext4_orphan_add(handle, inode); + if (ret) { + ext4_journal_stop(handle); + goto out; + } + + extend = true; + ext4_journal_stop(handle); + } + + ret = iomap_dio_rw(iocb, from, &ext4_iomap_ops, &ext4_dio_write_ops, + is_sync_kiocb(iocb) || unaligned_aio || extend); + + if (extend) + ret = ext4_handle_inode_extension(inode, offset, ret, count); + +out: + if (overwrite) + inode_unlock_shared(inode); + else + inode_unlock(inode); + + if (ret >= 0 && iov_iter_count(from)) { + ssize_t err; + loff_t endbyte; + + offset = iocb->ki_pos; + err = ext4_buffered_write_iter(iocb, from); + if (err < 0) + return err; + + /* + * We need to ensure that the pages within the page cache for + * the range covered by this I/O are written to disk and + * invalidated. This is in attempt to preserve the expected + * direct I/O semantics in the case we fallback to buffered I/O + * to complete off the I/O request. + */ + ret += err; + endbyte = offset + err - 1; + err = filemap_write_and_wait_range(iocb->ki_filp->f_mapping, + offset, endbyte); + if (!err) + invalidate_mapping_pages(iocb->ki_filp->f_mapping, + offset >> PAGE_SHIFT, + endbyte >> PAGE_SHIFT); + } + + return ret; +} + #ifdef CONFIG_FS_DAX static ssize_t ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) { - struct inode *inode = file_inode(iocb->ki_filp); ssize_t ret; + size_t count; + loff_t offset; + handle_t *handle; + bool extend = false; + struct inode *inode = file_inode(iocb->ki_filp); if (!inode_trylock(inode)) { if (iocb->ki_flags & IOCB_NOWAIT) return -EAGAIN; inode_lock(inode); } + ret = ext4_write_checks(iocb, from); if (ret <= 0) goto out; - ret = file_remove_privs(iocb->ki_filp); - if (ret) - goto out; - ret = file_update_time(iocb->ki_filp); - if (ret) - goto out; + + offset = iocb->ki_pos; + count = iov_iter_count(from); + + if (offset + count > EXT4_I(inode)->i_disksize) { + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + + ret = ext4_orphan_add(handle, inode); + if (ret) { + ext4_journal_stop(handle); + goto out; + } + + extend = true; + ext4_journal_stop(handle); + } ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops); + + if (extend) + ret = ext4_handle_inode_extension(inode, offset, ret, count); out: inode_unlock(inode); if (ret > 0) @@ -218,10 +533,6 @@ static ssize_t ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); - int o_direct = iocb->ki_flags & IOCB_DIRECT; - int unaligned_aio = 0; - int overwrite = 0; - ssize_t ret; if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) return -EIO; @@ -230,59 +541,10 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (IS_DAX(inode)) return ext4_dax_write_iter(iocb, from); #endif + if (iocb->ki_flags & IOCB_DIRECT) + return ext4_dio_write_iter(iocb, from); - if (!inode_trylock(inode)) { - if (iocb->ki_flags & IOCB_NOWAIT) - return -EAGAIN; - inode_lock(inode); - } - - ret = ext4_write_checks(iocb, from); - if (ret <= 0) - goto out; - - /* - * Unaligned direct AIO must be serialized among each other as zeroing - * of partial blocks of two competing unaligned AIOs can result in data - * corruption. - */ - if (o_direct && ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) && - !is_sync_kiocb(iocb) && - ext4_unaligned_aio(inode, from, iocb->ki_pos)) { - unaligned_aio = 1; - ext4_unwritten_wait(inode); - } - - iocb->private = &overwrite; - /* Check whether we do a DIO overwrite or not */ - if (o_direct && !unaligned_aio) { - if (ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) { - if (ext4_should_dioread_nolock(inode)) - overwrite = 1; - } else if (iocb->ki_flags & IOCB_NOWAIT) { - ret = -EAGAIN; - goto out; - } - } - - ret = __generic_file_write_iter(iocb, from); - /* - * Unaligned direct AIO must be the only IO in flight. Otherwise - * overlapping aligned IO after unaligned might result in data - * corruption. - */ - if (ret == -EIOCBQUEUED && unaligned_aio) - ext4_unwritten_wait(inode); - inode_unlock(inode); - - if (ret > 0) - ret = generic_write_sync(iocb, ret); - - return ret; - -out: - inode_unlock(inode); - return ret; + return ext4_buffered_write_iter(iocb, from); } #ifdef CONFIG_FS_DAX @@ -494,12 +756,14 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int whence) maxbytes, i_size_read(inode)); case SEEK_HOLE: inode_lock_shared(inode); - offset = iomap_seek_hole(inode, offset, &ext4_iomap_ops); + offset = iomap_seek_hole(inode, offset, + &ext4_iomap_report_ops); inode_unlock_shared(inode); break; case SEEK_DATA: inode_lock_shared(inode); - offset = iomap_seek_data(inode, offset, &ext4_iomap_ops); + offset = iomap_seek_data(inode, offset, + &ext4_iomap_report_ops); inode_unlock_shared(inode); break; } diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 5508baa11bb6..e10206e7f4bb 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -80,6 +80,43 @@ static int ext4_sync_parent(struct inode *inode) return ret; } +static int ext4_fsync_nojournal(struct inode *inode, bool datasync, + bool *needs_barrier) +{ + int ret, err; + + ret = sync_mapping_buffers(inode->i_mapping); + if (!(inode->i_state & I_DIRTY_ALL)) + return ret; + if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) + return ret; + + err = sync_inode_metadata(inode, 1); + if (!ret) + ret = err; + + if (!ret) + ret = ext4_sync_parent(inode); + if (test_opt(inode->i_sb, BARRIER)) + *needs_barrier = true; + + return ret; +} + +static int ext4_fsync_journal(struct inode *inode, bool datasync, + bool *needs_barrier) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; + tid_t commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid; + + if (journal->j_flags & JBD2_BARRIER && + !jbd2_trans_will_send_data_barrier(journal, commit_tid)) + *needs_barrier = true; + + return jbd2_complete_transaction(journal, commit_tid); +} + /* * akpm: A new design for ext4_sync_file(). * @@ -91,17 +128,14 @@ static int ext4_sync_parent(struct inode *inode) * What we do is just kick off a commit and wait on it. This will snapshot the * inode to disk. */ - int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) { - struct inode *inode = file->f_mapping->host; - struct ext4_inode_info *ei = EXT4_I(inode); - journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; int ret = 0, err; - tid_t commit_tid; bool needs_barrier = false; + struct inode *inode = file->f_mapping->host; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(sbi))) return -EIO; J_ASSERT(ext4_journal_current_handle() == NULL); @@ -111,23 +145,15 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (sb_rdonly(inode->i_sb)) { /* Make sure that we read updated s_mount_flags value */ smp_rmb(); - if (EXT4_SB(inode->i_sb)->s_mount_flags & EXT4_MF_FS_ABORTED) + if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) ret = -EROFS; goto out; } - if (!journal) { - ret = __generic_file_fsync(file, start, end, datasync); - if (!ret) - ret = ext4_sync_parent(inode); - if (test_opt(inode->i_sb, BARRIER)) - goto issue_flush; - goto out; - } - ret = file_write_and_wait_range(file, start, end); if (ret) return ret; + /* * data=writeback,ordered: * The caller's filemap_fdatawrite()/wait will sync the data. @@ -142,18 +168,14 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * (they were dirtied by commit). But that's OK - the blocks are * safe in-journal, which is all fsync() needs to ensure. */ - if (ext4_should_journal_data(inode)) { + if (!sbi->s_journal) + ret = ext4_fsync_nojournal(inode, datasync, &needs_barrier); + else if (ext4_should_journal_data(inode)) ret = ext4_force_commit(inode->i_sb); - goto out; - } + else + ret = ext4_fsync_journal(inode, datasync, &needs_barrier); - commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid; - if (journal->j_flags & JBD2_BARRIER && - !jbd2_trans_will_send_data_barrier(journal, commit_tid)) - needs_barrier = true; - ret = jbd2_complete_transaction(journal, commit_tid); if (needs_barrier) { - issue_flush: err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); if (!ret) ret = err; diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 764ff4c56233..dc333e8e51e8 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -265,13 +265,8 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) ext4_debug("freeing inode %lu\n", ino); trace_ext4_free_inode(inode); - /* - * Note: we must free any quota before locking the superblock, - * as writing the quota to disk may need the lock as well. - */ dquot_initialize(inode); dquot_free_inode(inode); - dquot_drop(inode); is_directory = S_ISDIR(inode->i_mode); @@ -927,7 +922,7 @@ repeat_in_this_group: BUG_ON(nblocks <= 0); handle = __ext4_journal_start_sb(dir->i_sb, line_no, handle_type, nblocks, - 0); + 0, 0); if (IS_ERR(handle)) { err = PTR_ERR(handle); ext4_std_error(sb, err); diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 36699a131168..3a4ab70fe9e0 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -331,11 +331,14 @@ static int ext4_alloc_branch(handle_t *handle, for (i = 0; i <= indirect_blks; i++) { if (i == indirect_blks) { new_blocks[i] = ext4_mb_new_blocks(handle, ar, &err); - } else + } else { ar->goal = new_blocks[i] = ext4_new_meta_blocks(handle, ar->inode, ar->goal, ar->flags & EXT4_MB_DELALLOC_RESERVED, NULL, &err); + /* Simplify error cleanup... */ + branch[i+1].bh = NULL; + } if (err) { i--; goto failed; @@ -377,18 +380,25 @@ static int ext4_alloc_branch(handle_t *handle, } return 0; failed: + if (i == indirect_blks) { + /* Free data blocks */ + ext4_free_blocks(handle, ar->inode, NULL, new_blocks[i], + ar->len, 0); + i--; + } for (; i >= 0; i--) { /* * We want to ext4_forget() only freshly allocated indirect - * blocks. Buffer for new_blocks[i-1] is at branch[i].bh and - * buffer at branch[0].bh is indirect block / inode already - * existing before ext4_alloc_branch() was called. + * blocks. Buffer for new_blocks[i] is at branch[i+1].bh + * (buffer at branch[0].bh is indirect block / inode already + * existing before ext4_alloc_branch() was called). Also + * because blocks are freshly allocated, we don't need to + * revoke them which is why we don't set + * EXT4_FREE_BLOCKS_METADATA. */ - if (i > 0 && i != indirect_blks && branch[i].bh) - ext4_forget(handle, 1, ar->inode, branch[i].bh, - branch[i].bh->b_blocknr); - ext4_free_blocks(handle, ar->inode, NULL, new_blocks[i], - (i == indirect_blks) ? ar->len : 1, 0); + ext4_free_blocks(handle, ar->inode, branch[i+1].bh, + new_blocks[i], 1, + branch[i+1].bh ? EXT4_FREE_BLOCKS_FORGET : 0); } return err; } @@ -689,27 +699,63 @@ int ext4_ind_trans_blocks(struct inode *inode, int nrblocks) return DIV_ROUND_UP(nrblocks, EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4; } +static int ext4_ind_trunc_restart_fn(handle_t *handle, struct inode *inode, + struct buffer_head *bh, int *dropped) +{ + int err; + + if (bh) { + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, inode, bh); + if (unlikely(err)) + return err; + } + err = ext4_mark_inode_dirty(handle, inode); + if (unlikely(err)) + return err; + /* + * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this + * moment, get_block can be called only for blocks inside i_size since + * page cache has been already dropped and writes are blocked by + * i_mutex. So we can safely drop the i_data_sem here. + */ + BUG_ON(EXT4_JOURNAL(inode) == NULL); + ext4_discard_preallocations(inode); + up_write(&EXT4_I(inode)->i_data_sem); + *dropped = 1; + return 0; +} + /* * Truncate transactions can be complex and absolutely huge. So we need to * be able to restart the transaction at a conventient checkpoint to make * sure we don't overflow the journal. * * Try to extend this transaction for the purposes of truncation. If - * extend fails, we need to propagate the failure up and restart the - * transaction in the top-level truncate loop. --sct - * - * Returns 0 if we managed to create more room. If we can't create more - * room, and the transaction must be restarted we return 1. + * extend fails, we restart transaction. */ -static int try_to_extend_transaction(handle_t *handle, struct inode *inode) +static int ext4_ind_truncate_ensure_credits(handle_t *handle, + struct inode *inode, + struct buffer_head *bh, + int revoke_creds) { - if (!ext4_handle_valid(handle)) - return 0; - if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1)) - return 0; - if (!ext4_journal_extend(handle, ext4_blocks_for_truncate(inode))) - return 0; - return 1; + int ret; + int dropped = 0; + + ret = ext4_journal_ensure_credits_fn(handle, EXT4_RESERVE_TRANS_BLOCKS, + ext4_blocks_for_truncate(inode), revoke_creds, + ext4_ind_trunc_restart_fn(handle, inode, bh, &dropped)); + if (dropped) + down_write(&EXT4_I(inode)->i_data_sem); + if (ret <= 0) + return ret; + if (bh) { + BUFFER_TRACE(bh, "retaking write access"); + ret = ext4_journal_get_write_access(handle, bh); + if (unlikely(ret)) + return ret; + } + return 0; } /* @@ -844,27 +890,10 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode, return 1; } - if (try_to_extend_transaction(handle, inode)) { - if (bh) { - BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, inode, bh); - if (unlikely(err)) - goto out_err; - } - err = ext4_mark_inode_dirty(handle, inode); - if (unlikely(err)) - goto out_err; - err = ext4_truncate_restart_trans(handle, inode, - ext4_blocks_for_truncate(inode)); - if (unlikely(err)) - goto out_err; - if (bh) { - BUFFER_TRACE(bh, "retaking write access"); - err = ext4_journal_get_write_access(handle, bh); - if (unlikely(err)) - goto out_err; - } - } + err = ext4_ind_truncate_ensure_credits(handle, inode, bh, + ext4_free_data_revoke_credits(inode, count)); + if (err < 0) + goto out_err; for (p = first; p < last; p++) *p = 0; @@ -1047,11 +1076,11 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, */ if (ext4_handle_is_aborted(handle)) return; - if (try_to_extend_transaction(handle, inode)) { - ext4_mark_inode_dirty(handle, inode); - ext4_truncate_restart_trans(handle, inode, - ext4_blocks_for_truncate(inode)); - } + if (ext4_ind_truncate_ensure_credits(handle, inode, + NULL, + ext4_free_metadata_revoke_credits( + inode->i_sb, 1)) < 0) + return; /* * The forget flag here is critical because if diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 1d880ae90c1f..28f28de0c1b6 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -163,32 +163,6 @@ int ext4_inode_is_fast_symlink(struct inode *inode) (inode->i_size < EXT4_N_BLOCKS * 4); } -/* - * Restart the transaction associated with *handle. This does a commit, - * so before we call here everything must be consistently dirtied against - * this transaction. - */ -int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode, - int nblocks) -{ - int ret; - - /* - * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this - * moment, get_block can be called only for blocks inside i_size since - * page cache has been already dropped and writes are blocked by - * i_mutex. So we can safely drop the i_data_sem here. - */ - BUG_ON(EXT4_JOURNAL(inode) == NULL); - jbd_debug(2, "restarting handle %p\n", handle); - up_write(&EXT4_I(inode)->i_data_sem); - ret = ext4_journal_restart(handle, nblocks); - down_write(&EXT4_I(inode)->i_data_sem); - ext4_discard_preallocations(inode); - - return ret; -} - /* * Called at the last iput() if i_nlink is zero. */ @@ -196,7 +170,12 @@ void ext4_evict_inode(struct inode *inode) { handle_t *handle; int err; - int extra_credits = 3; + /* + * Credits for final inode cleanup and freeing: + * sb + inode (ext4_orphan_del()), block bitmap, group descriptor + * (xattr block freeing), bitmap, group descriptor (inode freeing) + */ + int extra_credits = 6; struct ext4_xattr_inode_array *ea_inode_array = NULL; trace_ext4_evict_inode(inode); @@ -252,8 +231,12 @@ void ext4_evict_inode(struct inode *inode) if (!IS_NOQUOTA(inode)) extra_credits += EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb); + /* + * Block bitmap, group descriptor, and inode are accounted in both + * ext4_blocks_for_truncate() and extra_credits. So subtract 3. + */ handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, - ext4_blocks_for_truncate(inode)+extra_credits); + ext4_blocks_for_truncate(inode) + extra_credits - 3); if (IS_ERR(handle)) { ext4_std_error(inode->i_sb, PTR_ERR(handle)); /* @@ -826,136 +809,6 @@ int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, /* Maximum number of blocks we map for direct IO at once. */ #define DIO_MAX_BLOCKS 4096 -/* - * Get blocks function for the cases that need to start a transaction - - * generally difference cases of direct IO and DAX IO. It also handles retries - * in case of ENOSPC. - */ -static int ext4_get_block_trans(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int flags) -{ - int dio_credits; - handle_t *handle; - int retries = 0; - int ret; - - /* Trim mapping request to maximum we can map at once for DIO */ - if (bh_result->b_size >> inode->i_blkbits > DIO_MAX_BLOCKS) - bh_result->b_size = DIO_MAX_BLOCKS << inode->i_blkbits; - dio_credits = ext4_chunk_trans_blocks(inode, - bh_result->b_size >> inode->i_blkbits); -retry: - handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits); - if (IS_ERR(handle)) - return PTR_ERR(handle); - - ret = _ext4_get_block(inode, iblock, bh_result, flags); - ext4_journal_stop(handle); - - if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) - goto retry; - return ret; -} - -/* Get block function for DIO reads and writes to inodes without extents */ -int ext4_dio_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh, int create) -{ - /* We don't expect handle for direct IO */ - WARN_ON_ONCE(ext4_journal_current_handle()); - - if (!create) - return _ext4_get_block(inode, iblock, bh, 0); - return ext4_get_block_trans(inode, iblock, bh, EXT4_GET_BLOCKS_CREATE); -} - -/* - * Get block function for AIO DIO writes when we create unwritten extent if - * blocks are not allocated yet. The extent will be converted to written - * after IO is complete. - */ -static int ext4_dio_get_block_unwritten_async(struct inode *inode, - sector_t iblock, struct buffer_head *bh_result, int create) -{ - int ret; - - /* We don't expect handle for direct IO */ - WARN_ON_ONCE(ext4_journal_current_handle()); - - ret = ext4_get_block_trans(inode, iblock, bh_result, - EXT4_GET_BLOCKS_IO_CREATE_EXT); - - /* - * When doing DIO using unwritten extents, we need io_end to convert - * unwritten extents to written on IO completion. We allocate io_end - * once we spot unwritten extent and store it in b_private. Generic - * DIO code keeps b_private set and furthermore passes the value to - * our completion callback in 'private' argument. - */ - if (!ret && buffer_unwritten(bh_result)) { - if (!bh_result->b_private) { - ext4_io_end_t *io_end; - - io_end = ext4_init_io_end(inode, GFP_KERNEL); - if (!io_end) - return -ENOMEM; - bh_result->b_private = io_end; - ext4_set_io_unwritten_flag(inode, io_end); - } - set_buffer_defer_completion(bh_result); - } - - return ret; -} - -/* - * Get block function for non-AIO DIO writes when we create unwritten extent if - * blocks are not allocated yet. The extent will be converted to written - * after IO is complete by ext4_direct_IO_write(). - */ -static int ext4_dio_get_block_unwritten_sync(struct inode *inode, - sector_t iblock, struct buffer_head *bh_result, int create) -{ - int ret; - - /* We don't expect handle for direct IO */ - WARN_ON_ONCE(ext4_journal_current_handle()); - - ret = ext4_get_block_trans(inode, iblock, bh_result, - EXT4_GET_BLOCKS_IO_CREATE_EXT); - - /* - * Mark inode as having pending DIO writes to unwritten extents. - * ext4_direct_IO_write() checks this flag and converts extents to - * written. - */ - if (!ret && buffer_unwritten(bh_result)) - ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); - - return ret; -} - -static int ext4_dio_get_block_overwrite(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) -{ - int ret; - - ext4_debug("ext4_dio_get_block_overwrite: inode %lu, create flag %d\n", - inode->i_ino, create); - /* We don't expect handle for direct IO */ - WARN_ON_ONCE(ext4_journal_current_handle()); - - ret = _ext4_get_block(inode, iblock, bh_result, 0); - /* - * Blocks should have been preallocated! ext4_file_write_iter() checks - * that. - */ - WARN_ON_ONCE(!buffer_mapped(bh_result) || buffer_unwritten(bh_result)); - - return ret; -} - - /* * `handle' can be NULL if create is zero */ @@ -2340,6 +2193,79 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd, return lblk < blocks; } +/* + * mpage_process_page - update page buffers corresponding to changed extent and + * may submit fully mapped page for IO + * + * @mpd - description of extent to map, on return next extent to map + * @m_lblk - logical block mapping. + * @m_pblk - corresponding physical mapping. + * @map_bh - determines on return whether this page requires any further + * mapping or not. + * Scan given page buffers corresponding to changed extent and update buffer + * state according to new extent state. + * We map delalloc buffers to their physical location, clear unwritten bits. + * If the given page is not fully mapped, we update @map to the next extent in + * the given page that needs mapping & return @map_bh as true. + */ +static int mpage_process_page(struct mpage_da_data *mpd, struct page *page, + ext4_lblk_t *m_lblk, ext4_fsblk_t *m_pblk, + bool *map_bh) +{ + struct buffer_head *head, *bh; + ext4_io_end_t *io_end = mpd->io_submit.io_end; + ext4_lblk_t lblk = *m_lblk; + ext4_fsblk_t pblock = *m_pblk; + int err = 0; + int blkbits = mpd->inode->i_blkbits; + ssize_t io_end_size = 0; + struct ext4_io_end_vec *io_end_vec = ext4_last_io_end_vec(io_end); + + bh = head = page_buffers(page); + do { + if (lblk < mpd->map.m_lblk) + continue; + if (lblk >= mpd->map.m_lblk + mpd->map.m_len) { + /* + * Buffer after end of mapped extent. + * Find next buffer in the page to map. + */ + mpd->map.m_len = 0; + mpd->map.m_flags = 0; + io_end_vec->size += io_end_size; + io_end_size = 0; + + err = mpage_process_page_bufs(mpd, head, bh, lblk); + if (err > 0) + err = 0; + if (!err && mpd->map.m_len && mpd->map.m_lblk > lblk) { + io_end_vec = ext4_alloc_io_end_vec(io_end); + if (IS_ERR(io_end_vec)) { + err = PTR_ERR(io_end_vec); + goto out; + } + io_end_vec->offset = mpd->map.m_lblk << blkbits; + } + *map_bh = true; + goto out; + } + if (buffer_delay(bh)) { + clear_buffer_delay(bh); + bh->b_blocknr = pblock++; + } + clear_buffer_unwritten(bh); + io_end_size += (1 << blkbits); + } while (lblk++, (bh = bh->b_this_page) != head); + + io_end_vec->size += io_end_size; + io_end_size = 0; + *map_bh = false; +out: + *m_lblk = lblk; + *m_pblk = pblock; + return err; +} + /* * mpage_map_buffers - update buffers corresponding to changed extent and * submit fully mapped pages for IO @@ -2359,12 +2285,12 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) struct pagevec pvec; int nr_pages, i; struct inode *inode = mpd->inode; - struct buffer_head *head, *bh; int bpp_bits = PAGE_SHIFT - inode->i_blkbits; pgoff_t start, end; ext4_lblk_t lblk; - sector_t pblock; + ext4_fsblk_t pblock; int err; + bool map_bh = false; start = mpd->map.m_lblk >> bpp_bits; end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits; @@ -2380,50 +2306,19 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - bh = head = page_buffers(page); - do { - if (lblk < mpd->map.m_lblk) - continue; - if (lblk >= mpd->map.m_lblk + mpd->map.m_len) { - /* - * Buffer after end of mapped extent. - * Find next buffer in the page to map. - */ - mpd->map.m_len = 0; - mpd->map.m_flags = 0; - /* - * FIXME: If dioread_nolock supports - * blocksize < pagesize, we need to make - * sure we add size mapped so far to - * io_end->size as the following call - * can submit the page for IO. - */ - err = mpage_process_page_bufs(mpd, head, - bh, lblk); - pagevec_release(&pvec); - if (err > 0) - err = 0; - return err; - } - if (buffer_delay(bh)) { - clear_buffer_delay(bh); - bh->b_blocknr = pblock++; - } - clear_buffer_unwritten(bh); - } while (lblk++, (bh = bh->b_this_page) != head); - + err = mpage_process_page(mpd, page, &lblk, &pblock, + &map_bh); /* - * FIXME: This is going to break if dioread_nolock - * supports blocksize < pagesize as we will try to - * convert potentially unmapped parts of inode. + * If map_bh is true, means page may require further bh + * mapping, or maybe the page was submitted for IO. + * So we return to call further extent mapping. */ - mpd->io_submit.io_end->size += PAGE_SIZE; + if (err < 0 || map_bh == true) + goto out; /* Page fully mapped - let IO run! */ err = mpage_submit_page(mpd, page); - if (err < 0) { - pagevec_release(&pvec); - return err; - } + if (err < 0) + goto out; } pagevec_release(&pvec); } @@ -2431,6 +2326,9 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) mpd->map.m_len = 0; mpd->map.m_flags = 0; return 0; +out: + pagevec_release(&pvec); + return err; } static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) @@ -2510,9 +2408,13 @@ static int mpage_map_and_submit_extent(handle_t *handle, int err; loff_t disksize; int progress = 0; + ext4_io_end_t *io_end = mpd->io_submit.io_end; + struct ext4_io_end_vec *io_end_vec; - mpd->io_submit.io_end->offset = - ((loff_t)map->m_lblk) << inode->i_blkbits; + io_end_vec = ext4_alloc_io_end_vec(io_end); + if (IS_ERR(io_end_vec)) + return PTR_ERR(io_end_vec); + io_end_vec->offset = ((loff_t)map->m_lblk) << inode->i_blkbits; do { err = mpage_map_one_extent(handle, mpd); if (err < 0) { @@ -3406,148 +3308,142 @@ static bool ext4_inode_datasync_dirty(struct inode *inode) return inode->i_state & I_DIRTY_DATASYNC; } +static void ext4_set_iomap(struct inode *inode, struct iomap *iomap, + struct ext4_map_blocks *map, loff_t offset, + loff_t length) +{ + u8 blkbits = inode->i_blkbits; + + /* + * Writes that span EOF might trigger an I/O size update on completion, + * so consider them to be dirty for the purpose of O_DSYNC, even if + * there is no other metadata changes being made or are pending. + */ + iomap->flags = 0; + if (ext4_inode_datasync_dirty(inode) || + offset + length > i_size_read(inode)) + iomap->flags |= IOMAP_F_DIRTY; + + if (map->m_flags & EXT4_MAP_NEW) + iomap->flags |= IOMAP_F_NEW; + + iomap->bdev = inode->i_sb->s_bdev; + iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev; + iomap->offset = (u64) map->m_lblk << blkbits; + iomap->length = (u64) map->m_len << blkbits; + + /* + * Flags passed to ext4_map_blocks() for direct I/O writes can result + * in m_flags having both EXT4_MAP_MAPPED and EXT4_MAP_UNWRITTEN bits + * set. In order for any allocated unwritten extents to be converted + * into written extents correctly within the ->end_io() handler, we + * need to ensure that the iomap->type is set appropriately. Hence, the + * reason why we need to check whether the EXT4_MAP_UNWRITTEN bit has + * been set first. + */ + if (map->m_flags & EXT4_MAP_UNWRITTEN) { + iomap->type = IOMAP_UNWRITTEN; + iomap->addr = (u64) map->m_pblk << blkbits; + } else if (map->m_flags & EXT4_MAP_MAPPED) { + iomap->type = IOMAP_MAPPED; + iomap->addr = (u64) map->m_pblk << blkbits; + } else { + iomap->type = IOMAP_HOLE; + iomap->addr = IOMAP_NULL_ADDR; + } +} + +static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map, + unsigned int flags) +{ + handle_t *handle; + u8 blkbits = inode->i_blkbits; + int ret, dio_credits, m_flags = 0, retries = 0; + + /* + * Trim the mapping request to the maximum value that we can map at + * once for direct I/O. + */ + if (map->m_len > DIO_MAX_BLOCKS) + map->m_len = DIO_MAX_BLOCKS; + dio_credits = ext4_chunk_trans_blocks(inode, map->m_len); + +retry: + /* + * Either we allocate blocks and then don't get an unwritten extent, so + * in that case we have reserved enough credits. Or, the blocks are + * already allocated and unwritten. In that case, the extent conversion + * fits into the credits as well. + */ + handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + /* + * DAX and direct I/O are the only two operations that are currently + * supported with IOMAP_WRITE. + */ + WARN_ON(!IS_DAX(inode) && !(flags & IOMAP_DIRECT)); + if (IS_DAX(inode)) + m_flags = EXT4_GET_BLOCKS_CREATE_ZERO; + /* + * We use i_size instead of i_disksize here because delalloc writeback + * can complete at any point during the I/O and subsequently push the + * i_disksize out to i_size. This could be beyond where direct I/O is + * happening and thus expose allocated blocks to direct I/O reads. + */ + else if ((map->m_lblk * (1 << blkbits)) >= i_size_read(inode)) + m_flags = EXT4_GET_BLOCKS_CREATE; + else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT; + + ret = ext4_map_blocks(handle, inode, map, m_flags); + + /* + * We cannot fill holes in indirect tree based inodes as that could + * expose stale data in the case of a crash. Use the magic error code + * to fallback to buffered I/O. + */ + if (!m_flags && !ret) + ret = -ENOTBLK; + + ext4_journal_stop(handle); + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; + + return ret; +} + + static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned flags, struct iomap *iomap, struct iomap *srcmap) { - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - unsigned int blkbits = inode->i_blkbits; - unsigned long first_block, last_block; - struct ext4_map_blocks map; - bool delalloc = false; int ret; + struct ext4_map_blocks map; + u8 blkbits = inode->i_blkbits; if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK) return -EINVAL; - first_block = offset >> blkbits; - last_block = min_t(loff_t, (offset + length - 1) >> blkbits, - EXT4_MAX_LOGICAL_BLOCK); - if (flags & IOMAP_REPORT) { - if (ext4_has_inline_data(inode)) { - ret = ext4_inline_data_iomap(inode, iomap); - if (ret != -EAGAIN) { - if (ret == 0 && offset >= iomap->length) - ret = -ENOENT; - return ret; - } - } - } else { - if (WARN_ON_ONCE(ext4_has_inline_data(inode))) - return -ERANGE; - } + if (WARN_ON_ONCE(ext4_has_inline_data(inode))) + return -ERANGE; - map.m_lblk = first_block; - map.m_len = last_block - first_block + 1; + /* + * Calculate the first and last logical blocks respectively. + */ + map.m_lblk = offset >> blkbits; + map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits, + EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1; - if (flags & IOMAP_REPORT) { + if (flags & IOMAP_WRITE) + ret = ext4_iomap_alloc(inode, &map, flags); + else ret = ext4_map_blocks(NULL, inode, &map, 0); - if (ret < 0) - return ret; - if (ret == 0) { - ext4_lblk_t end = map.m_lblk + map.m_len - 1; - struct extent_status es; + if (ret < 0) + return ret; - ext4_es_find_extent_range(inode, &ext4_es_is_delayed, - map.m_lblk, end, &es); - - if (!es.es_len || es.es_lblk > end) { - /* entire range is a hole */ - } else if (es.es_lblk > map.m_lblk) { - /* range starts with a hole */ - map.m_len = es.es_lblk - map.m_lblk; - } else { - ext4_lblk_t offs = 0; - - if (es.es_lblk < map.m_lblk) - offs = map.m_lblk - es.es_lblk; - map.m_lblk = es.es_lblk + offs; - map.m_len = es.es_len - offs; - delalloc = true; - } - } - } else if (flags & IOMAP_WRITE) { - int dio_credits; - handle_t *handle; - int retries = 0; - - /* Trim mapping request to maximum we can map at once for DIO */ - if (map.m_len > DIO_MAX_BLOCKS) - map.m_len = DIO_MAX_BLOCKS; - dio_credits = ext4_chunk_trans_blocks(inode, map.m_len); -retry: - /* - * Either we allocate blocks and then we don't get unwritten - * extent so we have reserved enough credits, or the blocks - * are already allocated and unwritten and in that case - * extent conversion fits in the credits as well. - */ - handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, - dio_credits); - if (IS_ERR(handle)) - return PTR_ERR(handle); - - ret = ext4_map_blocks(handle, inode, &map, - EXT4_GET_BLOCKS_CREATE_ZERO); - if (ret < 0) { - ext4_journal_stop(handle); - if (ret == -ENOSPC && - ext4_should_retry_alloc(inode->i_sb, &retries)) - goto retry; - return ret; - } - - /* - * If we added blocks beyond i_size, we need to make sure they - * will get truncated if we crash before updating i_size in - * ext4_iomap_end(). For faults we don't need to do that (and - * even cannot because for orphan list operations inode_lock is - * required) - if we happen to instantiate block beyond i_size, - * it is because we race with truncate which has already added - * the inode to the orphan list. - */ - if (!(flags & IOMAP_FAULT) && first_block + map.m_len > - (i_size_read(inode) + (1 << blkbits) - 1) >> blkbits) { - int err; - - err = ext4_orphan_add(handle, inode); - if (err < 0) { - ext4_journal_stop(handle); - return err; - } - } - ext4_journal_stop(handle); - } else { - ret = ext4_map_blocks(NULL, inode, &map, 0); - if (ret < 0) - return ret; - } - - iomap->flags = 0; - if (ext4_inode_datasync_dirty(inode)) - iomap->flags |= IOMAP_F_DIRTY; - iomap->bdev = inode->i_sb->s_bdev; - iomap->dax_dev = sbi->s_daxdev; - iomap->offset = (u64)first_block << blkbits; - iomap->length = (u64)map.m_len << blkbits; - - if (ret == 0) { - iomap->type = delalloc ? IOMAP_DELALLOC : IOMAP_HOLE; - iomap->addr = IOMAP_NULL_ADDR; - } else { - if (map.m_flags & EXT4_MAP_MAPPED) { - iomap->type = IOMAP_MAPPED; - } else if (map.m_flags & EXT4_MAP_UNWRITTEN) { - iomap->type = IOMAP_UNWRITTEN; - } else { - WARN_ON_ONCE(1); - return -EIO; - } - iomap->addr = (u64)map.m_pblk << blkbits; - } - - if (map.m_flags & EXT4_MAP_NEW) - iomap->flags |= IOMAP_F_NEW; + ext4_set_iomap(inode, iomap, &map, offset, length); return 0; } @@ -3555,53 +3451,17 @@ retry: static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length, ssize_t written, unsigned flags, struct iomap *iomap) { - int ret = 0; - handle_t *handle; - int blkbits = inode->i_blkbits; - bool truncate = false; - - if (!(flags & IOMAP_WRITE) || (flags & IOMAP_FAULT)) - return 0; - - handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto orphan_del; - } - if (ext4_update_inode_size(inode, offset + written)) - ext4_mark_inode_dirty(handle, inode); /* - * We may need to truncate allocated but not written blocks beyond EOF. + * Check to see whether an error occurred while writing out the data to + * the allocated blocks. If so, return the magic error code so that we + * fallback to buffered I/O and attempt to complete the remainder of + * the I/O. Any blocks that may have been allocated in preparation for + * the direct I/O will be reused during buffered I/O. */ - if (iomap->offset + iomap->length > - ALIGN(inode->i_size, 1 << blkbits)) { - ext4_lblk_t written_blk, end_blk; + if (flags & (IOMAP_WRITE | IOMAP_DIRECT) && written == 0) + return -ENOTBLK; - written_blk = (offset + written) >> blkbits; - end_blk = (offset + length) >> blkbits; - if (written_blk < end_blk && ext4_can_truncate(inode)) - truncate = true; - } - /* - * Remove inode from orphan list if we were extending a inode and - * everything went fine. - */ - if (!truncate && inode->i_nlink && - !list_empty(&EXT4_I(inode)->i_orphan)) - ext4_orphan_del(handle, inode); - ext4_journal_stop(handle); - if (truncate) { - ext4_truncate_failed_write(inode); -orphan_del: - /* - * If truncate failed early the inode might still be on the - * orphan list; we need to make sure the inode is removed from - * the orphan list in that case. - */ - if (inode->i_nlink) - ext4_orphan_del(NULL, inode); - } - return ret; + return 0; } const struct iomap_ops ext4_iomap_ops = { @@ -3609,269 +3469,73 @@ const struct iomap_ops ext4_iomap_ops = { .iomap_end = ext4_iomap_end, }; -static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset, - ssize_t size, void *private) +static bool ext4_iomap_is_delalloc(struct inode *inode, + struct ext4_map_blocks *map) { - ext4_io_end_t *io_end = private; + struct extent_status es; + ext4_lblk_t offset = 0, end = map->m_lblk + map->m_len - 1; - /* if not async direct IO just return */ - if (!io_end) - return 0; + ext4_es_find_extent_range(inode, &ext4_es_is_delayed, + map->m_lblk, end, &es); - ext_debug("ext4_end_io_dio(): io_end 0x%p " - "for inode %lu, iocb 0x%p, offset %llu, size %zd\n", - io_end, io_end->inode->i_ino, iocb, offset, size); + if (!es.es_len || es.es_lblk > end) + return false; + + if (es.es_lblk > map->m_lblk) { + map->m_len = es.es_lblk - map->m_lblk; + return false; + } + + offset = map->m_lblk - es.es_lblk; + map->m_len = es.es_len - offset; + + return true; +} + +static int ext4_iomap_begin_report(struct inode *inode, loff_t offset, + loff_t length, unsigned int flags, + struct iomap *iomap, struct iomap *srcmap) +{ + int ret; + bool delalloc = false; + struct ext4_map_blocks map; + u8 blkbits = inode->i_blkbits; + + if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK) + return -EINVAL; + + if (ext4_has_inline_data(inode)) { + ret = ext4_inline_data_iomap(inode, iomap); + if (ret != -EAGAIN) { + if (ret == 0 && offset >= iomap->length) + ret = -ENOENT; + return ret; + } + } /* - * Error during AIO DIO. We cannot convert unwritten extents as the - * data was not written. Just clear the unwritten flag and drop io_end. + * Calculate the first and last logical block respectively. */ - if (size <= 0) { - ext4_clear_io_unwritten_flag(io_end); - size = 0; - } - io_end->offset = offset; - io_end->size = size; - ext4_put_io_end(io_end); + map.m_lblk = offset >> blkbits; + map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits, + EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1; + + ret = ext4_map_blocks(NULL, inode, &map, 0); + if (ret < 0) + return ret; + if (ret == 0) + delalloc = ext4_iomap_is_delalloc(inode, &map); + + ext4_set_iomap(inode, iomap, &map, offset, length); + if (delalloc && iomap->type == IOMAP_HOLE) + iomap->type = IOMAP_DELALLOC; return 0; } -/* - * Handling of direct IO writes. - * - * For ext4 extent files, ext4 will do direct-io write even to holes, - * preallocated extents, and those write extend the file, no need to - * fall back to buffered IO. - * - * For holes, we fallocate those blocks, mark them as unwritten - * If those blocks were preallocated, we mark sure they are split, but - * still keep the range to write as unwritten. - * - * The unwritten extents will be converted to written when DIO is completed. - * For async direct IO, since the IO may still pending when return, we - * set up an end_io call back function, which will do the conversion - * when async direct IO completed. - * - * If the O_DIRECT write will extend the file then add this inode to the - * orphan list. So recovery will truncate it back to the original size - * if the machine crashes during the write. - * - */ -static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter) -{ - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - struct ext4_inode_info *ei = EXT4_I(inode); - ssize_t ret; - loff_t offset = iocb->ki_pos; - size_t count = iov_iter_count(iter); - int overwrite = 0; - get_block_t *get_block_func = NULL; - int dio_flags = 0; - loff_t final_size = offset + count; - int orphan = 0; - handle_t *handle; - - if (final_size > inode->i_size || final_size > ei->i_disksize) { - /* Credits for sb + inode write */ - handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out; - } - ret = ext4_orphan_add(handle, inode); - if (ret) { - ext4_journal_stop(handle); - goto out; - } - orphan = 1; - ext4_update_i_disksize(inode, inode->i_size); - ext4_journal_stop(handle); - } - - BUG_ON(iocb->private == NULL); - - /* - * Make all waiters for direct IO properly wait also for extent - * conversion. This also disallows race between truncate() and - * overwrite DIO as i_dio_count needs to be incremented under i_mutex. - */ - inode_dio_begin(inode); - - /* If we do a overwrite dio, i_mutex locking can be released */ - overwrite = *((int *)iocb->private); - - if (overwrite) - inode_unlock(inode); - - /* - * For extent mapped files we could direct write to holes and fallocate. - * - * Allocated blocks to fill the hole are marked as unwritten to prevent - * parallel buffered read to expose the stale data before DIO complete - * the data IO. - * - * As to previously fallocated extents, ext4 get_block will just simply - * mark the buffer mapped but still keep the extents unwritten. - * - * For non AIO case, we will convert those unwritten extents to written - * after return back from blockdev_direct_IO. That way we save us from - * allocating io_end structure and also the overhead of offloading - * the extent convertion to a workqueue. - * - * For async DIO, the conversion needs to be deferred when the - * IO is completed. The ext4 end_io callback function will be - * called to take care of the conversion work. Here for async - * case, we allocate an io_end structure to hook to the iocb. - */ - iocb->private = NULL; - if (overwrite) - get_block_func = ext4_dio_get_block_overwrite; - else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) || - round_down(offset, i_blocksize(inode)) >= inode->i_size) { - get_block_func = ext4_dio_get_block; - dio_flags = DIO_LOCKING | DIO_SKIP_HOLES; - } else if (is_sync_kiocb(iocb)) { - get_block_func = ext4_dio_get_block_unwritten_sync; - dio_flags = DIO_LOCKING; - } else { - get_block_func = ext4_dio_get_block_unwritten_async; - dio_flags = DIO_LOCKING; - } - ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter, - get_block_func, ext4_end_io_dio, NULL, - dio_flags); - - if (ret > 0 && !overwrite && ext4_test_inode_state(inode, - EXT4_STATE_DIO_UNWRITTEN)) { - int err; - /* - * for non AIO case, since the IO is already - * completed, we could do the conversion right here - */ - err = ext4_convert_unwritten_extents(NULL, inode, - offset, ret); - if (err < 0) - ret = err; - ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); - } - - inode_dio_end(inode); - /* take i_mutex locking again if we do a ovewrite dio */ - if (overwrite) - inode_lock(inode); - - if (ret < 0 && final_size > inode->i_size) - ext4_truncate_failed_write(inode); - - /* Handle extending of i_size after direct IO write */ - if (orphan) { - int err; - - /* Credits for sb + inode write */ - handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); - if (IS_ERR(handle)) { - /* - * We wrote the data but cannot extend - * i_size. Bail out. In async io case, we do - * not return error here because we have - * already submmitted the corresponding - * bio. Returning error here makes the caller - * think that this IO is done and failed - * resulting in race with bio's completion - * handler. - */ - if (!ret) - ret = PTR_ERR(handle); - if (inode->i_nlink) - ext4_orphan_del(NULL, inode); - - goto out; - } - if (inode->i_nlink) - ext4_orphan_del(handle, inode); - if (ret > 0) { - loff_t end = offset + ret; - if (end > inode->i_size || end > ei->i_disksize) { - ext4_update_i_disksize(inode, end); - if (end > inode->i_size) - i_size_write(inode, end); - /* - * We're going to return a positive `ret' - * here due to non-zero-length I/O, so there's - * no way of reporting error returns from - * ext4_mark_inode_dirty() to userspace. So - * ignore it. - */ - ext4_mark_inode_dirty(handle, inode); - } - } - err = ext4_journal_stop(handle); - if (ret == 0) - ret = err; - } -out: - return ret; -} - -static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter) -{ - struct address_space *mapping = iocb->ki_filp->f_mapping; - struct inode *inode = mapping->host; - size_t count = iov_iter_count(iter); - ssize_t ret; - - /* - * Shared inode_lock is enough for us - it protects against concurrent - * writes & truncates and since we take care of writing back page cache, - * we are protected against page writeback as well. - */ - inode_lock_shared(inode); - ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, - iocb->ki_pos + count - 1); - if (ret) - goto out_unlock; - ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, - iter, ext4_dio_get_block, NULL, NULL, 0); -out_unlock: - inode_unlock_shared(inode); - return ret; -} - -static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter) -{ - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - size_t count = iov_iter_count(iter); - loff_t offset = iocb->ki_pos; - ssize_t ret; - -#ifdef CONFIG_FS_ENCRYPTION - if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) - return 0; -#endif - if (fsverity_active(inode)) - return 0; - - /* - * If we are doing data journalling we don't support O_DIRECT - */ - if (ext4_should_journal_data(inode)) - return 0; - - /* Let buffer I/O handle the inline data case. */ - if (ext4_has_inline_data(inode)) - return 0; - - trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter)); - if (iov_iter_rw(iter) == READ) - ret = ext4_direct_IO_read(iocb, iter); - else - ret = ext4_direct_IO_write(iocb, iter); - trace_ext4_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), ret); - return ret; -} +const struct iomap_ops ext4_iomap_report_ops = { + .iomap_begin = ext4_iomap_begin_report, +}; /* * Pages can be marked dirty completely asynchronously from ext4's journalling @@ -3910,7 +3574,7 @@ static const struct address_space_operations ext4_aops = { .bmap = ext4_bmap, .invalidatepage = ext4_invalidatepage, .releasepage = ext4_releasepage, - .direct_IO = ext4_direct_IO, + .direct_IO = noop_direct_IO, .migratepage = buffer_migrate_page, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, @@ -3927,7 +3591,7 @@ static const struct address_space_operations ext4_journalled_aops = { .bmap = ext4_bmap, .invalidatepage = ext4_journalled_invalidatepage, .releasepage = ext4_releasepage, - .direct_IO = ext4_direct_IO, + .direct_IO = noop_direct_IO, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, }; @@ -3943,7 +3607,7 @@ static const struct address_space_operations ext4_da_aops = { .bmap = ext4_bmap, .invalidatepage = ext4_invalidatepage, .releasepage = ext4_releasepage, - .direct_IO = ext4_direct_IO, + .direct_IO = noop_direct_IO, .migratepage = buffer_migrate_page, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, @@ -5450,11 +5114,15 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode) offset = inode->i_size & (PAGE_SIZE - 1); /* - * All buffers in the last page remain valid? Then there's nothing to - * do. We do the check mainly to optimize the common PAGE_SIZE == - * blocksize case + * If the page is fully truncated, we don't need to wait for any commit + * (and we even should not as __ext4_journalled_invalidatepage() may + * strip all buffers from the page but keep the page dirty which can then + * confuse e.g. concurrent ext4_writepage() seeing dirty page without + * buffers). Also we don't need to wait for any commit if all buffers in + * the page remain valid. This is most beneficial for the common case of + * blocksize == PAGESIZE. */ - if (offset > PAGE_SIZE - i_blocksize(inode)) + if (!offset || offset > (PAGE_SIZE - i_blocksize(inode))) return; while (1) { page = find_lock_page(inode->i_mapping, @@ -5915,8 +5583,23 @@ static int __ext4_expand_extra_isize(struct inode *inode, { struct ext4_inode *raw_inode; struct ext4_xattr_ibody_header *header; + unsigned int inode_size = EXT4_INODE_SIZE(inode->i_sb); + struct ext4_inode_info *ei = EXT4_I(inode); int error; + /* this was checked at iget time, but double check for good measure */ + if ((EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > inode_size) || + (ei->i_extra_isize & 3)) { + EXT4_ERROR_INODE(inode, "bad extra_isize %u (inode size %u)", + ei->i_extra_isize, + EXT4_INODE_SIZE(inode->i_sb)); + return -EFSCORRUPTED; + } + if ((new_extra_isize < ei->i_extra_isize) || + (new_extra_isize < 4) || + (new_extra_isize > inode_size - EXT4_GOOD_OLD_INODE_SIZE)) + return -EINVAL; /* Should never happen */ + raw_inode = ext4_raw_inode(iloc); header = IHDR(inode, raw_inode); @@ -5968,9 +5651,8 @@ static int ext4_try_to_expand_extra_isize(struct inode *inode, * If this is felt to be critical, then e2fsck should be run to * force a large enough s_min_extra_isize. */ - if (ext4_handle_valid(handle) && - jbd2_journal_extend(handle, - EXT4_DATA_TRANS_BLOCKS(inode->i_sb)) != 0) + if (ext4_journal_extend(handle, + EXT4_DATA_TRANS_BLOCKS(inode->i_sb), 0) != 0) return -ENOSPC; if (ext4_write_trylock_xattr(inode, &no_expand) == 0) diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index b1e4d359f73b..89725fa42573 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -50,29 +50,9 @@ static int finish_range(handle_t *handle, struct inode *inode, needed = ext4_ext_calc_credits_for_single_extent(inode, lb->last_block - lb->first_block + 1, path); - /* - * Make sure the credit we accumalated is not really high - */ - if (needed && ext4_handle_has_enough_credits(handle, - EXT4_RESERVE_TRANS_BLOCKS)) { - up_write((&EXT4_I(inode)->i_data_sem)); - retval = ext4_journal_restart(handle, needed); - down_write((&EXT4_I(inode)->i_data_sem)); - if (retval) - goto err_out; - } else if (needed) { - retval = ext4_journal_extend(handle, needed); - if (retval) { - /* - * IF not able to extend the journal restart the journal - */ - up_write((&EXT4_I(inode)->i_data_sem)); - retval = ext4_journal_restart(handle, needed); - down_write((&EXT4_I(inode)->i_data_sem)); - if (retval) - goto err_out; - } - } + retval = ext4_datasem_ensure_credits(handle, inode, needed, needed, 0); + if (retval < 0) + goto err_out; retval = ext4_ext_insert_extent(handle, inode, &path, &newext, 0); err_out: up_write((&EXT4_I(inode)->i_data_sem)); @@ -196,42 +176,30 @@ static int update_tind_extent_range(handle_t *handle, struct inode *inode, } -static int extend_credit_for_blkdel(handle_t *handle, struct inode *inode) -{ - int retval = 0, needed; - - if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1)) - return 0; - /* - * We are freeing a blocks. During this we touch - * superblock, group descriptor and block bitmap. - * So allocate a credit of 3. We may update - * quota (user and group). - */ - needed = 3 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); - - if (ext4_journal_extend(handle, needed) != 0) - retval = ext4_journal_restart(handle, needed); - - return retval; -} - static int free_dind_blocks(handle_t *handle, struct inode *inode, __le32 i_data) { int i; __le32 *tmp_idata; struct buffer_head *bh; + struct super_block *sb = inode->i_sb; unsigned long max_entries = inode->i_sb->s_blocksize >> 2; + int err; - bh = ext4_sb_bread(inode->i_sb, le32_to_cpu(i_data), 0); + bh = ext4_sb_bread(sb, le32_to_cpu(i_data), 0); if (IS_ERR(bh)) return PTR_ERR(bh); tmp_idata = (__le32 *)bh->b_data; for (i = 0; i < max_entries; i++) { if (tmp_idata[i]) { - extend_credit_for_blkdel(handle, inode); + err = ext4_journal_ensure_credits(handle, + EXT4_RESERVE_TRANS_BLOCKS, + ext4_free_metadata_revoke_credits(sb, 1)); + if (err < 0) { + put_bh(bh); + return err; + } ext4_free_blocks(handle, inode, NULL, le32_to_cpu(tmp_idata[i]), 1, EXT4_FREE_BLOCKS_METADATA | @@ -239,7 +207,10 @@ static int free_dind_blocks(handle_t *handle, } } put_bh(bh); - extend_credit_for_blkdel(handle, inode); + err = ext4_journal_ensure_credits(handle, EXT4_RESERVE_TRANS_BLOCKS, + ext4_free_metadata_revoke_credits(sb, 1)); + if (err < 0) + return err; ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); @@ -270,7 +241,10 @@ static int free_tind_blocks(handle_t *handle, } } put_bh(bh); - extend_credit_for_blkdel(handle, inode); + retval = ext4_journal_ensure_credits(handle, EXT4_RESERVE_TRANS_BLOCKS, + ext4_free_metadata_revoke_credits(inode->i_sb, 1)); + if (retval < 0) + return retval; ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); @@ -283,7 +257,11 @@ static int free_ind_block(handle_t *handle, struct inode *inode, __le32 *i_data) /* ei->i_data[EXT4_IND_BLOCK] */ if (i_data[0]) { - extend_credit_for_blkdel(handle, inode); + retval = ext4_journal_ensure_credits(handle, + EXT4_RESERVE_TRANS_BLOCKS, + ext4_free_metadata_revoke_credits(inode->i_sb, 1)); + if (retval < 0) + return retval; ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data[0]), 1, EXT4_FREE_BLOCKS_METADATA | @@ -318,12 +296,9 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode, * One credit accounted for writing the * i_data field of the original inode */ - retval = ext4_journal_extend(handle, 1); - if (retval) { - retval = ext4_journal_restart(handle, 1); - if (retval) - goto err_out; - } + retval = ext4_journal_ensure_credits(handle, 1, 0); + if (retval < 0) + goto err_out; i_data[0] = ei->i_data[EXT4_IND_BLOCK]; i_data[1] = ei->i_data[EXT4_DIND_BLOCK]; @@ -391,15 +366,20 @@ static int free_ext_idx(handle_t *handle, struct inode *inode, ix = EXT_FIRST_INDEX(eh); for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ix++) { retval = free_ext_idx(handle, inode, ix); - if (retval) - break; + if (retval) { + put_bh(bh); + return retval; + } } } put_bh(bh); - extend_credit_for_blkdel(handle, inode); + retval = ext4_journal_ensure_credits(handle, EXT4_RESERVE_TRANS_BLOCKS, + ext4_free_metadata_revoke_credits(inode->i_sb, 1)); + if (retval < 0) + return retval; ext4_free_blocks(handle, inode, NULL, block, 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); - return retval; + return 0; } /* @@ -574,9 +554,9 @@ err_out: } /* We mark the tmp_inode dirty via ext4_ext_tree_init. */ - if (ext4_journal_extend(handle, 1) != 0) - ext4_journal_restart(handle, 1); - + retval = ext4_journal_ensure_credits(handle, 1, 0); + if (retval < 0) + goto out_stop; /* * Mark the tmp_inode as of size zero */ @@ -594,6 +574,7 @@ err_out: /* Reset the extent details */ ext4_ext_tree_init(handle, tmp_inode); +out_stop: ext4_journal_stop(handle); out: unlock_new_inode(tmp_inode); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index a427d2031a8d..a856997d87b5 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2547,18 +2547,29 @@ static void ext4_dec_count(handle_t *handle, struct inode *inode) } +/* + * Add non-directory inode to a directory. On success, the inode reference is + * consumed by dentry is instantiation. This is also indicated by clearing of + * *inodep pointer. On failure, the caller is responsible for dropping the + * inode reference in the safe context. + */ static int ext4_add_nondir(handle_t *handle, - struct dentry *dentry, struct inode *inode) + struct dentry *dentry, struct inode **inodep) { + struct inode *dir = d_inode(dentry->d_parent); + struct inode *inode = *inodep; int err = ext4_add_entry(handle, dentry, inode); if (!err) { ext4_mark_inode_dirty(handle, inode); + if (IS_DIRSYNC(dir)) + ext4_handle_sync(handle); d_instantiate_new(dentry, inode); + *inodep = NULL; return 0; } drop_nlink(inode); + ext4_orphan_add(handle, inode); unlock_new_inode(inode); - iput(inode); return err; } @@ -2592,12 +2603,12 @@ retry: inode->i_op = &ext4_file_inode_operations; inode->i_fop = &ext4_file_operations; ext4_set_aops(inode); - err = ext4_add_nondir(handle, dentry, inode); - if (!err && IS_DIRSYNC(dir)) - ext4_handle_sync(handle); + err = ext4_add_nondir(handle, dentry, &inode); } if (handle) ext4_journal_stop(handle); + if (!IS_ERR_OR_NULL(inode)) + iput(inode); if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; return err; @@ -2624,12 +2635,12 @@ retry: if (!IS_ERR(inode)) { init_special_inode(inode, inode->i_mode, rdev); inode->i_op = &ext4_special_inode_operations; - err = ext4_add_nondir(handle, dentry, inode); - if (!err && IS_DIRSYNC(dir)) - ext4_handle_sync(handle); + err = ext4_add_nondir(handle, dentry, &inode); } if (handle) ext4_journal_stop(handle); + if (!IS_ERR_OR_NULL(inode)) + iput(inode); if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; return err; @@ -2779,10 +2790,12 @@ retry: if (err) { out_clear_inode: clear_nlink(inode); + ext4_orphan_add(handle, inode); unlock_new_inode(inode); ext4_mark_inode_dirty(handle, inode); + ext4_journal_stop(handle); iput(inode); - goto out_stop; + goto out_retry; } ext4_inc_count(handle, dir); ext4_update_dx_flag(dir); @@ -2796,6 +2809,7 @@ out_clear_inode: out_stop: if (handle) ext4_journal_stop(handle); +out_retry: if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; return err; @@ -3182,18 +3196,17 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) if (IS_DIRSYNC(dir)) ext4_handle_sync(handle); - if (inode->i_nlink == 0) { - ext4_warning_inode(inode, "Deleting file '%.*s' with no links", - dentry->d_name.len, dentry->d_name.name); - set_nlink(inode, 1); - } retval = ext4_delete_entry(handle, dir, de, bh); if (retval) goto end_unlink; dir->i_ctime = dir->i_mtime = current_time(dir); ext4_update_dx_flag(dir); ext4_mark_inode_dirty(handle, dir); - drop_nlink(inode); + if (inode->i_nlink == 0) + ext4_warning_inode(inode, "Deleting file '%.*s' with no links", + dentry->d_name.len, dentry->d_name.name); + else + drop_nlink(inode); if (!inode->i_nlink) ext4_orphan_add(handle, inode); inode->i_ctime = current_time(inode); @@ -3328,12 +3341,11 @@ static int ext4_symlink(struct inode *dir, inode->i_size = disk_link.len - 1; } EXT4_I(inode)->i_disksize = inode->i_size; - err = ext4_add_nondir(handle, dentry, inode); - if (!err && IS_DIRSYNC(dir)) - ext4_handle_sync(handle); - + err = ext4_add_nondir(handle, dentry, &inode); if (handle) ext4_journal_stop(handle); + if (inode) + iput(inode); goto out_free_encrypted_link; err_drop_inode: diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 12ceadef32c5..24aeedb8fc75 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -31,18 +31,56 @@ #include "acl.h" static struct kmem_cache *io_end_cachep; +static struct kmem_cache *io_end_vec_cachep; int __init ext4_init_pageio(void) { io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT); if (io_end_cachep == NULL) return -ENOMEM; + + io_end_vec_cachep = KMEM_CACHE(ext4_io_end_vec, 0); + if (io_end_vec_cachep == NULL) { + kmem_cache_destroy(io_end_cachep); + return -ENOMEM; + } return 0; } void ext4_exit_pageio(void) { kmem_cache_destroy(io_end_cachep); + kmem_cache_destroy(io_end_vec_cachep); +} + +struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end) +{ + struct ext4_io_end_vec *io_end_vec; + + io_end_vec = kmem_cache_zalloc(io_end_vec_cachep, GFP_NOFS); + if (!io_end_vec) + return ERR_PTR(-ENOMEM); + INIT_LIST_HEAD(&io_end_vec->list); + list_add_tail(&io_end_vec->list, &io_end->list_vec); + return io_end_vec; +} + +static void ext4_free_io_end_vec(ext4_io_end_t *io_end) +{ + struct ext4_io_end_vec *io_end_vec, *tmp; + + if (list_empty(&io_end->list_vec)) + return; + list_for_each_entry_safe(io_end_vec, tmp, &io_end->list_vec, list) { + list_del(&io_end_vec->list); + kmem_cache_free(io_end_vec_cachep, io_end_vec); + } +} + +struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end) +{ + BUG_ON(list_empty(&io_end->list_vec)); + return list_last_entry(&io_end->list_vec, struct ext4_io_end_vec, list); } /* @@ -125,6 +163,7 @@ static void ext4_release_io_end(ext4_io_end_t *io_end) ext4_finish_bio(bio); bio_put(bio); } + ext4_free_io_end_vec(io_end); kmem_cache_free(io_end_cachep, io_end); } @@ -136,29 +175,26 @@ static void ext4_release_io_end(ext4_io_end_t *io_end) * cannot get to ext4_ext_truncate() before all IOs overlapping that range are * completed (happens from ext4_free_ioend()). */ -static int ext4_end_io(ext4_io_end_t *io) +static int ext4_end_io_end(ext4_io_end_t *io_end) { - struct inode *inode = io->inode; - loff_t offset = io->offset; - ssize_t size = io->size; - handle_t *handle = io->handle; + struct inode *inode = io_end->inode; + handle_t *handle = io_end->handle; int ret = 0; - ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p," + ext4_debug("ext4_end_io_nolock: io_end 0x%p from inode %lu,list->next 0x%p," "list->prev 0x%p\n", - io, inode->i_ino, io->list.next, io->list.prev); + io_end, inode->i_ino, io_end->list.next, io_end->list.prev); - io->handle = NULL; /* Following call will use up the handle */ - ret = ext4_convert_unwritten_extents(handle, inode, offset, size); + io_end->handle = NULL; /* Following call will use up the handle */ + ret = ext4_convert_unwritten_io_end_vec(handle, io_end); if (ret < 0 && !ext4_forced_shutdown(EXT4_SB(inode->i_sb))) { ext4_msg(inode->i_sb, KERN_EMERG, "failed to convert unwritten extents to written " "extents -- potential data loss! " - "(inode %lu, offset %llu, size %zd, error %d)", - inode->i_ino, offset, size, ret); + "(inode %lu, error %d)", inode->i_ino, ret); } - ext4_clear_io_unwritten_flag(io); - ext4_release_io_end(io); + ext4_clear_io_unwritten_flag(io_end); + ext4_release_io_end(io_end); return ret; } @@ -166,21 +202,21 @@ static void dump_completed_IO(struct inode *inode, struct list_head *head) { #ifdef EXT4FS_DEBUG struct list_head *cur, *before, *after; - ext4_io_end_t *io, *io0, *io1; + ext4_io_end_t *io_end, *io_end0, *io_end1; if (list_empty(head)) return; ext4_debug("Dump inode %lu completed io list\n", inode->i_ino); - list_for_each_entry(io, head, list) { - cur = &io->list; + list_for_each_entry(io_end, head, list) { + cur = &io_end->list; before = cur->prev; - io0 = container_of(before, ext4_io_end_t, list); + io_end0 = container_of(before, ext4_io_end_t, list); after = cur->next; - io1 = container_of(after, ext4_io_end_t, list); + io_end1 = container_of(after, ext4_io_end_t, list); ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n", - io, inode->i_ino, io0, io1); + io_end, inode->i_ino, io_end0, io_end1); } #endif } @@ -207,7 +243,7 @@ static void ext4_add_complete_io(ext4_io_end_t *io_end) static int ext4_do_flush_completed_IO(struct inode *inode, struct list_head *head) { - ext4_io_end_t *io; + ext4_io_end_t *io_end; struct list_head unwritten; unsigned long flags; struct ext4_inode_info *ei = EXT4_I(inode); @@ -219,11 +255,11 @@ static int ext4_do_flush_completed_IO(struct inode *inode, spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); while (!list_empty(&unwritten)) { - io = list_entry(unwritten.next, ext4_io_end_t, list); - BUG_ON(!(io->flag & EXT4_IO_END_UNWRITTEN)); - list_del_init(&io->list); + io_end = list_entry(unwritten.next, ext4_io_end_t, list); + BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); + list_del_init(&io_end->list); - err = ext4_end_io(io); + err = ext4_end_io_end(io_end); if (unlikely(!ret && err)) ret = err; } @@ -242,19 +278,22 @@ void ext4_end_io_rsv_work(struct work_struct *work) ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) { - ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags); - if (io) { - io->inode = inode; - INIT_LIST_HEAD(&io->list); - atomic_set(&io->count, 1); + ext4_io_end_t *io_end = kmem_cache_zalloc(io_end_cachep, flags); + + if (io_end) { + io_end->inode = inode; + INIT_LIST_HEAD(&io_end->list); + INIT_LIST_HEAD(&io_end->list_vec); + atomic_set(&io_end->count, 1); } - return io; + return io_end; } void ext4_put_io_end_defer(ext4_io_end_t *io_end) { if (atomic_dec_and_test(&io_end->count)) { - if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || !io_end->size) { + if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || + list_empty(&io_end->list_vec)) { ext4_release_io_end(io_end); return; } @@ -268,9 +307,8 @@ int ext4_put_io_end(ext4_io_end_t *io_end) if (atomic_dec_and_test(&io_end->count)) { if (io_end->flag & EXT4_IO_END_UNWRITTEN) { - err = ext4_convert_unwritten_extents(io_end->handle, - io_end->inode, io_end->offset, - io_end->size); + err = ext4_convert_unwritten_io_end_vec(io_end->handle, + io_end); io_end->handle = NULL; ext4_clear_io_unwritten_flag(io_end); } @@ -307,10 +345,8 @@ static void ext4_end_bio(struct bio *bio) struct inode *inode = io_end->inode; ext4_warning(inode->i_sb, "I/O error %d writing to inode %lu " - "(offset %llu size %ld starting block %llu)", + "starting block %llu)", bio->bi_status, inode->i_ino, - (unsigned long long) io_end->offset, - (long) io_end->size, (unsigned long long) bi_sector >> (inode->i_blkbits - 9)); mapping_set_error(inode->i_mapping, @@ -358,14 +394,16 @@ void ext4_io_submit_init(struct ext4_io_submit *io, io->io_end = NULL; } -static int io_submit_init_bio(struct ext4_io_submit *io, - struct buffer_head *bh) +static void io_submit_init_bio(struct ext4_io_submit *io, + struct buffer_head *bh) { struct bio *bio; + /* + * bio_alloc will _always_ be able to allocate a bio if + * __GFP_DIRECT_RECLAIM is set, see comments for bio_alloc_bioset(). + */ bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); - if (!bio) - return -ENOMEM; bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio_set_dev(bio, bh->b_bdev); bio->bi_end_io = ext4_end_bio; @@ -373,13 +411,12 @@ static int io_submit_init_bio(struct ext4_io_submit *io, io->io_bio = bio; io->io_next_block = bh->b_blocknr; wbc_init_bio(io->io_wbc, bio); - return 0; } -static int io_submit_add_bh(struct ext4_io_submit *io, - struct inode *inode, - struct page *page, - struct buffer_head *bh) +static void io_submit_add_bh(struct ext4_io_submit *io, + struct inode *inode, + struct page *page, + struct buffer_head *bh) { int ret; @@ -388,9 +425,7 @@ submit_and_retry: ext4_io_submit(io); } if (io->io_bio == NULL) { - ret = io_submit_init_bio(io, bh); - if (ret) - return ret; + io_submit_init_bio(io, bh); io->io_bio->bi_write_hint = inode->i_write_hint; } ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh)); @@ -398,7 +433,6 @@ submit_and_retry: goto submit_and_retry; wbc_account_cgroup_owner(io->io_wbc, page, bh->b_size); io->io_next_block++; - return 0; } int ext4_bio_write_page(struct ext4_io_submit *io, @@ -491,8 +525,14 @@ int ext4_bio_write_page(struct ext4_io_submit *io, gfp_flags |= __GFP_NOFAIL; goto retry_encrypt; } - bounce_page = NULL; - goto out; + + printk_ratelimited(KERN_ERR "%s: ret = %d\n", __func__, ret); + redirty_page_for_writepage(wbc, page); + do { + clear_buffer_async_write(bh); + bh = bh->b_this_page; + } while (bh != head); + goto unlock; } } @@ -500,30 +540,13 @@ int ext4_bio_write_page(struct ext4_io_submit *io, do { if (!buffer_async_write(bh)) continue; - ret = io_submit_add_bh(io, inode, bounce_page ?: page, bh); - if (ret) { - /* - * We only get here on ENOMEM. Not much else - * we can do but mark the page as dirty, and - * better luck next time. - */ - break; - } + io_submit_add_bh(io, inode, + bounce_page ? bounce_page : page, bh); nr_submitted++; clear_buffer_dirty(bh); } while ((bh = bh->b_this_page) != head); - /* Error stopped previous loop? Clean up buffers... */ - if (ret) { - out: - fscrypt_free_bounce_page(bounce_page); - printk_ratelimited(KERN_ERR "%s: ret = %d\n", __func__, ret); - redirty_page_for_writepage(wbc, page); - do { - clear_buffer_async_write(bh); - bh = bh->b_this_page; - } while (bh != head); - } +unlock: unlock_page(page); /* Nothing submitted - we have to end page writeback */ if (!nr_submitted) diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index a30b203fa461..fef7755300c3 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -360,10 +360,12 @@ int ext4_mpage_readpages(struct address_space *mapping, if (bio == NULL) { struct bio_post_read_ctx *ctx; + /* + * bio_alloc will _always_ be able to allocate a bio if + * __GFP_DIRECT_RECLAIM is set, see bio_alloc_bioset(). + */ bio = bio_alloc(GFP_KERNEL, min_t(int, nr_pages, BIO_MAX_PAGES)); - if (!bio) - goto set_error_page; ctx = get_bio_post_read_ctx(inode, bio, page->index); if (IS_ERR(ctx)) { bio_put(bio); diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index c0e9aef376a7..a8c0f2b5b6e1 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -388,28 +388,10 @@ static struct buffer_head *bclean(handle_t *handle, struct super_block *sb, return bh; } -/* - * If we have fewer than thresh credits, extend by EXT4_MAX_TRANS_DATA. - * If that fails, restart the transaction & regain write access for the - * buffer head which is used for block_bitmap modifications. - */ -static int extend_or_restart_transaction(handle_t *handle, int thresh) +static int ext4_resize_ensure_credits_batch(handle_t *handle, int credits) { - int err; - - if (ext4_handle_has_enough_credits(handle, thresh)) - return 0; - - err = ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA); - if (err < 0) - return err; - if (err) { - err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA); - if (err) - return err; - } - - return 0; + return ext4_journal_ensure_credits_fn(handle, credits, + EXT4_MAX_TRANS_DATA, 0, 0); } /* @@ -451,8 +433,8 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle, continue; } - err = extend_or_restart_transaction(handle, 1); - if (err) + err = ext4_resize_ensure_credits_batch(handle, 1); + if (err < 0) return err; bh = sb_getblk(sb, flex_gd->groups[group].block_bitmap); @@ -544,8 +526,8 @@ static int setup_new_flex_group_blocks(struct super_block *sb, struct buffer_head *gdb; ext4_debug("update backup group %#04llx\n", block); - err = extend_or_restart_transaction(handle, 1); - if (err) + err = ext4_resize_ensure_credits_batch(handle, 1); + if (err < 0) goto out; gdb = sb_getblk(sb, block); @@ -602,8 +584,8 @@ handle_bb: /* Initialize block bitmap of the @group */ block = group_data[i].block_bitmap; - err = extend_or_restart_transaction(handle, 1); - if (err) + err = ext4_resize_ensure_credits_batch(handle, 1); + if (err < 0) goto out; bh = bclean(handle, sb, block); @@ -631,8 +613,8 @@ handle_ib: /* Initialize inode bitmap of the @group */ block = group_data[i].inode_bitmap; - err = extend_or_restart_transaction(handle, 1); - if (err) + err = ext4_resize_ensure_credits_batch(handle, 1); + if (err < 0) goto out; /* Mark unused entries in inode bitmap used */ bh = bclean(handle, sb, block); @@ -1109,10 +1091,8 @@ static void update_backups(struct super_block *sb, sector_t blk_off, char *data, ext4_fsblk_t backup_block; /* Out of journal space, and can't get more - abort - so sad */ - if (ext4_handle_valid(handle) && - handle->h_buffer_credits == 0 && - ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA) && - (err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA))) + err = ext4_resize_ensure_credits_batch(handle, 1); + if (err < 0) break; if (meta_bg == 0) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index b3cbf8622eab..c54019979e80 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1172,9 +1172,9 @@ void ext4_clear_inode(struct inode *inode) { invalidate_inode_buffers(inode); clear_inode(inode); - dquot_drop(inode); ext4_discard_preallocations(inode); ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); + dquot_drop(inode); if (EXT4_I(inode)->jinode) { jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode), EXT4_I(inode)->jinode); @@ -1388,7 +1388,6 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, static int ext4_quota_enable(struct super_block *sb, int type, int format_id, unsigned int flags); static int ext4_enable_quotas(struct super_block *sb); -static int ext4_get_next_id(struct super_block *sb, struct kqid *qid); static struct dquot **ext4_get_dquots(struct inode *inode) { @@ -1406,7 +1405,7 @@ static const struct dquot_operations ext4_quota_operations = { .destroy_dquot = dquot_destroy, .get_projid = ext4_get_projid, .get_inode_usage = ext4_get_inode_usage, - .get_next_id = ext4_get_next_id, + .get_next_id = dquot_get_next_id, }; static const struct quotactl_ops ext4_qctl_operations = { @@ -2065,7 +2064,7 @@ static int parse_options(char *options, struct super_block *sb, unsigned int *journal_ioprio, int is_remount) { - struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_sb_info __maybe_unused *sbi = EXT4_SB(sb); char *p, __maybe_unused *usr_qf_name, __maybe_unused *grp_qf_name; substring_t args[MAX_OPT_ARGS]; int token; @@ -2119,16 +2118,6 @@ static int parse_options(char *options, struct super_block *sb, } } #endif - if (test_opt(sb, DIOREAD_NOLOCK)) { - int blocksize = - BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size); - - if (blocksize < PAGE_SIZE) { - ext4_msg(sb, KERN_ERR, "can't mount with " - "dioread_nolock if block size != PAGE_SIZE"); - return 0; - } - } return 1; } @@ -3569,12 +3558,15 @@ static void ext4_clamp_want_extra_isize(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; + unsigned def_extra_isize = sizeof(struct ext4_inode) - + EXT4_GOOD_OLD_INODE_SIZE; - /* determine the minimum size of new large inodes, if present */ - if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE && - sbi->s_want_extra_isize == 0) { - sbi->s_want_extra_isize = sizeof(struct ext4_inode) - - EXT4_GOOD_OLD_INODE_SIZE; + if (sbi->s_inode_size == EXT4_GOOD_OLD_INODE_SIZE) { + sbi->s_want_extra_isize = 0; + return; + } + if (sbi->s_want_extra_isize < 4) { + sbi->s_want_extra_isize = def_extra_isize; if (ext4_has_feature_extra_isize(sb)) { if (sbi->s_want_extra_isize < le16_to_cpu(es->s_want_extra_isize)) @@ -3587,10 +3579,10 @@ static void ext4_clamp_want_extra_isize(struct super_block *sb) } } /* Check if enough inode space is available */ - if (EXT4_GOOD_OLD_INODE_SIZE + sbi->s_want_extra_isize > - sbi->s_inode_size) { - sbi->s_want_extra_isize = sizeof(struct ext4_inode) - - EXT4_GOOD_OLD_INODE_SIZE; + if ((sbi->s_want_extra_isize > sbi->s_inode_size) || + (EXT4_GOOD_OLD_INODE_SIZE + sbi->s_want_extra_isize > + sbi->s_inode_size)) { + sbi->s_want_extra_isize = def_extra_isize; ext4_msg(sb, KERN_INFO, "required extra inode space not available"); } @@ -4453,13 +4445,6 @@ no_journal: } } - if ((DUMMY_ENCRYPTION_ENABLED(sbi) || ext4_has_feature_encrypt(sb)) && - (blocksize != PAGE_SIZE)) { - ext4_msg(sb, KERN_ERR, - "Unsupported blocksize for fs encryption"); - goto failed_mount_wq; - } - if (ext4_has_feature_verity(sb) && blocksize != PAGE_SIZE) { ext4_msg(sb, KERN_ERR, "Unsupported blocksize for fs-verity"); goto failed_mount_wq; @@ -6033,18 +6018,6 @@ out: } return len; } - -static int ext4_get_next_id(struct super_block *sb, struct kqid *qid) -{ - const struct quota_format_ops *ops; - - if (!sb_has_quota_loaded(sb, qid->type)) - return -ESRCH; - ops = sb_dqopt(sb)->ops[qid->type]; - if (!ops || !ops->get_next_id) - return -ENOSYS; - return dquot_get_next_id(sb, qid); -} #endif static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags, diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 491f9ee4040e..8966a5439a22 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -967,55 +967,6 @@ int __ext4_xattr_set_credits(struct super_block *sb, struct inode *inode, return credits; } -static int ext4_xattr_ensure_credits(handle_t *handle, struct inode *inode, - int credits, struct buffer_head *bh, - bool dirty, bool block_csum) -{ - int error; - - if (!ext4_handle_valid(handle)) - return 0; - - if (handle->h_buffer_credits >= credits) - return 0; - - error = ext4_journal_extend(handle, credits - handle->h_buffer_credits); - if (!error) - return 0; - if (error < 0) { - ext4_warning(inode->i_sb, "Extend journal (error %d)", error); - return error; - } - - if (bh && dirty) { - if (block_csum) - ext4_xattr_block_csum_set(inode, bh); - error = ext4_handle_dirty_metadata(handle, NULL, bh); - if (error) { - ext4_warning(inode->i_sb, "Handle metadata (error %d)", - error); - return error; - } - } - - error = ext4_journal_restart(handle, credits); - if (error) { - ext4_warning(inode->i_sb, "Restart journal (error %d)", error); - return error; - } - - if (bh) { - error = ext4_journal_get_write_access(handle, bh); - if (error) { - ext4_warning(inode->i_sb, - "Get write access failed (error %d)", - error); - return error; - } - } - return 0; -} - static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode, int ref_change) { @@ -1149,6 +1100,24 @@ cleanup: return saved_err; } +static int ext4_xattr_restart_fn(handle_t *handle, struct inode *inode, + struct buffer_head *bh, bool block_csum, bool dirty) +{ + int error; + + if (bh && dirty) { + if (block_csum) + ext4_xattr_block_csum_set(inode, bh); + error = ext4_handle_dirty_metadata(handle, NULL, bh); + if (error) { + ext4_warning(inode->i_sb, "Handle metadata (error %d)", + error); + return error; + } + } + return 0; +} + static void ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent, struct buffer_head *bh, @@ -1185,13 +1154,24 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent, continue; } - err = ext4_xattr_ensure_credits(handle, parent, credits, bh, - dirty, block_csum); - if (err) { + err = ext4_journal_ensure_credits_fn(handle, credits, credits, + ext4_free_metadata_revoke_credits(parent->i_sb, 1), + ext4_xattr_restart_fn(handle, parent, bh, block_csum, + dirty)); + if (err < 0) { ext4_warning_inode(ea_inode, "Ensure credits err=%d", err); continue; } + if (err > 0) { + err = ext4_journal_get_write_access(handle, bh); + if (err) { + ext4_warning_inode(ea_inode, + "Re-get write access err=%d", + err); + continue; + } + } err = ext4_xattr_inode_dec_ref(handle, ea_inode); if (err) { @@ -2335,7 +2315,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, flags & XATTR_CREATE); brelse(bh); - if (!ext4_handle_has_enough_credits(handle, credits)) { + if (jbd2_handle_buffer_credits(handle) < credits) { error = -ENOSPC; goto cleanup; } @@ -2862,11 +2842,9 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, struct inode *ea_inode; int error; - error = ext4_xattr_ensure_credits(handle, inode, extra_credits, - NULL /* bh */, - false /* dirty */, - false /* block_csum */); - if (error) { + error = ext4_journal_ensure_credits(handle, extra_credits, + ext4_free_metadata_revoke_credits(inode->i_sb, 1)); + if (error < 0) { EXT4_ERROR_INODE(inode, "ensure credits (error %d)", error); goto cleanup; } diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index a1909066bde6..8fff6677a5da 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -110,7 +110,7 @@ void __jbd2_log_wait_for_space(journal_t *journal) int nblocks, space_left; /* assert_spin_locked(&journal->j_state_lock); */ - nblocks = jbd2_space_needed(journal); + nblocks = journal->j_max_transaction_buffers; while (jbd2_log_space_left(journal) < nblocks) { write_unlock(&journal->j_state_lock); mutex_lock_io(&journal->j_checkpoint_mutex); diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 132fb92098c7..7f0b362b3842 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -482,10 +482,10 @@ void jbd2_journal_commit_transaction(journal_t *journal) if (jh->b_committed_data) { struct buffer_head *bh = jh2bh(jh); - jbd_lock_bh_state(bh); + spin_lock(&jh->b_state_lock); jbd2_free(jh->b_committed_data, bh->b_size); jh->b_committed_data = NULL; - jbd_unlock_bh_state(bh); + spin_unlock(&jh->b_state_lock); } jbd2_journal_refile_buffer(journal, jh); } @@ -560,8 +560,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) stats.run.rs_logging = jiffies; stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing, stats.run.rs_logging); - stats.run.rs_blocks = - atomic_read(&commit_transaction->t_outstanding_credits); + stats.run.rs_blocks = commit_transaction->t_nr_buffers; stats.run.rs_blocks_logged = 0; J_ASSERT(commit_transaction->t_nr_buffers <= @@ -642,8 +641,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) /* * start_this_handle() uses t_outstanding_credits to determine - * the free space in the log, but this counter is changed - * by jbd2_journal_next_log_block() also. + * the free space in the log. */ atomic_dec(&commit_transaction->t_outstanding_credits); @@ -727,7 +725,6 @@ start_journal_io: submit_bh(REQ_OP_WRITE, REQ_SYNC, bh); } cond_resched(); - stats.run.rs_blocks_logged += bufs; /* Force a new descriptor to be generated next time round the loop. */ @@ -814,6 +811,7 @@ start_journal_io: if (unlikely(!buffer_uptodate(bh))) err = -EIO; jbd2_unfile_log_bh(bh); + stats.run.rs_blocks_logged++; /* * The list contains temporary buffer heads created by @@ -859,6 +857,7 @@ start_journal_io: BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile"); clear_buffer_jwrite(bh); jbd2_unfile_log_bh(bh); + stats.run.rs_blocks_logged++; __brelse(bh); /* One for getblk */ /* AKPM: bforget here */ } @@ -880,6 +879,7 @@ start_journal_io: } if (cbh) err = journal_wait_on_commit_record(journal, cbh); + stats.run.rs_blocks_logged++; if (jbd2_has_feature_async_commit(journal) && journal->j_flags & JBD2_BARRIER) { blkdev_issue_flush(journal->j_dev, GFP_NOFS, NULL); @@ -888,6 +888,9 @@ start_journal_io: if (err) jbd2_journal_abort(journal, err); + WARN_ON_ONCE( + atomic_read(&commit_transaction->t_outstanding_credits) < 0); + /* * Now disk caches for filesystem device are flushed so we are safe to * erase checkpointed transactions from the log by updating journal @@ -918,6 +921,7 @@ restart_loop: transaction_t *cp_transaction; struct buffer_head *bh; int try_to_free = 0; + bool drop_ref; jh = commit_transaction->t_forget; spin_unlock(&journal->j_list_lock); @@ -927,7 +931,7 @@ restart_loop: * done with it. */ get_bh(bh); - jbd_lock_bh_state(bh); + spin_lock(&jh->b_state_lock); J_ASSERT_JH(jh, jh->b_transaction == commit_transaction); /* @@ -1022,8 +1026,10 @@ restart_loop: try_to_free = 1; } JBUFFER_TRACE(jh, "refile or unfile buffer"); - __jbd2_journal_refile_buffer(jh); - jbd_unlock_bh_state(bh); + drop_ref = __jbd2_journal_refile_buffer(jh); + spin_unlock(&jh->b_state_lock); + if (drop_ref) + jbd2_journal_put_journal_head(jh); if (try_to_free) release_buffer_page(bh); /* Drops bh reference */ else diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 1c58859aa592..5e408ee24a1a 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -363,7 +363,7 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction, /* keep subsequent assertions sane */ atomic_set(&new_bh->b_count, 1); - jbd_lock_bh_state(bh_in); + spin_lock(&jh_in->b_state_lock); repeat: /* * If a new transaction has already done a buffer copy-out, then @@ -405,13 +405,13 @@ repeat: if (need_copy_out && !done_copy_out) { char *tmp; - jbd_unlock_bh_state(bh_in); + spin_unlock(&jh_in->b_state_lock); tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS); if (!tmp) { brelse(new_bh); return -ENOMEM; } - jbd_lock_bh_state(bh_in); + spin_lock(&jh_in->b_state_lock); if (jh_in->b_frozen_data) { jbd2_free(tmp, bh_in->b_size); goto repeat; @@ -464,7 +464,7 @@ repeat: __jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow); spin_unlock(&journal->j_list_lock); set_buffer_shadow(bh_in); - jbd_unlock_bh_state(bh_in); + spin_unlock(&jh_in->b_state_lock); return do_escape | (done_copy_out << 1); } @@ -840,6 +840,7 @@ jbd2_journal_get_descriptor_buffer(transaction_t *transaction, int type) bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); if (!bh) return NULL; + atomic_dec(&transaction->t_outstanding_credits); lock_buffer(bh); memset(bh->b_data, 0, journal->j_blocksize); header = (journal_header_t *)bh->b_data; @@ -1098,6 +1099,16 @@ static void jbd2_stats_proc_exit(journal_t *journal) remove_proc_entry(journal->j_devname, proc_jbd2_stats); } +/* Minimum size of descriptor tag */ +static int jbd2_min_tag_size(void) +{ + /* + * Tag with 32-bit block numbers does not use last four bytes of the + * structure + */ + return sizeof(journal_block_tag_t) - 4; +} + /* * Management for journal control blocks: functions to create and * destroy journal_t structures, and to initialise and read existing @@ -1156,7 +1167,8 @@ static journal_t *journal_init_common(struct block_device *bdev, journal->j_fs_dev = fs_dev; journal->j_blk_offset = start; journal->j_maxlen = len; - n = journal->j_blocksize / sizeof(journal_block_tag_t); + /* We need enough buffers to write out full descriptor block. */ + n = journal->j_blocksize / jbd2_min_tag_size(); journal->j_wbufsize = n; journal->j_wbuf = kmalloc_array(n, sizeof(struct buffer_head *), GFP_KERNEL); @@ -1488,6 +1500,21 @@ void jbd2_journal_update_sb_errno(journal_t *journal) } EXPORT_SYMBOL(jbd2_journal_update_sb_errno); +static int journal_revoke_records_per_block(journal_t *journal) +{ + int record_size; + int space = journal->j_blocksize - sizeof(jbd2_journal_revoke_header_t); + + if (jbd2_has_feature_64bit(journal)) + record_size = 8; + else + record_size = 4; + + if (jbd2_journal_has_csum_v2or3(journal)) + space -= sizeof(struct jbd2_journal_block_tail); + return space / record_size; +} + /* * Read the superblock for a given journal, performing initial * validation of the format. @@ -1596,6 +1623,8 @@ static int journal_get_superblock(journal_t *journal) sizeof(sb->s_uuid)); } + journal->j_revoke_records_per_block = + journal_revoke_records_per_block(journal); set_buffer_verified(bh); return 0; @@ -1916,6 +1945,8 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat, sb->s_feature_ro_compat |= cpu_to_be32(ro); sb->s_feature_incompat |= cpu_to_be32(incompat); unlock_buffer(journal->j_sb_buffer); + journal->j_revoke_records_per_block = + journal_revoke_records_per_block(journal); return 1; #undef COMPAT_FEATURE_ON @@ -1946,6 +1977,8 @@ void jbd2_journal_clear_features(journal_t *journal, unsigned long compat, sb->s_feature_compat &= ~cpu_to_be32(compat); sb->s_feature_ro_compat &= ~cpu_to_be32(ro); sb->s_feature_incompat &= ~cpu_to_be32(incompat); + journal->j_revoke_records_per_block = + journal_revoke_records_per_block(journal); } EXPORT_SYMBOL(jbd2_journal_clear_features); @@ -2410,6 +2443,8 @@ static struct journal_head *journal_alloc_journal_head(void) ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS | __GFP_NOFAIL); } + if (ret) + spin_lock_init(&ret->b_state_lock); return ret; } @@ -2529,17 +2564,23 @@ static void __journal_remove_journal_head(struct buffer_head *bh) J_ASSERT_BH(bh, buffer_jbd(bh)); J_ASSERT_BH(bh, jh2bh(jh) == bh); BUFFER_TRACE(bh, "remove journal_head"); - if (jh->b_frozen_data) { - printk(KERN_WARNING "%s: freeing b_frozen_data\n", __func__); - jbd2_free(jh->b_frozen_data, bh->b_size); - } - if (jh->b_committed_data) { - printk(KERN_WARNING "%s: freeing b_committed_data\n", __func__); - jbd2_free(jh->b_committed_data, bh->b_size); - } + + /* Unlink before dropping the lock */ bh->b_private = NULL; jh->b_bh = NULL; /* debug, really */ clear_buffer_jbd(bh); +} + +static void journal_release_journal_head(struct journal_head *jh, size_t b_size) +{ + if (jh->b_frozen_data) { + printk(KERN_WARNING "%s: freeing b_frozen_data\n", __func__); + jbd2_free(jh->b_frozen_data, b_size); + } + if (jh->b_committed_data) { + printk(KERN_WARNING "%s: freeing b_committed_data\n", __func__); + jbd2_free(jh->b_committed_data, b_size); + } journal_free_journal_head(jh); } @@ -2557,9 +2598,11 @@ void jbd2_journal_put_journal_head(struct journal_head *jh) if (!jh->b_jcount) { __journal_remove_journal_head(bh); jbd_unlock_bh_journal_head(bh); + journal_release_journal_head(jh, bh->b_size); __brelse(bh); - } else + } else { jbd_unlock_bh_journal_head(bh); + } } /* diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index f08073d7bbf5..fa608788b93d 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -371,6 +371,11 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr, } #endif + if (WARN_ON_ONCE(handle->h_revoke_credits <= 0)) { + if (!bh_in) + brelse(bh); + return -EIO; + } /* We really ought not ever to revoke twice in a row without first having the revoke cancelled: it's illegal to free a block twice without allocating it in between! */ @@ -391,6 +396,7 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr, __brelse(bh); } } + handle->h_revoke_credits--; jbd_debug(2, "insert revoke for block %llu, bh_in=%p\n",blocknr, bh_in); err = insert_revoke_hash(journal, blocknr, diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index b25ebdcabfa3..27b9f9dee434 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -62,6 +62,28 @@ void jbd2_journal_free_transaction(transaction_t *transaction) kmem_cache_free(transaction_cache, transaction); } +/* + * Base amount of descriptor blocks we reserve for each transaction. + */ +static int jbd2_descriptor_blocks_per_trans(journal_t *journal) +{ + int tag_space = journal->j_blocksize - sizeof(journal_header_t); + int tags_per_block; + + /* Subtract UUID */ + tag_space -= 16; + if (jbd2_journal_has_csum_v2or3(journal)) + tag_space -= sizeof(struct jbd2_journal_block_tail); + /* Commit code leaves a slack space of 16 bytes at the end of block */ + tags_per_block = (tag_space - 16) / journal_tag_bytes(journal); + /* + * Revoke descriptors are accounted separately so we need to reserve + * space for commit block and normal transaction descriptor blocks. + */ + return 1 + DIV_ROUND_UP(journal->j_max_transaction_buffers, + tags_per_block); +} + /* * jbd2_get_transaction: obtain a new transaction_t object. * @@ -88,7 +110,9 @@ static void jbd2_get_transaction(journal_t *journal, spin_lock_init(&transaction->t_handle_lock); atomic_set(&transaction->t_updates, 0); atomic_set(&transaction->t_outstanding_credits, + jbd2_descriptor_blocks_per_trans(journal) + atomic_read(&journal->j_reserved_credits)); + atomic_set(&transaction->t_outstanding_revokes, 0); atomic_set(&transaction->t_handle_count, 0); INIT_LIST_HEAD(&transaction->t_inode_list); INIT_LIST_HEAD(&transaction->t_private_list); @@ -258,12 +282,13 @@ static int add_transaction_credits(journal_t *journal, int blocks, * *before* starting to dirty potentially checkpointed buffers * in the new transaction. */ - if (jbd2_log_space_left(journal) < jbd2_space_needed(journal)) { + if (jbd2_log_space_left(journal) < journal->j_max_transaction_buffers) { atomic_sub(total, &t->t_outstanding_credits); read_unlock(&journal->j_state_lock); jbd2_might_wait_for_commit(journal); write_lock(&journal->j_state_lock); - if (jbd2_log_space_left(journal) < jbd2_space_needed(journal)) + if (jbd2_log_space_left(journal) < + journal->j_max_transaction_buffers) __jbd2_log_wait_for_space(journal); write_unlock(&journal->j_state_lock); return 1; @@ -299,12 +324,12 @@ static int start_this_handle(journal_t *journal, handle_t *handle, gfp_t gfp_mask) { transaction_t *transaction, *new_transaction = NULL; - int blocks = handle->h_buffer_credits; + int blocks = handle->h_total_credits; int rsv_blocks = 0; unsigned long ts = jiffies; if (handle->h_rsv_handle) - rsv_blocks = handle->h_rsv_handle->h_buffer_credits; + rsv_blocks = handle->h_rsv_handle->h_total_credits; /* * Limit the number of reserved credits to 1/2 of maximum transaction @@ -405,6 +430,7 @@ repeat: update_t_max_wait(transaction, ts); handle->h_transaction = transaction; handle->h_requested_credits = blocks; + handle->h_revoke_credits_requested = handle->h_revoke_credits; handle->h_start_jiffies = jiffies; atomic_inc(&transaction->t_updates); atomic_inc(&transaction->t_handle_count); @@ -431,15 +457,15 @@ static handle_t *new_handle(int nblocks) handle_t *handle = jbd2_alloc_handle(GFP_NOFS); if (!handle) return NULL; - handle->h_buffer_credits = nblocks; + handle->h_total_credits = nblocks; handle->h_ref = 1; return handle; } handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks, - gfp_t gfp_mask, unsigned int type, - unsigned int line_no) + int revoke_records, gfp_t gfp_mask, + unsigned int type, unsigned int line_no) { handle_t *handle = journal_current_handle(); int err; @@ -453,6 +479,8 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks, return handle; } + nblocks += DIV_ROUND_UP(revoke_records, + journal->j_revoke_records_per_block); handle = new_handle(nblocks); if (!handle) return ERR_PTR(-ENOMEM); @@ -468,6 +496,7 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks, rsv_handle->h_journal = journal; handle->h_rsv_handle = rsv_handle; } + handle->h_revoke_credits = revoke_records; err = start_this_handle(journal, handle, gfp_mask); if (err < 0) { @@ -508,16 +537,21 @@ EXPORT_SYMBOL(jbd2__journal_start); */ handle_t *jbd2_journal_start(journal_t *journal, int nblocks) { - return jbd2__journal_start(journal, nblocks, 0, GFP_NOFS, 0, 0); + return jbd2__journal_start(journal, nblocks, 0, 0, GFP_NOFS, 0, 0); } EXPORT_SYMBOL(jbd2_journal_start); -void jbd2_journal_free_reserved(handle_t *handle) +static void __jbd2_journal_unreserve_handle(handle_t *handle) { journal_t *journal = handle->h_journal; WARN_ON(!handle->h_reserved); - sub_reserved_credits(journal, handle->h_buffer_credits); + sub_reserved_credits(journal, handle->h_total_credits); +} + +void jbd2_journal_free_reserved(handle_t *handle) +{ + __jbd2_journal_unreserve_handle(handle); jbd2_free_handle(handle); } EXPORT_SYMBOL(jbd2_journal_free_reserved); @@ -571,7 +605,7 @@ int jbd2_journal_start_reserved(handle_t *handle, unsigned int type, handle->h_line_no = line_no; trace_jbd2_handle_start(journal->j_fs_dev->bd_dev, handle->h_transaction->t_tid, type, - line_no, handle->h_buffer_credits); + line_no, handle->h_total_credits); return 0; } EXPORT_SYMBOL(jbd2_journal_start_reserved); @@ -580,6 +614,7 @@ EXPORT_SYMBOL(jbd2_journal_start_reserved); * int jbd2_journal_extend() - extend buffer credits. * @handle: handle to 'extend' * @nblocks: nr blocks to try to extend by. + * @revoke_records: number of revoke records to try to extend by. * * Some transactions, such as large extends and truncates, can be done * atomically all at once or in several stages. The operation requests @@ -596,7 +631,7 @@ EXPORT_SYMBOL(jbd2_journal_start_reserved); * return code < 0 implies an error * return code > 0 implies normal transaction-full status. */ -int jbd2_journal_extend(handle_t *handle, int nblocks) +int jbd2_journal_extend(handle_t *handle, int nblocks, int revoke_records) { transaction_t *transaction = handle->h_transaction; journal_t *journal; @@ -618,6 +653,12 @@ int jbd2_journal_extend(handle_t *handle, int nblocks) goto error_out; } + nblocks += DIV_ROUND_UP( + handle->h_revoke_credits_requested + revoke_records, + journal->j_revoke_records_per_block) - + DIV_ROUND_UP( + handle->h_revoke_credits_requested, + journal->j_revoke_records_per_block); spin_lock(&transaction->t_handle_lock); wanted = atomic_add_return(nblocks, &transaction->t_outstanding_credits); @@ -629,22 +670,16 @@ int jbd2_journal_extend(handle_t *handle, int nblocks) goto unlock; } - if (wanted + (wanted >> JBD2_CONTROL_BLOCKS_SHIFT) > - jbd2_log_space_left(journal)) { - jbd_debug(3, "denied handle %p %d blocks: " - "insufficient log space\n", handle, nblocks); - atomic_sub(nblocks, &transaction->t_outstanding_credits); - goto unlock; - } - trace_jbd2_handle_extend(journal->j_fs_dev->bd_dev, transaction->t_tid, handle->h_type, handle->h_line_no, - handle->h_buffer_credits, + handle->h_total_credits, nblocks); - handle->h_buffer_credits += nblocks; + handle->h_total_credits += nblocks; handle->h_requested_credits += nblocks; + handle->h_revoke_credits += revoke_records; + handle->h_revoke_credits_requested += revoke_records; result = 0; jbd_debug(3, "extended handle %p by %d\n", handle, nblocks); @@ -655,11 +690,55 @@ error_out: return result; } +static void stop_this_handle(handle_t *handle) +{ + transaction_t *transaction = handle->h_transaction; + journal_t *journal = transaction->t_journal; + int revokes; + + J_ASSERT(journal_current_handle() == handle); + J_ASSERT(atomic_read(&transaction->t_updates) > 0); + current->journal_info = NULL; + /* + * Subtract necessary revoke descriptor blocks from handle credits. We + * take care to account only for revoke descriptor blocks the + * transaction will really need as large sequences of transactions with + * small numbers of revokes are relatively common. + */ + revokes = handle->h_revoke_credits_requested - handle->h_revoke_credits; + if (revokes) { + int t_revokes, revoke_descriptors; + int rr_per_blk = journal->j_revoke_records_per_block; + + WARN_ON_ONCE(DIV_ROUND_UP(revokes, rr_per_blk) + > handle->h_total_credits); + t_revokes = atomic_add_return(revokes, + &transaction->t_outstanding_revokes); + revoke_descriptors = + DIV_ROUND_UP(t_revokes, rr_per_blk) - + DIV_ROUND_UP(t_revokes - revokes, rr_per_blk); + handle->h_total_credits -= revoke_descriptors; + } + atomic_sub(handle->h_total_credits, + &transaction->t_outstanding_credits); + if (handle->h_rsv_handle) + __jbd2_journal_unreserve_handle(handle->h_rsv_handle); + if (atomic_dec_and_test(&transaction->t_updates)) + wake_up(&journal->j_wait_updates); + + rwsem_release(&journal->j_trans_commit_map, _THIS_IP_); + /* + * Scope of the GFP_NOFS context is over here and so we can restore the + * original alloc context. + */ + memalloc_nofs_restore(handle->saved_alloc_context); +} /** * int jbd2_journal_restart() - restart a handle . * @handle: handle to restart * @nblocks: nr credits requested + * @revoke_records: number of revoke record credits requested * @gfp_mask: memory allocation flags (for start_this_handle) * * Restart a handle for a multi-transaction filesystem @@ -672,56 +751,48 @@ error_out: * credits. We preserve reserved handle if there's any attached to the * passed in handle. */ -int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask) +int jbd2__journal_restart(handle_t *handle, int nblocks, int revoke_records, + gfp_t gfp_mask) { transaction_t *transaction = handle->h_transaction; journal_t *journal; tid_t tid; - int need_to_start, ret; + int need_to_start; + int ret; /* If we've had an abort of any type, don't even think about * actually doing the restart! */ if (is_handle_aborted(handle)) return 0; journal = transaction->t_journal; + tid = transaction->t_tid; /* * First unlink the handle from its current transaction, and start the * commit on that. */ - J_ASSERT(atomic_read(&transaction->t_updates) > 0); - J_ASSERT(journal_current_handle() == handle); - - read_lock(&journal->j_state_lock); - spin_lock(&transaction->t_handle_lock); - atomic_sub(handle->h_buffer_credits, - &transaction->t_outstanding_credits); - if (handle->h_rsv_handle) { - sub_reserved_credits(journal, - handle->h_rsv_handle->h_buffer_credits); - } - if (atomic_dec_and_test(&transaction->t_updates)) - wake_up(&journal->j_wait_updates); - tid = transaction->t_tid; - spin_unlock(&transaction->t_handle_lock); - handle->h_transaction = NULL; - current->journal_info = NULL; - jbd_debug(2, "restarting handle %p\n", handle); + stop_this_handle(handle); + handle->h_transaction = NULL; + + /* + * TODO: If we use READ_ONCE / WRITE_ONCE for j_commit_request we can + * get rid of pointless j_state_lock traffic like this. + */ + read_lock(&journal->j_state_lock); need_to_start = !tid_geq(journal->j_commit_request, tid); read_unlock(&journal->j_state_lock); if (need_to_start) jbd2_log_start_commit(journal, tid); - - rwsem_release(&journal->j_trans_commit_map, _THIS_IP_); - handle->h_buffer_credits = nblocks; - /* - * Restore the original nofs context because the journal restart - * is basically the same thing as journal stop and start. - * start_this_handle will start a new nofs context. - */ - memalloc_nofs_restore(handle->saved_alloc_context); + handle->h_total_credits = nblocks + + DIV_ROUND_UP(revoke_records, + journal->j_revoke_records_per_block); + handle->h_revoke_credits = revoke_records; ret = start_this_handle(journal, handle, gfp_mask); + trace_jbd2_handle_restart(journal->j_fs_dev->bd_dev, + ret ? 0 : handle->h_transaction->t_tid, + handle->h_type, handle->h_line_no, + handle->h_total_credits); return ret; } EXPORT_SYMBOL(jbd2__journal_restart); @@ -729,7 +800,7 @@ EXPORT_SYMBOL(jbd2__journal_restart); int jbd2_journal_restart(handle_t *handle, int nblocks) { - return jbd2__journal_restart(handle, nblocks, GFP_NOFS); + return jbd2__journal_restart(handle, nblocks, 0, GFP_NOFS); } EXPORT_SYMBOL(jbd2_journal_restart); @@ -879,7 +950,7 @@ repeat: start_lock = jiffies; lock_buffer(bh); - jbd_lock_bh_state(bh); + spin_lock(&jh->b_state_lock); /* If it takes too long to lock the buffer, trace it */ time_lock = jbd2_time_diff(start_lock, jiffies); @@ -929,7 +1000,7 @@ repeat: error = -EROFS; if (is_handle_aborted(handle)) { - jbd_unlock_bh_state(bh); + spin_unlock(&jh->b_state_lock); goto out; } error = 0; @@ -993,7 +1064,7 @@ repeat: */ if (buffer_shadow(bh)) { JBUFFER_TRACE(jh, "on shadow: sleep"); - jbd_unlock_bh_state(bh); + spin_unlock(&jh->b_state_lock); wait_on_bit_io(&bh->b_state, BH_Shadow, TASK_UNINTERRUPTIBLE); goto repeat; } @@ -1014,7 +1085,7 @@ repeat: JBUFFER_TRACE(jh, "generate frozen data"); if (!frozen_buffer) { JBUFFER_TRACE(jh, "allocate memory for buffer"); - jbd_unlock_bh_state(bh); + spin_unlock(&jh->b_state_lock); frozen_buffer = jbd2_alloc(jh2bh(jh)->b_size, GFP_NOFS | __GFP_NOFAIL); goto repeat; @@ -1033,7 +1104,7 @@ attach_next: jh->b_next_transaction = transaction; done: - jbd_unlock_bh_state(bh); + spin_unlock(&jh->b_state_lock); /* * If we are about to journal a buffer, then any revoke pending on it is @@ -1172,7 +1243,7 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh) * that case: the transaction must have deleted the buffer for it to be * reused here. */ - jbd_lock_bh_state(bh); + spin_lock(&jh->b_state_lock); J_ASSERT_JH(jh, (jh->b_transaction == transaction || jh->b_transaction == NULL || (jh->b_transaction == journal->j_committing_transaction && @@ -1207,7 +1278,7 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh) jh->b_next_transaction = transaction; spin_unlock(&journal->j_list_lock); } - jbd_unlock_bh_state(bh); + spin_unlock(&jh->b_state_lock); /* * akpm: I added this. ext3_alloc_branch can pick up new indirect @@ -1275,13 +1346,13 @@ repeat: committed_data = jbd2_alloc(jh2bh(jh)->b_size, GFP_NOFS|__GFP_NOFAIL); - jbd_lock_bh_state(bh); + spin_lock(&jh->b_state_lock); if (!jh->b_committed_data) { /* Copy out the current buffer contents into the * preserved, committed copy. */ JBUFFER_TRACE(jh, "generate b_committed data"); if (!committed_data) { - jbd_unlock_bh_state(bh); + spin_unlock(&jh->b_state_lock); goto repeat; } @@ -1289,7 +1360,7 @@ repeat: committed_data = NULL; memcpy(jh->b_committed_data, bh->b_data, bh->b_size); } - jbd_unlock_bh_state(bh); + spin_unlock(&jh->b_state_lock); out: jbd2_journal_put_journal_head(jh); if (unlikely(committed_data)) @@ -1390,16 +1461,16 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) */ if (jh->b_transaction != transaction && jh->b_next_transaction != transaction) { - jbd_lock_bh_state(bh); + spin_lock(&jh->b_state_lock); J_ASSERT_JH(jh, jh->b_transaction == transaction || jh->b_next_transaction == transaction); - jbd_unlock_bh_state(bh); + spin_unlock(&jh->b_state_lock); } if (jh->b_modified == 1) { /* If it's in our transaction it must be in BJ_Metadata list. */ if (jh->b_transaction == transaction && jh->b_jlist != BJ_Metadata) { - jbd_lock_bh_state(bh); + spin_lock(&jh->b_state_lock); if (jh->b_transaction == transaction && jh->b_jlist != BJ_Metadata) pr_err("JBD2: assertion failure: h_type=%u " @@ -1409,13 +1480,13 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) jh->b_jlist); J_ASSERT_JH(jh, jh->b_transaction != transaction || jh->b_jlist == BJ_Metadata); - jbd_unlock_bh_state(bh); + spin_unlock(&jh->b_state_lock); } goto out; } journal = transaction->t_journal; - jbd_lock_bh_state(bh); + spin_lock(&jh->b_state_lock); if (jh->b_modified == 0) { /* @@ -1423,12 +1494,12 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) * of the transaction. This needs to be done * once a transaction -bzzz */ - if (handle->h_buffer_credits <= 0) { + if (WARN_ON_ONCE(jbd2_handle_buffer_credits(handle) <= 0)) { ret = -ENOSPC; goto out_unlock_bh; } jh->b_modified = 1; - handle->h_buffer_credits--; + handle->h_total_credits--; } /* @@ -1501,7 +1572,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) __jbd2_journal_file_buffer(jh, transaction, BJ_Metadata); spin_unlock(&journal->j_list_lock); out_unlock_bh: - jbd_unlock_bh_state(bh); + spin_unlock(&jh->b_state_lock); out: JBUFFER_TRACE(jh, "exit"); return ret; @@ -1539,18 +1610,20 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) BUFFER_TRACE(bh, "entry"); - jbd_lock_bh_state(bh); + jh = jbd2_journal_grab_journal_head(bh); + if (!jh) { + __bforget(bh); + return 0; + } - if (!buffer_jbd(bh)) - goto not_jbd; - jh = bh2jh(bh); + spin_lock(&jh->b_state_lock); /* Critical error: attempting to delete a bitmap buffer, maybe? * Don't do any jbd operations, and return an error. */ if (!J_EXPECT_JH(jh, !jh->b_committed_data, "inconsistent data on disk")) { err = -EIO; - goto not_jbd; + goto drop; } /* keep track of whether or not this transaction modified us */ @@ -1598,10 +1671,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) __jbd2_journal_file_buffer(jh, transaction, BJ_Forget); } else { __jbd2_journal_unfile_buffer(jh); - if (!buffer_jbd(bh)) { - spin_unlock(&journal->j_list_lock); - goto not_jbd; - } + jbd2_journal_put_journal_head(jh); } spin_unlock(&journal->j_list_lock); } else if (jh->b_transaction) { @@ -1643,7 +1713,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) if (!jh->b_cp_transaction) { JBUFFER_TRACE(jh, "belongs to none transaction"); spin_unlock(&journal->j_list_lock); - goto not_jbd; + goto drop; } /* @@ -1653,7 +1723,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) if (!buffer_dirty(bh)) { __jbd2_journal_remove_checkpoint(jh); spin_unlock(&journal->j_list_lock); - goto not_jbd; + goto drop; } /* @@ -1666,20 +1736,15 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) __jbd2_journal_file_buffer(jh, transaction, BJ_Forget); spin_unlock(&journal->j_list_lock); } - - jbd_unlock_bh_state(bh); - __brelse(bh); drop: + __brelse(bh); + spin_unlock(&jh->b_state_lock); + jbd2_journal_put_journal_head(jh); if (drop_reserve) { /* no need to reserve log space for this block -bzzz */ - handle->h_buffer_credits++; + handle->h_total_credits++; } return err; - -not_jbd: - jbd_unlock_bh_state(bh); - __bforget(bh); - goto drop; } /** @@ -1706,45 +1771,34 @@ int jbd2_journal_stop(handle_t *handle) tid_t tid; pid_t pid; + if (--handle->h_ref > 0) { + jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1, + handle->h_ref); + if (is_handle_aborted(handle)) + return -EIO; + return 0; + } if (!transaction) { /* - * Handle is already detached from the transaction so - * there is nothing to do other than decrease a refcount, - * or free the handle if refcount drops to zero + * Handle is already detached from the transaction so there is + * nothing to do other than free the handle. */ - if (--handle->h_ref > 0) { - jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1, - handle->h_ref); - return err; - } else { - if (handle->h_rsv_handle) - jbd2_free_handle(handle->h_rsv_handle); - goto free_and_exit; - } + memalloc_nofs_restore(handle->saved_alloc_context); + goto free_and_exit; } journal = transaction->t_journal; - - J_ASSERT(journal_current_handle() == handle); + tid = transaction->t_tid; if (is_handle_aborted(handle)) err = -EIO; - else - J_ASSERT(atomic_read(&transaction->t_updates) > 0); - - if (--handle->h_ref > 0) { - jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1, - handle->h_ref); - return err; - } jbd_debug(4, "Handle %p going down\n", handle); trace_jbd2_handle_stats(journal->j_fs_dev->bd_dev, - transaction->t_tid, - handle->h_type, handle->h_line_no, + tid, handle->h_type, handle->h_line_no, jiffies - handle->h_start_jiffies, handle->h_sync, handle->h_requested_credits, (handle->h_requested_credits - - handle->h_buffer_credits)); + handle->h_total_credits)); /* * Implement synchronous transaction batching. If the handle @@ -1804,19 +1858,13 @@ int jbd2_journal_stop(handle_t *handle) if (handle->h_sync) transaction->t_synchronous_commit = 1; - current->journal_info = NULL; - atomic_sub(handle->h_buffer_credits, - &transaction->t_outstanding_credits); /* * If the handle is marked SYNC, we need to set another commit - * going! We also want to force a commit if the current - * transaction is occupying too much of the log, or if the - * transaction is too old now. + * going! We also want to force a commit if the transaction is too + * old now. */ if (handle->h_sync || - (atomic_read(&transaction->t_outstanding_credits) > - journal->j_max_transaction_buffers) || time_after_eq(jiffies, transaction->t_expires)) { /* Do this even for aborted journals: an abort still * completes the commit thread, it just doesn't write @@ -1825,7 +1873,7 @@ int jbd2_journal_stop(handle_t *handle) jbd_debug(2, "transaction too old, requesting commit for " "handle %p\n", handle); /* This is non-blocking */ - jbd2_log_start_commit(journal, transaction->t_tid); + jbd2_log_start_commit(journal, tid); /* * Special case: JBD2_SYNC synchronous updates require us @@ -1836,31 +1884,19 @@ int jbd2_journal_stop(handle_t *handle) } /* - * Once we drop t_updates, if it goes to zero the transaction - * could start committing on us and eventually disappear. So - * once we do this, we must not dereference transaction - * pointer again. + * Once stop_this_handle() drops t_updates, the transaction could start + * committing on us and eventually disappear. So we must not + * dereference transaction pointer again after calling + * stop_this_handle(). */ - tid = transaction->t_tid; - if (atomic_dec_and_test(&transaction->t_updates)) { - wake_up(&journal->j_wait_updates); - if (journal->j_barrier_count) - wake_up(&journal->j_wait_transaction_locked); - } - - rwsem_release(&journal->j_trans_commit_map, _THIS_IP_); + stop_this_handle(handle); if (wait_for_commit) err = jbd2_log_wait_commit(journal, tid); - if (handle->h_rsv_handle) - jbd2_journal_free_reserved(handle->h_rsv_handle); free_and_exit: - /* - * Scope of the GFP_NOFS context is over here and so we can restore the - * original alloc context. - */ - memalloc_nofs_restore(handle->saved_alloc_context); + if (handle->h_rsv_handle) + jbd2_free_handle(handle->h_rsv_handle); jbd2_free_handle(handle); return err; } @@ -1878,7 +1914,7 @@ free_and_exit: * * j_list_lock is held. * - * jbd_lock_bh_state(jh2bh(jh)) is held. + * jh->b_state_lock is held. */ static inline void @@ -1902,7 +1938,7 @@ __blist_add_buffer(struct journal_head **list, struct journal_head *jh) * * Called with j_list_lock held, and the journal may not be locked. * - * jbd_lock_bh_state(jh2bh(jh)) is held. + * jh->b_state_lock is held. */ static inline void @@ -1934,7 +1970,7 @@ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh) transaction_t *transaction; struct buffer_head *bh = jh2bh(jh); - J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); + lockdep_assert_held(&jh->b_state_lock); transaction = jh->b_transaction; if (transaction) assert_spin_locked(&transaction->t_journal->j_list_lock); @@ -1971,17 +2007,15 @@ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh) } /* - * Remove buffer from all transactions. + * Remove buffer from all transactions. The caller is responsible for dropping + * the jh reference that belonged to the transaction. * * Called with bh_state lock and j_list_lock - * - * jh and bh may be already freed when this function returns. */ static void __jbd2_journal_unfile_buffer(struct journal_head *jh) { __jbd2_journal_temp_unlink_buffer(jh); jh->b_transaction = NULL; - jbd2_journal_put_journal_head(jh); } void jbd2_journal_unfile_buffer(journal_t *journal, struct journal_head *jh) @@ -1990,18 +2024,19 @@ void jbd2_journal_unfile_buffer(journal_t *journal, struct journal_head *jh) /* Get reference so that buffer cannot be freed before we unlock it */ get_bh(bh); - jbd_lock_bh_state(bh); + spin_lock(&jh->b_state_lock); spin_lock(&journal->j_list_lock); __jbd2_journal_unfile_buffer(jh); spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); + spin_unlock(&jh->b_state_lock); + jbd2_journal_put_journal_head(jh); __brelse(bh); } /* * Called from jbd2_journal_try_to_free_buffers(). * - * Called under jbd_lock_bh_state(bh) + * Called under jh->b_state_lock */ static void __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh) @@ -2088,10 +2123,10 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal, if (!jh) continue; - jbd_lock_bh_state(bh); + spin_lock(&jh->b_state_lock); __journal_try_to_free_buffer(journal, bh); + spin_unlock(&jh->b_state_lock); jbd2_journal_put_journal_head(jh); - jbd_unlock_bh_state(bh); if (buffer_jbd(bh)) goto busy; } while ((bh = bh->b_this_page) != head); @@ -2112,7 +2147,7 @@ busy: * * Called under j_list_lock. * - * Called under jbd_lock_bh_state(bh). + * Called under jh->b_state_lock. */ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) { @@ -2133,6 +2168,7 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) } else { JBUFFER_TRACE(jh, "on running transaction"); __jbd2_journal_unfile_buffer(jh); + jbd2_journal_put_journal_head(jh); } return may_free; } @@ -2199,18 +2235,15 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh, * holding the page lock. --sct */ - if (!buffer_jbd(bh)) + jh = jbd2_journal_grab_journal_head(bh); + if (!jh) goto zap_buffer_unlocked; /* OK, we have data buffer in journaled mode */ write_lock(&journal->j_state_lock); - jbd_lock_bh_state(bh); + spin_lock(&jh->b_state_lock); spin_lock(&journal->j_list_lock); - jh = jbd2_journal_grab_journal_head(bh); - if (!jh) - goto zap_buffer_no_jh; - /* * We cannot remove the buffer from checkpoint lists until the * transaction adding inode to orphan list (let's call it T) @@ -2289,10 +2322,10 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh, * for commit and try again. */ if (partial_page) { - jbd2_journal_put_journal_head(jh); spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); + spin_unlock(&jh->b_state_lock); write_unlock(&journal->j_state_lock); + jbd2_journal_put_journal_head(jh); return -EBUSY; } /* @@ -2304,10 +2337,10 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh, set_buffer_freed(bh); if (journal->j_running_transaction && buffer_jbddirty(bh)) jh->b_next_transaction = journal->j_running_transaction; - jbd2_journal_put_journal_head(jh); spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); + spin_unlock(&jh->b_state_lock); write_unlock(&journal->j_state_lock); + jbd2_journal_put_journal_head(jh); return 0; } else { /* Good, the buffer belongs to the running transaction. @@ -2331,11 +2364,10 @@ zap_buffer: * here. */ jh->b_modified = 0; - jbd2_journal_put_journal_head(jh); -zap_buffer_no_jh: spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); + spin_unlock(&jh->b_state_lock); write_unlock(&journal->j_state_lock); + jbd2_journal_put_journal_head(jh); zap_buffer_unlocked: clear_buffer_dirty(bh); J_ASSERT_BH(bh, !buffer_jbddirty(bh)); @@ -2422,7 +2454,7 @@ void __jbd2_journal_file_buffer(struct journal_head *jh, int was_dirty = 0; struct buffer_head *bh = jh2bh(jh); - J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); + lockdep_assert_held(&jh->b_state_lock); assert_spin_locked(&transaction->t_journal->j_list_lock); J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); @@ -2484,11 +2516,11 @@ void __jbd2_journal_file_buffer(struct journal_head *jh, void jbd2_journal_file_buffer(struct journal_head *jh, transaction_t *transaction, int jlist) { - jbd_lock_bh_state(jh2bh(jh)); + spin_lock(&jh->b_state_lock); spin_lock(&transaction->t_journal->j_list_lock); __jbd2_journal_file_buffer(jh, transaction, jlist); spin_unlock(&transaction->t_journal->j_list_lock); - jbd_unlock_bh_state(jh2bh(jh)); + spin_unlock(&jh->b_state_lock); } /* @@ -2498,23 +2530,25 @@ void jbd2_journal_file_buffer(struct journal_head *jh, * buffer on that transaction's metadata list. * * Called under j_list_lock - * Called under jbd_lock_bh_state(jh2bh(jh)) + * Called under jh->b_state_lock * - * jh and bh may be already free when this function returns + * When this function returns true, there's no next transaction to refile to + * and the caller has to drop jh reference through + * jbd2_journal_put_journal_head(). */ -void __jbd2_journal_refile_buffer(struct journal_head *jh) +bool __jbd2_journal_refile_buffer(struct journal_head *jh) { int was_dirty, jlist; struct buffer_head *bh = jh2bh(jh); - J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); + lockdep_assert_held(&jh->b_state_lock); if (jh->b_transaction) assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock); /* If the buffer is now unused, just drop it. */ if (jh->b_next_transaction == NULL) { __jbd2_journal_unfile_buffer(jh); - return; + return true; } /* @@ -2542,6 +2576,7 @@ void __jbd2_journal_refile_buffer(struct journal_head *jh) if (was_dirty) set_buffer_jbddirty(bh); + return false; } /* @@ -2552,16 +2587,15 @@ void __jbd2_journal_refile_buffer(struct journal_head *jh) */ void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh) { - struct buffer_head *bh = jh2bh(jh); + bool drop; - /* Get reference so that buffer cannot be freed before we unlock it */ - get_bh(bh); - jbd_lock_bh_state(bh); + spin_lock(&jh->b_state_lock); spin_lock(&journal->j_list_lock); - __jbd2_journal_refile_buffer(jh); - jbd_unlock_bh_state(bh); + drop = __jbd2_journal_refile_buffer(jh); + spin_unlock(&jh->b_state_lock); spin_unlock(&journal->j_list_lock); - __brelse(bh); + if (drop) + jbd2_journal_put_journal_head(jh); } /* diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index f9baefc76cf9..88534eb0e7c2 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -2288,9 +2288,9 @@ static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth, int ret = 0; int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits; - if (handle->h_buffer_credits < credits) + if (jbd2_handle_buffer_credits(handle) < credits) ret = ocfs2_extend_trans(handle, - credits - handle->h_buffer_credits); + credits - jbd2_handle_buffer_credits(handle)); return ret; } @@ -2367,7 +2367,7 @@ static int ocfs2_rotate_tree_right(handle_t *handle, struct ocfs2_path *right_path, struct ocfs2_path **ret_left_path) { - int ret, start, orig_credits = handle->h_buffer_credits; + int ret, start, orig_credits = jbd2_handle_buffer_credits(handle); u32 cpos; struct ocfs2_path *left_path = NULL; struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); @@ -3148,7 +3148,7 @@ static int ocfs2_rotate_tree_left(handle_t *handle, struct ocfs2_path *path, struct ocfs2_cached_dealloc_ctxt *dealloc) { - int ret, orig_credits = handle->h_buffer_credits; + int ret, orig_credits = jbd2_handle_buffer_credits(handle); struct ocfs2_path *tmp_path = NULL, *restart_path = NULL; struct ocfs2_extent_block *eb; struct ocfs2_extent_list *el; @@ -3386,8 +3386,8 @@ static int ocfs2_merge_rec_right(struct ocfs2_path *left_path, right_path); ret = ocfs2_extend_rotate_transaction(handle, subtree_index, - handle->h_buffer_credits, - right_path); + jbd2_handle_buffer_credits(handle), + right_path); if (ret) { mlog_errno(ret); goto out; @@ -3548,8 +3548,8 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path, right_path); ret = ocfs2_extend_rotate_transaction(handle, subtree_index, - handle->h_buffer_credits, - left_path); + jbd2_handle_buffer_credits(handle), + left_path); if (ret) { mlog_errno(ret); goto out; @@ -3623,7 +3623,7 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path, le16_to_cpu(el->l_next_free_rec) == 1) { /* extend credit for ocfs2_remove_rightmost_path */ ret = ocfs2_extend_rotate_transaction(handle, 0, - handle->h_buffer_credits, + jbd2_handle_buffer_credits(handle), right_path); if (ret) { mlog_errno(ret); @@ -3669,7 +3669,7 @@ static int ocfs2_try_to_merge_extent(handle_t *handle, if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) { /* extend credit for ocfs2_remove_rightmost_path */ ret = ocfs2_extend_rotate_transaction(handle, 0, - handle->h_buffer_credits, + jbd2_handle_buffer_credits(handle), path); if (ret) { mlog_errno(ret); @@ -3725,7 +3725,7 @@ static int ocfs2_try_to_merge_extent(handle_t *handle, /* extend credit for ocfs2_remove_rightmost_path */ ret = ocfs2_extend_rotate_transaction(handle, 0, - handle->h_buffer_credits, + jbd2_handle_buffer_credits(handle), path); if (ret) { mlog_errno(ret); @@ -3755,7 +3755,7 @@ static int ocfs2_try_to_merge_extent(handle_t *handle, /* extend credit for ocfs2_remove_rightmost_path */ ret = ocfs2_extend_rotate_transaction(handle, 0, - handle->h_buffer_credits, + jbd2_handle_buffer_credits(handle), path); if (ret) { mlog_errno(ret); @@ -3799,7 +3799,7 @@ static int ocfs2_try_to_merge_extent(handle_t *handle, if (ctxt->c_split_covers_rec) { /* extend credit for ocfs2_remove_rightmost_path */ ret = ocfs2_extend_rotate_transaction(handle, 0, - handle->h_buffer_credits, + jbd2_handle_buffer_credits(handle), path); if (ret) { mlog_errno(ret); @@ -5358,7 +5358,7 @@ static int ocfs2_truncate_rec(handle_t *handle, if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) { /* extend credit for ocfs2_remove_rightmost_path */ ret = ocfs2_extend_rotate_transaction(handle, 0, - handle->h_buffer_credits, + jbd2_handle_buffer_credits(handle), path); if (ret) { mlog_errno(ret); @@ -5427,8 +5427,8 @@ static int ocfs2_truncate_rec(handle_t *handle, } ret = ocfs2_extend_rotate_transaction(handle, 0, - handle->h_buffer_credits, - path); + jbd2_handle_buffer_credits(handle), + path); if (ret) { mlog_errno(ret); goto out; diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 699a560efbb0..1afe57f425a0 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -420,14 +420,14 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks) if (!nblocks) return 0; - old_nblocks = handle->h_buffer_credits; + old_nblocks = jbd2_handle_buffer_credits(handle); trace_ocfs2_extend_trans(old_nblocks, nblocks); #ifdef CONFIG_OCFS2_DEBUG_FS status = 1; #else - status = jbd2_journal_extend(handle, nblocks); + status = jbd2_journal_extend(handle, nblocks, 0); if (status < 0) { mlog_errno(status); goto bail; @@ -461,13 +461,13 @@ int ocfs2_allocate_extend_trans(handle_t *handle, int thresh) BUG_ON(!handle); - old_nblks = handle->h_buffer_credits; + old_nblks = jbd2_handle_buffer_credits(handle); trace_ocfs2_allocate_extend_trans(old_nblks, thresh); if (old_nblks < thresh) return 0; - status = jbd2_journal_extend(handle, OCFS2_MAX_TRANS_DATA); + status = jbd2_journal_extend(handle, OCFS2_MAX_TRANS_DATA, 0); if (status < 0) { mlog_errno(status); goto bail; diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 69c21a3843af..4180c3ef0a68 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -1252,6 +1252,7 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, int nr) { struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; + struct journal_head *jh; int ret; if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap)) @@ -1260,13 +1261,14 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, if (!buffer_jbd(bg_bh)) return 1; - jbd_lock_bh_state(bg_bh); - bg = (struct ocfs2_group_desc *) bh2jh(bg_bh)->b_committed_data; + jh = bh2jh(bg_bh); + spin_lock(&jh->b_state_lock); + bg = (struct ocfs2_group_desc *) jh->b_committed_data; if (bg) ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap); else ret = 1; - jbd_unlock_bh_state(bg_bh); + spin_unlock(&jh->b_state_lock); return ret; } @@ -2387,6 +2389,7 @@ static int ocfs2_block_group_clear_bits(handle_t *handle, int status; unsigned int tmp; struct ocfs2_group_desc *undo_bg = NULL; + struct journal_head *jh; /* The caller got this descriptor from * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ @@ -2405,10 +2408,10 @@ static int ocfs2_block_group_clear_bits(handle_t *handle, goto bail; } + jh = bh2jh(group_bh); if (undo_fn) { - jbd_lock_bh_state(group_bh); - undo_bg = (struct ocfs2_group_desc *) - bh2jh(group_bh)->b_committed_data; + spin_lock(&jh->b_state_lock); + undo_bg = (struct ocfs2_group_desc *) jh->b_committed_data; BUG_ON(!undo_bg); } @@ -2423,7 +2426,7 @@ static int ocfs2_block_group_clear_bits(handle_t *handle, le16_add_cpu(&bg->bg_free_bits_count, num_bits); if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { if (undo_fn) - jbd_unlock_bh_state(group_bh); + spin_unlock(&jh->b_state_lock); return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n", (unsigned long long)le64_to_cpu(bg->bg_blkno), le16_to_cpu(bg->bg_bits), @@ -2432,7 +2435,7 @@ static int ocfs2_block_group_clear_bits(handle_t *handle, } if (undo_fn) - jbd_unlock_bh_state(group_bh); + spin_unlock(&jh->b_state_lock); ocfs2_journal_dirty(handle, group_bh); bail: diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 564793c24d12..29dce6ff6bae 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -313,7 +313,6 @@ enum jbd_state_bits { BH_Revoked, /* Has been revoked from the log */ BH_RevokeValid, /* Revoked flag is valid */ BH_JBDDirty, /* Is dirty but journaled */ - BH_State, /* Pins most journal_head state */ BH_JournalHead, /* Pins bh->b_private and jh->b_bh */ BH_Shadow, /* IO on shadow buffer is running */ BH_Verified, /* Metadata block has been verified ok */ @@ -342,26 +341,6 @@ static inline struct journal_head *bh2jh(struct buffer_head *bh) return bh->b_private; } -static inline void jbd_lock_bh_state(struct buffer_head *bh) -{ - bit_spin_lock(BH_State, &bh->b_state); -} - -static inline int jbd_trylock_bh_state(struct buffer_head *bh) -{ - return bit_spin_trylock(BH_State, &bh->b_state); -} - -static inline int jbd_is_locked_bh_state(struct buffer_head *bh) -{ - return bit_spin_is_locked(BH_State, &bh->b_state); -} - -static inline void jbd_unlock_bh_state(struct buffer_head *bh) -{ - bit_spin_unlock(BH_State, &bh->b_state); -} - static inline void jbd_lock_bh_journal_head(struct buffer_head *bh) { bit_spin_lock(BH_JournalHead, &bh->b_state); @@ -477,7 +456,9 @@ struct jbd2_revoke_table_s; * @h_transaction: Which compound transaction is this update a part of? * @h_journal: Which journal handle belongs to - used iff h_reserved set. * @h_rsv_handle: Handle reserved for finishing the logical operation. - * @h_buffer_credits: Number of remaining buffers we are allowed to dirty. + * @h_total_credits: Number of remaining buffers we are allowed to add to + journal. These are dirty buffers and revoke descriptor blocks. + * @h_revoke_credits: Number of remaining revoke records available for handle * @h_ref: Reference count on this handle. * @h_err: Field for caller's use to track errors through large fs operations. * @h_sync: Flag for sync-on-close. @@ -487,7 +468,8 @@ struct jbd2_revoke_table_s; * @h_type: For handle statistics. * @h_line_no: For handle statistics. * @h_start_jiffies: Handle Start time. - * @h_requested_credits: Holds @h_buffer_credits after handle is started. + * @h_requested_credits: Holds @h_total_credits after handle is started. + * @h_revoke_credits_requested: Holds @h_revoke_credits after handle is started. * @saved_alloc_context: Saved context while transaction is open. **/ @@ -504,7 +486,9 @@ struct jbd2_journal_handle }; handle_t *h_rsv_handle; - int h_buffer_credits; + int h_total_credits; + int h_revoke_credits; + int h_revoke_credits_requested; int h_ref; int h_err; @@ -556,9 +540,9 @@ struct transaction_chp_stats_s { * ->jbd_lock_bh_journal_head() (This is "innermost") * * j_state_lock - * ->jbd_lock_bh_state() + * ->b_state_lock * - * jbd_lock_bh_state() + * b_state_lock * ->j_list_lock * * j_state_lock @@ -681,11 +665,24 @@ struct transaction_s atomic_t t_updates; /* - * Number of buffers reserved for use by all handles in this transaction - * handle but not yet modified. [none] + * Number of blocks reserved for this transaction in the journal. + * This is including all credits reserved when starting transaction + * handles as well as all journal descriptor blocks needed for this + * transaction. [none] */ atomic_t t_outstanding_credits; + /* + * Number of revoke records for this transaction added by already + * stopped handles. [none] + */ + atomic_t t_outstanding_revokes; + + /* + * How many handles used this transaction? [none] + */ + atomic_t t_handle_count; + /* * Forward and backward links for the circular list of all transactions * awaiting checkpoint. [j_list_lock] @@ -703,11 +700,6 @@ struct transaction_s */ ktime_t t_start_time; - /* - * How many handles used this transaction? [none] - */ - atomic_t t_handle_count; - /* * This transaction is being forced and some process is * waiting for it to finish. @@ -1024,6 +1016,13 @@ struct journal_s */ int j_max_transaction_buffers; + /** + * @j_revoke_records_per_block: + * + * Number of revoke records that fit in one descriptor block. + */ + int j_revoke_records_per_block; + /** * @j_commit_interval: * @@ -1257,7 +1256,7 @@ JBD2_FEATURE_INCOMPAT_FUNCS(csum3, CSUM_V3) /* Filing buffers */ extern void jbd2_journal_unfile_buffer(journal_t *, struct journal_head *); -extern void __jbd2_journal_refile_buffer(struct journal_head *); +extern bool __jbd2_journal_refile_buffer(struct journal_head *); extern void jbd2_journal_refile_buffer(journal_t *, struct journal_head *); extern void __jbd2_journal_file_buffer(struct journal_head *, transaction_t *, int); extern void __journal_free_buffer(struct journal_head *bh); @@ -1358,14 +1357,16 @@ static inline handle_t *journal_current_handle(void) extern handle_t *jbd2_journal_start(journal_t *, int nblocks); extern handle_t *jbd2__journal_start(journal_t *, int blocks, int rsv_blocks, - gfp_t gfp_mask, unsigned int type, - unsigned int line_no); + int revoke_records, gfp_t gfp_mask, + unsigned int type, unsigned int line_no); extern int jbd2_journal_restart(handle_t *, int nblocks); -extern int jbd2__journal_restart(handle_t *, int nblocks, gfp_t gfp_mask); +extern int jbd2__journal_restart(handle_t *, int nblocks, + int revoke_records, gfp_t gfp_mask); extern int jbd2_journal_start_reserved(handle_t *handle, unsigned int type, unsigned int line_no); extern void jbd2_journal_free_reserved(handle_t *handle); -extern int jbd2_journal_extend (handle_t *, int nblocks); +extern int jbd2_journal_extend(handle_t *handle, int nblocks, + int revoke_records); extern int jbd2_journal_get_write_access(handle_t *, struct buffer_head *); extern int jbd2_journal_get_create_access (handle_t *, struct buffer_head *); extern int jbd2_journal_get_undo_access(handle_t *, struct buffer_head *); @@ -1560,38 +1561,19 @@ static inline int jbd2_journal_has_csum_v2or3(journal_t *journal) return journal->j_chksum_driver != NULL; } -/* - * We reserve t_outstanding_credits >> JBD2_CONTROL_BLOCKS_SHIFT for - * transaction control blocks. - */ -#define JBD2_CONTROL_BLOCKS_SHIFT 5 - -/* - * Return the minimum number of blocks which must be free in the journal - * before a new transaction may be started. Must be called under j_state_lock. - */ -static inline int jbd2_space_needed(journal_t *journal) -{ - int nblocks = journal->j_max_transaction_buffers; - return nblocks + (nblocks >> JBD2_CONTROL_BLOCKS_SHIFT); -} - /* * Return number of free blocks in the log. Must be called under j_state_lock. */ static inline unsigned long jbd2_log_space_left(journal_t *journal) { /* Allow for rounding errors */ - unsigned long free = journal->j_free - 32; + long free = journal->j_free - 32; if (journal->j_committing_transaction) { - unsigned long committing = atomic_read(&journal-> - j_committing_transaction->t_outstanding_credits); - - /* Transaction + control blocks */ - free -= committing + (committing >> JBD2_CONTROL_BLOCKS_SHIFT); + free -= atomic_read(&journal-> + j_committing_transaction->t_outstanding_credits); } - return free; + return max_t(long, free, 0); } /* @@ -1645,6 +1627,20 @@ static inline tid_t jbd2_get_latest_transaction(journal_t *journal) return tid; } +static inline int jbd2_handle_buffer_credits(handle_t *handle) +{ + journal_t *journal; + + if (!handle->h_reserved) + journal = handle->h_transaction->t_journal; + else + journal = handle->h_journal; + + return handle->h_total_credits - + DIV_ROUND_UP(handle->h_revoke_credits_requested, + journal->j_revoke_records_per_block); +} + #ifdef __KERNEL__ #define buffer_trace_init(bh) do {} while (0) diff --git a/include/linux/journal-head.h b/include/linux/journal-head.h index 9fb870524314..75bc56109031 100644 --- a/include/linux/journal-head.h +++ b/include/linux/journal-head.h @@ -11,6 +11,8 @@ #ifndef JOURNAL_HEAD_H_INCLUDED #define JOURNAL_HEAD_H_INCLUDED +#include + typedef unsigned int tid_t; /* Unique transaction ID */ typedef struct transaction_s transaction_t; /* Compound transaction type */ @@ -23,6 +25,11 @@ struct journal_head { */ struct buffer_head *b_bh; + /* + * Protect the buffer head state + */ + spinlock_t b_state_lock; + /* * Reference count - see description in journal.c * [jbd_lock_bh_journal_head()] @@ -30,7 +37,7 @@ struct journal_head { int b_jcount; /* - * Journalling list for this buffer [jbd_lock_bh_state()] + * Journalling list for this buffer [b_state_lock] * NOTE: We *cannot* combine this with b_modified into a bitfield * as gcc would then (which the C standard allows but which is * very unuseful) make 64-bit accesses to the bitfield and clobber @@ -41,20 +48,20 @@ struct journal_head { /* * This flag signals the buffer has been modified by * the currently running transaction - * [jbd_lock_bh_state()] + * [b_state_lock] */ unsigned b_modified; /* * Copy of the buffer data frozen for writing to the log. - * [jbd_lock_bh_state()] + * [b_state_lock] */ char *b_frozen_data; /* * Pointer to a saved copy of the buffer containing no uncommitted * deallocation references, so that allocations can avoid overwriting - * uncommitted deletes. [jbd_lock_bh_state()] + * uncommitted deletes. [b_state_lock] */ char *b_committed_data; @@ -63,7 +70,7 @@ struct journal_head { * metadata: either the running transaction or the committing * transaction (if there is one). Only applies to buffers on a * transaction's data or metadata journaling list. - * [j_list_lock] [jbd_lock_bh_state()] + * [j_list_lock] [b_state_lock] * Either of these locks is enough for reading, both are needed for * changes. */ @@ -73,13 +80,13 @@ struct journal_head { * Pointer to the running compound transaction which is currently * modifying the buffer's metadata, if there was already a transaction * committing it when the new transaction touched it. - * [t_list_lock] [jbd_lock_bh_state()] + * [t_list_lock] [b_state_lock] */ transaction_t *b_next_transaction; /* * Doubly-linked list of buffers on a transaction's data, metadata or - * forget queue. [t_list_lock] [jbd_lock_bh_state()] + * forget queue. [t_list_lock] [b_state_lock] */ struct journal_head *b_tnext, *b_tprev; diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index d68e9e536814..182c9fe9c0e9 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -1746,15 +1746,16 @@ TRACE_EVENT(ext4_load_inode, TRACE_EVENT(ext4_journal_start, TP_PROTO(struct super_block *sb, int blocks, int rsv_blocks, - unsigned long IP), + int revoke_creds, unsigned long IP), - TP_ARGS(sb, blocks, rsv_blocks, IP), + TP_ARGS(sb, blocks, rsv_blocks, revoke_creds, IP), TP_STRUCT__entry( __field( dev_t, dev ) __field(unsigned long, ip ) __field( int, blocks ) __field( int, rsv_blocks ) + __field( int, revoke_creds ) ), TP_fast_assign( @@ -1762,11 +1763,13 @@ TRACE_EVENT(ext4_journal_start, __entry->ip = IP; __entry->blocks = blocks; __entry->rsv_blocks = rsv_blocks; + __entry->revoke_creds = revoke_creds; ), - TP_printk("dev %d,%d blocks, %d rsv_blocks, %d caller %pS", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->blocks, __entry->rsv_blocks, (void *)__entry->ip) + TP_printk("dev %d,%d blocks %d, rsv_blocks %d, revoke_creds %d, " + "caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->blocks, __entry->rsv_blocks, __entry->revoke_creds, + (void *)__entry->ip) ); TRACE_EVENT(ext4_journal_start_reserved, diff --git a/include/trace/events/jbd2.h b/include/trace/events/jbd2.h index 2310b259329f..d16a32867f3a 100644 --- a/include/trace/events/jbd2.h +++ b/include/trace/events/jbd2.h @@ -133,7 +133,7 @@ TRACE_EVENT(jbd2_submit_inode_data, (unsigned long) __entry->ino) ); -TRACE_EVENT(jbd2_handle_start, +DECLARE_EVENT_CLASS(jbd2_handle_start_class, TP_PROTO(dev_t dev, unsigned long tid, unsigned int type, unsigned int line_no, int requested_blocks), @@ -161,6 +161,20 @@ TRACE_EVENT(jbd2_handle_start, __entry->type, __entry->line_no, __entry->requested_blocks) ); +DEFINE_EVENT(jbd2_handle_start_class, jbd2_handle_start, + TP_PROTO(dev_t dev, unsigned long tid, unsigned int type, + unsigned int line_no, int requested_blocks), + + TP_ARGS(dev, tid, type, line_no, requested_blocks) +); + +DEFINE_EVENT(jbd2_handle_start_class, jbd2_handle_restart, + TP_PROTO(dev_t dev, unsigned long tid, unsigned int type, + unsigned int line_no, int requested_blocks), + + TP_ARGS(dev, tid, type, line_no, requested_blocks) +); + TRACE_EVENT(jbd2_handle_extend, TP_PROTO(dev_t dev, unsigned long tid, unsigned int type, unsigned int line_no, int buffer_credits,