diff --git a/Documentation/ABI/testing/sysfs-fs-ext4 b/Documentation/ABI/testing/sysfs-fs-ext4 index c631253cf85c..78604db56279 100644 --- a/Documentation/ABI/testing/sysfs-fs-ext4 +++ b/Documentation/ABI/testing/sysfs-fs-ext4 @@ -109,3 +109,10 @@ Description: write operation (since a 4k random write might turn into a much larger write due to the zeroout operation). + +What: /sys/fs/ext4//journal_task +Date: February 2019 +Contact: "Theodore Ts'o" +Description: + This file is read-only and shows the pid of journal thread in + current pid-namespace or 0 if task is unreachable. diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index 031e5a82d556..06f77ca7f36e 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -97,7 +97,7 @@ config EXT4_FS_SECURITY extended attributes for file security labels, say N. config EXT4_DEBUG - bool "EXT4 debugging support" + bool "Ext4 debugging support" depends on EXT4_FS help Enables run-time debugging support for the ext4 filesystem. diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 5012ddb6daf9..82ffdacdc7fa 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -425,6 +425,9 @@ struct flex_groups { /* Flags that are appropriate for non-directories/regular files. */ #define EXT4_OTHER_FLMASK (EXT4_NODUMP_FL | EXT4_NOATIME_FL) +/* The only flags that should be swapped */ +#define EXT4_FL_SHOULD_SWAP (EXT4_HUGE_FILE_FL | EXT4_EXTENTS_FL) + /* Mask out flags that are inappropriate for the given type of inode. */ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags) { @@ -1661,6 +1664,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) #define EXT4_FEATURE_INCOMPAT_INLINE_DATA 0x8000 /* data in inode */ #define EXT4_FEATURE_INCOMPAT_ENCRYPT 0x10000 +extern void ext4_update_dynamic_rev(struct super_block *sb); + #define EXT4_FEATURE_COMPAT_FUNCS(name, flagname) \ static inline bool ext4_has_feature_##name(struct super_block *sb) \ { \ @@ -1669,6 +1674,7 @@ static inline bool ext4_has_feature_##name(struct super_block *sb) \ } \ static inline void ext4_set_feature_##name(struct super_block *sb) \ { \ + ext4_update_dynamic_rev(sb); \ EXT4_SB(sb)->s_es->s_feature_compat |= \ cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname); \ } \ @@ -1686,6 +1692,7 @@ static inline bool ext4_has_feature_##name(struct super_block *sb) \ } \ static inline void ext4_set_feature_##name(struct super_block *sb) \ { \ + ext4_update_dynamic_rev(sb); \ EXT4_SB(sb)->s_es->s_feature_ro_compat |= \ cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname); \ } \ @@ -1703,6 +1710,7 @@ static inline bool ext4_has_feature_##name(struct super_block *sb) \ } \ static inline void ext4_set_feature_##name(struct super_block *sb) \ { \ + ext4_update_dynamic_rev(sb); \ EXT4_SB(sb)->s_es->s_feature_incompat |= \ cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname); \ } \ @@ -2666,7 +2674,6 @@ do { \ #endif -extern void ext4_update_dynamic_rev(struct super_block *sb); extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb, __u32 compat); extern int ext4_update_rocompat_feature(handle_t *handle, diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 79d986dbf5af..0f89f5190cd7 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2956,14 +2956,17 @@ again: if (err < 0) goto out; - } else if (sbi->s_cluster_ratio > 1 && end >= ex_end) { + } else if (sbi->s_cluster_ratio > 1 && end >= ex_end && + partial.state == initial) { /* - * If there's an extent to the right its first cluster - * contains the immediate right boundary of the - * truncated/punched region. Set partial_cluster to - * its negative value so it won't be freed if shared - * with the current extent. The end < ee_block case - * is handled in ext4_ext_rm_leaf(). + * If we're punching, there's an extent to the right. + * If the partial cluster hasn't been set, set it to + * that extent's first cluster and its state to nofree + * so it won't be freed should it contain blocks to be + * removed. If it's already set (tofree/nofree), we're + * retrying and keep the original partial cluster info + * so a cluster marked tofree as a result of earlier + * extent removal is not lost. */ lblk = ex_end + 1; err = ext4_ext_search_right(inode, path, &lblk, &pblk, @@ -4048,18 +4051,8 @@ out: } else allocated = ret; map->m_flags |= EXT4_MAP_NEW; - /* - * if we allocated more blocks than requested - * we need to make sure we unmap the extra block - * allocated. The actual needed block will get - * unmapped later when we find the buffer_head marked - * new. - */ - if (allocated > map->m_len) { - clean_bdev_aliases(inode->i_sb->s_bdev, newblock + map->m_len, - allocated - map->m_len); + if (allocated > map->m_len) allocated = map->m_len; - } map->m_len = allocated; map_out: diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c index e22dcfab308b..46b24da33a28 100644 --- a/fs/ext4/hash.c +++ b/fs/ext4/hash.c @@ -231,6 +231,7 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) break; case DX_HASH_HALF_MD4_UNSIGNED: str2hashbuf = str2hashbuf_unsigned; + /* fall through */ case DX_HASH_HALF_MD4: p = name; while (len > 0) { @@ -244,6 +245,7 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) break; case DX_HASH_TEA_UNSIGNED: str2hashbuf = str2hashbuf_unsigned; + /* fall through */ case DX_HASH_TEA: p = name; while (len > 0) { diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index bf7fa1507e81..c2225f0d31b5 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -1183,18 +1183,21 @@ do_indirects: ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1); i_data[EXT4_IND_BLOCK] = 0; } + /* fall through */ case EXT4_IND_BLOCK: nr = i_data[EXT4_DIND_BLOCK]; if (nr) { ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2); i_data[EXT4_DIND_BLOCK] = 0; } + /* fall through */ case EXT4_DIND_BLOCK: nr = i_data[EXT4_TIND_BLOCK]; if (nr) { ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3); i_data[EXT4_TIND_BLOCK] = 0; } + /* fall through */ case EXT4_TIND_BLOCK: ; } @@ -1433,6 +1436,7 @@ do_indirects: ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1); i_data[EXT4_IND_BLOCK] = 0; } + /* fall through */ case EXT4_IND_BLOCK: if (++n >= n2) return 0; @@ -1441,6 +1445,7 @@ do_indirects: ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2); i_data[EXT4_DIND_BLOCK] = 0; } + /* fall through */ case EXT4_DIND_BLOCK: if (++n >= n2) return 0; @@ -1449,6 +1454,7 @@ do_indirects: ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3); i_data[EXT4_TIND_BLOCK] = 0; } + /* fall through */ case EXT4_TIND_BLOCK: ; } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 4356ef6d728e..b54b261ded36 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -391,7 +391,7 @@ void ext4_da_update_reserve_space(struct inode *inode, * inode's preallocations. */ if ((ei->i_reserved_data_blocks == 0) && - (atomic_read(&inode->i_writecount) == 0)) + !inode_is_open_for_write(inode)) ext4_discard_preallocations(inode); } @@ -678,8 +678,6 @@ found: if (flags & EXT4_GET_BLOCKS_ZERO && map->m_flags & EXT4_MAP_MAPPED && map->m_flags & EXT4_MAP_NEW) { - clean_bdev_aliases(inode->i_sb->s_bdev, map->m_pblk, - map->m_len); ret = ext4_issue_zeroout(inode, map->m_lblk, map->m_pblk, map->m_len); if (ret) { @@ -1194,7 +1192,6 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, if (err) break; if (buffer_new(bh)) { - clean_bdev_bh_alias(bh); if (PageUptodate(page)) { clear_buffer_new(bh); set_buffer_uptodate(bh); @@ -2489,10 +2486,6 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) } BUG_ON(map->m_len == 0); - if (map->m_flags & EXT4_MAP_NEW) { - clean_bdev_aliases(inode->i_sb->s_bdev, map->m_pblk, - map->m_len); - } return 0; } @@ -2835,12 +2828,12 @@ retry: goto unplug; } ret = mpage_prepare_extent_to_map(&mpd); + /* Unlock pages we didn't use */ + mpage_release_unused_pages(&mpd, false); /* Submit prepared bio */ ext4_io_submit(&mpd.io_submit); ext4_put_io_end_defer(mpd.io_submit.io_end); mpd.io_submit.io_end = NULL; - /* Unlock pages we didn't use */ - mpage_release_unused_pages(&mpd, false); if (ret < 0) goto unplug; @@ -2908,10 +2901,11 @@ retry: handle = NULL; mpd.do_map = 0; } - /* Submit prepared bio */ - ext4_io_submit(&mpd.io_submit); /* Unlock pages we didn't use */ mpage_release_unused_pages(&mpd, give_up_on_write); + /* Submit prepared bio */ + ext4_io_submit(&mpd.io_submit); + /* * Drop our io_end reference we got from init. We have * to be careful and use deferred io_end finishing if @@ -5349,7 +5343,6 @@ static int ext4_do_update_inode(handle_t *handle, err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); if (err) goto out_brelse; - ext4_update_dynamic_rev(sb); ext4_set_feature_large_file(sb); ext4_handle_sync(handle); err = ext4_handle_dirty_super(handle, sb); @@ -6000,7 +5993,7 @@ int ext4_expand_extra_isize(struct inode *inode, ext4_write_lock_xattr(inode, &no_expand); - BUFFER_TRACE(iloc.bh, "get_write_access"); + BUFFER_TRACE(iloc->bh, "get_write_access"); error = ext4_journal_get_write_access(handle, iloc->bh); if (error) { brelse(iloc->bh); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index d26bcac291bb..3c4f8bb59f8a 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -63,18 +63,20 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2) loff_t isize; struct ext4_inode_info *ei1; struct ext4_inode_info *ei2; + unsigned long tmp; ei1 = EXT4_I(inode1); ei2 = EXT4_I(inode2); swap(inode1->i_version, inode2->i_version); - swap(inode1->i_blocks, inode2->i_blocks); - swap(inode1->i_bytes, inode2->i_bytes); swap(inode1->i_atime, inode2->i_atime); swap(inode1->i_mtime, inode2->i_mtime); memswap(ei1->i_data, ei2->i_data, sizeof(ei1->i_data)); - swap(ei1->i_flags, ei2->i_flags); + tmp = ei1->i_flags & EXT4_FL_SHOULD_SWAP; + ei1->i_flags = (ei2->i_flags & EXT4_FL_SHOULD_SWAP) | + (ei1->i_flags & ~EXT4_FL_SHOULD_SWAP); + ei2->i_flags = tmp | (ei2->i_flags & ~EXT4_FL_SHOULD_SWAP); swap(ei1->i_disksize, ei2->i_disksize); ext4_es_remove_extent(inode1, 0, EXT_MAX_BLOCKS); ext4_es_remove_extent(inode2, 0, EXT_MAX_BLOCKS); @@ -115,28 +117,42 @@ static long swap_inode_boot_loader(struct super_block *sb, int err; struct inode *inode_bl; struct ext4_inode_info *ei_bl; - - if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode) || - IS_SWAPFILE(inode) || IS_ENCRYPTED(inode) || - ext4_has_inline_data(inode)) - return -EINVAL; - - if (IS_RDONLY(inode) || IS_APPEND(inode) || IS_IMMUTABLE(inode) || - !inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) - return -EPERM; + qsize_t size, size_bl, diff; + blkcnt_t blocks; + unsigned short bytes; inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO, EXT4_IGET_SPECIAL); if (IS_ERR(inode_bl)) return PTR_ERR(inode_bl); ei_bl = EXT4_I(inode_bl); - filemap_flush(inode->i_mapping); - filemap_flush(inode_bl->i_mapping); - /* Protect orig inodes against a truncate and make sure, * that only 1 swap_inode_boot_loader is running. */ lock_two_nondirectories(inode, inode_bl); + if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode) || + IS_SWAPFILE(inode) || IS_ENCRYPTED(inode) || + (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) || + ext4_has_inline_data(inode)) { + err = -EINVAL; + goto journal_err_out; + } + + if (IS_RDONLY(inode) || IS_APPEND(inode) || IS_IMMUTABLE(inode) || + !inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) { + err = -EPERM; + goto journal_err_out; + } + + down_write(&EXT4_I(inode)->i_mmap_sem); + err = filemap_write_and_wait(inode->i_mapping); + if (err) + goto err_out; + + err = filemap_write_and_wait(inode_bl->i_mapping); + if (err) + goto err_out; + /* Wait for all existing dio workers */ inode_dio_wait(inode); inode_dio_wait(inode_bl); @@ -147,7 +163,7 @@ static long swap_inode_boot_loader(struct super_block *sb, handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2); if (IS_ERR(handle)) { err = -EINVAL; - goto journal_err_out; + goto err_out; } /* Protect extent tree against block allocations via delalloc */ @@ -170,6 +186,13 @@ static long swap_inode_boot_loader(struct super_block *sb, memset(ei_bl->i_data, 0, sizeof(ei_bl->i_data)); } + err = dquot_initialize(inode); + if (err) + goto err_out1; + + size = (qsize_t)(inode->i_blocks) * (1 << 9) + inode->i_bytes; + size_bl = (qsize_t)(inode_bl->i_blocks) * (1 << 9) + inode_bl->i_bytes; + diff = size - size_bl; swap_inode_data(inode, inode_bl); inode->i_ctime = inode_bl->i_ctime = current_time(inode); @@ -183,27 +206,51 @@ static long swap_inode_boot_loader(struct super_block *sb, err = ext4_mark_inode_dirty(handle, inode); if (err < 0) { + /* No need to update quota information. */ ext4_warning(inode->i_sb, "couldn't mark inode #%lu dirty (err %d)", inode->i_ino, err); /* Revert all changes: */ swap_inode_data(inode, inode_bl); ext4_mark_inode_dirty(handle, inode); - } else { - err = ext4_mark_inode_dirty(handle, inode_bl); - if (err < 0) { - ext4_warning(inode_bl->i_sb, - "couldn't mark inode #%lu dirty (err %d)", - inode_bl->i_ino, err); - /* Revert all changes: */ - swap_inode_data(inode, inode_bl); - ext4_mark_inode_dirty(handle, inode); - ext4_mark_inode_dirty(handle, inode_bl); - } + goto err_out1; } + + blocks = inode_bl->i_blocks; + bytes = inode_bl->i_bytes; + inode_bl->i_blocks = inode->i_blocks; + inode_bl->i_bytes = inode->i_bytes; + err = ext4_mark_inode_dirty(handle, inode_bl); + if (err < 0) { + /* No need to update quota information. */ + ext4_warning(inode_bl->i_sb, + "couldn't mark inode #%lu dirty (err %d)", + inode_bl->i_ino, err); + goto revert; + } + + /* Bootloader inode should not be counted into quota information. */ + if (diff > 0) + dquot_free_space(inode, diff); + else + err = dquot_alloc_space(inode, -1 * diff); + + if (err < 0) { +revert: + /* Revert all changes: */ + inode_bl->i_blocks = blocks; + inode_bl->i_bytes = bytes; + swap_inode_data(inode, inode_bl); + ext4_mark_inode_dirty(handle, inode); + ext4_mark_inode_dirty(handle, inode_bl); + } + +err_out1: ext4_journal_stop(handle); ext4_double_up_write_data_sem(inode, inode_bl); +err_out: + up_write(&EXT4_I(inode)->i_mmap_sem); journal_err_out: unlock_two_nondirectories(inode, inode_bl); iput(inode_bl); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index e2248083cdca..6fb76d408093 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4176,9 +4176,8 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1) >> bsbits; - if ((size == isize) && - !ext4_fs_is_busy(sbi) && - (atomic_read(&ac->ac_inode->i_writecount) == 0)) { + if ((size == isize) && !ext4_fs_is_busy(sbi) && + !inode_is_open_for_write(ac->ac_inode)) { ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC; return; } @@ -4258,7 +4257,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac, (unsigned) ar->goal, ac->ac_flags, ac->ac_2order, (unsigned) ar->lleft, (unsigned) ar->pleft, (unsigned) ar->lright, (unsigned) ar->pright, - atomic_read(&ar->inode->i_writecount) ? "" : "non-"); + inode_is_open_for_write(ar->inode) ? "" : "non-"); return 0; } diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 6f5305e9a6ac..3e9298e6a705 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -468,10 +468,8 @@ int ext4_bio_write_page(struct ext4_io_submit *io, ext4_io_submit(io); continue; } - if (buffer_new(bh)) { + if (buffer_new(bh)) clear_buffer_new(bh); - clean_bdev_bh_alias(bh); - } set_buffer_async_write(bh); nr_to_submit++; } while ((bh = bh->b_this_page) != head); diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 48421de803b7..3d9b18505c0c 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1960,7 +1960,8 @@ retry: le16_to_cpu(es->s_reserved_gdt_blocks); n_group = n_desc_blocks * EXT4_DESC_PER_BLOCK(sb); n_blocks_count = (ext4_fsblk_t)n_group * - EXT4_BLOCKS_PER_GROUP(sb); + EXT4_BLOCKS_PER_GROUP(sb) + + le32_to_cpu(es->s_first_data_block); n_group--; /* set to last group number */ } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 60da0a6e4d86..f5b828bf1299 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2249,7 +2249,6 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT); le16_add_cpu(&es->s_mnt_count, 1); ext4_update_tstamp(es, s_mtime); - ext4_update_dynamic_rev(sb); if (sbi->s_journal) ext4_set_feature_journal_needs_recovery(sb); diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index 5e4e78fc0b3a..616c075da062 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -30,6 +30,7 @@ typedef enum { attr_feature, attr_pointer_ui, attr_pointer_atomic, + attr_journal_task, } attr_id_t; typedef enum { @@ -125,6 +126,14 @@ static ssize_t trigger_test_error(struct ext4_sb_info *sbi, return count; } +static ssize_t journal_task_show(struct ext4_sb_info *sbi, char *buf) +{ + if (!sbi->s_journal) + return snprintf(buf, PAGE_SIZE, "\n"); + return snprintf(buf, PAGE_SIZE, "%d\n", + task_pid_vnr(sbi->s_journal->j_task)); +} + #define EXT4_ATTR(_name,_mode,_id) \ static struct ext4_attr ext4_attr_##_name = { \ .attr = {.name = __stringify(_name), .mode = _mode }, \ @@ -188,6 +197,7 @@ EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst); EXT4_RO_ATTR_ES_UI(errors_count, s_error_count); EXT4_ATTR(first_error_time, 0444, first_error_time); EXT4_ATTR(last_error_time, 0444, last_error_time); +EXT4_ATTR(journal_task, 0444, journal_task); static unsigned int old_bump_val = 128; EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val); @@ -217,6 +227,7 @@ static struct attribute *ext4_attrs[] = { ATTR_LIST(errors_count), ATTR_LIST(first_error_time), ATTR_LIST(last_error_time), + ATTR_LIST(journal_task), NULL, }; @@ -304,6 +315,8 @@ static ssize_t ext4_attr_show(struct kobject *kobj, return print_tstamp(buf, sbi->s_es, s_first_error_time); case attr_last_error_time: return print_tstamp(buf, sbi->s_es, s_last_error_time); + case attr_journal_task: + return journal_task_show(sbi, buf); } return 0; diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 86ed9c686249..dc82e7757f67 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -829,6 +829,7 @@ int ext4_get_inode_usage(struct inode *inode, qsize_t *usage) bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); if (IS_ERR(bh)) { ret = PTR_ERR(bh); + bh = NULL; goto out; } @@ -2903,6 +2904,7 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, if (error == -EIO) EXT4_ERROR_INODE(inode, "block %llu read error", EXT4_I(inode)->i_file_acl); + bh = NULL; goto cleanup; } error = ext4_xattr_check_block(inode, bh); @@ -3059,6 +3061,7 @@ ext4_xattr_block_cache_find(struct inode *inode, if (IS_ERR(bh)) { if (PTR_ERR(bh) == -ENOMEM) return NULL; + bh = NULL; EXT4_ERROR_INODE(inode, "block %lu read error", (unsigned long)ce->e_value); } else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) { diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 26f8d7e46462..02e0b79753e7 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -113,7 +113,7 @@ void __jbd2_log_wait_for_space(journal_t *journal) nblocks = jbd2_space_needed(journal); while (jbd2_log_space_left(journal) < nblocks) { write_unlock(&journal->j_state_lock); - mutex_lock(&journal->j_checkpoint_mutex); + mutex_lock_io(&journal->j_checkpoint_mutex); /* * Test again, another process may have checkpointed while we @@ -276,9 +276,22 @@ restart: "JBD2: %s: Waiting for Godot: block %llu\n", journal->j_devname, (unsigned long long) bh->b_blocknr); + if (batch_count) + __flush_batch(journal, &batch_count); jbd2_log_start_commit(journal, tid); + /* + * jbd2_journal_commit_transaction() may want + * to take the checkpoint_mutex if JBD2_FLUSHED + * is set, jbd2_update_log_tail() called by + * jbd2_journal_commit_transaction() may also take + * checkpoint_mutex. So we need to temporarily + * drop it. + */ + mutex_unlock(&journal->j_checkpoint_mutex); jbd2_log_wait_commit(journal, tid); - goto retry; + mutex_lock_io(&journal->j_checkpoint_mutex); + spin_lock(&journal->j_list_lock); + goto restart; } if (!buffer_dirty(bh)) { if (unlikely(buffer_write_io_error(bh)) && !result) diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 2eb55c3361a8..efd0ce9489ae 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -694,9 +694,11 @@ void jbd2_journal_commit_transaction(journal_t *journal) the last tag we set up. */ tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG); - - jbd2_descriptor_block_csum_set(journal, descriptor); start_journal_io: + if (descriptor) + jbd2_descriptor_block_csum_set(journal, + descriptor); + for (i = 0; i < bufs; i++) { struct buffer_head *bh = wbuf[i]; /* diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 8ef6b6daaa7a..382c030cc78b 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -142,22 +142,6 @@ static __be32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb) return cpu_to_be32(csum); } -static int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb) -{ - if (!jbd2_journal_has_csum_v2or3(j)) - return 1; - - return sb->s_checksum == jbd2_superblock_csum(j, sb); -} - -static void jbd2_superblock_csum_set(journal_t *j, journal_superblock_t *sb) -{ - if (!jbd2_journal_has_csum_v2or3(j)) - return; - - sb->s_checksum = jbd2_superblock_csum(j, sb); -} - /* * Helper function used to manage commit timeouts */ @@ -1356,6 +1340,10 @@ static int journal_reset(journal_t *journal) return jbd2_journal_start_thread(journal); } +/* + * This function expects that the caller will have locked the journal + * buffer head, and will return with it unlocked + */ static int jbd2_write_superblock(journal_t *journal, int write_flags) { struct buffer_head *bh = journal->j_sb_buffer; @@ -1365,7 +1353,6 @@ static int jbd2_write_superblock(journal_t *journal, int write_flags) trace_jbd2_write_superblock(journal, write_flags); if (!(journal->j_flags & JBD2_BARRIER)) write_flags &= ~(REQ_FUA | REQ_PREFLUSH); - lock_buffer(bh); if (buffer_write_io_error(bh)) { /* * Oh, dear. A previous attempt to write the journal @@ -1381,7 +1368,8 @@ static int jbd2_write_superblock(journal_t *journal, int write_flags) clear_buffer_write_io_error(bh); set_buffer_uptodate(bh); } - jbd2_superblock_csum_set(journal, sb); + if (jbd2_journal_has_csum_v2or3(journal)) + sb->s_checksum = jbd2_superblock_csum(journal, sb); get_bh(bh); bh->b_end_io = end_buffer_write_sync; ret = submit_bh(REQ_OP_WRITE, write_flags, bh); @@ -1424,6 +1412,7 @@ int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid, jbd_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n", tail_block, tail_tid); + lock_buffer(journal->j_sb_buffer); sb->s_sequence = cpu_to_be32(tail_tid); sb->s_start = cpu_to_be32(tail_block); @@ -1454,18 +1443,17 @@ static void jbd2_mark_journal_empty(journal_t *journal, int write_op) journal_superblock_t *sb = journal->j_superblock; BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); - read_lock(&journal->j_state_lock); - /* Is it already empty? */ - if (sb->s_start == 0) { - read_unlock(&journal->j_state_lock); + lock_buffer(journal->j_sb_buffer); + if (sb->s_start == 0) { /* Is it already empty? */ + unlock_buffer(journal->j_sb_buffer); return; } + jbd_debug(1, "JBD2: Marking journal as empty (seq %d)\n", journal->j_tail_sequence); sb->s_sequence = cpu_to_be32(journal->j_tail_sequence); sb->s_start = cpu_to_be32(0); - read_unlock(&journal->j_state_lock); jbd2_write_superblock(journal, write_op); @@ -1488,9 +1476,8 @@ void jbd2_journal_update_sb_errno(journal_t *journal) journal_superblock_t *sb = journal->j_superblock; int errcode; - read_lock(&journal->j_state_lock); + lock_buffer(journal->j_sb_buffer); errcode = journal->j_errno; - read_unlock(&journal->j_state_lock); if (errcode == -ESHUTDOWN) errcode = 0; jbd_debug(1, "JBD2: updating superblock error (errno %d)\n", errcode); @@ -1595,17 +1582,18 @@ static int journal_get_superblock(journal_t *journal) } } - /* Check superblock checksum */ - if (!jbd2_superblock_csum_verify(journal, sb)) { - printk(KERN_ERR "JBD2: journal checksum error\n"); - err = -EFSBADCRC; - goto out; - } + if (jbd2_journal_has_csum_v2or3(journal)) { + /* Check superblock checksum */ + if (sb->s_checksum != jbd2_superblock_csum(journal, sb)) { + printk(KERN_ERR "JBD2: journal checksum error\n"); + err = -EFSBADCRC; + goto out; + } - /* Precompute checksum seed for all metadata */ - if (jbd2_journal_has_csum_v2or3(journal)) + /* Precompute checksum seed for all metadata */ journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid, sizeof(sb->s_uuid)); + } set_buffer_verified(bh); @@ -1894,28 +1882,27 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat, sb = journal->j_superblock; + /* Load the checksum driver if necessary */ + if ((journal->j_chksum_driver == NULL) && + INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) { + journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); + if (IS_ERR(journal->j_chksum_driver)) { + printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n"); + journal->j_chksum_driver = NULL; + return 0; + } + /* Precompute checksum seed for all metadata */ + journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid, + sizeof(sb->s_uuid)); + } + + lock_buffer(journal->j_sb_buffer); + /* If enabling v3 checksums, update superblock */ if (INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) { sb->s_checksum_type = JBD2_CRC32C_CHKSUM; sb->s_feature_compat &= ~cpu_to_be32(JBD2_FEATURE_COMPAT_CHECKSUM); - - /* Load the checksum driver */ - if (journal->j_chksum_driver == NULL) { - journal->j_chksum_driver = crypto_alloc_shash("crc32c", - 0, 0); - if (IS_ERR(journal->j_chksum_driver)) { - printk(KERN_ERR "JBD2: Cannot load crc32c " - "driver.\n"); - journal->j_chksum_driver = NULL; - return 0; - } - - /* Precompute checksum seed for all metadata */ - journal->j_csum_seed = jbd2_chksum(journal, ~0, - sb->s_uuid, - sizeof(sb->s_uuid)); - } } /* If enabling v1 checksums, downgrade superblock */ @@ -1927,6 +1914,7 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat, sb->s_feature_compat |= cpu_to_be32(compat); sb->s_feature_ro_compat |= cpu_to_be32(ro); sb->s_feature_incompat |= cpu_to_be32(incompat); + unlock_buffer(journal->j_sb_buffer); return 1; #undef COMPAT_FEATURE_ON @@ -2067,7 +2055,7 @@ int jbd2_journal_wipe(journal_t *journal, int write) err = jbd2_journal_skip_recovery(journal); if (write) { /* Lock to make assertions happy... */ - mutex_lock(&journal->j_checkpoint_mutex); + mutex_lock_io(&journal->j_checkpoint_mutex); jbd2_mark_journal_empty(journal, REQ_SYNC | REQ_FUA); mutex_unlock(&journal->j_checkpoint_mutex); } diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index cc35537232f2..f940d31c2adc 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -63,7 +63,7 @@ void jbd2_journal_free_transaction(transaction_t *transaction) /* * jbd2_get_transaction: obtain a new transaction_t object. * - * Simply allocate and initialise a new transaction. Create it in + * Simply initialise a new transaction. Initialize it in * RUNNING state and add it to the current journal (which should not * have an existing running transaction: we only make a new transaction * once we have started to commit the old one). @@ -75,8 +75,8 @@ void jbd2_journal_free_transaction(transaction_t *transaction) * */ -static transaction_t * -jbd2_get_transaction(journal_t *journal, transaction_t *transaction) +static void jbd2_get_transaction(journal_t *journal, + transaction_t *transaction) { transaction->t_journal = journal; transaction->t_state = T_RUNNING; @@ -100,8 +100,6 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction) transaction->t_max_wait = 0; transaction->t_start = jiffies; transaction->t_requested = 0; - - return transaction; } /* @@ -1252,11 +1250,12 @@ int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh) struct journal_head *jh; char *committed_data = NULL; - JBUFFER_TRACE(jh, "entry"); if (jbd2_write_access_granted(handle, bh, true)) return 0; jh = jbd2_journal_add_journal_head(bh); + JBUFFER_TRACE(jh, "entry"); + /* * Do this first --- it can drop the journal lock, so we want to * make sure that obtaining the committed_data is done @@ -1367,15 +1366,17 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) if (is_handle_aborted(handle)) return -EROFS; - if (!buffer_jbd(bh)) { - ret = -EUCLEAN; - goto out; - } + if (!buffer_jbd(bh)) + return -EUCLEAN; + /* * We don't grab jh reference here since the buffer must be part * of the running transaction. */ jh = bh2jh(bh); + jbd_debug(5, "journal_head %p\n", jh); + JBUFFER_TRACE(jh, "entry"); + /* * This and the following assertions are unreliable since we may see jh * in inconsistent state unless we grab bh_state lock. But this is @@ -1409,9 +1410,6 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) } journal = transaction->t_journal; - jbd_debug(5, "journal_head %p\n", jh); - JBUFFER_TRACE(jh, "entry"); - jbd_lock_bh_state(bh); if (jh->b_modified == 0) { @@ -1597,9 +1595,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) __jbd2_journal_unfile_buffer(jh); if (!buffer_jbd(bh)) { spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - __bforget(bh); - goto drop; + goto not_jbd; } } spin_unlock(&journal->j_list_lock); @@ -1609,14 +1605,21 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) /* However, if the buffer is still owned by a prior * (committing) transaction, we can't drop it yet... */ JBUFFER_TRACE(jh, "belongs to older transaction"); - /* ... but we CAN drop it from the new transaction if we - * have also modified it since the original commit. */ + /* ... but we CAN drop it from the new transaction through + * marking the buffer as freed and set j_next_transaction to + * the new transaction, so that not only the commit code + * knows it should clear dirty bits when it is done with the + * buffer, but also the buffer can be checkpointed only + * after the new transaction commits. */ - if (jh->b_next_transaction) { - J_ASSERT(jh->b_next_transaction == transaction); + set_buffer_freed(bh); + + if (!jh->b_next_transaction) { spin_lock(&journal->j_list_lock); - jh->b_next_transaction = NULL; + jh->b_next_transaction = transaction; spin_unlock(&journal->j_list_lock); + } else { + J_ASSERT(jh->b_next_transaction == transaction); /* * only drop a reference if this transaction modified @@ -1625,9 +1628,40 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) if (was_modified) drop_reserve = 1; } + } else { + /* + * Finally, if the buffer is not belongs to any + * transaction, we can just drop it now if it has no + * checkpoint. + */ + spin_lock(&journal->j_list_lock); + if (!jh->b_cp_transaction) { + JBUFFER_TRACE(jh, "belongs to none transaction"); + spin_unlock(&journal->j_list_lock); + goto not_jbd; + } + + /* + * Otherwise, if the buffer has been written to disk, + * it is safe to remove the checkpoint and drop it. + */ + if (!buffer_dirty(bh)) { + __jbd2_journal_remove_checkpoint(jh); + spin_unlock(&journal->j_list_lock); + goto not_jbd; + } + + /* + * The buffer is still not written to disk, we should + * attach this buffer to current transaction so that the + * buffer can be checkpointed only after the current + * transaction commits. + */ + clear_buffer_dirty(bh); + __jbd2_journal_file_buffer(jh, transaction, BJ_Forget); + spin_unlock(&journal->j_list_lock); } -not_jbd: jbd_unlock_bh_state(bh); __brelse(bh); drop: @@ -1636,6 +1670,11 @@ drop: handle->h_buffer_credits++; } return err; + +not_jbd: + jbd_unlock_bh_state(bh); + __bforget(bh); + goto drop; } /**