From 75a037f3604ceb781ae23167e0cdfbe5d71533d7 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 31 Jul 2019 13:27:05 -0700 Subject: [PATCH 01/51] f2fs: fix livelock in swapfile writes This patch fixes livelock in the below call path when writing swap pages. [46374.617256] c2 701 __switch_to+0xe4/0x100 [46374.617265] c2 701 __schedule+0x80c/0xbc4 [46374.617273] c2 701 schedule+0x74/0x98 [46374.617281] c2 701 rwsem_down_read_failed+0x190/0x234 [46374.617291] c2 701 down_read+0x58/0x5c [46374.617300] c2 701 f2fs_map_blocks+0x138/0x9a8 [46374.617310] c2 701 get_data_block_dio_write+0x74/0x104 [46374.617320] c2 701 __blockdev_direct_IO+0x1350/0x3930 [46374.617331] c2 701 f2fs_direct_IO+0x55c/0x8bc [46374.617341] c2 701 __swap_writepage+0x1d0/0x3e8 [46374.617351] c2 701 swap_writepage+0x44/0x54 [46374.617360] c2 701 shrink_page_list+0x140/0xe80 [46374.617371] c2 701 shrink_inactive_list+0x510/0x918 [46374.617381] c2 701 shrink_node_memcg+0x2d4/0x804 [46374.617391] c2 701 shrink_node+0x10c/0x2f8 [46374.617400] c2 701 do_try_to_free_pages+0x178/0x38c [46374.617410] c2 701 try_to_free_pages+0x348/0x4b8 [46374.617419] c2 701 __alloc_pages_nodemask+0x7f8/0x1014 [46374.617429] c2 701 pagecache_get_page+0x184/0x2cc [46374.617438] c2 701 f2fs_new_node_page+0x60/0x41c [46374.617449] c2 701 f2fs_new_inode_page+0x50/0x7c [46374.617460] c2 701 f2fs_init_inode_metadata+0x128/0x530 [46374.617472] c2 701 f2fs_add_inline_entry+0x138/0xd64 [46374.617480] c2 701 f2fs_do_add_link+0xf4/0x178 [46374.617488] c2 701 f2fs_create+0x1e4/0x3ac [46374.617497] c2 701 path_openat+0xdc0/0x1308 [46374.617507] c2 701 do_filp_open+0x78/0x124 [46374.617516] c2 701 do_sys_open+0x134/0x248 [46374.617525] c2 701 SyS_openat+0x14/0x20 Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index abbf14e9bd72..f49f243fd54f 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1372,7 +1372,7 @@ static int get_data_block_dio_write(struct inode *inode, sector_t iblock, return __get_data_block(inode, iblock, bh_result, create, F2FS_GET_BLOCK_DIO, NULL, f2fs_rw_hint_to_seg_type(inode->i_write_hint), - true); + IS_SWAPFILE(inode) ? false : true); } static int get_data_block_dio(struct inode *inode, sector_t iblock, From 8896cbdfed0ca34452252b72d6ee97bcfca9abd2 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 12 Jul 2019 16:55:41 +0800 Subject: [PATCH 02/51] f2fs: introduce {page,io}_is_mergeable() for readability Wrap merge condition into function for readability, no logic change. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 40 +++++++++++++++++++++++++++++++++------- 1 file changed, 33 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index f49f243fd54f..0686306ed988 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -481,6 +481,33 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) return 0; } +static bool page_is_mergeable(struct f2fs_sb_info *sbi, struct bio *bio, + block_t last_blkaddr, block_t cur_blkaddr) +{ + if (last_blkaddr + 1 != cur_blkaddr) + return false; + return __same_bdev(sbi, cur_blkaddr, bio); +} + +static bool io_type_is_mergeable(struct f2fs_bio_info *io, + struct f2fs_io_info *fio) +{ + if (io->fio.op != fio->op) + return false; + return io->fio.op_flags == fio->op_flags; +} + +static bool io_is_mergeable(struct f2fs_sb_info *sbi, struct bio *bio, + struct f2fs_bio_info *io, + struct f2fs_io_info *fio, + block_t last_blkaddr, + block_t cur_blkaddr) +{ + if (!page_is_mergeable(sbi, bio, last_blkaddr, cur_blkaddr)) + return false; + return io_type_is_mergeable(io, fio); +} + int f2fs_merge_page_bio(struct f2fs_io_info *fio) { struct bio *bio = *fio->bio; @@ -494,8 +521,8 @@ int f2fs_merge_page_bio(struct f2fs_io_info *fio) trace_f2fs_submit_page_bio(page, fio); f2fs_trace_ios(fio, 0); - if (bio && (*fio->last_block + 1 != fio->new_blkaddr || - !__same_bdev(fio->sbi, fio->new_blkaddr, bio))) { + if (bio && !page_is_mergeable(fio->sbi, bio, *fio->last_block, + fio->new_blkaddr)) { __submit_bio(fio->sbi, bio, fio->type); bio = NULL; } @@ -568,9 +595,8 @@ next: inc_page_count(sbi, WB_DATA_TYPE(bio_page)); - if (io->bio && (io->last_block_in_bio != fio->new_blkaddr - 1 || - (io->fio.op != fio->op || io->fio.op_flags != fio->op_flags) || - !__same_bdev(sbi, fio->new_blkaddr, io->bio))) + if (io->bio && !io_is_mergeable(sbi, io->bio, io, fio, + io->last_block_in_bio, fio->new_blkaddr)) __submit_merged_bio(io); alloc_new: if (io->bio == NULL) { @@ -1642,8 +1668,8 @@ zero_out: * This page will go to BIO. Do we need to send this * BIO off first? */ - if (bio && (*last_block_in_bio != block_nr - 1 || - !__same_bdev(F2FS_I_SB(inode), block_nr, bio))) { + if (bio && !page_is_mergeable(F2FS_I_SB(inode), bio, + *last_block_in_bio, block_nr)) { submit_and_realloc: __submit_bio(F2FS_I_SB(inode), bio, DATA); bio = NULL; From c72db71ed61ff51c2b8189ac9889dd18f22eb612 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 12 Jul 2019 16:55:42 +0800 Subject: [PATCH 03/51] f2fs: fix panic of IO alignment feature Since 07173c3ec276 ("block: enable multipage bvecs"), one bio vector can store multi pages, so that we can not calculate max IO size of bio as PAGE_SIZE * bio->bi_max_vecs. However IO alignment feature of f2fs always has that assumption, so finally, it may cause panic during IO submission as below stack. kernel BUG at fs/f2fs/data.c:317! RIP: 0010:__submit_merged_bio+0x8b0/0x8c0 Call Trace: f2fs_submit_page_write+0x3cd/0xdd0 do_write_page+0x15d/0x360 f2fs_outplace_write_data+0xd7/0x210 f2fs_do_write_data_page+0x43b/0xf30 __write_data_page+0xcf6/0x1140 f2fs_write_cache_pages+0x3ba/0xb40 f2fs_write_data_pages+0x3dd/0x8b0 do_writepages+0xbb/0x1e0 __writeback_single_inode+0xb6/0x800 writeback_sb_inodes+0x441/0x910 wb_writeback+0x261/0x650 wb_workfn+0x1f9/0x7a0 process_one_work+0x503/0x970 worker_thread+0x7d/0x820 kthread+0x1ad/0x210 ret_from_fork+0x35/0x40 This patch adds one extra condition to check left space in bio while trying merging page to bio, to avoid panic. This bug was reported in bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=204043 Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 10 ++++++++++ fs/f2fs/super.c | 2 +- include/linux/f2fs_fs.h | 1 + 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 0686306ed988..5bce20005add 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -503,6 +503,16 @@ static bool io_is_mergeable(struct f2fs_sb_info *sbi, struct bio *bio, block_t last_blkaddr, block_t cur_blkaddr) { + if (F2FS_IO_ALIGNED(sbi) && (fio->type == DATA || fio->type == NODE)) { + unsigned int filled_blocks = + F2FS_BYTES_TO_BLK(bio->bi_iter.bi_size); + unsigned int io_size = F2FS_IO_SIZE(sbi); + unsigned int left_vecs = bio->bi_max_vecs - bio->bi_vcnt; + + /* IOs in bio is aligned and left space of vectors is not enough */ + if (!(filled_blocks % io_size) && left_vecs < io_size) + return false; + } if (!page_is_mergeable(sbi, bio, last_blkaddr, cur_blkaddr)) return false; return io_type_is_mergeable(io, fio); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 78a1b873e48a..720f2e6d6f0a 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3202,7 +3202,7 @@ try_onemore: if (err) goto free_bio_info; - if (F2FS_IO_SIZE(sbi) > 1) { + if (F2FS_IO_ALIGNED(sbi)) { sbi->write_io_dummy = mempool_create_page_pool(2 * (F2FS_IO_SIZE(sbi) - 1), 0); if (!sbi->write_io_dummy) { diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 65559900d4d7..52af9ac164b4 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -41,6 +41,7 @@ #define F2FS_IO_SIZE_BYTES(sbi) (1 << (F2FS_OPTION(sbi).write_io_size_bits + 12)) /* B */ #define F2FS_IO_SIZE_BITS(sbi) (F2FS_OPTION(sbi).write_io_size_bits) /* power of 2 */ #define F2FS_IO_SIZE_MASK(sbi) (F2FS_IO_SIZE(sbi) - 1) +#define F2FS_IO_ALIGNED(sbi) (F2FS_IO_SIZE(sbi) > 1) /* This flag is used by node and meta inodes, and by recovery */ #define GFP_F2FS_ZERO (GFP_NOFS | __GFP_ZERO) From 1f78adfab379e53b0bd725b869061d9ba8055943 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 12 Jul 2019 16:57:00 +0800 Subject: [PATCH 04/51] f2fs: disallow switching io_bits option during remount If IO alignment feature is turned on after remount, we didn't initialize mempool of it, it turns out we will encounter panic during IO submission due to access NULL mempool pointer. This feature should be set only at mount time, so simply deny configuring during remount. This fixes bug reported in bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=204135 Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 720f2e6d6f0a..3ae29d8fb18c 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1522,6 +1522,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) bool need_stop_gc = false; bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE); bool disable_checkpoint = test_opt(sbi, DISABLE_CHECKPOINT); + bool no_io_align = !F2FS_IO_ALIGNED(sbi); bool checkpoint_changed; #ifdef CONFIG_QUOTA int i, j; @@ -1601,6 +1602,12 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) goto restore_opts; } + if (no_io_align == !!F2FS_IO_ALIGNED(sbi)) { + err = -EINVAL; + f2fs_warn(sbi, "switch io_bits option is not allowed"); + goto restore_opts; + } + if ((*flags & SB_RDONLY) && test_opt(sbi, DISABLE_CHECKPOINT)) { err = -EINVAL; f2fs_warn(sbi, "disabling checkpoint not compatible with read-only"); From a8933b6b68f775b5774e7b075447fae13f4d01fe Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 18 Jul 2019 16:39:59 +0800 Subject: [PATCH 05/51] f2fs: fix to drop meta/node pages during umount As reported in bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=204193 A null pointer dereference bug is triggered in f2fs under kernel-5.1.3. kasan_report.cold+0x5/0x32 f2fs_write_end_io+0x215/0x650 bio_endio+0x26e/0x320 blk_update_request+0x209/0x5d0 blk_mq_end_request+0x2e/0x230 lo_complete_rq+0x12c/0x190 blk_done_softirq+0x14a/0x1a0 __do_softirq+0x119/0x3e5 irq_exit+0x94/0xe0 call_function_single_interrupt+0xf/0x20 During umount, we will access NULL sbi->node_inode pointer in f2fs_write_end_io(): f2fs_bug_on(sbi, page->mapping == NODE_MAPPING(sbi) && page->index != nid_of_node(page)); The reason is if disable_checkpoint mount option is on, meta dirty pages can remain during umount, and then be flushed by iput() of meta_inode, however node_inode has been iput()ed before meta_inode's iput(). Since checkpoint is disabled, all meta/node datas are useless and should be dropped in next mount, so in umount, let's adjust drop_inode() to give a hint to iput_final() to drop all those dirty datas correctly. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 3ae29d8fb18c..2d2c91c7eadd 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -873,7 +873,21 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) static int f2fs_drop_inode(struct inode *inode) { + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); int ret; + + /* + * during filesystem shutdown, if checkpoint is disabled, + * drop useless meta/node dirty pages. + */ + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { + if (inode->i_ino == F2FS_NODE_INO(sbi) || + inode->i_ino == F2FS_META_INO(sbi)) { + trace_f2fs_drop_inode(inode, 1); + return 1; + } + } + /* * This is to avoid a deadlock condition like below. * writeback_single_inode(inode) From 0f1898f93cdcb9275b7ab9c9931c5c21a8fd3d61 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 19 Jul 2019 11:51:11 +0800 Subject: [PATCH 06/51] f2fs: fix to avoid tagging SBI_QUOTA_NEED_REPAIR incorrectly On a quota disabled image, with fault injection, SBI_QUOTA_NEED_REPAIR will be set incorrectly in error path of f2fs_evict_inode(), fix it. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index a33d7a849b2d..d1998ddf14fd 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -693,7 +693,8 @@ retry: if (err) { f2fs_update_inode_page(inode); - set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); + if (dquot_initialize_needed(inode)) + set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); } sb_end_intwrite(inode->i_sb); no_delete: From 04f9287ab395a5a279db44fb39de69b23640abb9 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 19 Jul 2019 15:18:44 +0800 Subject: [PATCH 07/51] f2fs: fix to avoid discard command leak ============================================================================= BUG discard_cmd (Tainted: G B OE ): Objects remaining in discard_cmd on __kmem_cache_shutdown() ----------------------------------------------------------------------------- INFO: Slab 0xffffe1ac481d22c0 objects=36 used=2 fp=0xffff936b4748bf50 flags=0x2ffff0000000100 Call Trace: dump_stack+0x63/0x87 slab_err+0xa1/0xb0 __kmem_cache_shutdown+0x183/0x390 shutdown_cache+0x14/0x110 kmem_cache_destroy+0x195/0x1c0 f2fs_destroy_segment_manager_caches+0x21/0x40 [f2fs] exit_f2fs_fs+0x35/0x641 [f2fs] SyS_delete_module+0x155/0x230 ? vtime_user_exit+0x29/0x70 do_syscall_64+0x6e/0x160 entry_SYSCALL64_slow_path+0x25/0x25 INFO: Object 0xffff936b4748b000 @offset=0 INFO: Object 0xffff936b4748b070 @offset=112 kmem_cache_destroy discard_cmd: Slab cache still has objects Call Trace: dump_stack+0x63/0x87 kmem_cache_destroy+0x1b4/0x1c0 f2fs_destroy_segment_manager_caches+0x21/0x40 [f2fs] exit_f2fs_fs+0x35/0x641 [f2fs] SyS_delete_module+0x155/0x230 do_syscall_64+0x6e/0x160 entry_SYSCALL64_slow_path+0x25/0x25 Recovery can cache discard commands, so in error path of fill_super(), we need give a chance to handle them, otherwise it will lead to leak of discard_cmd slab cache. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index a661ac32e829..a1ece0caad78 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2084,6 +2084,13 @@ static void destroy_discard_cmd_control(struct f2fs_sb_info *sbi) f2fs_stop_discard_thread(sbi); + /* + * Recovery can cache discard commands, so in error path of + * fill_super(), it needs to give a chance to handle them. + */ + if (unlikely(atomic_read(&dcc->discard_cmd_cnt))) + f2fs_issue_discard_timeout(sbi); + kvfree(dcc); SM_I(sbi)->dcc_info = NULL; } From 7975f3498dc0403d8177c0775b9514158ec66681 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 22 Jul 2019 18:03:50 +0800 Subject: [PATCH 08/51] f2fs: support fiemap() for directory inode Adjust f2fs_fiemap() to support fiemap() on directory inode. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- fs/f2fs/inline.c | 8 +++++++- fs/f2fs/namei.c | 1 + 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 5bce20005add..73ed4ff9d01c 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1539,7 +1539,7 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, goto out; } - if (f2fs_has_inline_data(inode)) { + if (f2fs_has_inline_data(inode) || f2fs_has_inline_dentry(inode)) { ret = f2fs_inline_data_fiemap(inode, fieinfo, start, len); if (ret != -EAGAIN) goto out; diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 3613efca8c00..8c0712154fb1 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -704,7 +704,13 @@ int f2fs_inline_data_fiemap(struct inode *inode, if (IS_ERR(ipage)) return PTR_ERR(ipage); - if (!f2fs_has_inline_data(inode)) { + if ((S_ISREG(inode->i_mode) || S_ISLNK(inode->i_mode)) && + !f2fs_has_inline_data(inode)) { + err = -EAGAIN; + goto out; + } + + if (S_ISDIR(inode->i_mode) && !f2fs_has_inline_dentry(inode)) { err = -EAGAIN; goto out; } diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index c5b99042e6f2..612561c4f7bd 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -1250,6 +1250,7 @@ const struct inode_operations f2fs_dir_inode_operations = { #ifdef CONFIG_F2FS_FS_XATTR .listxattr = f2fs_listxattr, #endif + .fiemap = f2fs_fiemap, }; const struct inode_operations f2fs_symlink_inode_operations = { From 955ebcd3a910b00de94be8797b20b8cfb2ee0fd8 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 22 Jul 2019 17:57:05 +0800 Subject: [PATCH 09/51] f2fs: fix to spread f2fs_is_checkpoint_ready() We missed to call f2fs_is_checkpoint_ready() in several places, it may allow space allocation even when free space was exhausted during checkpoint is disabled, fix to add them. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 11 +++++++++++ fs/f2fs/namei.c | 4 ++++ fs/f2fs/xattr.c | 5 +++++ 3 files changed, 20 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 3e58a6f697dd..1e27b4bc852d 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -56,6 +56,9 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) err = -EIO; goto err; } + err = f2fs_is_checkpoint_ready(sbi); + if (err) + goto err; sb_start_pagefault(inode->i_sb); @@ -1567,6 +1570,9 @@ static long f2fs_fallocate(struct file *file, int mode, if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) return -EIO; + ret = f2fs_is_checkpoint_ready(F2FS_I_SB(inode)); + if (ret) + return ret; /* f2fs only support ->fallocate for regular file */ if (!S_ISREG(inode->i_mode)) @@ -3062,8 +3068,13 @@ static int f2fs_ioc_resize_fs(struct file *filp, unsigned long arg) long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { + int ret; + if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(filp))))) return -EIO; + ret = f2fs_is_checkpoint_ready(F2FS_I_SB(file_inode(filp))); + if (ret) + return ret; switch (cmd) { case F2FS_IOC_GETFLAGS: diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 612561c4f7bd..7560c7ed38b1 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -801,9 +801,13 @@ out: static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + int ret; if (unlikely(f2fs_cp_error(sbi))) return -EIO; + ret = f2fs_is_checkpoint_ready(sbi); + if (ret) + return ret; if (IS_ENCRYPTED(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) { int err = fscrypt_get_encryption_info(dir); diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index b32c45621679..3c92f4122044 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -21,6 +21,7 @@ #include #include "f2fs.h" #include "xattr.h" +#include "segment.h" static int f2fs_xattr_generic_get(const struct xattr_handler *handler, struct dentry *unused, struct inode *inode, @@ -729,6 +730,10 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name, struct f2fs_sb_info *sbi = F2FS_I_SB(inode); int err; + err = f2fs_is_checkpoint_ready(sbi); + if (err) + return err; + err = dquot_initialize(inode); if (err) return err; From a25c2cdcb61ab3d8d99623a9e72cf1747979a1f9 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 22 Jul 2019 17:57:06 +0800 Subject: [PATCH 10/51] f2fs: fix to detect cp error in f2fs_setxattr() It needs to return -EIO if filesystem has been shutdown, fix the miss case in f2fs_setxattr(). Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/xattr.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 3c92f4122044..f85c810e33ca 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -730,6 +730,8 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name, struct f2fs_sb_info *sbi = F2FS_I_SB(inode); int err; + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; err = f2fs_is_checkpoint_ready(sbi); if (err) return err; From fe973b065bce0e61414c33251afae501a757f8c6 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 25 Jul 2019 17:33:37 +0800 Subject: [PATCH 11/51] f2fs: fix to handle quota_{on,off} correctly With quota_ino feature on, generic/232 reports an inconsistence issue on the image. The root cause is that the testcase tries to: - use quotactl to shutdown journalled quota based on sysfile; - and then use quotactl to enable/turn on quota based on specific file (aquota.user or aquota.group). Eventually, quota sysfile will be out-of-update due to following specific file creation. Change as below to fix this issue: - deny enabling quota based on specific file if quota sysfile exists. - set SBI_QUOTA_NEED_REPAIR once sysfile based quota shutdowns via ioctl. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 2d2c91c7eadd..9167deb0c417 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2000,6 +2000,12 @@ static int f2fs_quota_on(struct super_block *sb, int type, int format_id, struct inode *inode; int err; + /* if quota sysfile exists, deny enabling quota with specific file */ + if (f2fs_sb_has_quota_ino(F2FS_SB(sb))) { + f2fs_err(F2FS_SB(sb), "quota sysfile already exists"); + return -EBUSY; + } + err = f2fs_quota_sync(sb, type); if (err) return err; @@ -2019,7 +2025,7 @@ static int f2fs_quota_on(struct super_block *sb, int type, int format_id, return 0; } -static int f2fs_quota_off(struct super_block *sb, int type) +static int __f2fs_quota_off(struct super_block *sb, int type) { struct inode *inode = sb_dqopt(sb)->files[type]; int err; @@ -2045,13 +2051,30 @@ out_put: return err; } +static int f2fs_quota_off(struct super_block *sb, int type) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + int err; + + err = __f2fs_quota_off(sb, type); + + /* + * quotactl can shutdown journalled quota, result in inconsistence + * between quota record and fs data by following updates, tag the + * flag to let fsck be aware of it. + */ + if (is_journalled_quota(sbi)) + set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); + return err; +} + void f2fs_quota_off_umount(struct super_block *sb) { int type; int err; for (type = 0; type < MAXQUOTAS; type++) { - err = f2fs_quota_off(sb, type); + err = __f2fs_quota_off(sb, type); if (err) { int ret = dquot_quota_off(sb, type); From 038d06984f5c50a101c1cf47bc6419064042716d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 25 Jul 2019 22:39:11 +0800 Subject: [PATCH 12/51] f2fs: disallow direct IO in atomic write Atomic write needs page cache to cache data of transaction, direct IO should never be allowed in atomic write, detect and deny it when open atomic write file. Signed-off-by: Gao Xiang Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 1e27b4bc852d..a5080bea6b0b 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1825,6 +1825,9 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) if (!S_ISREG(inode->i_mode)) return -EINVAL; + if (filp->f_flags & O_DIRECT) + return -EINVAL; + ret = mnt_want_write_file(filp); if (ret) return ret; From 280fd422958187ac5f069c08d84dd65f7f87c2e6 Mon Sep 17 00:00:00 2001 From: Jia-Ju Bai Date: Fri, 26 Jul 2019 11:45:12 +0800 Subject: [PATCH 13/51] fs: f2fs: Remove unnecessary checks of SM_I(sbi) in update_general_status() In fill_super() and put_super(), f2fs_destroy_stats() is called in prior to f2fs_destroy_segment_manager(), so if current sbi can still be visited in global stat list, SM_I(sbi) should be released yet. For this reason, SM_I(sbi) does not need to be checked in update_general_status(). Thank Chao Yu for advice. Signed-off-by: Jia-Ju Bai Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 7706049d23bf..9b0bedd82581 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -67,7 +67,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->nr_rd_data = get_pages(sbi, F2FS_RD_DATA); si->nr_rd_node = get_pages(sbi, F2FS_RD_NODE); si->nr_rd_meta = get_pages(sbi, F2FS_RD_META); - if (SM_I(sbi) && SM_I(sbi)->fcc_info) { + if (SM_I(sbi)->fcc_info) { si->nr_flushed = atomic_read(&SM_I(sbi)->fcc_info->issued_flush); si->nr_flushing = @@ -75,7 +75,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->flush_list_empty = llist_empty(&SM_I(sbi)->fcc_info->issue_list); } - if (SM_I(sbi) && SM_I(sbi)->dcc_info) { + if (SM_I(sbi)->dcc_info) { si->nr_discarded = atomic_read(&SM_I(sbi)->dcc_info->issued_discard); si->nr_discarding = From 0921835c9544c1dc3d2093df6932f1b207eec487 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 26 Jul 2019 15:43:17 +0800 Subject: [PATCH 14/51] f2fs: fix to avoid call kvfree under spinlock vfree() don't wish to be called from interrupt context, move it out of spin_lock_irqsave() coverage. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 17382da7f0bd..4fa9a618b31a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1640,6 +1640,7 @@ static inline void clear_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) static inline void disable_nat_bits(struct f2fs_sb_info *sbi, bool lock) { unsigned long flags; + unsigned char *nat_bits; /* * In order to re-enable nat_bits we need to call fsck.f2fs by @@ -1650,10 +1651,12 @@ static inline void disable_nat_bits(struct f2fs_sb_info *sbi, bool lock) if (lock) spin_lock_irqsave(&sbi->cp_lock, flags); __clear_ckpt_flags(F2FS_CKPT(sbi), CP_NAT_BITS_FLAG); - kvfree(NM_I(sbi)->nat_bits); + nat_bits = NM_I(sbi)->nat_bits; NM_I(sbi)->nat_bits = NULL; if (lock) spin_unlock_irqrestore(&sbi->cp_lock, flags); + + kvfree(nat_bits); } static inline bool enabled_nat_bits(struct f2fs_sb_info *sbi, From 71e90b4654a9298f9e2375cc733d57b8bf92ce73 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Tue, 23 Jul 2019 16:05:27 -0700 Subject: [PATCH 15/51] fs: Reserve flag for casefolding In preparation for including the casefold feature within f2fs, elevate the EXT4_CASEFOLD_FL flag to FS_CASEFOLD_FL. Signed-off-by: Daniel Rosenberg Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- include/uapi/linux/fs.h | 1 + tools/include/uapi/linux/fs.h | 1 + 2 files changed, 2 insertions(+) diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 59c71fa8c553..2a616aa3f686 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -311,6 +311,7 @@ struct fscrypt_key { #define FS_NOCOW_FL 0x00800000 /* Do not cow file */ #define FS_INLINE_DATA_FL 0x10000000 /* Reserved for ext4 */ #define FS_PROJINHERIT_FL 0x20000000 /* Create with parents projid */ +#define FS_CASEFOLD_FL 0x40000000 /* Folder is case insensitive */ #define FS_RESERVED_FL 0x80000000 /* reserved for ext2 lib */ #define FS_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */ diff --git a/tools/include/uapi/linux/fs.h b/tools/include/uapi/linux/fs.h index 59c71fa8c553..2a616aa3f686 100644 --- a/tools/include/uapi/linux/fs.h +++ b/tools/include/uapi/linux/fs.h @@ -311,6 +311,7 @@ struct fscrypt_key { #define FS_NOCOW_FL 0x00800000 /* Do not cow file */ #define FS_INLINE_DATA_FL 0x10000000 /* Reserved for ext4 */ #define FS_PROJINHERIT_FL 0x20000000 /* Create with parents projid */ +#define FS_CASEFOLD_FL 0x40000000 /* Folder is case insensitive */ #define FS_RESERVED_FL 0x80000000 /* reserved for ext2 lib */ #define FS_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */ From 5aba54302a46fdd589040b928d5d010e5ace1234 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Tue, 23 Jul 2019 16:05:28 -0700 Subject: [PATCH 16/51] f2fs: include charset encoding information in the superblock Add charset encoding to f2fs to support casefolding. It is modeled after the same feature introduced in commit c83ad55eaa91 ("ext4: include charset encoding information in the superblock") Currently this is not compatible with encryption, similar to the current ext4 imlpementation. This will change in the future. >From the ext4 patch: """ The s_encoding field stores a magic number indicating the encoding format and version used globally by file and directory names in the filesystem. The s_encoding_flags defines policies for using the charset encoding, like how to handle invalid sequences. The magic number is mapped to the exact charset table, but the mapping is specific to ext4. Since we don't have any commitment to support old encodings, the only encoding I am supporting right now is utf8-12.1.0. The current implementation prevents the user from enabling encoding and per-directory encryption on the same filesystem at the same time. The incompatibility between these features lies in how we do efficient directory searches when we cannot be sure the encryption of the user provided fname will match the actual hash stored in the disk without decrypting every directory entry, because of normalization cases. My quickest solution is to simply block the concurrent use of these features for now, and enable it later, once we have a better solution. """ Signed-off-by: Daniel Rosenberg Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 7 ++ Documentation/filesystems/f2fs.txt | 3 + fs/f2fs/f2fs.h | 6 ++ fs/f2fs/super.c | 95 +++++++++++++++++++++++++ fs/f2fs/sysfs.c | 23 ++++++ include/linux/f2fs_fs.h | 9 ++- 6 files changed, 142 insertions(+), 1 deletion(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index dca326e0ee3e..7ab2b1b5e255 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -251,3 +251,10 @@ Description: If checkpoint=disable, it displays the number of blocks that are unusable. If checkpoint=enable it displays the enumber of blocks that would be unusable if checkpoint=disable were to be set. + +What: /sys/fs/f2fs//encoding +Date July 2019 +Contact: "Daniel Rosenberg" +Description: + Displays name and version of the encoding set for the filesystem. + If no encoding is set, displays (none) diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt index 496fa28b2492..5fa38ab373ca 100644 --- a/Documentation/filesystems/f2fs.txt +++ b/Documentation/filesystems/f2fs.txt @@ -413,6 +413,9 @@ Files in /sys/fs/f2fs/ that would be unusable if checkpoint=disable were to be set. +encoding This shows the encoding used for casefolding. + If casefolding is not enabled, returns (none) + ================================================================================ USAGE ================================================================================ diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4fa9a618b31a..dd69e1559839 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -153,6 +153,7 @@ struct f2fs_mount_info { #define F2FS_FEATURE_LOST_FOUND 0x0200 #define F2FS_FEATURE_VERITY 0x0400 /* reserved */ #define F2FS_FEATURE_SB_CHKSUM 0x0800 +#define F2FS_FEATURE_CASEFOLD 0x1000 #define __F2FS_HAS_FEATURE(raw_super, mask) \ ((raw_super->feature & cpu_to_le32(mask)) != 0) @@ -1169,6 +1170,10 @@ struct f2fs_sb_info { int valid_super_block; /* valid super block no */ unsigned long s_flag; /* flags for sbi */ struct mutex writepages; /* mutex for writepages() */ +#ifdef CONFIG_UNICODE + struct unicode_map *s_encoding; + __u16 s_encoding_flags; +#endif #ifdef CONFIG_BLK_DEV_ZONED unsigned int blocks_per_blkz; /* F2FS blocks per zone */ @@ -3565,6 +3570,7 @@ F2FS_FEATURE_FUNCS(quota_ino, QUOTA_INO); F2FS_FEATURE_FUNCS(inode_crtime, INODE_CRTIME); F2FS_FEATURE_FUNCS(lost_found, LOST_FOUND); F2FS_FEATURE_FUNCS(sb_chksum, SB_CHKSUM); +F2FS_FEATURE_FUNCS(casefold, CASEFOLD); #ifdef CONFIG_BLK_DEV_ZONED static inline bool f2fs_blkz_is_seq(struct f2fs_sb_info *sbi, int devi, diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 9167deb0c417..8bebee8e0186 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -23,6 +23,7 @@ #include #include #include +#include #include "f2fs.h" #include "node.h" @@ -222,6 +223,36 @@ void f2fs_printk(struct f2fs_sb_info *sbi, const char *fmt, ...) va_end(args); } +#ifdef CONFIG_UNICODE +static const struct f2fs_sb_encodings { + __u16 magic; + char *name; + char *version; +} f2fs_sb_encoding_map[] = { + {F2FS_ENC_UTF8_12_1, "utf8", "12.1.0"}, +}; + +static int f2fs_sb_read_encoding(const struct f2fs_super_block *sb, + const struct f2fs_sb_encodings **encoding, + __u16 *flags) +{ + __u16 magic = le16_to_cpu(sb->s_encoding); + int i; + + for (i = 0; i < ARRAY_SIZE(f2fs_sb_encoding_map); i++) + if (magic == f2fs_sb_encoding_map[i].magic) + break; + + if (i >= ARRAY_SIZE(f2fs_sb_encoding_map)) + return -EINVAL; + + *encoding = &f2fs_sb_encoding_map[i]; + *flags = le16_to_cpu(sb->s_encoding_flags); + + return 0; +} +#endif + static inline void limit_reserve_root(struct f2fs_sb_info *sbi) { block_t limit = min((sbi->user_block_count << 1) / 1000, @@ -798,6 +829,13 @@ static int parse_options(struct super_block *sb, char *options) return -EINVAL; } #endif +#ifndef CONFIG_UNICODE + if (f2fs_sb_has_casefold(sbi)) { + f2fs_err(sbi, + "Filesystem with casefold feature cannot be mounted without CONFIG_UNICODE"); + return -EINVAL; + } +#endif if (F2FS_IO_SIZE_BITS(sbi) && !test_opt(sbi, LFS)) { f2fs_err(sbi, "Should set mode=lfs with %uKB-sized IO", @@ -1103,6 +1141,9 @@ static void f2fs_put_super(struct super_block *sb) destroy_percpu_info(sbi); for (i = 0; i < NR_PAGE_TYPE; i++) kvfree(sbi->write_io[i]); +#ifdef CONFIG_UNICODE + utf8_unload(sbi->s_encoding); +#endif kvfree(sbi); } @@ -3075,6 +3116,52 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) return 0; } +static int f2fs_setup_casefold(struct f2fs_sb_info *sbi) +{ +#ifdef CONFIG_UNICODE + if (f2fs_sb_has_casefold(sbi) && !sbi->s_encoding) { + const struct f2fs_sb_encodings *encoding_info; + struct unicode_map *encoding; + __u16 encoding_flags; + + if (f2fs_sb_has_encrypt(sbi)) { + f2fs_err(sbi, + "Can't mount with encoding and encryption"); + return -EINVAL; + } + + if (f2fs_sb_read_encoding(sbi->raw_super, &encoding_info, + &encoding_flags)) { + f2fs_err(sbi, + "Encoding requested by superblock is unknown"); + return -EINVAL; + } + + encoding = utf8_load(encoding_info->version); + if (IS_ERR(encoding)) { + f2fs_err(sbi, + "can't mount with superblock charset: %s-%s " + "not supported by the kernel. flags: 0x%x.", + encoding_info->name, encoding_info->version, + encoding_flags); + return PTR_ERR(encoding); + } + f2fs_info(sbi, "Using encoding defined by superblock: " + "%s-%s with flags 0x%hx", encoding_info->name, + encoding_info->version?:"\b", encoding_flags); + + sbi->s_encoding = encoding; + sbi->s_encoding_flags = encoding_flags; + } +#else + if (f2fs_sb_has_casefold(sbi)) { + f2fs_err(sbi, "Filesystem with casefold feature cannot be mounted without CONFIG_UNICODE"); + return -EINVAL; + } +#endif + return 0; +} + static void f2fs_tuning_parameters(struct f2fs_sb_info *sbi) { struct f2fs_sm_info *sm_i = SM_I(sbi); @@ -3171,6 +3258,10 @@ try_onemore: le32_to_cpu(raw_super->log_blocksize); sb->s_max_links = F2FS_LINK_MAX; + err = f2fs_setup_casefold(sbi); + if (err) + goto free_options; + #ifdef CONFIG_QUOTA sb->dq_op = &f2fs_quota_operations; sb->s_qcop = &f2fs_quotactl_ops; @@ -3521,6 +3612,10 @@ free_percpu: free_bio_info: for (i = 0; i < NR_PAGE_TYPE; i++) kvfree(sbi->write_io[i]); + +#ifdef CONFIG_UNICODE + utf8_unload(sbi->s_encoding); +#endif free_options: #ifdef CONFIG_QUOTA for (i = 0; i < MAXQUOTAS; i++) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 3aeacd0aacfd..f9fcca695db9 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -10,6 +10,7 @@ #include #include #include +#include #include "f2fs.h" #include "segment.h" @@ -81,6 +82,19 @@ static ssize_t unusable_show(struct f2fs_attr *a, (unsigned long long)unusable); } +static ssize_t encoding_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ +#ifdef CONFIG_UNICODE + if (f2fs_sb_has_casefold(sbi)) + return snprintf(buf, PAGE_SIZE, "%s (%d.%d.%d)\n", + sbi->s_encoding->charset, + (sbi->s_encoding->version >> 16) & 0xff, + (sbi->s_encoding->version >> 8) & 0xff, + sbi->s_encoding->version & 0xff); +#endif + return snprintf(buf, PAGE_SIZE, "(none)"); +} static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) @@ -134,6 +148,9 @@ static ssize_t features_show(struct f2fs_attr *a, if (f2fs_sb_has_sb_chksum(sbi)) len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "sb_checksum"); + if (f2fs_sb_has_casefold(sbi)) + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "casefold"); len += snprintf(buf + len, PAGE_SIZE - len, "\n"); return len; } @@ -365,6 +382,7 @@ enum feat_id { FEAT_INODE_CRTIME, FEAT_LOST_FOUND, FEAT_SB_CHECKSUM, + FEAT_CASEFOLD, }; static ssize_t f2fs_feature_show(struct f2fs_attr *a, @@ -382,6 +400,7 @@ static ssize_t f2fs_feature_show(struct f2fs_attr *a, case FEAT_INODE_CRTIME: case FEAT_LOST_FOUND: case FEAT_SB_CHECKSUM: + case FEAT_CASEFOLD: return snprintf(buf, PAGE_SIZE, "supported\n"); } return 0; @@ -455,6 +474,7 @@ F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes); F2FS_GENERAL_RO_ATTR(features); F2FS_GENERAL_RO_ATTR(current_reserved_blocks); F2FS_GENERAL_RO_ATTR(unusable); +F2FS_GENERAL_RO_ATTR(encoding); #ifdef CONFIG_FS_ENCRYPTION F2FS_FEATURE_RO_ATTR(encryption, FEAT_CRYPTO); @@ -471,6 +491,7 @@ F2FS_FEATURE_RO_ATTR(quota_ino, FEAT_QUOTA_INO); F2FS_FEATURE_RO_ATTR(inode_crtime, FEAT_INODE_CRTIME); F2FS_FEATURE_RO_ATTR(lost_found, FEAT_LOST_FOUND); F2FS_FEATURE_RO_ATTR(sb_checksum, FEAT_SB_CHECKSUM); +F2FS_FEATURE_RO_ATTR(casefold, FEAT_CASEFOLD); #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { @@ -515,6 +536,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(features), ATTR_LIST(reserved_blocks), ATTR_LIST(current_reserved_blocks), + ATTR_LIST(encoding), NULL, }; ATTRIBUTE_GROUPS(f2fs); @@ -535,6 +557,7 @@ static struct attribute *f2fs_feat_attrs[] = { ATTR_LIST(inode_crtime), ATTR_LIST(lost_found), ATTR_LIST(sb_checksum), + ATTR_LIST(casefold), NULL, }; ATTRIBUTE_GROUPS(f2fs_feat); diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 52af9ac164b4..284738996028 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -36,6 +36,11 @@ #define F2FS_MAX_QUOTAS 3 +#define F2FS_ENC_UTF8_12_1 1 +#define F2FS_ENC_STRICT_MODE_FL (1 << 0) +#define f2fs_has_strict_mode(sbi) \ + (sbi->s_encoding_flags & F2FS_ENC_STRICT_MODE_FL) + #define F2FS_IO_SIZE(sbi) (1 << F2FS_OPTION(sbi).write_io_size_bits) /* Blocks */ #define F2FS_IO_SIZE_KB(sbi) (1 << (F2FS_OPTION(sbi).write_io_size_bits + 2)) /* KB */ #define F2FS_IO_SIZE_BYTES(sbi) (1 << (F2FS_OPTION(sbi).write_io_size_bits + 12)) /* B */ @@ -110,7 +115,9 @@ struct f2fs_super_block { struct f2fs_device devs[MAX_DEVICES]; /* device list */ __le32 qf_ino[F2FS_MAX_QUOTAS]; /* quota inode numbers */ __u8 hot_ext_count; /* # of hot file extension */ - __u8 reserved[310]; /* valid reserved region */ + __le16 s_encoding; /* Filename charset encoding */ + __le16 s_encoding_flags; /* Filename charset encoding flags */ + __u8 reserved[306]; /* valid reserved region */ __le32 crc; /* checksum of superblock */ } __packed; From 2c2eb7a300cd7c6945dafb077801dca95d7a6c25 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Tue, 23 Jul 2019 16:05:29 -0700 Subject: [PATCH 17/51] f2fs: Support case-insensitive file name lookups Modeled after commit b886ee3e778e ("ext4: Support case-insensitive file name lookups") """ This patch implements the actual support for case-insensitive file name lookups in f2fs, based on the feature bit and the encoding stored in the superblock. A filesystem that has the casefold feature set is able to configure directories with the +F (F2FS_CASEFOLD_FL) attribute, enabling lookups to succeed in that directory in a case-insensitive fashion, i.e: match a directory entry even if the name used by userspace is not a byte per byte match with the disk name, but is an equivalent case-insensitive version of the Unicode string. This operation is called a case-insensitive file name lookup. The feature is configured as an inode attribute applied to directories and inherited by its children. This attribute can only be enabled on empty directories for filesystems that support the encoding feature, thus preventing collision of file names that only differ by case. * dcache handling: For a +F directory, F2Fs only stores the first equivalent name dentry used in the dcache. This is done to prevent unintentional duplication of dentries in the dcache, while also allowing the VFS code to quickly find the right entry in the cache despite which equivalent string was used in a previous lookup, without having to resort to ->lookup(). d_hash() of casefolded directories is implemented as the hash of the casefolded string, such that we always have a well-known bucket for all the equivalencies of the same string. d_compare() uses the utf8_strncasecmp() infrastructure, which handles the comparison of equivalent, same case, names as well. For now, negative lookups are not inserted in the dcache, since they would need to be invalidated anyway, because we can't trust missing file dentries. This is bad for performance but requires some leveraging of the vfs layer to fix. We can live without that for now, and so does everyone else. * on-disk data: Despite using a specific version of the name as the internal representation within the dcache, the name stored and fetched from the disk is a byte-per-byte match with what the user requested, making this implementation 'name-preserving'. i.e. no actual information is lost when writing to storage. DX is supported by modifying the hashes used in +F directories to make them case/encoding-aware. The new disk hashes are calculated as the hash of the full casefolded string, instead of the string directly. This allows us to efficiently search for file names in the htree without requiring the user to provide an exact name. * Dealing with invalid sequences: By default, when a invalid UTF-8 sequence is identified, ext4 will treat it as an opaque byte sequence, ignoring the encoding and reverting to the old behavior for that unique file. This means that case-insensitive file name lookup will not work only for that file. An optional bit can be set in the superblock telling the filesystem code and userspace tools to enforce the encoding. When that optional bit is set, any attempt to create a file name using an invalid UTF-8 sequence will fail and return an error to userspace. * Normalization algorithm: The UTF-8 algorithms used to compare strings in f2fs is implemented in fs/unicode, and is based on a previous version developed by SGI. It implements the Canonical decomposition (NFD) algorithm described by the Unicode specification 12.1, or higher, combined with the elimination of ignorable code points (NFDi) and full case-folding (CF) as documented in fs/unicode/utf8_norm.c. NFD seems to be the best normalization method for F2FS because: - It has a lower cost than NFC/NFKC (which requires decomposing to NFD as an intermediary step) - It doesn't eliminate important semantic meaning like compatibility decompositions. Although: - This implementation is not completely linguistic accurate, because different languages have conflicting rules, which would require the specialization of the filesystem to a given locale, which brings all sorts of problems for removable media and for users who use more than one language. """ Signed-off-by: Daniel Rosenberg Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 125 +++++++++++++++++++++++++++++++++++++++++++---- fs/f2fs/f2fs.h | 18 +++++-- fs/f2fs/file.c | 14 +++++- fs/f2fs/hash.c | 37 +++++++++++++- fs/f2fs/inline.c | 4 +- fs/f2fs/inode.c | 4 +- fs/f2fs/namei.c | 21 ++++++++ fs/f2fs/super.c | 1 + 8 files changed, 204 insertions(+), 20 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 85a1528f319f..dac07d17cdbd 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -8,6 +8,7 @@ #include #include #include +#include #include "f2fs.h" #include "node.h" #include "acl.h" @@ -81,7 +82,8 @@ static unsigned long dir_block_index(unsigned int level, return bidx; } -static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, +static struct f2fs_dir_entry *find_in_block(struct inode *dir, + struct page *dentry_page, struct fscrypt_name *fname, f2fs_hash_t namehash, int *max_slots, @@ -93,7 +95,7 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, dentry_blk = (struct f2fs_dentry_block *)page_address(dentry_page); - make_dentry_ptr_block(NULL, &d, dentry_blk); + make_dentry_ptr_block(dir, &d, dentry_blk); de = f2fs_find_target_dentry(fname, namehash, max_slots, &d); if (de) *res_page = dentry_page; @@ -101,6 +103,39 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, return de; } +#ifdef CONFIG_UNICODE +/* + * Test whether a case-insensitive directory entry matches the filename + * being searched for. + * + * Returns: 0 if the directory entry matches, more than 0 if it + * doesn't match or less than zero on error. + */ +int f2fs_ci_compare(const struct inode *parent, const struct qstr *name, + const struct qstr *entry) +{ + const struct f2fs_sb_info *sbi = F2FS_SB(parent->i_sb); + const struct unicode_map *um = sbi->s_encoding; + int ret; + + ret = utf8_strncasecmp(um, name, entry); + if (ret < 0) { + /* Handle invalid character sequence as either an error + * or as an opaque byte sequence. + */ + if (f2fs_has_strict_mode(sbi)) + return -EINVAL; + + if (name->len != entry->len) + return 1; + + return !!memcmp(name->name, entry->name, name->len); + } + + return ret; +} +#endif + struct f2fs_dir_entry *f2fs_find_target_dentry(struct fscrypt_name *fname, f2fs_hash_t namehash, int *max_slots, struct f2fs_dentry_ptr *d) @@ -108,6 +143,9 @@ struct f2fs_dir_entry *f2fs_find_target_dentry(struct fscrypt_name *fname, struct f2fs_dir_entry *de; unsigned long bit_pos = 0; int max_len = 0; +#ifdef CONFIG_UNICODE + struct qstr entry; +#endif if (max_slots) *max_slots = 0; @@ -119,16 +157,28 @@ struct f2fs_dir_entry *f2fs_find_target_dentry(struct fscrypt_name *fname, } de = &d->dentry[bit_pos]; +#ifdef CONFIG_UNICODE + entry.name = d->filename[bit_pos]; + entry.len = de->name_len; +#endif if (unlikely(!de->name_len)) { bit_pos++; continue; } + if (de->hash_code == namehash) { +#ifdef CONFIG_UNICODE + if (F2FS_SB(d->inode->i_sb)->s_encoding && + IS_CASEFOLDED(d->inode) && + !f2fs_ci_compare(d->inode, + fname->usr_fname, &entry)) + goto found; - if (de->hash_code == namehash && - fscrypt_match_name(fname, d->filename[bit_pos], - le16_to_cpu(de->name_len))) - goto found; +#endif + if (fscrypt_match_name(fname, d->filename[bit_pos], + le16_to_cpu(de->name_len))) + goto found; + } if (max_slots && max_len > *max_slots) *max_slots = max_len; @@ -157,7 +207,7 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, struct f2fs_dir_entry *de = NULL; bool room = false; int max_slots; - f2fs_hash_t namehash = f2fs_dentry_hash(&name, fname); + f2fs_hash_t namehash = f2fs_dentry_hash(dir, &name, fname); nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level); nblock = bucket_blocks(level); @@ -179,8 +229,8 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, } } - de = find_in_block(dentry_page, fname, namehash, &max_slots, - res_page); + de = find_in_block(dir, dentry_page, fname, namehash, + &max_slots, res_page); if (de) break; @@ -250,6 +300,14 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, struct fscrypt_name fname; int err; +#ifdef CONFIG_UNICODE + if (f2fs_has_strict_mode(F2FS_I_SB(dir)) && IS_CASEFOLDED(dir) && + utf8_validate(F2FS_I_SB(dir)->s_encoding, child)) { + *res_page = ERR_PTR(-EINVAL); + return NULL; + } +#endif + err = fscrypt_setup_filename(dir, child, 1, &fname); if (err) { if (err == -ENOENT) @@ -504,7 +562,7 @@ int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name, level = 0; slots = GET_DENTRY_SLOTS(new_name->len); - dentry_hash = f2fs_dentry_hash(new_name, NULL); + dentry_hash = f2fs_dentry_hash(dir, new_name, NULL); current_depth = F2FS_I(dir)->i_current_depth; if (F2FS_I(dir)->chash == dentry_hash) { @@ -943,3 +1001,50 @@ const struct file_operations f2fs_dir_operations = { .compat_ioctl = f2fs_compat_ioctl, #endif }; + +#ifdef CONFIG_UNICODE +static int f2fs_d_compare(const struct dentry *dentry, unsigned int len, + const char *str, const struct qstr *name) +{ + struct qstr qstr = {.name = str, .len = len }; + + if (!IS_CASEFOLDED(dentry->d_parent->d_inode)) { + if (len != name->len) + return -1; + return memcmp(str, name, len); + } + + return f2fs_ci_compare(dentry->d_parent->d_inode, name, &qstr); +} + +static int f2fs_d_hash(const struct dentry *dentry, struct qstr *str) +{ + struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb); + const struct unicode_map *um = sbi->s_encoding; + unsigned char *norm; + int len, ret = 0; + + if (!IS_CASEFOLDED(dentry->d_inode)) + return 0; + + norm = f2fs_kmalloc(sbi, PATH_MAX, GFP_ATOMIC); + if (!norm) + return -ENOMEM; + + len = utf8_casefold(um, str, norm, PATH_MAX); + if (len < 0) { + if (f2fs_has_strict_mode(sbi)) + ret = -EINVAL; + goto out; + } + str->hash = full_name_hash(dentry, norm, len); +out: + kvfree(norm); + return ret; +} + +const struct dentry_operations f2fs_dentry_ops = { + .d_hash = f2fs_d_hash, + .d_compare = f2fs_d_compare, +}; +#endif diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index dd69e1559839..3f57ab6c9137 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2367,13 +2367,16 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr) #define F2FS_INDEX_FL 0x00001000 /* hash-indexed directory */ #define F2FS_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ #define F2FS_PROJINHERIT_FL 0x20000000 /* Create with parents projid */ +#define F2FS_CASEFOLD_FL 0x40000000 /* Casefolded file */ /* Flags that should be inherited by new inodes from their parent. */ #define F2FS_FL_INHERITED (F2FS_SYNC_FL | F2FS_NODUMP_FL | F2FS_NOATIME_FL | \ - F2FS_DIRSYNC_FL | F2FS_PROJINHERIT_FL) + F2FS_DIRSYNC_FL | F2FS_PROJINHERIT_FL | \ + F2FS_CASEFOLD_FL) /* Flags that are appropriate for regular files (all but dir-specific ones). */ -#define F2FS_REG_FLMASK (~(F2FS_DIRSYNC_FL | F2FS_PROJINHERIT_FL)) +#define F2FS_REG_FLMASK (~(F2FS_DIRSYNC_FL | F2FS_PROJINHERIT_FL | \ + F2FS_CASEFOLD_FL)) /* Flags that are appropriate for non-directories/regular files. */ #define F2FS_OTHER_FLMASK (F2FS_NODUMP_FL | F2FS_NOATIME_FL) @@ -2933,6 +2936,10 @@ int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name, bool hot, bool set); struct dentry *f2fs_get_parent(struct dentry *child); +extern int f2fs_ci_compare(const struct inode *parent, + const struct qstr *name, + const struct qstr *entry); + /* * dir.c */ @@ -2996,8 +3003,8 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi); /* * hash.c */ -f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info, - struct fscrypt_name *fname); +f2fs_hash_t f2fs_dentry_hash(const struct inode *dir, + const struct qstr *name_info, struct fscrypt_name *fname); /* * node.c @@ -3440,6 +3447,9 @@ static inline void f2fs_destroy_root_stats(void) { } #endif extern const struct file_operations f2fs_dir_operations; +#ifdef CONFIG_UNICODE +extern const struct dentry_operations f2fs_dentry_ops; +#endif extern const struct file_operations f2fs_file_operations; extern const struct inode_operations f2fs_file_inode_operations; extern const struct address_space_operations f2fs_dblock_aops; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index a5080bea6b0b..103636c52206 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1664,6 +1664,13 @@ static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask) if (IS_NOQUOTA(inode)) return -EPERM; + if ((iflags ^ fi->i_flags) & F2FS_CASEFOLD_FL) { + if (!f2fs_sb_has_casefold(F2FS_I_SB(inode))) + return -EOPNOTSUPP; + if (!f2fs_empty_dir(inode)) + return -ENOTEMPTY; + } + fi->i_flags = iflags | (fi->i_flags & ~mask); if (fi->i_flags & F2FS_PROJINHERIT_FL) @@ -1698,6 +1705,7 @@ static const struct { { F2FS_INDEX_FL, FS_INDEX_FL }, { F2FS_DIRSYNC_FL, FS_DIRSYNC_FL }, { F2FS_PROJINHERIT_FL, FS_PROJINHERIT_FL }, + { F2FS_CASEFOLD_FL, FS_CASEFOLD_FL }, }; #define F2FS_GETTABLE_FS_FL ( \ @@ -1711,7 +1719,8 @@ static const struct { FS_PROJINHERIT_FL | \ FS_ENCRYPT_FL | \ FS_INLINE_DATA_FL | \ - FS_NOCOW_FL) + FS_NOCOW_FL | \ + FS_CASEFOLD_FL) #define F2FS_SETTABLE_FS_FL ( \ FS_SYNC_FL | \ @@ -1720,7 +1729,8 @@ static const struct { FS_NODUMP_FL | \ FS_NOATIME_FL | \ FS_DIRSYNC_FL | \ - FS_PROJINHERIT_FL) + FS_PROJINHERIT_FL | \ + FS_CASEFOLD_FL) /* Convert f2fs on-disk i_flags to FS_IOC_{GET,SET}FLAGS flags */ static inline u32 f2fs_iflags_to_fsflags(u32 iflags) diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c index cc82f142f811..5bc4dcd8fc03 100644 --- a/fs/f2fs/hash.c +++ b/fs/f2fs/hash.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "f2fs.h" @@ -67,7 +68,7 @@ static void str2hashbuf(const unsigned char *msg, size_t len, *buf++ = pad; } -f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info, +static f2fs_hash_t __f2fs_dentry_hash(const struct qstr *name_info, struct fscrypt_name *fname) { __u32 hash; @@ -103,3 +104,37 @@ f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info, f2fs_hash = cpu_to_le32(hash & ~F2FS_HASH_COL_BIT); return f2fs_hash; } + +f2fs_hash_t f2fs_dentry_hash(const struct inode *dir, + const struct qstr *name_info, struct fscrypt_name *fname) +{ +#ifdef CONFIG_UNICODE + struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); + const struct unicode_map *um = sbi->s_encoding; + int r, dlen; + unsigned char *buff; + struct qstr folded; + + if (!name_info->len || !IS_CASEFOLDED(dir)) + goto opaque_seq; + + buff = f2fs_kzalloc(sbi, sizeof(char) * PATH_MAX, GFP_KERNEL); + if (!buff) + return -ENOMEM; + + dlen = utf8_casefold(um, name_info, buff, PATH_MAX); + if (dlen < 0) { + kvfree(buff); + goto opaque_seq; + } + folded.name = buff; + folded.len = dlen; + r = __f2fs_dentry_hash(&folded, fname); + + kvfree(buff); + return r; + +opaque_seq: +#endif + return __f2fs_dentry_hash(name_info, fname); +} diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 8c0712154fb1..78d6ebe165cd 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -320,7 +320,7 @@ struct f2fs_dir_entry *f2fs_find_in_inline_dir(struct inode *dir, return NULL; } - namehash = f2fs_dentry_hash(&name, fname); + namehash = f2fs_dentry_hash(dir, &name, fname); inline_dentry = inline_data_addr(dir, ipage); @@ -580,7 +580,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, f2fs_wait_on_page_writeback(ipage, NODE, true, true); - name_hash = f2fs_dentry_hash(new_name, NULL); + name_hash = f2fs_dentry_hash(dir, new_name, NULL); f2fs_update_dentry(ino, mode, &d, new_name, name_hash, bit_pos); set_page_dirty(ipage); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index d1998ddf14fd..5d78f2db7a67 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -46,9 +46,11 @@ void f2fs_set_inode_flags(struct inode *inode) new_fl |= S_DIRSYNC; if (file_is_encrypt(inode)) new_fl |= S_ENCRYPTED; + if (flags & F2FS_CASEFOLD_FL) + new_fl |= S_CASEFOLD; inode_set_flags(inode, new_fl, S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC| - S_ENCRYPTED); + S_ENCRYPTED|S_CASEFOLD); } static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 7560c7ed38b1..9a28c5d9b3e9 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -489,6 +489,17 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, goto out_iput; } out_splice: +#ifdef CONFIG_UNICODE + if (!inode && IS_CASEFOLDED(dir)) { + /* Eventually we want to call d_add_ci(dentry, NULL) + * for negative dentries in the encoding case as + * well. For now, prevent the negative dentry + * from being cached. + */ + trace_f2fs_lookup_end(dir, dentry, ino, err); + return NULL; + } +#endif new = d_splice_alias(inode, dentry); err = PTR_ERR_OR_ZERO(new); trace_f2fs_lookup_end(dir, dentry, ino, err); @@ -537,6 +548,16 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) goto fail; } f2fs_delete_entry(de, page, dir, inode); +#ifdef CONFIG_UNICODE + /* VFS negative dentries are incompatible with Encoding and + * Case-insensitiveness. Eventually we'll want avoid + * invalidating the dentries here, alongside with returning the + * negative dentries at f2fs_lookup(), when it is better + * supported by the VFS for the CI case. + */ + if (IS_CASEFOLDED(dir)) + d_invalidate(dentry); +#endif f2fs_unlock_op(sbi); if (IS_DIRSYNC(dir)) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 8bebee8e0186..3435c8bccf3e 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3152,6 +3152,7 @@ static int f2fs_setup_casefold(struct f2fs_sb_info *sbi) sbi->s_encoding = encoding; sbi->s_encoding_flags = encoding_flags; + sbi->sb->s_d_op = &f2fs_dentry_ops; } #else if (f2fs_sb_has_casefold(sbi)) { From 3ee0c5d3b4e8bb0990a800449509c087f270d4a6 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 15 Aug 2019 19:45:34 +0800 Subject: [PATCH 18/51] f2fs: use wrapped IS_SWAPFILE() Just cleanup, no logic change. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 3f57ab6c9137..09ad4116d635 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3703,7 +3703,7 @@ static inline bool f2fs_force_buffered_io(struct inode *inode, block_unaligned_IO(inode, iocb, iter)) return true; if (is_sbi_flag_set(F2FS_I_SB(inode), SBI_CP_DISABLED) && - !(inode->i_flags & S_SWAPFILE)) + !IS_SWAPFILE(inode)) return true; return false; From fd114ab22dd14ded1783b7f9b0aaddc14c098fe0 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 15 Aug 2019 19:45:36 +0800 Subject: [PATCH 19/51] f2fs: fix to use more generic EOPNOTSUPP EOPNOTSUPP is widely used as error number indicating operation is not supported in syscall, and ENOTSUPP was defined and only used for NFSv3 protocol, so use EOPNOTSUPP instead. Fixes: 0a2aa8fbb969 ("f2fs: refactor __exchange_data_block for speed up") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 103636c52206..2d392887b92d 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1033,7 +1033,7 @@ next_dnode: if (test_opt(sbi, LFS)) { f2fs_put_dnode(&dn); - return -ENOTSUPP; + return -EOPNOTSUPP; } /* do not invalidate this block address */ From 33ac18a15c880d253565c08e8cec3ee4c8a76657 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 15 Aug 2019 19:45:35 +0800 Subject: [PATCH 20/51] f2fs: use wrapped f2fs_cp_error() Just cleanup, no logic change. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 5d78f2db7a67..88af85e0db62 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -706,7 +706,7 @@ no_delete: stat_dec_inline_dir(inode); stat_dec_inline_inode(inode); - if (likely(!is_set_ckpt_flags(sbi, CP_ERROR_FLAG) && + if (likely(!f2fs_cp_error(sbi) && !is_sbi_flag_set(sbi, SBI_CP_DISABLED))) f2fs_bug_on(sbi, is_inode_flag_set(inode, FI_DIRTY_INODE)); else From d3a1a0e1bf5d4d2107d6b485c5d33e65acaa7cf6 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 29 Jul 2019 23:02:29 +0800 Subject: [PATCH 21/51] f2fs: fix to migrate blocks correctly during defragment During defragment, we missed to trigger fragmented blocks migration for below condition: In defragment region: - total number of valid blocks is smaller than 512; - the tail part of the region are all holes; In addtion, return zero to user via range->len if there is no fragmented blocks. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 2d392887b92d..16f39c0c457d 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2383,8 +2383,10 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, map.m_lblk += map.m_len; } - if (!fragmented) + if (!fragmented) { + total = 0; goto out; + } sec_num = DIV_ROUND_UP(total, BLKS_PER_SEC(sbi)); @@ -2414,7 +2416,7 @@ do_map: if (!(map.m_flags & F2FS_MAP_FLAGS)) { map.m_lblk = next_pgofs; - continue; + goto check; } set_inode_flag(inode, FI_DO_DEFRAG); @@ -2438,8 +2440,8 @@ do_map: } map.m_lblk = idx; - - if (idx < pg_end && cnt < blk_per_seg) +check: + if (map.m_lblk < pg_end && cnt < blk_per_seg) goto do_map; clear_inode_flag(inode, FI_DO_DEFRAG); From 0b86f78920919cf36239f4e187ebad7f5d78b5f8 Mon Sep 17 00:00:00 2001 From: Lihong Kou Date: Mon, 5 Aug 2019 15:27:24 +0800 Subject: [PATCH 22/51] f2fs: remove duplicate code in f2fs_file_write_iter We will do the same check in generic_write_checks. if (iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT) return -EINVAL; just remove the same check in f2fs_file_write_iter. Signed-off-by: Lihong Kou Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 16f39c0c457d..b1f38f2795bc 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -3160,11 +3160,6 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) goto out; } - if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT)) { - ret = -EINVAL; - goto out; - } - if (!inode_trylock(inode)) { if (iocb->ki_flags & IOCB_NOWAIT) { ret = -EAGAIN; From 27cae0bcc0510ba009f01806b4af13940ad96f7d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 5 Aug 2019 18:27:25 +0800 Subject: [PATCH 23/51] f2fs: fix wrong available node count calculation In mkfs, we have counted quota file's node number in cp.valid_node_count, so we have to avoid wrong substraction of quota node number in .available_nid/.avail_node_count calculation. f2fs_write_check_point_pack() { .. set_cp(valid_node_count, 1 + c.quota_inum + c.lpf_inum); Fixes: 292c196a3695 ("f2fs: reserve nid resource for quota sysfile") Fixes: 7b63f72f73af ("f2fs: fix to do sanity check on valid node/block count") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 2 +- fs/f2fs/super.c | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index a18b2a895771..d9ba1db2d01e 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2964,7 +2964,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi) /* not used nids: 0, node, meta, (and root counted as valid node) */ nm_i->available_nids = nm_i->max_nid - sbi->total_valid_node_count - - sbi->nquota_files - F2FS_RESERVED_NODE_NUM; + F2FS_RESERVED_NODE_NUM; nm_i->nid_cnt[FREE_NID] = 0; nm_i->nid_cnt[PREALLOC_NID] = 0; nm_i->nat_cnt = 0; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 3435c8bccf3e..46d10a94721d 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1269,8 +1269,7 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) else buf->f_bavail = 0; - avail_node_count = sbi->total_node_count - sbi->nquota_files - - F2FS_RESERVED_NODE_NUM; + avail_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; if (avail_node_count > user_block_count) { buf->f_files = user_block_count; @@ -2700,8 +2699,7 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) } valid_node_count = le32_to_cpu(ckpt->valid_node_count); - avail_node_count = sbi->total_node_count - sbi->nquota_files - - F2FS_RESERVED_NODE_NUM; + avail_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; if (valid_node_count > avail_node_count) { f2fs_err(sbi, "Wrong valid_node_count: %u, avail_node_count: %u", valid_node_count, avail_node_count); From 290c30d4454ccf85de372af499e32e86f5d94188 Mon Sep 17 00:00:00 2001 From: Lihong Kou Date: Mon, 5 Aug 2019 19:13:52 +0800 Subject: [PATCH 24/51] f2fs: cleanup the code in build_sit_entries. We do not need to set the SBI_NEED_FSCK flag in the error paths, if we return error here, we will not update the checkpoint flag, so the code is useless, just remove it. Signed-off-by: Lihong Kou Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index a1ece0caad78..6aec63f0523b 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -4168,7 +4168,6 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) if (start >= MAIN_SEGS(sbi)) { f2fs_err(sbi, "Wrong journal entry on segno %u", start); - set_sbi_flag(sbi, SBI_NEED_FSCK); err = -EFSCORRUPTED; break; } @@ -4208,7 +4207,6 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) if (!err && total_node_blocks != valid_node_count(sbi)) { f2fs_err(sbi, "SIT is corrupted node# %u vs %u", total_node_blocks, valid_node_count(sbi)); - set_sbi_flag(sbi, SBI_NEED_FSCK); err = -EFSCORRUPTED; } From a37d0862d17411edb67677a580a6f505ec2225f6 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 2 Aug 2019 18:15:48 +0800 Subject: [PATCH 25/51] Revert "f2fs: avoid out-of-range memory access" As Pavel Machek reported: "We normally use -EUCLEAN to signal filesystem corruption. Plus, it is good idea to report it to the syslog and mark filesystem as "needing fsck" if filesystem can do that." Still we need improve the original patch with: - use unlikely keyword - add message print - return EUCLEAN However, after rethink this patch, I don't think we should add such condition check here as below reasons: - We have already checked the field in f2fs_sanity_check_ckpt(), - If there is fs corrupt or security vulnerability, there is nothing to guarantee the field is integrated after the check, unless we do the check before each of its use, however no filesystem does that. - We only have similar check for bitmap, which was added due to there is bitmap corruption happened on f2fs' runtime in product. - There are so many key fields in SB/CP/NAT did have such check after f2fs_sanity_check_{sb,cp,..}. So I propose to revert this unneeded check. This reverts commit 56f3ce675103e3fb9e631cfb4131fc768bc23e9a. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 6aec63f0523b..67e43b1c22e4 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3454,11 +3454,6 @@ static int read_compacted_summaries(struct f2fs_sb_info *sbi) seg_i = CURSEG_I(sbi, i); segno = le32_to_cpu(ckpt->cur_data_segno[i]); blk_off = le16_to_cpu(ckpt->cur_data_blkoff[i]); - if (blk_off > ENTRIES_IN_SUM) { - f2fs_bug_on(sbi, 1); - f2fs_put_page(page, 1); - return -EFAULT; - } seg_i->next_segno = segno; reset_curseg(sbi, i, 0); seg_i->alloc_type = ckpt->alloc_type[i]; From aabc172b986fd797065a61625c22a27a61f3f43d Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Thu, 8 Aug 2019 10:02:53 +0800 Subject: [PATCH 26/51] f2fs: Fix build error while CONFIG_NLS=m If CONFIG_F2FS_FS=y but CONFIG_NLS=m, building fails: fs/f2fs/file.o: In function `f2fs_ioctl': file.c:(.text+0xb86f): undefined reference to `utf16s_to_utf8s' file.c:(.text+0xe651): undefined reference to `utf8s_to_utf16s' Select CONFIG_NLS to fix this. Reported-by: Hulk Robot Fixes: 61a3da4d5ef8 ("f2fs: support FS_IOC_{GET,SET}FSLABEL") Signed-off-by: YueHaibing Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index 110a38ca5d53..95f1b99fa900 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -2,6 +2,7 @@ config F2FS_FS tristate "F2FS filesystem support" depends on BLOCK + select NLS select CRYPTO select CRYPTO_CRC32 select F2FS_FS_XATTR if FS_ENCRYPTION From 899fee36fac07e49bb969e3f214e572eecb14f00 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 16 Aug 2019 11:03:34 +0800 Subject: [PATCH 27/51] f2fs: fix to avoid data corruption by forbidding SSR overwrite There is one case can cause data corruption. - write 4k to fileA - fsync fileA, 4k data is writebacked to lbaA - write 4k to fileA - kworker flushs 4k to lbaB; dnode contain lbaB didn't be persisted yet - write 4k to fileB - kworker flush 4k to lbaA due to SSR - SPOR -> dnode with lbaA will be recovered, however lbaA contains fileB's data One solution is tracking all fsynced file's block history, and disallow SSR overwrite on newly invalidated block on that file. However, during recovery, no matter the dnode is flushed or fsynced, all previous dnodes until last fsynced one in node chain can be recovered, that means we need to record all block change in flushed dnode, which will cause heavy cost, so let's just use simple fix by forbidding SSR overwrite directly. Fixes: 5b6c6be2d878 ("f2fs: use SSR for warm node as well") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 67e43b1c22e4..3f47379cd7db 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2163,9 +2163,11 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) if (!f2fs_test_and_set_bit(offset, se->discard_map)) sbi->discard_blks--; - /* don't overwrite by SSR to keep node chain */ - if (IS_NODESEG(se->type) && - !is_sbi_flag_set(sbi, SBI_CP_DISABLED)) { + /* + * SSR should never reuse block which is checkpointed + * or newly invalidated. + */ + if (!is_sbi_flag_set(sbi, SBI_CP_DISABLED)) { if (!f2fs_test_and_set_bit(offset, se->ckpt_valid_map)) se->ckpt_valid_blocks++; } From 4507847c86bfc64e9bdce941a0f707560d3df98a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 17 Jul 2019 17:06:11 +0800 Subject: [PATCH 28/51] f2fs: support FS_IOC_{GET,SET}FSLABEL Support two generic fs ioctls FS_IOC_{GET,SET}FSLABEL, letting f2fs pass generic/492 testcase. Fixes were made by Eric where: - f2fs: fix buffer overruns in FS_IOC_{GET, SET}FSLABEL utf16s_to_utf8s() and utf8s_to_utf16s() take the number of characters, not the number of bytes. - f2fs: fix copying too many bytes in FS_IOC_SETFSLABEL Userspace provides a null-terminated string, so don't assume that the full FSLABEL_MAX bytes can always be copied. - f2fs: add missing authorization check in FS_IOC_SETFSLABEL FS_IOC_SETFSLABEL modifies the filesystem superblock, so it shouldn't be allowed to regular users. Require CAP_SYS_ADMIN, like xfs and btrfs do. Signed-off-by: Chao Yu Signed-off-by: Eric Biggers Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 3 +++ fs/f2fs/file.c | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 09ad4116d635..d2b718e33f88 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -418,6 +418,9 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal, #define F2FS_IOC_PRECACHE_EXTENTS _IO(F2FS_IOCTL_MAGIC, 15) #define F2FS_IOC_RESIZE_FS _IOW(F2FS_IOCTL_MAGIC, 16, __u64) +#define F2FS_IOC_GET_VOLUME_NAME FS_IOC_GETFSLABEL +#define F2FS_IOC_SET_VOLUME_NAME FS_IOC_SETFSLABEL + #define F2FS_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY #define F2FS_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY #define F2FS_IOC_GET_ENCRYPTION_PWSALT FS_IOC_GET_ENCRYPTION_PWSALT diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index b1f38f2795bc..344e0bd638e5 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -20,6 +20,7 @@ #include #include #include +#include #include "f2fs.h" #include "node.h" @@ -3081,6 +3082,68 @@ static int f2fs_ioc_resize_fs(struct file *filp, unsigned long arg) return ret; } +static int f2fs_get_volume_name(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + char *vbuf; + int count; + int err = 0; + + vbuf = f2fs_kzalloc(sbi, MAX_VOLUME_NAME, GFP_KERNEL); + if (!vbuf) + return -ENOMEM; + + down_read(&sbi->sb_lock); + count = utf16s_to_utf8s(sbi->raw_super->volume_name, + ARRAY_SIZE(sbi->raw_super->volume_name), + UTF16_LITTLE_ENDIAN, vbuf, MAX_VOLUME_NAME); + up_read(&sbi->sb_lock); + + if (copy_to_user((char __user *)arg, vbuf, + min(FSLABEL_MAX, count))) + err = -EFAULT; + + kvfree(vbuf); + return err; +} + +static int f2fs_set_volume_name(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + char *vbuf; + int err = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + vbuf = strndup_user((const char __user *)arg, FSLABEL_MAX); + if (IS_ERR(vbuf)) + return PTR_ERR(vbuf); + + err = mnt_want_write_file(filp); + if (err) + goto out; + + down_write(&sbi->sb_lock); + + memset(sbi->raw_super->volume_name, 0, + sizeof(sbi->raw_super->volume_name)); + utf8s_to_utf16s(vbuf, strlen(vbuf), UTF16_LITTLE_ENDIAN, + sbi->raw_super->volume_name, + ARRAY_SIZE(sbi->raw_super->volume_name)); + + err = f2fs_commit_super(sbi, false); + + up_write(&sbi->sb_lock); + + mnt_drop_write_file(filp); +out: + kfree(vbuf); + return err; +} + long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { int ret; @@ -3144,6 +3207,10 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_precache_extents(filp, arg); case F2FS_IOC_RESIZE_FS: return f2fs_ioc_resize_fs(filp, arg); + case F2FS_IOC_GET_VOLUME_NAME: + return f2fs_get_volume_name(filp, arg); + case F2FS_IOC_SET_VOLUME_NAME: + return f2fs_set_volume_name(filp, arg); default: return -ENOTTY; } @@ -3253,6 +3320,8 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case F2FS_IOC_SET_PIN_FILE: case F2FS_IOC_PRECACHE_EXTENTS: case F2FS_IOC_RESIZE_FS: + case F2FS_IOC_GET_VOLUME_NAME: + case F2FS_IOC_SET_VOLUME_NAME: break; default: return -ENOIOCTLCMD; From 2fde3dd14efe8ceb21a92c2749394a5df61cb7df Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 26 Jul 2019 15:41:20 +0800 Subject: [PATCH 29/51] f2fs: allocate memory in batch in build_sit_info() build_sit_info() allocate all bitmaps for each segment one by one, it's quite low efficiency, this pach changes to allocate large continuous memory at a time, and divide it and assign for each bitmaps of segment. For large size image, it can expect improving its mount speed. Signed-off-by: Chen Gong Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 51 +++++++++++++++++++++-------------------------- fs/f2fs/segment.h | 1 + 2 files changed, 24 insertions(+), 28 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 3f47379cd7db..426d4a529f5e 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3945,7 +3945,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi) struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct sit_info *sit_i; unsigned int sit_segs, start; - char *src_bitmap; + char *src_bitmap, *bitmap; unsigned int bitmap_size; /* allocate memory for SIT information */ @@ -3968,27 +3968,31 @@ static int build_sit_info(struct f2fs_sb_info *sbi) if (!sit_i->dirty_sentries_bitmap) return -ENOMEM; +#ifdef CONFIG_F2FS_CHECK_FS + bitmap_size = MAIN_SEGS(sbi) * SIT_VBLOCK_MAP_SIZE * 4; +#else + bitmap_size = MAIN_SEGS(sbi) * SIT_VBLOCK_MAP_SIZE * 3; +#endif + sit_i->bitmap = f2fs_kvzalloc(sbi, bitmap_size, GFP_KERNEL); + if (!sit_i->bitmap) + return -ENOMEM; + + bitmap = sit_i->bitmap; + for (start = 0; start < MAIN_SEGS(sbi); start++) { - sit_i->sentries[start].cur_valid_map - = f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); - sit_i->sentries[start].ckpt_valid_map - = f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); - if (!sit_i->sentries[start].cur_valid_map || - !sit_i->sentries[start].ckpt_valid_map) - return -ENOMEM; + sit_i->sentries[start].cur_valid_map = bitmap; + bitmap += SIT_VBLOCK_MAP_SIZE; + + sit_i->sentries[start].ckpt_valid_map = bitmap; + bitmap += SIT_VBLOCK_MAP_SIZE; #ifdef CONFIG_F2FS_CHECK_FS - sit_i->sentries[start].cur_valid_map_mir - = f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); - if (!sit_i->sentries[start].cur_valid_map_mir) - return -ENOMEM; + sit_i->sentries[start].cur_valid_map_mir = bitmap; + bitmap += SIT_VBLOCK_MAP_SIZE; #endif - sit_i->sentries[start].discard_map - = f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE, - GFP_KERNEL); - if (!sit_i->sentries[start].discard_map) - return -ENOMEM; + sit_i->sentries[start].discard_map = bitmap; + bitmap += SIT_VBLOCK_MAP_SIZE; } sit_i->tmp_map = f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); @@ -4494,21 +4498,12 @@ static void destroy_free_segmap(struct f2fs_sb_info *sbi) static void destroy_sit_info(struct f2fs_sb_info *sbi) { struct sit_info *sit_i = SIT_I(sbi); - unsigned int start; if (!sit_i) return; - if (sit_i->sentries) { - for (start = 0; start < MAIN_SEGS(sbi); start++) { - kvfree(sit_i->sentries[start].cur_valid_map); -#ifdef CONFIG_F2FS_CHECK_FS - kvfree(sit_i->sentries[start].cur_valid_map_mir); -#endif - kvfree(sit_i->sentries[start].ckpt_valid_map); - kvfree(sit_i->sentries[start].discard_map); - } - } + if (sit_i->sentries) + kvfree(sit_i->bitmap); kvfree(sit_i->tmp_map); kvfree(sit_i->sentries); diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index b74602813a05..ec4d568fd58c 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -226,6 +226,7 @@ struct sit_info { block_t sit_base_addr; /* start block address of SIT area */ block_t sit_blocks; /* # of blocks used by SIT area */ block_t written_valid_blocks; /* # of valid blocks in main area */ + char *bitmap; /* all bitmaps pointer */ char *sit_bitmap; /* SIT bitmap pointer */ #ifdef CONFIG_F2FS_CHECK_FS char *sit_bitmap_mir; /* SIT bitmap mirror */ From bbf9f7d90f21e05e31b7cdd95b32f64dd2819dfe Mon Sep 17 00:00:00 2001 From: Sahitya Tummala Date: Wed, 7 Aug 2019 19:10:32 +0530 Subject: [PATCH 30/51] f2fs: Fix indefinite loop in f2fs_gc() Policy - Foreground GC, LFS and greedy GC mode. Under this policy, f2fs_gc() loops forever to GC as it doesn't have enough free segements to proceed and thus it keeps calling gc_more for the same victim segment. This can happen if the selected victim segment could not be GC'd due to failed blkaddr validity check i.e. is_alive() returns false for the blocks set in current validity map. Fix this by keeping track of such invalid segments and skip those segments for selection in get_victim_by_default() to avoid endless GC loop under such error scenarios. Currently, add this logic under CONFIG_F2FS_CHECK_FS to be able to root cause the issue in debug version. Signed-off-by: Sahitya Tummala Reviewed-by: Chao Yu [Jaegeuk Kim: fix wrong bitmap size] Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 25 ++++++++++++++++++++++++- fs/f2fs/segment.c | 27 +++++++++++++++++++-------- fs/f2fs/segment.h | 3 +++ 3 files changed, 46 insertions(+), 9 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 8974672db78f..e88f98ddf396 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -382,6 +382,16 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, nsearched++; } +#ifdef CONFIG_F2FS_CHECK_FS + /* + * skip selecting the invalid segno (that is failed due to block + * validity check failure during GC) to avoid endless GC loop in + * such cases. + */ + if (test_bit(segno, sm->invalid_segmap)) + goto next; +#endif + secno = GET_SEC_FROM_SEG(sbi, segno); if (sec_usage_check(sbi, secno)) @@ -627,8 +637,21 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, source_blkaddr = datablock_addr(NULL, node_page, ofs_in_node); f2fs_put_page(node_page, 1); - if (source_blkaddr != blkaddr) + if (source_blkaddr != blkaddr) { +#ifdef CONFIG_F2FS_CHECK_FS + unsigned int segno = GET_SEGNO(sbi, blkaddr); + unsigned long offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); + + if (unlikely(check_valid_map(sbi, segno, offset))) { + if (!test_and_set_bit(segno, SIT_I(sbi)->invalid_segmap)) { + f2fs_err(sbi, "mismatched blkaddr %u (source_blkaddr %u) in seg %u\n", + blkaddr, source_blkaddr, segno); + f2fs_bug_on(sbi, 1); + } + } +#endif return false; + } return true; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 426d4a529f5e..cc230fc829e1 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -817,9 +817,13 @@ static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, if (test_and_clear_bit(segno, dirty_i->dirty_segmap[t])) dirty_i->nr_dirty[t]--; - if (get_valid_blocks(sbi, segno, true) == 0) + if (get_valid_blocks(sbi, segno, true) == 0) { clear_bit(GET_SEC_FROM_SEG(sbi, segno), dirty_i->victim_secmap); +#ifdef CONFIG_F2FS_CHECK_FS + clear_bit(segno, SIT_I(sbi)->invalid_segmap); +#endif + } } } @@ -3946,7 +3950,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi) struct sit_info *sit_i; unsigned int sit_segs, start; char *src_bitmap, *bitmap; - unsigned int bitmap_size; + unsigned int bitmap_size, main_bitmap_size, sit_bitmap_size; /* allocate memory for SIT information */ sit_i = f2fs_kzalloc(sbi, sizeof(struct sit_info), GFP_KERNEL); @@ -3962,8 +3966,8 @@ static int build_sit_info(struct f2fs_sb_info *sbi) if (!sit_i->sentries) return -ENOMEM; - bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); - sit_i->dirty_sentries_bitmap = f2fs_kvzalloc(sbi, bitmap_size, + main_bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); + sit_i->dirty_sentries_bitmap = f2fs_kvzalloc(sbi, main_bitmap_size, GFP_KERNEL); if (!sit_i->dirty_sentries_bitmap) return -ENOMEM; @@ -4012,17 +4016,23 @@ static int build_sit_info(struct f2fs_sb_info *sbi) sit_segs = le32_to_cpu(raw_super->segment_count_sit) >> 1; /* setup SIT bitmap from ckeckpoint pack */ - bitmap_size = __bitmap_size(sbi, SIT_BITMAP); + sit_bitmap_size = __bitmap_size(sbi, SIT_BITMAP); src_bitmap = __bitmap_ptr(sbi, SIT_BITMAP); - sit_i->sit_bitmap = kmemdup(src_bitmap, bitmap_size, GFP_KERNEL); + sit_i->sit_bitmap = kmemdup(src_bitmap, sit_bitmap_size, GFP_KERNEL); if (!sit_i->sit_bitmap) return -ENOMEM; #ifdef CONFIG_F2FS_CHECK_FS - sit_i->sit_bitmap_mir = kmemdup(src_bitmap, bitmap_size, GFP_KERNEL); + sit_i->sit_bitmap_mir = kmemdup(src_bitmap, + sit_bitmap_size, GFP_KERNEL); if (!sit_i->sit_bitmap_mir) return -ENOMEM; + + sit_i->invalid_segmap = f2fs_kvzalloc(sbi, + main_bitmap_size, GFP_KERNEL); + if (!sit_i->invalid_segmap) + return -ENOMEM; #endif /* init SIT information */ @@ -4031,7 +4041,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi) sit_i->sit_base_addr = le32_to_cpu(raw_super->sit_blkaddr); sit_i->sit_blocks = sit_segs << sbi->log_blocks_per_seg; sit_i->written_valid_blocks = 0; - sit_i->bitmap_size = bitmap_size; + sit_i->bitmap_size = sit_bitmap_size; sit_i->dirty_sentries = 0; sit_i->sents_per_block = SIT_ENTRY_PER_BLOCK; sit_i->elapsed_time = le64_to_cpu(sbi->ckpt->elapsed_time); @@ -4514,6 +4524,7 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi) kvfree(sit_i->sit_bitmap); #ifdef CONFIG_F2FS_CHECK_FS kvfree(sit_i->sit_bitmap_mir); + kvfree(sit_i->invalid_segmap); #endif kvfree(sit_i); } diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index ec4d568fd58c..b219009c3e20 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -230,6 +230,9 @@ struct sit_info { char *sit_bitmap; /* SIT bitmap pointer */ #ifdef CONFIG_F2FS_CHECK_FS char *sit_bitmap_mir; /* SIT bitmap mirror */ + + /* bitmap of segments to be ignored by GC in case of errors */ + unsigned long *invalid_segmap; #endif unsigned int bitmap_size; /* SIT bitmap size */ From fe76a166a1e3dd2cc906e5468fda9834029d6258 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 21 Aug 2019 23:13:34 +0800 Subject: [PATCH 31/51] f2fs: introduce f2fs_match_name() for cleanup This patch introduces f2fs_match_name() for cleanup. BTW, it avoids to fallback to normal comparison once it doesn't match casefolded name. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 49 ++++++++++++++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 19 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index dac07d17cdbd..501999af581d 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -136,6 +136,34 @@ int f2fs_ci_compare(const struct inode *parent, const struct qstr *name, } #endif +static inline bool f2fs_match_name(struct f2fs_dentry_ptr *d, + struct f2fs_dir_entry *de, + struct fscrypt_name *fname, + unsigned long bit_pos, + f2fs_hash_t namehash) +{ +#ifdef CONFIG_UNICODE + struct inode *parent = d->inode; + struct f2fs_sb_info *sbi = F2FS_I_SB(parent); + struct qstr entry; +#endif + + if (de->hash_code != namehash) + return false; + +#ifdef CONFIG_UNICODE + entry.name = d->filename[bit_pos]; + entry.len = de->name_len; + + if (sbi->s_encoding && IS_CASEFOLDED(parent)) + return !f2fs_ci_compare(parent, fname->usr_fname, &entry); +#endif + if (fscrypt_match_name(fname, d->filename[bit_pos], + le16_to_cpu(de->name_len))) + return true; + return false; +} + struct f2fs_dir_entry *f2fs_find_target_dentry(struct fscrypt_name *fname, f2fs_hash_t namehash, int *max_slots, struct f2fs_dentry_ptr *d) @@ -143,9 +171,6 @@ struct f2fs_dir_entry *f2fs_find_target_dentry(struct fscrypt_name *fname, struct f2fs_dir_entry *de; unsigned long bit_pos = 0; int max_len = 0; -#ifdef CONFIG_UNICODE - struct qstr entry; -#endif if (max_slots) *max_slots = 0; @@ -157,28 +182,14 @@ struct f2fs_dir_entry *f2fs_find_target_dentry(struct fscrypt_name *fname, } de = &d->dentry[bit_pos]; -#ifdef CONFIG_UNICODE - entry.name = d->filename[bit_pos]; - entry.len = de->name_len; -#endif if (unlikely(!de->name_len)) { bit_pos++; continue; } - if (de->hash_code == namehash) { -#ifdef CONFIG_UNICODE - if (F2FS_SB(d->inode->i_sb)->s_encoding && - IS_CASEFOLDED(d->inode) && - !f2fs_ci_compare(d->inode, - fname->usr_fname, &entry)) - goto found; -#endif - if (fscrypt_match_name(fname, d->filename[bit_pos], - le16_to_cpu(de->name_len))) - goto found; - } + if (f2fs_match_name(d, de, fname, bit_pos, namehash)) + goto found; if (max_slots && max_len > *max_slots) *max_slots = max_len; From 950d47f233522ad8ad4b914ae9ddc03db2811edb Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 21 Aug 2019 23:13:35 +0800 Subject: [PATCH 32/51] f2fs: optimize case-insensitive lookups This patch ports below casefold enhancement patch from ext4 to f2fs commit 3ae72562ad91 ("ext4: optimize case-insensitive lookups") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 57 ++++++++++++++++++++++++++++++++++++++++++++------ fs/f2fs/f2fs.h | 3 ++- 2 files changed, 53 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 501999af581d..7afbf8f5ab08 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -112,13 +112,17 @@ static struct f2fs_dir_entry *find_in_block(struct inode *dir, * doesn't match or less than zero on error. */ int f2fs_ci_compare(const struct inode *parent, const struct qstr *name, - const struct qstr *entry) + const struct qstr *entry, bool quick) { const struct f2fs_sb_info *sbi = F2FS_SB(parent->i_sb); const struct unicode_map *um = sbi->s_encoding; int ret; - ret = utf8_strncasecmp(um, name, entry); + if (quick) + ret = utf8_strncasecmp_folded(um, name, entry); + else + ret = utf8_strncasecmp(um, name, entry); + if (ret < 0) { /* Handle invalid character sequence as either an error * or as an opaque byte sequence. @@ -134,11 +138,36 @@ int f2fs_ci_compare(const struct inode *parent, const struct qstr *name, return ret; } + +static void f2fs_fname_setup_ci_filename(struct inode *dir, + const struct qstr *iname, + struct fscrypt_str *cf_name) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + + if (!IS_CASEFOLDED(dir)) { + cf_name->name = NULL; + return; + } + + cf_name->name = f2fs_kmalloc(sbi, F2FS_NAME_LEN, GFP_NOFS); + if (!cf_name->name) + return; + + cf_name->len = utf8_casefold(sbi->s_encoding, + iname, cf_name->name, + F2FS_NAME_LEN); + if ((int)cf_name->len <= 0) { + kvfree(cf_name->name); + cf_name->name = NULL; + } +} #endif static inline bool f2fs_match_name(struct f2fs_dentry_ptr *d, struct f2fs_dir_entry *de, struct fscrypt_name *fname, + struct fscrypt_str *cf_str, unsigned long bit_pos, f2fs_hash_t namehash) { @@ -155,8 +184,15 @@ static inline bool f2fs_match_name(struct f2fs_dentry_ptr *d, entry.name = d->filename[bit_pos]; entry.len = de->name_len; - if (sbi->s_encoding && IS_CASEFOLDED(parent)) - return !f2fs_ci_compare(parent, fname->usr_fname, &entry); + if (sbi->s_encoding && IS_CASEFOLDED(parent)) { + if (cf_str->name) { + struct qstr cf = {.name = cf_str->name, + .len = cf_str->len}; + return !f2fs_ci_compare(parent, &cf, &entry, true); + } + return !f2fs_ci_compare(parent, fname->usr_fname, &entry, + false); + } #endif if (fscrypt_match_name(fname, d->filename[bit_pos], le16_to_cpu(de->name_len))) @@ -169,9 +205,14 @@ struct f2fs_dir_entry *f2fs_find_target_dentry(struct fscrypt_name *fname, struct f2fs_dentry_ptr *d) { struct f2fs_dir_entry *de; + struct fscrypt_str cf_str = { .name = NULL, .len = 0 }; unsigned long bit_pos = 0; int max_len = 0; +#ifdef CONFIG_UNICODE + f2fs_fname_setup_ci_filename(d->inode, fname->usr_fname, &cf_str); +#endif + if (max_slots) *max_slots = 0; while (bit_pos < d->max) { @@ -188,7 +229,7 @@ struct f2fs_dir_entry *f2fs_find_target_dentry(struct fscrypt_name *fname, continue; } - if (f2fs_match_name(d, de, fname, bit_pos, namehash)) + if (f2fs_match_name(d, de, fname, &cf_str, bit_pos, namehash)) goto found; if (max_slots && max_len > *max_slots) @@ -202,6 +243,10 @@ struct f2fs_dir_entry *f2fs_find_target_dentry(struct fscrypt_name *fname, found: if (max_slots && max_len > *max_slots) *max_slots = max_len; + +#ifdef CONFIG_UNICODE + kvfree(cf_str.name); +#endif return de; } @@ -1025,7 +1070,7 @@ static int f2fs_d_compare(const struct dentry *dentry, unsigned int len, return memcmp(str, name, len); } - return f2fs_ci_compare(dentry->d_parent->d_inode, name, &qstr); + return f2fs_ci_compare(dentry->d_parent->d_inode, name, &qstr, false); } static int f2fs_d_hash(const struct dentry *dentry, struct qstr *str) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index d2b718e33f88..260c6b2dca97 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2941,7 +2941,8 @@ struct dentry *f2fs_get_parent(struct dentry *child); extern int f2fs_ci_compare(const struct inode *parent, const struct qstr *name, - const struct qstr *entry); + const struct qstr *entry, + bool quick); /* * dir.c From 052a82d85a3b3eee6a386be2ba3b82278cf277ce Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 22 Aug 2019 20:17:56 +0800 Subject: [PATCH 33/51] f2fs: fix to writeout dirty inode during node flush As Eric reported: On xfstest generic/204 on f2fs, I'm getting a kernel BUG. allocate_segment_by_default+0x9d/0x100 [f2fs] f2fs_allocate_data_block+0x3c0/0x5c0 [f2fs] do_write_page+0x62/0x110 [f2fs] f2fs_do_write_node_page+0x2b/0xa0 [f2fs] __write_node_page+0x2ec/0x590 [f2fs] f2fs_sync_node_pages+0x756/0x7e0 [f2fs] block_operations+0x25b/0x350 [f2fs] f2fs_write_checkpoint+0x104/0x1150 [f2fs] f2fs_sync_fs+0xa2/0x120 [f2fs] f2fs_balance_fs_bg+0x33c/0x390 [f2fs] f2fs_write_node_pages+0x4c/0x1f0 [f2fs] do_writepages+0x1c/0x70 __writeback_single_inode+0x45/0x320 writeback_sb_inodes+0x273/0x5c0 wb_writeback+0xff/0x2e0 wb_workfn+0xa1/0x370 process_one_work+0x138/0x350 worker_thread+0x4d/0x3d0 kthread+0x109/0x140 The root cause of this issue is, in a very small partition, e.g. in generic/204 testcase of fstest suit, filesystem's free space is 50MB, so at most we can write 12800 inline inode with command: `echo XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX > $SCRATCH_MNT/$i`, then filesystem will have: - 12800 dirty inline data page - 12800 dirty inode page - and 12800 dirty imeta (dirty inode) When we flush node-inode's page cache, we can also flush inline data with each inode page, however it will run out-of-free-space in device, then once it triggers checkpoint, there is no room for huge number of imeta, at this time, GC is useless, as there is no dirty segment at all. In order to fix this, we try to recognize inode page during node_inode's page flushing, and update inode page from dirty inode, so that later another imeta (dirty inode) flush can be avoided. Reported-and-tested-by: Eric Biggers Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index d9ba1db2d01e..e5044eec8097 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1762,6 +1762,47 @@ out: return ret ? -EIO: 0; } +static int f2fs_match_ino(struct inode *inode, unsigned long ino, void *data) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + bool clean; + + if (inode->i_ino != ino) + return 0; + + if (!is_inode_flag_set(inode, FI_DIRTY_INODE)) + return 0; + + spin_lock(&sbi->inode_lock[DIRTY_META]); + clean = list_empty(&F2FS_I(inode)->gdirty_list); + spin_unlock(&sbi->inode_lock[DIRTY_META]); + + if (clean) + return 0; + + inode = igrab(inode); + if (!inode) + return 0; + return 1; +} + +static bool flush_dirty_inode(struct page *page) +{ + struct f2fs_sb_info *sbi = F2FS_P_SB(page); + struct inode *inode; + nid_t ino = ino_of_node(page); + + inode = find_inode_nowait(sbi->sb, ino, f2fs_match_ino, NULL); + if (!inode) + return false; + + f2fs_update_inode(inode, page); + unlock_page(page); + + iput(inode); + return true; +} + int f2fs_sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc, bool do_balance, enum iostat_type io_type) @@ -1785,6 +1826,7 @@ next_step: for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; bool submitted = false; + bool may_dirty = true; /* give a priority to WB_SYNC threads */ if (atomic_read(&sbi->wb_sync_req[NODE]) && @@ -1832,6 +1874,13 @@ continue_unlock: goto lock_node; } + /* flush dirty inode */ + if (IS_INODE(page) && may_dirty) { + may_dirty = false; + if (flush_dirty_inode(page)) + goto lock_node; + } + f2fs_wait_on_page_writeback(page, NODE, true, true); if (!clear_page_dirty_for_io(page)) From 9ea2f0be6ceaebae1518a5f897cff2645830dd95 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 23 Aug 2019 17:58:34 +0800 Subject: [PATCH 34/51] f2fs: fix wrong error injection path in inc_valid_block_count() If FAULT_BLOCK type error injection is on, in inc_valid_block_count() we may decrease sbi->alloc_valid_block_count percpu stat count incorrectly, fix it. Fixes: 36b877af7992 ("f2fs: Keep alloc_valid_block_count in sync") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 260c6b2dca97..a89ad8cab821 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1771,7 +1771,7 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi, if (time_to_inject(sbi, FAULT_BLOCK)) { f2fs_show_injection_info(FAULT_BLOCK); release = *count; - goto enospc; + goto release_quota; } /* @@ -1816,6 +1816,7 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi, enospc: percpu_counter_sub(&sbi->alloc_valid_block_count, release); +release_quota: dquot_release_reservation_block(inode, release); return -ENOSPC; } From b757f6edbeddd0c43135edfdee18103bd73f0991 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 23 Aug 2019 17:58:35 +0800 Subject: [PATCH 35/51] f2fs: clean up __bio_alloc()'s parameter Just cleanup, no logic change. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 73ed4ff9d01c..0b3728f58d17 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -259,26 +259,25 @@ static bool __same_bdev(struct f2fs_sb_info *sbi, /* * Low-level block read/write IO operations. */ -static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr, - struct writeback_control *wbc, - int npages, bool is_read, - enum page_type type, enum temp_type temp) +static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages) { + struct f2fs_sb_info *sbi = fio->sbi; struct bio *bio; bio = f2fs_bio_alloc(sbi, npages, true); - f2fs_target_device(sbi, blk_addr, bio); - if (is_read) { + f2fs_target_device(sbi, fio->new_blkaddr, bio); + if (is_read_io(fio->op)) { bio->bi_end_io = f2fs_read_end_io; bio->bi_private = NULL; } else { bio->bi_end_io = f2fs_write_end_io; bio->bi_private = sbi; - bio->bi_write_hint = f2fs_io_type_to_rw_hint(sbi, type, temp); + bio->bi_write_hint = f2fs_io_type_to_rw_hint(sbi, + fio->type, fio->temp); } - if (wbc) - wbc_init_bio(wbc, bio); + if (fio->io_wbc) + wbc_init_bio(fio->io_wbc, bio); return bio; } @@ -461,8 +460,7 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) f2fs_trace_ios(fio, 0); /* Allocate a new bio */ - bio = __bio_alloc(fio->sbi, fio->new_blkaddr, fio->io_wbc, - 1, is_read_io(fio->op), fio->type, fio->temp); + bio = __bio_alloc(fio, 1); if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { bio_put(bio); @@ -538,8 +536,7 @@ int f2fs_merge_page_bio(struct f2fs_io_info *fio) } alloc_new: if (!bio) { - bio = __bio_alloc(fio->sbi, fio->new_blkaddr, fio->io_wbc, - BIO_MAX_PAGES, false, fio->type, fio->temp); + bio = __bio_alloc(fio, BIO_MAX_PAGES); bio_set_op_attrs(bio, fio->op, fio->op_flags); } @@ -616,9 +613,7 @@ alloc_new: fio->retry = true; goto skip; } - io->bio = __bio_alloc(sbi, fio->new_blkaddr, fio->io_wbc, - BIO_MAX_PAGES, false, - fio->type, fio->temp); + io->bio = __bio_alloc(fio, BIO_MAX_PAGES); io->fio = *fio; } From 00e09c0bccc71825ca9a659eb145ed7c4dc95588 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 23 Aug 2019 17:58:36 +0800 Subject: [PATCH 36/51] f2fs: enhance f2fs_is_checkpoint_ready()'s readability This patch changes sematics of f2fs_is_checkpoint_ready()'s return value as: return true when checkpoint is ready, other return false, it can improve readability of below conditions. f2fs_submit_page_write() ... if (is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN) || !f2fs_is_checkpoint_ready(sbi)) __submit_merged_bio(io); f2fs_balance_fs() ... if (!f2fs_is_checkpoint_ready(sbi)) return; Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 7 ++++--- fs/f2fs/file.c | 18 ++++++++---------- fs/f2fs/inode.c | 2 +- fs/f2fs/namei.c | 36 ++++++++++++++---------------------- fs/f2fs/segment.c | 2 +- fs/f2fs/segment.h | 8 ++++---- fs/f2fs/xattr.c | 5 ++--- 7 files changed, 34 insertions(+), 44 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 0b3728f58d17..ab8c8f2fff70 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -634,7 +634,7 @@ skip: goto next; out: if (is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN) || - f2fs_is_checkpoint_ready(sbi)) + !f2fs_is_checkpoint_ready(sbi)) __submit_merged_bio(io); up_write(&io->io_rwsem); } @@ -2570,9 +2570,10 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, trace_f2fs_write_begin(inode, pos, len, flags); - err = f2fs_is_checkpoint_ready(sbi); - if (err) + if (!f2fs_is_checkpoint_ready(sbi)) { + err = -ENOSPC; goto fail; + } if ((f2fs_is_atomic_file(inode) && !f2fs_available_free_memory(sbi, INMEM_PAGES)) || diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 344e0bd638e5..6528216ab832 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -57,9 +57,11 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) err = -EIO; goto err; } - err = f2fs_is_checkpoint_ready(sbi); - if (err) + + if (!f2fs_is_checkpoint_ready(sbi)) { + err = -ENOSPC; goto err; + } sb_start_pagefault(inode->i_sb); @@ -1571,9 +1573,8 @@ static long f2fs_fallocate(struct file *file, int mode, if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) return -EIO; - ret = f2fs_is_checkpoint_ready(F2FS_I_SB(inode)); - if (ret) - return ret; + if (!f2fs_is_checkpoint_ready(F2FS_I_SB(inode))) + return -ENOSPC; /* f2fs only support ->fallocate for regular file */ if (!S_ISREG(inode->i_mode)) @@ -3146,13 +3147,10 @@ out: long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { - int ret; - if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(filp))))) return -EIO; - ret = f2fs_is_checkpoint_ready(F2FS_I_SB(file_inode(filp))); - if (ret) - return ret; + if (!f2fs_is_checkpoint_ready(F2FS_I_SB(file_inode(filp)))) + return -ENOSPC; switch (cmd) { case F2FS_IOC_GETFLAGS: diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 88af85e0db62..87214414936b 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -616,7 +616,7 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) if (!is_inode_flag_set(inode, FI_DIRTY_INODE)) return 0; - if (f2fs_is_checkpoint_ready(sbi)) + if (!f2fs_is_checkpoint_ready(sbi)) return -ENOSPC; /* diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 9a28c5d9b3e9..4faf06e8bf89 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -272,9 +272,8 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, if (unlikely(f2fs_cp_error(sbi))) return -EIO; - err = f2fs_is_checkpoint_ready(sbi); - if (err) - return err; + if (!f2fs_is_checkpoint_ready(sbi)) + return -ENOSPC; err = dquot_initialize(dir); if (err) @@ -321,9 +320,8 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, if (unlikely(f2fs_cp_error(sbi))) return -EIO; - err = f2fs_is_checkpoint_ready(sbi); - if (err) - return err; + if (!f2fs_is_checkpoint_ready(sbi)) + return -ENOSPC; err = fscrypt_prepare_link(old_dentry, dir, dentry); if (err) @@ -592,9 +590,8 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, if (unlikely(f2fs_cp_error(sbi))) return -EIO; - err = f2fs_is_checkpoint_ready(sbi); - if (err) - return err; + if (!f2fs_is_checkpoint_ready(sbi)) + return -ENOSPC; err = fscrypt_prepare_symlink(dir, symname, len, dir->i_sb->s_blocksize, &disk_link); @@ -724,9 +721,8 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, if (unlikely(f2fs_cp_error(sbi))) return -EIO; - err = f2fs_is_checkpoint_ready(sbi); - if (err) - return err; + if (!f2fs_is_checkpoint_ready(sbi)) + return -ENOSPC; err = dquot_initialize(dir); if (err) @@ -822,13 +818,11 @@ out: static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); - int ret; if (unlikely(f2fs_cp_error(sbi))) return -EIO; - ret = f2fs_is_checkpoint_ready(sbi); - if (ret) - return ret; + if (!f2fs_is_checkpoint_ready(sbi)) + return -ENOSPC; if (IS_ENCRYPTED(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) { int err = fscrypt_get_encryption_info(dir); @@ -865,9 +859,8 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, if (unlikely(f2fs_cp_error(sbi))) return -EIO; - err = f2fs_is_checkpoint_ready(sbi); - if (err) - return err; + if (!f2fs_is_checkpoint_ready(sbi)) + return -ENOSPC; if (is_inode_flag_set(new_dir, FI_PROJ_INHERIT) && (!projid_eq(F2FS_I(new_dir)->i_projid, @@ -1060,9 +1053,8 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, if (unlikely(f2fs_cp_error(sbi))) return -EIO; - err = f2fs_is_checkpoint_ready(sbi); - if (err) - return err; + if (!f2fs_is_checkpoint_ready(sbi)) + return -ENOSPC; if ((is_inode_flag_set(new_dir, FI_PROJ_INHERIT) && !projid_eq(F2FS_I(new_dir)->i_projid, diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index cc230fc829e1..18584d4c078a 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -501,7 +501,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) if (need && excess_cached_nats(sbi)) f2fs_balance_fs_bg(sbi); - if (f2fs_is_checkpoint_ready(sbi)) + if (!f2fs_is_checkpoint_ready(sbi)) return; /* diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index b219009c3e20..325781a1ae4d 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -586,13 +586,13 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, reserved_sections(sbi) + needed); } -static inline int f2fs_is_checkpoint_ready(struct f2fs_sb_info *sbi) +static inline bool f2fs_is_checkpoint_ready(struct f2fs_sb_info *sbi) { if (likely(!is_sbi_flag_set(sbi, SBI_CP_DISABLED))) - return 0; + return true; if (likely(!has_not_enough_free_secs(sbi, 0, 0))) - return 0; - return -ENOSPC; + return true; + return false; } static inline bool excess_prefree_segs(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index f85c810e33ca..181900af2576 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -732,9 +732,8 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name, if (unlikely(f2fs_cp_error(sbi))) return -EIO; - err = f2fs_is_checkpoint_ready(sbi); - if (err) - return err; + if (!f2fs_is_checkpoint_ready(sbi)) + return -ENOSPC; err = dquot_initialize(inode); if (err) From 100c06554ef3020f7836bb34dfec4cc2a7cbf30a Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 28 Aug 2019 19:58:26 -0700 Subject: [PATCH 37/51] f2fs: fix flushing node pages when checkpoint is disabled This patch fixes skipping node page writes when checkpoint is disabled. In this period, we can't rely on checkpoint to flush node pages. Fixes: fd8c8caf7e7c ("f2fs: let checkpoint flush dnode page of regular") Fixes: 4354994f097d ("f2fs: checkpoint disabling") Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index e5044eec8097..8b66bc4c004b 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1524,7 +1524,8 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) goto redirty_out; - if (wbc->sync_mode == WB_SYNC_NONE && + if (!is_sbi_flag_set(sbi, SBI_CP_DISABLED) && + wbc->sync_mode == WB_SYNC_NONE && IS_DNODE(page) && is_cold_node(page)) goto redirty_out; @@ -1909,7 +1910,8 @@ continue_unlock: } if (step < 2) { - if (wbc->sync_mode == WB_SYNC_NONE && step == 1) + if (!is_sbi_flag_set(sbi, SBI_CP_DISABLED) && + wbc->sync_mode == WB_SYNC_NONE && step == 1) goto out; step++; goto next_step; From d0995b53130eaa6a458780179c50489b933ea31a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 27 Aug 2019 18:17:55 +0800 Subject: [PATCH 38/51] f2fs: add missing documents of reserve_root/resuid/resgid Add missing documents. Fixes: 7e65be49ed94f ("f2fs: add reserved blocks for root user") Fixes: 7c2e59632b846 ("f2fs: add resgid and resuid to reserve root blocks") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt index 5fa38ab373ca..7e1991328473 100644 --- a/Documentation/filesystems/f2fs.txt +++ b/Documentation/filesystems/f2fs.txt @@ -157,6 +157,11 @@ noinline_data Disable the inline data feature, inline data feature is enabled by default. data_flush Enable data flushing before checkpoint in order to persist data of regular and symlink. +reserve_root=%d Support configuring reserved space which is used for + allocation from a privileged user with specified uid or + gid, unit: 4KB, the default limit is 0.2% of user blocks. +resuid=%d The user ID which may use the reserved blocks. +resgid=%d The group ID which may use the reserved blocks. fault_injection=%d Enable fault injection in all supported types with specified injection rate. fault_type=%d Support configuring fault injection type, should be From e8c82c11c93d586d03d80305959527bcac383555 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 3 Sep 2019 10:06:25 +0800 Subject: [PATCH 39/51] f2fs: fix error path of f2fs_convert_inline_page() In error path of f2fs_convert_inline_page(), we missed to truncate newly reserved block in .i_addrs[0] once we failed in get_node_info(), fix it. Fixes: 7735730d39d7 ("f2fs: fix to propagate error from __get_meta_page()") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inline.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 78d6ebe165cd..16ebdd4d1f2c 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -131,6 +131,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) err = f2fs_get_node_info(fio.sbi, dn->nid, &ni); if (err) { + f2fs_truncate_data_blocks_range(dn, 1); f2fs_put_dnode(dn); return err; } From cfb9a34d147b8d002d4330ddb53a3eb79565d17c Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 3 Sep 2019 10:06:26 +0800 Subject: [PATCH 40/51] f2fs: convert inline_data in prior to i_size_write In below call path, we change i_size before inline conversion, however, if we failed to convert inline inode, the inode may have wrong i_size which is larger than max inline size, result inline inode corruption. - f2fs_setattr - truncate_setsize - f2fs_convert_inline_inode This patch reorders truncate_setsize() and f2fs_convert_inline_inode() to guarantee inline_data has valid i_size. Fixes: 0cab80ee0c9e ("f2fs: fix to convert inline inode in ->setattr") Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 6528216ab832..10927a0b8df3 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -814,14 +814,24 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) } if (attr->ia_valid & ATTR_SIZE) { - bool to_smaller = (attr->ia_size <= i_size_read(inode)); + loff_t old_size = i_size_read(inode); + + if (attr->ia_size > MAX_INLINE_DATA(inode)) { + /* + * should convert inline inode before i_size_write to + * keep smaller than inline_data size with inline flag. + */ + err = f2fs_convert_inline_inode(inode); + if (err) + return err; + } down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); down_write(&F2FS_I(inode)->i_mmap_sem); truncate_setsize(inode, attr->ia_size); - if (to_smaller) + if (attr->ia_size <= old_size) err = f2fs_truncate(inode); /* * do not trim all blocks after i_size if target size is @@ -829,21 +839,11 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) */ up_write(&F2FS_I(inode)->i_mmap_sem); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - if (err) return err; - if (!to_smaller) { - /* should convert inline inode here */ - if (!f2fs_may_inline_data(inode)) { - err = f2fs_convert_inline_inode(inode); - if (err) - return err; - } - inode->i_mtime = inode->i_ctime = current_time(inode); - } - down_write(&F2FS_I(inode)->i_sem); + inode->i_mtime = inode->i_ctime = current_time(inode); F2FS_I(inode)->last_disk_size = i_size_read(inode); up_write(&F2FS_I(inode)->i_sem); } From 957fa47823dfe449c5a15a944e4e7a299a6601db Mon Sep 17 00:00:00 2001 From: Sahitya Tummala Date: Mon, 29 Jul 2019 10:50:26 +0530 Subject: [PATCH 41/51] f2fs: Fix indefinite loop in f2fs_gc() Policy - foreground GC, LFS mode and greedy GC mode. Under this policy, f2fs_gc() loops forever to GC as it doesn't have enough free segements to proceed and thus it keeps calling gc_more for the same victim segment. This can happen if the selected victim segment could not be GC'd due to failed blkaddr validity check i.e. is_alive() returns false for the blocks set in current validity map. Fix this by not resetting the sbi->cur_victim_sec to NULL_SEGNO, when the segment selected could not be GC'd. This helps to select another segment for GC and thus helps to proceed forward with GC. [Note] This can happen due to is_alive as well as atomic_file which skipps GC. Signed-off-by: Sahitya Tummala Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index e88f98ddf396..5877bd729689 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1326,7 +1326,7 @@ gc_more: round++; } - if (gc_type == FG_GC) + if (gc_type == FG_GC && seg_freed) sbi->cur_victim_sec = NULL_SEGNO; if (sync) From 743b620cb0516f6b6cbc45b48df00fe6d14d00ba Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 9 Sep 2019 13:10:59 +0100 Subject: [PATCH 42/51] f2fs: avoid infinite GC loop due to stale atomic files If committing atomic pages is failed when doing f2fs_do_sync_file(), we can get commited pages but atomic_file being still set like: - inmem: 0, atomic IO: 4 (Max. 10), volatile IO: 0 (Max. 0) If GC selects this block, we can get an infinite loop like this: f2fs_submit_page_bio: dev = (253,7), ino = 2, page_index = 0x2359a8, oldaddr = 0x2359a8, newaddr = 0x2359a8, rw = READ(), type = COLD_DATA f2fs_submit_read_bio: dev = (253,7)/(253,7), rw = READ(), DATA, sector = 18533696, size = 4096 f2fs_get_victim: dev = (253,7), type = No TYPE, policy = (Foreground GC, LFS-mode, Greedy), victim = 4355, cost = 1, ofs_unit = 1, pre_victim_secno = 4355, prefree = 0, free = 234 f2fs_iget: dev = (253,7), ino = 6247, pino = 5845, i_mode = 0x81b0, i_size = 319488, i_nlink = 1, i_blocks = 624, i_advise = 0x2c f2fs_submit_page_bio: dev = (253,7), ino = 2, page_index = 0x2359a8, oldaddr = 0x2359a8, newaddr = 0x2359a8, rw = READ(), type = COLD_DATA f2fs_submit_read_bio: dev = (253,7)/(253,7), rw = READ(), DATA, sector = 18533696, size = 4096 f2fs_get_victim: dev = (253,7), type = No TYPE, policy = (Foreground GC, LFS-mode, Greedy), victim = 4355, cost = 1, ofs_unit = 1, pre_victim_secno = 4355, prefree = 0, free = 234 f2fs_iget: dev = (253,7), ino = 6247, pino = 5845, i_mode = 0x81b0, i_size = 319488, i_nlink = 1, i_blocks = 624, i_advise = 0x2c In that moment, we can observe: [Before] Try to move 5084219 blocks (BG: 384508) - data blocks : 4962373 (274483) - node blocks : 121846 (110025) Skipped : atomic write 4534686 (10) [After] Try to move 5088973 blocks (BG: 384508) - data blocks : 4967127 (274483) - node blocks : 121846 (110025) Skipped : atomic write 4539440 (10) So, refactor atomic_write flow like this: 1. start_atomic_write - add inmem_list and set atomic_file 2. write() - register it in inmem_pages 3. commit_atomic_write - if no error, f2fs_drop_inmem_pages() - f2fs_commit_inmme_pages() failed : __revoked_inmem_pages() was done - f2fs_do_sync_file failed : abort_atomic_write later 4. abort_atomic_write - f2fs_drop_inmem_pages 5. f2fs_drop_inmem_pages - clear atomic_file - remove inmem_list Based on this change, when GC fails to move block in atomic_file, f2fs_drop_inmem_pages_all() can call f2fs_drop_inmem_pages(). Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 15 ++++++++++----- fs/f2fs/segment.c | 29 ++++++++--------------------- 2 files changed, 18 insertions(+), 26 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 10927a0b8df3..fab6e4cf8f06 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1829,6 +1829,8 @@ static int f2fs_ioc_getversion(struct file *filp, unsigned long arg) static int f2fs_ioc_start_atomic_write(struct file *filp) { struct inode *inode = file_inode(filp); + struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); int ret; if (!inode_owner_or_capable(inode)) @@ -1871,6 +1873,12 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) goto out; } + spin_lock(&sbi->inode_lock[ATOMIC_FILE]); + if (list_empty(&fi->inmem_ilist)) + list_add_tail(&fi->inmem_ilist, &sbi->inode_list[ATOMIC_FILE]); + spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); + + /* add inode in inmem_list first and set atomic_file */ set_inode_flag(inode, FI_ATOMIC_FILE); clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); @@ -1912,11 +1920,8 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) goto err_out; ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); - if (!ret) { - clear_inode_flag(inode, FI_ATOMIC_FILE); - F2FS_I(inode)->i_gc_failures[GC_FAILURE_ATOMIC] = 0; - stat_dec_atomic_write(inode); - } + if (!ret) + f2fs_drop_inmem_pages(inode); } else { ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 1, false); } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 18584d4c078a..204524943bc6 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -185,8 +185,6 @@ bool f2fs_need_SSR(struct f2fs_sb_info *sbi) void f2fs_register_inmem_page(struct inode *inode, struct page *page) { - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct f2fs_inode_info *fi = F2FS_I(inode); struct inmem_pages *new; f2fs_trace_pid(page); @@ -200,15 +198,11 @@ void f2fs_register_inmem_page(struct inode *inode, struct page *page) INIT_LIST_HEAD(&new->list); /* increase reference count with clean state */ - mutex_lock(&fi->inmem_lock); get_page(page); - list_add_tail(&new->list, &fi->inmem_pages); - spin_lock(&sbi->inode_lock[ATOMIC_FILE]); - if (list_empty(&fi->inmem_ilist)) - list_add_tail(&fi->inmem_ilist, &sbi->inode_list[ATOMIC_FILE]); - spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); + mutex_lock(&F2FS_I(inode)->inmem_lock); + list_add_tail(&new->list, &F2FS_I(inode)->inmem_pages); inc_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES); - mutex_unlock(&fi->inmem_lock); + mutex_unlock(&F2FS_I(inode)->inmem_lock); trace_f2fs_register_inmem_page(page, INMEM); } @@ -330,19 +324,17 @@ void f2fs_drop_inmem_pages(struct inode *inode) mutex_lock(&fi->inmem_lock); __revoke_inmem_pages(inode, &fi->inmem_pages, true, false, true); - - if (list_empty(&fi->inmem_pages)) { - spin_lock(&sbi->inode_lock[ATOMIC_FILE]); - if (!list_empty(&fi->inmem_ilist)) - list_del_init(&fi->inmem_ilist); - spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); - } mutex_unlock(&fi->inmem_lock); } clear_inode_flag(inode, FI_ATOMIC_FILE); fi->i_gc_failures[GC_FAILURE_ATOMIC] = 0; stat_dec_atomic_write(inode); + + spin_lock(&sbi->inode_lock[ATOMIC_FILE]); + if (!list_empty(&fi->inmem_ilist)) + list_del_init(&fi->inmem_ilist); + spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); } void f2fs_drop_inmem_page(struct inode *inode, struct page *page) @@ -471,11 +463,6 @@ int f2fs_commit_inmem_pages(struct inode *inode) mutex_lock(&fi->inmem_lock); err = __f2fs_commit_inmem_pages(inode); - - spin_lock(&sbi->inode_lock[ATOMIC_FILE]); - if (!list_empty(&fi->inmem_ilist)) - list_del_init(&fi->inmem_ilist); - spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); mutex_unlock(&fi->inmem_lock); clear_inode_flag(inode, FI_ATOMIC_COMMIT); From 98194030554cd9b10568a9b58f5a135c7e7cba85 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 10 Sep 2019 09:14:16 +0800 Subject: [PATCH 43/51] f2fs: fix to avoid accessing uninitialized field of inode page in is_alive() If inode is newly created, inode page may not synchronize with inode cache, so fields like .i_inline or .i_extra_isize could be wrong, in below call path, we may access such wrong fields, result in failing to migrate valid target block. Thread A Thread B - f2fs_create - f2fs_add_link - f2fs_add_dentry - f2fs_init_inode_metadata - f2fs_add_inline_entry - f2fs_new_inode_page - f2fs_put_page : inode page wasn't updated with inode cache - gc_data_segment - is_alive - f2fs_get_node_page - datablock_addr - offset_in_addr : access uninitialized fields Fixes: 7a2af766af15 ("f2fs: enhance on-disk inode structure scalability") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 5 +++++ fs/f2fs/inline.c | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 7afbf8f5ab08..4033778bcbbf 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -682,6 +682,11 @@ add_dentry: if (inode) { f2fs_i_pino_write(inode, dir->i_ino); + + /* synchronize inode page's data from inode cache */ + if (is_inode_flag_set(inode, FI_NEW_INODE)) + f2fs_update_inode(inode, page); + f2fs_put_page(page, 1); } diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 16ebdd4d1f2c..896db0416f0e 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -589,6 +589,11 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, /* we don't need to mark_inode_dirty now */ if (inode) { f2fs_i_pino_write(inode, dir->i_ino); + + /* synchronize inode page's data from inode cache */ + if (is_inode_flag_set(inode, FI_NEW_INODE)) + f2fs_update_inode(inode, page); + f2fs_put_page(page, 1); } From cb8434f16479b6c00540e23d02d6e078ebe69a52 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Wed, 11 Sep 2019 11:45:17 -0500 Subject: [PATCH 44/51] f2fs: fix inode rwsem regression This is similar to 942491c9e6d6 ("xfs: fix AIM7 regression") Apparently our current rwsem code doesn't like doing the trylock, then lock for real scheme. So change our read/write methods to just do the trylock for the RWF_NOWAIT case. We don't need a check for IOCB_NOWAIT and !direct-IO because it is checked in generic_write_checks(). Fixes: b91050a80cec ("f2fs: add nowait aio support") Signed-off-by: Goldwyn Rodrigues Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index fab6e4cf8f06..aea82f2b9240 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -3230,11 +3230,12 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) goto out; } - if (!inode_trylock(inode)) { - if (iocb->ki_flags & IOCB_NOWAIT) { + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!inode_trylock(inode)) { ret = -EAGAIN; goto out; } + } else { inode_lock(inode); } From bd7253bc5eee11877d963453dc5812e99d26c317 Mon Sep 17 00:00:00 2001 From: Lockywolf Date: Sun, 25 Aug 2019 17:28:38 +0800 Subject: [PATCH 45/51] f2fs: Add a small clarification to CONFIG_FS_F2FS_FS_SECURITY Signed-off-by: Lockywolf Signed-off-by: Jaegeuk Kim --- fs/f2fs/Kconfig | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index 95f1b99fa900..652fd2e2b23d 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -61,7 +61,9 @@ config F2FS_FS_SECURITY Security Models (LSMs) accepted by AppArmor, SELinux, Smack and TOMOYO Linux. This option enables an extended attribute handler for file security labels in the f2fs filesystem, so that it requires enabling - the extended attribute support in advance. + the extended attribute support in advance. In particular you need this + option if you use the setcap command to assign initial process capabi- + lities to executables (the security.* extended attributes). If you are not using a security module, say N. From 1166c1f2f69117ad254189ca781287afa6e550b6 Mon Sep 17 00:00:00 2001 From: Surbhi Palande Date: Fri, 23 Aug 2019 15:40:45 -0700 Subject: [PATCH 46/51] f2fs: check all the data segments against all node ones As a part of the sanity checking while mounting, distinct segment number assignment to data and node segments is verified. Fixing a small bug in this verification between node and data segments. We need to check all the data segments with all the node segments. Fixes: 042be0f849e5f ("f2fs: fix to do sanity check with current segment number") Signed-off-by: Surbhi Palande Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 46d10a94721d..04788a6f9bcf 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2738,10 +2738,10 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) } } for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) { - for (j = i; j < NR_CURSEG_DATA_TYPE; j++) { + for (j = 0; j < NR_CURSEG_DATA_TYPE; j++) { if (le32_to_cpu(ckpt->cur_node_segno[i]) == le32_to_cpu(ckpt->cur_data_segno[j])) { - f2fs_err(sbi, "Data segment (%u) and Data segment (%u) has the same segno: %u", + f2fs_err(sbi, "Node segment (%u) and Data segment (%u) has the same segno: %u", i, j, le32_to_cpu(ckpt->cur_node_segno[i])); return 1; From 86f35dc39ef9cdc5d33548e2d4ddac815a39e542 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 28 Aug 2019 17:33:35 +0800 Subject: [PATCH 47/51] f2fs: fix extent corrupotion during directIO in LFS mode In LFS mode, por_fsstress testcase reports a bug as below: [ASSERT] (fsck_chk_inode_blk: 931) --> ino: 0x12fe has wrong ext: [pgofs:142, blk:215424, len:16] Since commit f847c699cff3 ("f2fs: allow out-place-update for direct IO in LFS mode"), we start to allow OPU mode for direct IO, however, we missed to update extent cache in __allocate_data_block(), finally, it cause extent field being inconsistent with physical block address, fix it. Fixes: f847c699cff3 ("f2fs: allow out-place-update for direct IO in LFS mode") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index ab8c8f2fff70..7e9fafd44cbc 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1018,7 +1018,7 @@ alloc: if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) invalidate_mapping_pages(META_MAPPING(sbi), old_blkaddr, old_blkaddr); - f2fs_set_data_blkaddr(dn); + f2fs_update_data_blkaddr(dn, dn->data_blkaddr); /* * i_size will be updated by direct_IO. Otherwise, we'll get stale From 05e360061cbdcbfa93f8fcace2e7b53b2baed191 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 28 Aug 2019 17:33:36 +0800 Subject: [PATCH 48/51] f2fs: fix to handle error path correctly in f2fs_map_blocks In f2fs_map_blocks(), we should bail out once __allocate_data_block() failed. Fixes: f847c699cff3 ("f2fs: allow out-place-update for direct IO in LFS mode") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 7e9fafd44cbc..a3e2ce5a6b22 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1195,10 +1195,10 @@ next_block: if (test_opt(sbi, LFS) && flag == F2FS_GET_BLOCK_DIO && map->m_may_create) { err = __allocate_data_block(&dn, map->m_seg_type); - if (!err) { - blkaddr = dn.data_blkaddr; - set_inode_flag(inode, FI_APPEND_WRITE); - } + if (err) + goto sync_out; + blkaddr = dn.data_blkaddr; + set_inode_flag(inode, FI_APPEND_WRITE); } } else { if (create) { From 9720ee80aa183c6463b3e8c68f51eb979583d0ea Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 28 Aug 2019 17:33:37 +0800 Subject: [PATCH 49/51] f2fs: fix to fallback to buffered IO in IO aligned mode In LFS mode, we allow OPU for direct IO, however, we didn't consider IO alignment feature, so direct IO can trigger unaligned IO, let's just fallback to buffered IO to keep correct IO alignment semantics in all places. Fixes: f847c699cff3 ("f2fs: allow out-place-update for direct IO in LFS mode") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index a89ad8cab821..11c5a6d9f849 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3704,9 +3704,12 @@ static inline bool f2fs_force_buffered_io(struct inode *inode, */ if (f2fs_sb_has_blkzoned(sbi)) return true; - if (test_opt(sbi, LFS) && (rw == WRITE) && - block_unaligned_IO(inode, iocb, iter)) - return true; + if (test_opt(sbi, LFS) && (rw == WRITE)) { + if (block_unaligned_IO(inode, iocb, iter)) + return true; + if (F2FS_IO_ALIGNED(sbi)) + return true; + } if (is_sbi_flag_set(F2FS_I_SB(inode), SBI_CP_DISABLED) && !IS_SWAPFILE(inode)) return true; From 8223ecc456d079ef9b7a1fed237134cf62e9e870 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 28 Aug 2019 17:33:38 +0800 Subject: [PATCH 50/51] f2fs: fix to add missing F2FS_IO_ALIGNED() condition In f2fs_allocate_data_block(), we will reset fio.retry for IO alignment feature instead of IO serialization feature. In addition, spread F2FS_IO_ALIGNED() to check IO alignment feature status explicitly. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 6 +++++- fs/f2fs/segment.c | 4 +++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index a3e2ce5a6b22..adc64d514b79 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -294,6 +294,9 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi, if (test_opt(sbi, LFS) && current->plug) blk_finish_plug(current->plug); + if (F2FS_IO_ALIGNED(sbi)) + goto submit_io; + start = bio->bi_iter.bi_size >> F2FS_BLKSIZE_BITS; start %= F2FS_IO_SIZE(sbi); @@ -607,7 +610,8 @@ next: __submit_merged_bio(io); alloc_new: if (io->bio == NULL) { - if ((fio->type == DATA || fio->type == NODE) && + if (F2FS_IO_ALIGNED(sbi) && + (fio->type == DATA || fio->type == NODE) && fio->new_blkaddr & F2FS_IO_SIZE_MASK(sbi)) { dec_page_count(sbi, WB_DATA_TYPE(bio_page)); fio->retry = true; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 204524943bc6..808709581481 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3116,12 +3116,14 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, f2fs_inode_chksum_set(sbi, page); } + if (F2FS_IO_ALIGNED(sbi)) + fio->retry = false; + if (add_list) { struct f2fs_bio_info *io; INIT_LIST_HEAD(&fio->list); fio->in_list = true; - fio->retry = false; io = sbi->write_io[fio->type] + fio->temp; spin_lock(&io->io_lock); list_add_tail(&fio->list, &io->io_list); From fbbf779989d2ef9a51daaa4e53c0b2ecc8c55c4e Mon Sep 17 00:00:00 2001 From: Sahitya Tummala Date: Tue, 17 Sep 2019 10:19:23 +0530 Subject: [PATCH 51/51] f2fs: add a condition to detect overflow in f2fs_ioc_gc_range() end = range.start + range.len; If the range.start/range.len is a very large value, then end can overflow in this operation. It results into a crash in get_valid_blocks() when accessing the invalid range.start segno. This issue is reported in ioctl fuzz testing. Signed-off-by: Sahitya Tummala Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index aea82f2b9240..e4b78fb3fc79 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2264,9 +2264,9 @@ static int f2fs_ioc_gc_range(struct file *filp, unsigned long arg) return -EROFS; end = range.start + range.len; - if (range.start < MAIN_BLKADDR(sbi) || end >= MAX_BLKADDR(sbi)) { + if (end < range.start || range.start < MAIN_BLKADDR(sbi) || + end >= MAX_BLKADDR(sbi)) return -EINVAL; - } ret = mnt_want_write_file(filp); if (ret)