From e08ac99fa2a25626f573cfa377ef3ddedf2cfe8f Mon Sep 17 00:00:00 2001 From: Artem Blagodarenko Date: Wed, 21 Jun 2017 21:09:57 -0400 Subject: [PATCH 01/47] ext4: add largedir feature This INCOMPAT_LARGEDIR feature allows larger directories to be created in ldiskfs, both with directory sizes over 2GB and and a maximum htree depth of 3 instead of the current limit of 2. These features are needed in order to exceed the current limit of approximately 10M entries in a single directory. This patch was originally written by Yang Sheng to support the Lustre server. [ Bumped the credits needed to update an indexed directory -- tytso ] Signed-off-by: Liang Zhen Signed-off-by: Yang Sheng Signed-off-by: Artem Blagodarenko Signed-off-by: Theodore Ts'o Reviewed-by: Andreas Dilger --- fs/ext4/ext4.h | 23 ++++++-- fs/ext4/ext4_jbd2.h | 9 +++- fs/ext4/inode.c | 4 +- fs/ext4/namei.c | 124 ++++++++++++++++++++++++++++++-------------- 4 files changed, 113 insertions(+), 47 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 32191548abed..f17a4e7075be 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1800,7 +1800,8 @@ EXT4_FEATURE_INCOMPAT_FUNCS(encrypt, ENCRYPT) EXT4_FEATURE_INCOMPAT_MMP | \ EXT4_FEATURE_INCOMPAT_INLINE_DATA | \ EXT4_FEATURE_INCOMPAT_ENCRYPT | \ - EXT4_FEATURE_INCOMPAT_CSUM_SEED) + EXT4_FEATURE_INCOMPAT_CSUM_SEED | \ + EXT4_FEATURE_INCOMPAT_LARGEDIR) #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ @@ -2126,6 +2127,16 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no) */ #define ERR_BAD_DX_DIR (-(MAX_ERRNO - 1)) +/* htree levels for ext4 */ +#define EXT4_HTREE_LEVEL_COMPAT 2 +#define EXT4_HTREE_LEVEL 3 + +static inline int ext4_dir_htree_level(struct super_block *sb) +{ + return ext4_has_feature_largedir(sb) ? + EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT; +} + /* * Timeout and state flag for lazy initialization inode thread. */ @@ -2756,13 +2767,15 @@ static inline void ext4_r_blocks_count_set(struct ext4_super_block *es, es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32); } -static inline loff_t ext4_isize(struct ext4_inode *raw_inode) +static inline loff_t ext4_isize(struct super_block *sb, + struct ext4_inode *raw_inode) { - if (S_ISREG(le16_to_cpu(raw_inode->i_mode))) + if (ext4_has_feature_largedir(sb) || + S_ISREG(le16_to_cpu(raw_inode->i_mode))) return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) | le32_to_cpu(raw_inode->i_size_lo); - else - return (loff_t) le32_to_cpu(raw_inode->i_size_lo); + + return (loff_t) le32_to_cpu(raw_inode->i_size_lo); } static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size) diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index f97611171023..5e61e464d71c 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -77,7 +77,14 @@ #define EXT4_RESERVE_TRANS_BLOCKS 12U -#define EXT4_INDEX_EXTRA_TRANS_BLOCKS 8 +/* + * Number of credits needed if we need to insert an entry into a + * directory. For each new index block, we need 4 blocks (old index + * block, new index block, bitmap block, bg summary). For normal + * htree directories there are 2 levels; if the largedir feature + * enabled it's 3 levels. + */ +#define EXT4_INDEX_EXTRA_TRANS_BLOCKS 12U #ifdef CONFIG_QUOTA /* Amount of blocks needed for quota update - we know that the structure was diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5cf82d03968c..47604d1352fc 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4712,7 +4712,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) if (ext4_has_feature_64bit(sb)) ei->i_file_acl |= ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; - inode->i_size = ext4_isize(raw_inode); + inode->i_size = ext4_isize(sb, raw_inode); if ((size = i_size_read(inode)) < 0) { EXT4_ERROR_INODE(inode, "bad i_size value: %lld", size); ret = -EFSCORRUPTED; @@ -5037,7 +5037,7 @@ static int ext4_do_update_inode(handle_t *handle, raw_inode->i_file_acl_high = cpu_to_le16(ei->i_file_acl >> 32); raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); - if (ei->i_disksize != ext4_isize(raw_inode)) { + if (ei->i_disksize != ext4_isize(inode->i_sb, raw_inode)) { ext4_isize_set(raw_inode, ei->i_disksize); need_datasync = 1; } diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 404256caf9cf..423e1f761768 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -513,7 +513,7 @@ ext4_next_entry(struct ext4_dir_entry_2 *p, unsigned long blocksize) static inline ext4_lblk_t dx_get_block(struct dx_entry *entry) { - return le32_to_cpu(entry->block) & 0x00ffffff; + return le32_to_cpu(entry->block) & 0x0fffffff; } static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value) @@ -739,6 +739,7 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR); u32 hash; + memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0])); frame->bh = ext4_read_dirblock(dir, 0, INDEX); if (IS_ERR(frame->bh)) return (struct dx_frame *) frame->bh; @@ -768,9 +769,15 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, } indirect = root->info.indirect_levels; - if (indirect > 1) { - ext4_warning_inode(dir, "Unimplemented hash depth: %#06x", - root->info.indirect_levels); + if (indirect >= ext4_dir_htree_level(dir->i_sb)) { + ext4_warning(dir->i_sb, + "Directory (ino: %lu) htree depth %#06x exceed" + "supported value", dir->i_ino, + ext4_dir_htree_level(dir->i_sb)); + if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) { + ext4_warning(dir->i_sb, "Enable large directory " + "feature to access it"); + } goto fail; } @@ -859,12 +866,19 @@ fail: static void dx_release(struct dx_frame *frames) { + struct dx_root_info *info; + int i; + if (frames[0].bh == NULL) return; - if (((struct dx_root *)frames[0].bh->b_data)->info.indirect_levels) - brelse(frames[1].bh); - brelse(frames[0].bh); + info = &((struct dx_root *)frames[0].bh->b_data)->info; + for (i = 0; i <= info->indirect_levels; i++) { + if (frames[i].bh == NULL) + break; + brelse(frames[i].bh); + frames[i].bh = NULL; + } } /* @@ -1050,7 +1064,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, { struct dx_hash_info hinfo; struct ext4_dir_entry_2 *de; - struct dx_frame frames[2], *frame; + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; struct inode *dir; ext4_lblk_t block; int count = 0; @@ -1485,7 +1499,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, struct ext4_dir_entry_2 **res_dir) { struct super_block * sb = dir->i_sb; - struct dx_frame frames[2], *frame; + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; struct buffer_head *bh; ext4_lblk_t block; int retval; @@ -1889,7 +1903,7 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, */ dir->i_mtime = dir->i_ctime = current_time(dir); ext4_update_dx_flag(dir); - dir->i_version++; + inode_inc_iversion(dir); ext4_mark_inode_dirty(handle, dir); BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_dirent_node(handle, dir, bh); @@ -1908,7 +1922,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, { struct buffer_head *bh2; struct dx_root *root; - struct dx_frame frames[2], *frame; + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; struct dx_entry *entries; struct ext4_dir_entry_2 *de, *de2; struct ext4_dir_entry_tail *t; @@ -2127,13 +2141,16 @@ out: static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, struct inode *dir, struct inode *inode) { - struct dx_frame frames[2], *frame; + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; struct dx_entry *entries, *at; struct buffer_head *bh; struct super_block *sb = dir->i_sb; struct ext4_dir_entry_2 *de; + int restart; int err; +again: + restart = 0; frame = dx_probe(fname, dir, NULL, frames); if (IS_ERR(frame)) return PTR_ERR(frame); @@ -2155,24 +2172,44 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, if (err != -ENOSPC) goto cleanup; + err = 0; /* Block full, should compress but for now just split */ dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n", dx_get_count(entries), dx_get_limit(entries))); /* Need to split index? */ if (dx_get_count(entries) == dx_get_limit(entries)) { ext4_lblk_t newblock; - unsigned icount = dx_get_count(entries); - int levels = frame - frames; + int levels = frame - frames + 1; + unsigned int icount; + int add_level = 1; struct dx_entry *entries2; struct dx_node *node2; struct buffer_head *bh2; - if (levels && (dx_get_count(frames->entries) == - dx_get_limit(frames->entries))) { - ext4_warning_inode(dir, "Directory index full!"); + while (frame > frames) { + if (dx_get_count((frame - 1)->entries) < + dx_get_limit((frame - 1)->entries)) { + add_level = 0; + break; + } + frame--; /* split higher index block */ + at = frame->at; + entries = frame->entries; + restart = 1; + } + if (add_level && levels == ext4_dir_htree_level(sb)) { + ext4_warning(sb, "Directory (ino: %lu) index full, " + "reach max htree level :%d", + dir->i_ino, levels); + if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) { + ext4_warning(sb, "Large directory feature is " + "not enabled on this " + "filesystem"); + } err = -ENOSPC; goto cleanup; } + icount = dx_get_count(entries); bh2 = ext4_append(handle, dir, &newblock); if (IS_ERR(bh2)) { err = PTR_ERR(bh2); @@ -2187,7 +2224,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, err = ext4_journal_get_write_access(handle, frame->bh); if (err) goto journal_error; - if (levels) { + if (!add_level) { unsigned icount1 = icount/2, icount2 = icount - icount1; unsigned hash2 = dx_get_hash(entries + icount1); dxtrace(printk(KERN_DEBUG "Split index %i/%i\n", @@ -2195,7 +2232,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ err = ext4_journal_get_write_access(handle, - frames[0].bh); + (frame - 1)->bh); if (err) goto journal_error; @@ -2211,17 +2248,25 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, frame->entries = entries = entries2; swap(frame->bh, bh2); } - dx_insert_block(frames + 0, hash2, newblock); - dxtrace(dx_show_index("node", frames[1].entries)); + dx_insert_block((frame - 1), hash2, newblock); + dxtrace(dx_show_index("node", frame->entries)); dxtrace(dx_show_index("node", ((struct dx_node *) bh2->b_data)->entries)); err = ext4_handle_dirty_dx_node(handle, dir, bh2); if (err) goto journal_error; brelse (bh2); + err = ext4_handle_dirty_dx_node(handle, dir, + (frame - 1)->bh); + if (err) + goto journal_error; + if (restart) { + err = ext4_handle_dirty_dx_node(handle, dir, + frame->bh); + goto journal_error; + } } else { - dxtrace(printk(KERN_DEBUG - "Creating second level index...\n")); + struct dx_root *dxroot; memcpy((char *) entries2, (char *) entries, icount * sizeof(struct dx_entry)); dx_set_limit(entries2, dx_node_limit(dir)); @@ -2229,22 +2274,18 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, /* Set up root */ dx_set_count(entries, 1); dx_set_block(entries + 0, newblock); - ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1; - - /* Add new access path frame */ - frame = frames + 1; - frame->at = at = at - entries + entries2; - frame->entries = entries = entries2; - frame->bh = bh2; - err = ext4_journal_get_write_access(handle, - frame->bh); + dxroot = (struct dx_root *)frames[0].bh->b_data; + dxroot->info.indirect_levels += 1; + dxtrace(printk(KERN_DEBUG + "Creating %d level index...\n", + info->indirect_levels)); + err = ext4_handle_dirty_dx_node(handle, dir, frame->bh); if (err) goto journal_error; - } - err = ext4_handle_dirty_dx_node(handle, dir, frames[0].bh); - if (err) { - ext4_std_error(inode->i_sb, err); - goto cleanup; + err = ext4_handle_dirty_dx_node(handle, dir, bh2); + brelse(bh2); + restart = 1; + goto journal_error; } } de = do_split(handle, dir, &bh, frame, &fname->hinfo); @@ -2256,10 +2297,15 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, goto cleanup; journal_error: - ext4_std_error(dir->i_sb, err); + ext4_std_error(dir->i_sb, err); /* this is a no-op if err == 0 */ cleanup: brelse(bh); dx_release(frames); + /* @restart is true means htree-path has been changed, we need to + * repeat dx_probe() to find out valid htree-path + */ + if (restart && err == 0) + goto again; return err; } @@ -2296,7 +2342,7 @@ int ext4_generic_delete_entry(handle_t *handle, blocksize); else de->inode = 0; - dir->i_version++; + inode_inc_iversion(dir); return 0; } i += ext4_rec_len_from_disk(de->rec_len, blocksize); From e50e5129f384ae282adebfb561189cdb19b81cee Mon Sep 17 00:00:00 2001 From: Andreas Dilger Date: Wed, 21 Jun 2017 21:10:32 -0400 Subject: [PATCH 02/47] ext4: xattr-in-inode support Large xattr support is implemented for EXT4_FEATURE_INCOMPAT_EA_INODE. If the size of an xattr value is larger than will fit in a single external block, then the xattr value will be saved into the body of an external xattr inode. The also helps support a larger number of xattr, since only the headers will be stored in the in-inode space or the single external block. The inode is referenced from the xattr header via "e_value_inum", which was formerly "e_value_block", but that field was never used. The e_value_size still contains the xattr size so that listing xattrs does not need to look up the inode if the data is not accessed. struct ext4_xattr_entry { __u8 e_name_len; /* length of name */ __u8 e_name_index; /* attribute name index */ __le16 e_value_offs; /* offset in disk block of value */ __le32 e_value_inum; /* inode in which value is stored */ __le32 e_value_size; /* size of attribute value */ __le32 e_hash; /* hash value of name and value */ char e_name[0]; /* attribute name */ }; The xattr inode is marked with the EXT4_EA_INODE_FL flag and also holds a back-reference to the owning inode in its i_mtime field, allowing the ext4/e2fsck to verify the correct inode is accessed. [ Applied fix by Dan Carpenter to avoid freeing an ERR_PTR. ] Lustre-Jira: https://jira.hpdd.intel.com/browse/LU-80 Lustre-bugzilla: https://bugzilla.lustre.org/show_bug.cgi?id=4424 Signed-off-by: Kalpak Shah Signed-off-by: James Simmons Signed-off-by: Andreas Dilger Signed-off-by: Tahsin Erdogan Signed-off-by: Theodore Ts'o Signed-off-by: Dan Carpenter --- fs/ext4/ext4.h | 12 + fs/ext4/ialloc.c | 1 - fs/ext4/inline.c | 2 +- fs/ext4/inode.c | 49 ++++- fs/ext4/xattr.c | 563 +++++++++++++++++++++++++++++++++++++++++++---- fs/ext4/xattr.h | 33 ++- 6 files changed, 604 insertions(+), 56 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index f17a4e7075be..41e26ad86fc3 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1797,6 +1797,7 @@ EXT4_FEATURE_INCOMPAT_FUNCS(encrypt, ENCRYPT) EXT4_FEATURE_INCOMPAT_EXTENTS| \ EXT4_FEATURE_INCOMPAT_64BIT| \ EXT4_FEATURE_INCOMPAT_FLEX_BG| \ + EXT4_FEATURE_INCOMPAT_EA_INODE| \ EXT4_FEATURE_INCOMPAT_MMP | \ EXT4_FEATURE_INCOMPAT_INLINE_DATA | \ EXT4_FEATURE_INCOMPAT_ENCRYPT | \ @@ -2230,6 +2231,12 @@ struct mmpd_data { */ #define EXT4_MMP_MAX_CHECK_INTERVAL 300UL +/* + * Maximum size of xattr attributes for FEATURE_INCOMPAT_EA_INODE 1Mb + * This limit is arbitrary, but is reasonable for the xattr API. + */ +#define EXT4_XATTR_MAX_LARGE_EA_SIZE (1024 * 1024) + /* * Function prototypes */ @@ -2242,6 +2249,10 @@ struct mmpd_data { # define ATTRIB_NORET __attribute__((noreturn)) # define NORET_AND noreturn, +struct ext4_xattr_ino_array { + unsigned int xia_count; /* # of used item in the array */ + unsigned int xia_inodes[0]; +}; /* bitmap.c */ extern unsigned int ext4_count_free(char *bitmap, unsigned numchars); void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group, @@ -2489,6 +2500,7 @@ extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks); extern void ext4_set_inode_flags(struct inode *); extern int ext4_alloc_da_blocks(struct inode *inode); extern void ext4_set_aops(struct inode *inode); +extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int chunk); extern int ext4_writepage_trans_blocks(struct inode *); extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 98ac2f1f23b3..e2eb3cc06820 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -294,7 +294,6 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) * as writing the quota to disk may need the lock as well. */ dquot_initialize(inode); - ext4_xattr_delete_inode(handle, inode); dquot_free_inode(inode); dquot_drop(inode); diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 8d141c0c8ff9..28c5c3abddb3 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -61,7 +61,7 @@ static int get_max_inline_xattr_value_size(struct inode *inode, /* Compute min_offs. */ for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { - if (!entry->e_value_block && entry->e_value_size) { + if (!entry->e_value_inum && entry->e_value_size) { size_t offs = le16_to_cpu(entry->e_value_offs); if (offs < min_offs) min_offs = offs; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 47604d1352fc..986efd9511ac 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -139,8 +139,6 @@ static void ext4_invalidatepage(struct page *page, unsigned int offset, unsigned int length); static int __ext4_journalled_writepage(struct page *page, unsigned int len); static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh); -static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, - int pextents); /* * Test whether an inode is a fast symlink. @@ -189,6 +187,8 @@ void ext4_evict_inode(struct inode *inode) { handle_t *handle; int err; + int extra_credits = 3; + struct ext4_xattr_ino_array *lea_ino_array = NULL; trace_ext4_evict_inode(inode); @@ -238,8 +238,8 @@ void ext4_evict_inode(struct inode *inode) * protection against it */ sb_start_intwrite(inode->i_sb); - handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, - ext4_blocks_for_truncate(inode)+3); + + handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, extra_credits); if (IS_ERR(handle)) { ext4_std_error(inode->i_sb, PTR_ERR(handle)); /* @@ -251,9 +251,36 @@ void ext4_evict_inode(struct inode *inode) sb_end_intwrite(inode->i_sb); goto no_delete; } - if (IS_SYNC(inode)) ext4_handle_sync(handle); + + /* + * Delete xattr inode before deleting the main inode. + */ + err = ext4_xattr_delete_inode(handle, inode, &lea_ino_array); + if (err) { + ext4_warning(inode->i_sb, + "couldn't delete inode's xattr (err %d)", err); + goto stop_handle; + } + + if (!IS_NOQUOTA(inode)) + extra_credits += 2 * EXT4_QUOTA_DEL_BLOCKS(inode->i_sb); + + if (!ext4_handle_has_enough_credits(handle, + ext4_blocks_for_truncate(inode) + extra_credits)) { + err = ext4_journal_extend(handle, + ext4_blocks_for_truncate(inode) + extra_credits); + if (err > 0) + err = ext4_journal_restart(handle, + ext4_blocks_for_truncate(inode) + extra_credits); + if (err != 0) { + ext4_warning(inode->i_sb, + "couldn't extend journal (err %d)", err); + goto stop_handle; + } + } + inode->i_size = 0; err = ext4_mark_inode_dirty(handle, inode); if (err) { @@ -277,10 +304,10 @@ void ext4_evict_inode(struct inode *inode) * enough credits left in the handle to remove the inode from * the orphan list and set the dtime field. */ - if (!ext4_handle_has_enough_credits(handle, 3)) { - err = ext4_journal_extend(handle, 3); + if (!ext4_handle_has_enough_credits(handle, extra_credits)) { + err = ext4_journal_extend(handle, extra_credits); if (err > 0) - err = ext4_journal_restart(handle, 3); + err = ext4_journal_restart(handle, extra_credits); if (err != 0) { ext4_warning(inode->i_sb, "couldn't extend journal (err %d)", err); @@ -315,8 +342,12 @@ void ext4_evict_inode(struct inode *inode) ext4_clear_inode(inode); else ext4_free_inode(handle, inode); + ext4_journal_stop(handle); sb_end_intwrite(inode->i_sb); + + if (lea_ino_array != NULL) + ext4_xattr_inode_array_free(inode, lea_ino_array); return; no_delete: ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ @@ -5504,7 +5535,7 @@ static int ext4_index_trans_blocks(struct inode *inode, int lblocks, * * Also account for superblock, inode, quota and xattr blocks */ -static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, +int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int pextents) { ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 5d3c2536641c..7dd80d16f98e 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -177,9 +177,8 @@ ext4_xattr_check_entries(struct ext4_xattr_entry *entry, void *end, /* Check the values */ while (!IS_LAST_ENTRY(entry)) { - if (entry->e_value_block != 0) - return -EFSCORRUPTED; - if (entry->e_value_size != 0) { + if (entry->e_value_size != 0 && + entry->e_value_inum == 0) { u16 offs = le16_to_cpu(entry->e_value_offs); u32 size = le32_to_cpu(entry->e_value_size); void *value; @@ -269,6 +268,99 @@ ext4_xattr_find_entry(struct ext4_xattr_entry **pentry, int name_index, return cmp ? -ENODATA : 0; } +/* + * Read the EA value from an inode. + */ +static int +ext4_xattr_inode_read(struct inode *ea_inode, void *buf, size_t *size) +{ + unsigned long block = 0; + struct buffer_head *bh = NULL; + int blocksize; + size_t csize, ret_size = 0; + + if (*size == 0) + return 0; + + blocksize = ea_inode->i_sb->s_blocksize; + + while (ret_size < *size) { + csize = (*size - ret_size) > blocksize ? blocksize : + *size - ret_size; + bh = ext4_bread(NULL, ea_inode, block, 0); + if (IS_ERR(bh)) { + *size = ret_size; + return PTR_ERR(bh); + } + memcpy(buf, bh->b_data, csize); + brelse(bh); + + buf += csize; + block += 1; + ret_size += csize; + } + + *size = ret_size; + + return 0; +} + +struct inode *ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino, int *err) +{ + struct inode *ea_inode = NULL; + + ea_inode = ext4_iget(parent->i_sb, ea_ino); + if (IS_ERR(ea_inode) || is_bad_inode(ea_inode)) { + int rc = IS_ERR(ea_inode) ? PTR_ERR(ea_inode) : 0; + ext4_error(parent->i_sb, "error while reading EA inode %lu " + "/ %d %d", ea_ino, rc, is_bad_inode(ea_inode)); + *err = rc != 0 ? rc : -EIO; + return NULL; + } + + if (EXT4_XATTR_INODE_GET_PARENT(ea_inode) != parent->i_ino || + ea_inode->i_generation != parent->i_generation) { + ext4_error(parent->i_sb, "Backpointer from EA inode %lu " + "to parent invalid.", ea_ino); + *err = -EINVAL; + goto error; + } + + if (!(EXT4_I(ea_inode)->i_flags & EXT4_EA_INODE_FL)) { + ext4_error(parent->i_sb, "EA inode %lu does not have " + "EXT4_EA_INODE_FL flag set.\n", ea_ino); + *err = -EINVAL; + goto error; + } + + *err = 0; + return ea_inode; + +error: + iput(ea_inode); + return NULL; +} + +/* + * Read the value from the EA inode. + */ +static int +ext4_xattr_inode_get(struct inode *inode, unsigned long ea_ino, void *buffer, + size_t *size) +{ + struct inode *ea_inode = NULL; + int err; + + ea_inode = ext4_xattr_inode_iget(inode, ea_ino, &err); + if (err) + return err; + + err = ext4_xattr_inode_read(ea_inode, buffer, size); + iput(ea_inode); + + return err; +} + static int ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, void *buffer, size_t buffer_size) @@ -308,8 +400,16 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, error = -ERANGE; if (size > buffer_size) goto cleanup; - memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs), - size); + if (entry->e_value_inum) { + error = ext4_xattr_inode_get(inode, + le32_to_cpu(entry->e_value_inum), + buffer, &size); + if (error) + goto cleanup; + } else { + memcpy(buffer, bh->b_data + + le16_to_cpu(entry->e_value_offs), size); + } } error = size; @@ -350,8 +450,16 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name, error = -ERANGE; if (size > buffer_size) goto cleanup; - memcpy(buffer, (void *)IFIRST(header) + - le16_to_cpu(entry->e_value_offs), size); + if (entry->e_value_inum) { + error = ext4_xattr_inode_get(inode, + le32_to_cpu(entry->e_value_inum), + buffer, &size); + if (error) + goto cleanup; + } else { + memcpy(buffer, (void *)IFIRST(header) + + le16_to_cpu(entry->e_value_offs), size); + } } error = size; @@ -620,7 +728,7 @@ static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last, size_t *min_offs, void *base, int *total) { for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { - if (last->e_value_size) { + if (!last->e_value_inum && last->e_value_size) { size_t offs = le16_to_cpu(last->e_value_offs); if (offs < *min_offs) *min_offs = offs; @@ -631,16 +739,171 @@ static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last, return (*min_offs - ((void *)last - base) - sizeof(__u32)); } -static int -ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s) +/* + * Write the value of the EA in an inode. + */ +static int ext4_xattr_inode_write(handle_t *handle, struct inode *ea_inode, + const void *buf, int bufsize) +{ + struct buffer_head *bh = NULL; + unsigned long block = 0; + unsigned blocksize = ea_inode->i_sb->s_blocksize; + unsigned max_blocks = (bufsize + blocksize - 1) >> ea_inode->i_blkbits; + int csize, wsize = 0; + int ret = 0; + int retries = 0; + +retry: + while (ret >= 0 && ret < max_blocks) { + struct ext4_map_blocks map; + map.m_lblk = block += ret; + map.m_len = max_blocks -= ret; + + ret = ext4_map_blocks(handle, ea_inode, &map, + EXT4_GET_BLOCKS_CREATE); + if (ret <= 0) { + ext4_mark_inode_dirty(handle, ea_inode); + if (ret == -ENOSPC && + ext4_should_retry_alloc(ea_inode->i_sb, &retries)) { + ret = 0; + goto retry; + } + break; + } + } + + if (ret < 0) + return ret; + + block = 0; + while (wsize < bufsize) { + if (bh != NULL) + brelse(bh); + csize = (bufsize - wsize) > blocksize ? blocksize : + bufsize - wsize; + bh = ext4_getblk(handle, ea_inode, block, 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); + ret = ext4_journal_get_write_access(handle, bh); + if (ret) + goto out; + + memcpy(bh->b_data, buf, csize); + set_buffer_uptodate(bh); + ext4_handle_dirty_metadata(handle, ea_inode, bh); + + buf += csize; + wsize += csize; + block += 1; + } + + inode_lock(ea_inode); + i_size_write(ea_inode, wsize); + ext4_update_i_disksize(ea_inode, wsize); + inode_unlock(ea_inode); + + ext4_mark_inode_dirty(handle, ea_inode); + +out: + brelse(bh); + + return ret; +} + +/* + * Create an inode to store the value of a large EA. + */ +static struct inode *ext4_xattr_inode_create(handle_t *handle, + struct inode *inode) +{ + struct inode *ea_inode = NULL; + + /* + * Let the next inode be the goal, so we try and allocate the EA inode + * in the same group, or nearby one. + */ + ea_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode, + S_IFREG | 0600, NULL, inode->i_ino + 1, NULL); + if (!IS_ERR(ea_inode)) { + ea_inode->i_op = &ext4_file_inode_operations; + ea_inode->i_fop = &ext4_file_operations; + ext4_set_aops(ea_inode); + ea_inode->i_generation = inode->i_generation; + EXT4_I(ea_inode)->i_flags |= EXT4_EA_INODE_FL; + + /* + * A back-pointer from EA inode to parent inode will be useful + * for e2fsck. + */ + EXT4_XATTR_INODE_SET_PARENT(ea_inode, inode->i_ino); + unlock_new_inode(ea_inode); + } + + return ea_inode; +} + +/* + * Unlink the inode storing the value of the EA. + */ +int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino) +{ + struct inode *ea_inode = NULL; + int err; + + ea_inode = ext4_xattr_inode_iget(inode, ea_ino, &err); + if (err) + return err; + + clear_nlink(ea_inode); + iput(ea_inode); + + return 0; +} + +/* + * Add value of the EA in an inode. + */ +static int ext4_xattr_inode_set(handle_t *handle, struct inode *inode, + unsigned long *ea_ino, const void *value, + size_t value_len) +{ + struct inode *ea_inode; + int err; + + /* Create an inode for the EA value */ + ea_inode = ext4_xattr_inode_create(handle, inode); + if (IS_ERR(ea_inode)) + return PTR_ERR(ea_inode); + + err = ext4_xattr_inode_write(handle, ea_inode, value, value_len); + if (err) + clear_nlink(ea_inode); + else + *ea_ino = ea_inode->i_ino; + + iput(ea_inode); + + return err; +} + +static int ext4_xattr_set_entry(struct ext4_xattr_info *i, + struct ext4_xattr_search *s, + handle_t *handle, struct inode *inode) { struct ext4_xattr_entry *last; size_t free, min_offs = s->end - s->base, name_len = strlen(i->name); + int in_inode = i->in_inode; + int rc; + + if (ext4_has_feature_ea_inode(inode->i_sb) && + (EXT4_XATTR_SIZE(i->value_len) > + EXT4_XATTR_MIN_LARGE_EA_SIZE(inode->i_sb->s_blocksize))) + in_inode = 1; /* Compute min_offs and last. */ last = s->first; for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { - if (last->e_value_size) { + if (!last->e_value_inum && last->e_value_size) { size_t offs = le16_to_cpu(last->e_value_offs); if (offs < min_offs) min_offs = offs; @@ -648,15 +911,20 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s) } free = min_offs - ((void *)last - s->base) - sizeof(__u32); if (!s->not_found) { - if (s->here->e_value_size) { + if (!in_inode && + !s->here->e_value_inum && s->here->e_value_size) { size_t size = le32_to_cpu(s->here->e_value_size); free += EXT4_XATTR_SIZE(size); } free += EXT4_XATTR_LEN(name_len); } if (i->value) { - if (free < EXT4_XATTR_LEN(name_len) + - EXT4_XATTR_SIZE(i->value_len)) + size_t value_len = EXT4_XATTR_SIZE(i->value_len); + + if (in_inode) + value_len = 0; + + if (free < EXT4_XATTR_LEN(name_len) + value_len) return -ENOSPC; } @@ -670,7 +938,8 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s) s->here->e_name_len = name_len; memcpy(s->here->e_name, i->name, name_len); } else { - if (s->here->e_value_size) { + if (!s->here->e_value_inum && s->here->e_value_size && + s->here->e_value_offs > 0) { void *first_val = s->base + min_offs; size_t offs = le16_to_cpu(s->here->e_value_offs); void *val = s->base + offs; @@ -704,12 +973,18 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s) last = s->first; while (!IS_LAST_ENTRY(last)) { size_t o = le16_to_cpu(last->e_value_offs); - if (last->e_value_size && o < offs) + if (!last->e_value_inum && + last->e_value_size && o < offs) last->e_value_offs = cpu_to_le16(o + size); last = EXT4_XATTR_NEXT(last); } } + if (s->here->e_value_inum) { + ext4_xattr_inode_unlink(inode, + le32_to_cpu(s->here->e_value_inum)); + s->here->e_value_inum = 0; + } if (!i->value) { /* Remove the old name. */ size_t size = EXT4_XATTR_LEN(name_len); @@ -722,11 +997,20 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s) if (i->value) { /* Insert the new value. */ - s->here->e_value_size = cpu_to_le32(i->value_len); - if (i->value_len) { + if (in_inode) { + unsigned long ea_ino = + le32_to_cpu(s->here->e_value_inum); + rc = ext4_xattr_inode_set(handle, inode, &ea_ino, + i->value, i->value_len); + if (rc) + goto out; + s->here->e_value_inum = cpu_to_le32(ea_ino); + s->here->e_value_offs = 0; + } else if (i->value_len) { size_t size = EXT4_XATTR_SIZE(i->value_len); void *val = s->base + min_offs - size; s->here->e_value_offs = cpu_to_le16(min_offs - size); + s->here->e_value_inum = 0; if (i->value == EXT4_ZERO_XATTR_VALUE) { memset(val, 0, size); } else { @@ -736,8 +1020,11 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s) memcpy(val, i->value, i->value_len); } } + s->here->e_value_size = cpu_to_le32(i->value_len); } - return 0; + +out: + return rc; } struct ext4_xattr_block_find { @@ -801,8 +1088,6 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, #define header(x) ((struct ext4_xattr_header *)(x)) - if (i->value && i->value_len > sb->s_blocksize) - return -ENOSPC; if (s->base) { BUFFER_TRACE(bs->bh, "get_write_access"); error = ext4_journal_get_write_access(handle, bs->bh); @@ -821,7 +1106,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, mb_cache_entry_delete_block(ext4_mb_cache, hash, bs->bh->b_blocknr); ea_bdebug(bs->bh, "modifying in-place"); - error = ext4_xattr_set_entry(i, s); + error = ext4_xattr_set_entry(i, s, handle, inode); if (!error) { if (!IS_LAST_ENTRY(s->first)) ext4_xattr_rehash(header(s->base), @@ -870,7 +1155,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, s->end = s->base + sb->s_blocksize; } - error = ext4_xattr_set_entry(i, s); + error = ext4_xattr_set_entry(i, s, handle, inode); if (error == -EFSCORRUPTED) goto bad_block; if (error) @@ -1070,7 +1355,7 @@ int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode, if (EXT4_I(inode)->i_extra_isize == 0) return -ENOSPC; - error = ext4_xattr_set_entry(i, s); + error = ext4_xattr_set_entry(i, s, handle, inode); if (error) { if (error == -ENOSPC && ext4_has_inline_data(inode)) { @@ -1082,7 +1367,7 @@ int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode, error = ext4_xattr_ibody_find(inode, i, is); if (error) return error; - error = ext4_xattr_set_entry(i, s); + error = ext4_xattr_set_entry(i, s, handle, inode); } if (error) return error; @@ -1098,7 +1383,7 @@ int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode, return 0; } -static int ext4_xattr_ibody_set(struct inode *inode, +static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, struct ext4_xattr_info *i, struct ext4_xattr_ibody_find *is) { @@ -1108,7 +1393,7 @@ static int ext4_xattr_ibody_set(struct inode *inode, if (EXT4_I(inode)->i_extra_isize == 0) return -ENOSPC; - error = ext4_xattr_set_entry(i, s); + error = ext4_xattr_set_entry(i, s, handle, inode); if (error) return error; header = IHDR(inode, ext4_raw_inode(&is->iloc)); @@ -1155,7 +1440,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, .name = name, .value = value, .value_len = value_len, - + .in_inode = 0, }; struct ext4_xattr_ibody_find is = { .s = { .not_found = -ENODATA, }, @@ -1204,7 +1489,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, } if (!value) { if (!is.s.not_found) - error = ext4_xattr_ibody_set(inode, &i, &is); + error = ext4_xattr_ibody_set(handle, inode, &i, &is); else if (!bs.s.not_found) error = ext4_xattr_block_set(handle, inode, &i, &bs); } else { @@ -1215,7 +1500,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, if (!bs.s.not_found && ext4_xattr_value_same(&bs.s, &i)) goto cleanup; - error = ext4_xattr_ibody_set(inode, &i, &is); + error = ext4_xattr_ibody_set(handle, inode, &i, &is); if (!error && !bs.s.not_found) { i.value = NULL; error = ext4_xattr_block_set(handle, inode, &i, &bs); @@ -1226,11 +1511,20 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, goto cleanup; } error = ext4_xattr_block_set(handle, inode, &i, &bs); + if (ext4_has_feature_ea_inode(inode->i_sb) && + error == -ENOSPC) { + /* xattr not fit to block, store at external + * inode */ + i.in_inode = 1; + error = ext4_xattr_ibody_set(handle, inode, + &i, &is); + } if (error) goto cleanup; if (!is.s.not_found) { i.value = NULL; - error = ext4_xattr_ibody_set(inode, &i, &is); + error = ext4_xattr_ibody_set(handle, inode, &i, + &is); } } } @@ -1269,12 +1563,26 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name, const void *value, size_t value_len, int flags) { handle_t *handle; + struct super_block *sb = inode->i_sb; int error, retries = 0; int credits = ext4_jbd2_credits_xattr(inode); error = dquot_initialize(inode); if (error) return error; + + if ((value_len >= EXT4_XATTR_MIN_LARGE_EA_SIZE(sb->s_blocksize)) && + ext4_has_feature_ea_inode(sb)) { + int nrblocks = (value_len + sb->s_blocksize - 1) >> + sb->s_blocksize_bits; + + /* For new inode */ + credits += EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + 3; + + /* For data blocks of EA inode */ + credits += ext4_meta_trans_blocks(inode, nrblocks, 0); + } + retry: handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits); if (IS_ERR(handle)) { @@ -1286,7 +1594,7 @@ retry: value, value_len, flags); error2 = ext4_journal_stop(handle); if (error == -ENOSPC && - ext4_should_retry_alloc(inode->i_sb, &retries)) + ext4_should_retry_alloc(sb, &retries)) goto retry; if (error == 0) error = error2; @@ -1311,7 +1619,7 @@ static void ext4_xattr_shift_entries(struct ext4_xattr_entry *entry, /* Adjust the value offsets of the entries */ for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { - if (last->e_value_size) { + if (!last->e_value_inum && last->e_value_size) { new_offs = le16_to_cpu(last->e_value_offs) + value_offs_shift; last->e_value_offs = cpu_to_le16(new_offs); @@ -1372,7 +1680,7 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode, goto out; /* Remove the chosen entry from the inode */ - error = ext4_xattr_ibody_set(inode, &i, is); + error = ext4_xattr_ibody_set(handle, inode, &i, is); if (error) goto out; @@ -1572,21 +1880,135 @@ cleanup: } +#define EIA_INCR 16 /* must be 2^n */ +#define EIA_MASK (EIA_INCR - 1) +/* Add the large xattr @ino into @lea_ino_array for later deletion. + * If @lea_ino_array is new or full it will be grown and the old + * contents copied over. + */ +static int +ext4_expand_ino_array(struct ext4_xattr_ino_array **lea_ino_array, __u32 ino) +{ + if (*lea_ino_array == NULL) { + /* + * Start with 15 inodes, so it fits into a power-of-two size. + * If *lea_ino_array is NULL, this is essentially offsetof() + */ + (*lea_ino_array) = + kmalloc(offsetof(struct ext4_xattr_ino_array, + xia_inodes[EIA_MASK]), + GFP_NOFS); + if (*lea_ino_array == NULL) + return -ENOMEM; + (*lea_ino_array)->xia_count = 0; + } else if (((*lea_ino_array)->xia_count & EIA_MASK) == EIA_MASK) { + /* expand the array once all 15 + n * 16 slots are full */ + struct ext4_xattr_ino_array *new_array = NULL; + int count = (*lea_ino_array)->xia_count; + + /* if new_array is NULL, this is essentially offsetof() */ + new_array = kmalloc( + offsetof(struct ext4_xattr_ino_array, + xia_inodes[count + EIA_INCR]), + GFP_NOFS); + if (new_array == NULL) + return -ENOMEM; + memcpy(new_array, *lea_ino_array, + offsetof(struct ext4_xattr_ino_array, + xia_inodes[count])); + kfree(*lea_ino_array); + *lea_ino_array = new_array; + } + (*lea_ino_array)->xia_inodes[(*lea_ino_array)->xia_count++] = ino; + return 0; +} + +/** + * Add xattr inode to orphan list + */ +static int +ext4_xattr_inode_orphan_add(handle_t *handle, struct inode *inode, + int credits, struct ext4_xattr_ino_array *lea_ino_array) +{ + struct inode *ea_inode = NULL; + int idx = 0, error = 0; + + if (lea_ino_array == NULL) + return 0; + + for (; idx < lea_ino_array->xia_count; ++idx) { + if (!ext4_handle_has_enough_credits(handle, credits)) { + error = ext4_journal_extend(handle, credits); + if (error > 0) + error = ext4_journal_restart(handle, credits); + + if (error != 0) { + ext4_warning(inode->i_sb, + "couldn't extend journal " + "(err %d)", error); + return error; + } + } + ea_inode = ext4_xattr_inode_iget(inode, + lea_ino_array->xia_inodes[idx], &error); + if (error) + continue; + ext4_orphan_add(handle, ea_inode); + /* the inode's i_count will be released by caller */ + } + + return 0; +} /* * ext4_xattr_delete_inode() * - * Free extended attribute resources associated with this inode. This + * Free extended attribute resources associated with this inode. Traverse + * all entries and unlink any xattr inodes associated with this inode. This * is called immediately before an inode is freed. We have exclusive - * access to the inode. + * access to the inode. If an orphan inode is deleted it will also delete any + * xattr block and all xattr inodes. They are checked by ext4_xattr_inode_iget() + * to ensure they belong to the parent inode and were not deleted already. */ -void -ext4_xattr_delete_inode(handle_t *handle, struct inode *inode) +int +ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, + struct ext4_xattr_ino_array **lea_ino_array) { struct buffer_head *bh = NULL; + struct ext4_xattr_ibody_header *header; + struct ext4_inode *raw_inode; + struct ext4_iloc iloc; + struct ext4_xattr_entry *entry; + int credits = 3, error = 0; - if (!EXT4_I(inode)->i_file_acl) + if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR)) + goto delete_external_ea; + + error = ext4_get_inode_loc(inode, &iloc); + if (error) goto cleanup; + raw_inode = ext4_raw_inode(&iloc); + header = IHDR(inode, raw_inode); + for (entry = IFIRST(header); !IS_LAST_ENTRY(entry); + entry = EXT4_XATTR_NEXT(entry)) { + if (!entry->e_value_inum) + continue; + if (ext4_expand_ino_array(lea_ino_array, + entry->e_value_inum) != 0) { + brelse(iloc.bh); + goto cleanup; + } + entry->e_value_inum = 0; + } + brelse(iloc.bh); + +delete_external_ea: + if (!EXT4_I(inode)->i_file_acl) { + /* add xattr inode to orphan list */ + ext4_xattr_inode_orphan_add(handle, inode, credits, + *lea_ino_array); + goto cleanup; + } bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); if (!bh) { EXT4_ERROR_INODE(inode, "block %llu read error", @@ -1599,11 +2021,69 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode) EXT4_I(inode)->i_file_acl); goto cleanup; } + + for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry); + entry = EXT4_XATTR_NEXT(entry)) { + if (!entry->e_value_inum) + continue; + if (ext4_expand_ino_array(lea_ino_array, + entry->e_value_inum) != 0) + goto cleanup; + entry->e_value_inum = 0; + } + + /* add xattr inode to orphan list */ + error = ext4_xattr_inode_orphan_add(handle, inode, credits, + *lea_ino_array); + if (error != 0) + goto cleanup; + + if (!IS_NOQUOTA(inode)) + credits += 2 * EXT4_QUOTA_DEL_BLOCKS(inode->i_sb); + + if (!ext4_handle_has_enough_credits(handle, credits)) { + error = ext4_journal_extend(handle, credits); + if (error > 0) + error = ext4_journal_restart(handle, credits); + if (error != 0) { + ext4_warning(inode->i_sb, + "couldn't extend journal (err %d)", error); + goto cleanup; + } + } + ext4_xattr_release_block(handle, inode, bh); EXT4_I(inode)->i_file_acl = 0; cleanup: brelse(bh); + + return error; +} + +void +ext4_xattr_inode_array_free(struct inode *inode, + struct ext4_xattr_ino_array *lea_ino_array) +{ + struct inode *ea_inode = NULL; + int idx = 0; + int err; + + if (lea_ino_array == NULL) + return; + + for (; idx < lea_ino_array->xia_count; ++idx) { + ea_inode = ext4_xattr_inode_iget(inode, + lea_ino_array->xia_inodes[idx], &err); + if (err) + continue; + /* for inode's i_count get from ext4_xattr_delete_inode */ + if (!list_empty(&EXT4_I(ea_inode)->i_orphan)) + iput(ea_inode); + clear_nlink(ea_inode); + iput(ea_inode); + } + kfree(lea_ino_array); } /* @@ -1655,10 +2135,9 @@ ext4_xattr_cmp(struct ext4_xattr_header *header1, entry1->e_name_index != entry2->e_name_index || entry1->e_name_len != entry2->e_name_len || entry1->e_value_size != entry2->e_value_size || + entry1->e_value_inum != entry2->e_value_inum || memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len)) return 1; - if (entry1->e_value_block != 0 || entry2->e_value_block != 0) - return -EFSCORRUPTED; if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs), (char *)header2 + le16_to_cpu(entry2->e_value_offs), le32_to_cpu(entry1->e_value_size))) @@ -1730,7 +2209,7 @@ static inline void ext4_xattr_hash_entry(struct ext4_xattr_header *header, *name++; } - if (entry->e_value_size != 0) { + if (!entry->e_value_inum && entry->e_value_size) { __le32 *value = (__le32 *)((char *)header + le16_to_cpu(entry->e_value_offs)); for (n = (le32_to_cpu(entry->e_value_size) + diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 099c8b670ef5..6e10ff9393d4 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -44,7 +44,7 @@ struct ext4_xattr_entry { __u8 e_name_len; /* length of name */ __u8 e_name_index; /* attribute name index */ __le16 e_value_offs; /* offset in disk block of value */ - __le32 e_value_block; /* disk block attribute is stored on (n/i) */ + __le32 e_value_inum; /* inode in which the value is stored */ __le32 e_value_size; /* size of attribute value */ __le32 e_hash; /* hash value of name and value */ char e_name[0]; /* attribute name */ @@ -69,6 +69,26 @@ struct ext4_xattr_entry { EXT4_I(inode)->i_extra_isize)) #define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1)) +/* + * Link EA inode back to parent one using i_mtime field. + * Extra integer type conversion added to ignore higher + * bits in i_mtime.tv_sec which might be set by ext4_get() + */ +#define EXT4_XATTR_INODE_SET_PARENT(inode, inum) \ +do { \ + (inode)->i_mtime.tv_sec = inum; \ +} while(0) + +#define EXT4_XATTR_INODE_GET_PARENT(inode) \ +((__u32)(inode)->i_mtime.tv_sec) + +/* + * The minimum size of EA value when you start storing it in an external inode + * size of block - size of header - size of 1 entry - 4 null bytes +*/ +#define EXT4_XATTR_MIN_LARGE_EA_SIZE(b) \ + ((b) - EXT4_XATTR_LEN(3) - sizeof(struct ext4_xattr_header) - 4) + #define BHDR(bh) ((struct ext4_xattr_header *)((bh)->b_data)) #define ENTRY(ptr) ((struct ext4_xattr_entry *)(ptr)) #define BFIRST(bh) ENTRY(BHDR(bh)+1) @@ -77,10 +97,11 @@ struct ext4_xattr_entry { #define EXT4_ZERO_XATTR_VALUE ((void *)-1) struct ext4_xattr_info { - int name_index; const char *name; const void *value; size_t value_len; + int name_index; + int in_inode; }; struct ext4_xattr_search { @@ -140,7 +161,13 @@ extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t); extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_t, int); extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int); -extern void ext4_xattr_delete_inode(handle_t *, struct inode *); +extern struct inode *ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino, + int *err); +extern int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino); +extern int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, + struct ext4_xattr_ino_array **array); +extern void ext4_xattr_inode_array_free(struct inode *inode, + struct ext4_xattr_ino_array *array); extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, struct ext4_inode *raw_inode, handle_t *handle); From 33d201e0277b2d496f66b621f63693ced2da4198 Mon Sep 17 00:00:00 2001 From: Tahsin Erdogan Date: Wed, 21 Jun 2017 21:17:10 -0400 Subject: [PATCH 03/47] ext4: fix lockdep warning about recursive inode locking Setting a large xattr value may require writing the attribute contents to an external inode. In this case we may need to lock the xattr inode along with the parent inode. This doesn't pose a deadlock risk because xattr inodes are not directly visible to the user and their access is restricted. Assign a lockdep subclass to xattr inode's lock. ============================================ WARNING: possible recursive locking detected 4.12.0-rc1+ #740 Not tainted -------------------------------------------- python/1822 is trying to acquire lock: (&sb->s_type->i_mutex_key#15){+.+...}, at: [] ext4_xattr_set_entry+0x65a/0x7b0 but task is already holding lock: (&sb->s_type->i_mutex_key#15){+.+...}, at: [] vfs_setxattr+0x57/0xb0 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(&sb->s_type->i_mutex_key#15); lock(&sb->s_type->i_mutex_key#15); *** DEADLOCK *** May be due to missing lock nesting notation 4 locks held by python/1822: #0: (sb_writers#10){.+.+.+}, at: [] mnt_want_write+0x1f/0x50 #1: (&sb->s_type->i_mutex_key#15){+.+...}, at: [] vfs_setxattr+0x57/0xb0 #2: (jbd2_handle){.+.+..}, at: [] start_this_handle+0xf0/0x420 #3: (&ei->xattr_sem){++++..}, at: [] ext4_xattr_set_handle+0x9a/0x4f0 stack backtrace: CPU: 0 PID: 1822 Comm: python Not tainted 4.12.0-rc1+ #740 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 Call Trace: dump_stack+0x67/0x9e __lock_acquire+0x5f3/0x1750 lock_acquire+0xb5/0x1d0 down_write+0x2c/0x60 ext4_xattr_set_entry+0x65a/0x7b0 ext4_xattr_block_set+0x1b2/0x9b0 ext4_xattr_set_handle+0x322/0x4f0 ext4_xattr_set+0x144/0x1a0 ext4_xattr_user_set+0x34/0x40 __vfs_setxattr+0x66/0x80 __vfs_setxattr_noperm+0x69/0x1c0 vfs_setxattr+0xa2/0xb0 setxattr+0x12e/0x150 path_setxattr+0x87/0xb0 SyS_setxattr+0xf/0x20 entry_SYSCALL_64_fastpath+0x18/0xad Signed-off-by: Tahsin Erdogan Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 2 ++ fs/ext4/xattr.c | 8 ++++++++ fs/ext4/xattr.h | 6 ++++++ 3 files changed, 16 insertions(+) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 986efd9511ac..fda70fedf56d 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4877,6 +4877,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) } brelse(iloc.bh); ext4_set_inode_flags(inode); + if (ei->i_flags & EXT4_EA_INODE_FL) + ext4_xattr_inode_set_class(inode); unlock_new_inode(inode); return inode; diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 7dd80d16f98e..3d19be8f102e 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -107,6 +107,13 @@ const struct xattr_handler *ext4_xattr_handlers[] = { #define EXT4_GET_MB_CACHE(inode) (((struct ext4_sb_info *) \ inode->i_sb->s_fs_info)->s_mb_cache) +#ifdef CONFIG_LOCKDEP +void ext4_xattr_inode_set_class(struct inode *ea_inode) +{ + lockdep_set_subclass(&ea_inode->i_rwsem, 1); +} +#endif + static __le32 ext4_xattr_block_csum(struct inode *inode, sector_t block_nr, struct ext4_xattr_header *hdr) @@ -828,6 +835,7 @@ static struct inode *ext4_xattr_inode_create(handle_t *handle, ea_inode->i_op = &ext4_file_inode_operations; ea_inode->i_fop = &ext4_file_operations; ext4_set_aops(ea_inode); + ext4_xattr_inode_set_class(ea_inode); ea_inode->i_generation = inode->i_generation; EXT4_I(ea_inode)->i_flags |= EXT4_EA_INODE_FL; diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 6e10ff9393d4..e8bef79bdc38 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -196,3 +196,9 @@ static inline int ext4_init_security(handle_t *handle, struct inode *inode, return 0; } #endif + +#ifdef CONFIG_LOCKDEP +extern void ext4_xattr_inode_set_class(struct inode *ea_inode); +#else +static inline void ext4_xattr_inode_set_class(struct inode *ea_inode) { } +#endif From 0de5983d354adbf1c9fa57eca8b5dd8155132fb1 Mon Sep 17 00:00:00 2001 From: Tahsin Erdogan Date: Wed, 21 Jun 2017 21:19:16 -0400 Subject: [PATCH 04/47] ext4: lock inode before calling ext4_orphan_add() ext4_orphan_add() requires caller to be holding the inode lock. Add missing lock statements. WARNING: CPU: 3 PID: 1806 at fs/ext4/namei.c:2731 ext4_orphan_add+0x4e/0x240 CPU: 3 PID: 1806 Comm: python Not tainted 4.12.0-rc1+ #746 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 task: ffff880135d466c0 task.stack: ffffc900014b0000 RIP: 0010:ext4_orphan_add+0x4e/0x240 RSP: 0018:ffffc900014b3d50 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffff8801348fe1f0 RCX: ffffc900014b3c64 RDX: 0000000000000000 RSI: ffff8801348fe1f0 RDI: ffff8801348fe1f0 RBP: ffffc900014b3da0 R08: 0000000000000000 R09: ffffffff80e82025 R10: 0000000000004692 R11: 000000000000468d R12: ffff880137598000 R13: ffff880137217000 R14: ffff880134ac58d0 R15: 0000000000000000 FS: 00007fc50f09e740(0000) GS:ffff88013fd80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000008bc2e0 CR3: 00000001375ac000 CR4: 00000000000006e0 Call Trace: ext4_xattr_inode_orphan_add.constprop.19+0x9d/0xf0 ext4_xattr_delete_inode+0x1c4/0x2f0 ext4_evict_inode+0x15a/0x7f0 evict+0xc0/0x1a0 iput+0x16a/0x270 do_unlinkat+0x172/0x290 SyS_unlink+0x11/0x20 entry_SYSCALL_64_fastpath+0x18/0xad Signed-off-by: Tahsin Erdogan Signed-off-by: Theodore Ts'o --- fs/ext4/xattr.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 3d19be8f102e..02b0462fec62 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1961,7 +1961,9 @@ ext4_xattr_inode_orphan_add(handle_t *handle, struct inode *inode, lea_ino_array->xia_inodes[idx], &error); if (error) continue; + inode_lock(ea_inode); ext4_orphan_add(handle, ea_inode); + inode_unlock(ea_inode); /* the inode's i_count will be released by caller */ } From 1b917ed8ae0d4ce2ee3d6c56ac6748cd1cd92d4b Mon Sep 17 00:00:00 2001 From: Tahsin Erdogan Date: Wed, 21 Jun 2017 21:21:39 -0400 Subject: [PATCH 05/47] ext4: do not set posix acls on xattr inodes We don't need acls on xattr inodes because they are not directly accessible from user mode. Besides lockdep complains about recursive locking of xattr_sem as seen below. ============================================= [ INFO: possible recursive locking detected ] 4.11.0-rc8+ #402 Not tainted --------------------------------------------- python/1894 is trying to acquire lock: (&ei->xattr_sem){++++..}, at: [] ext4_xattr_get+0x66/0x270 but task is already holding lock: (&ei->xattr_sem){++++..}, at: [] ext4_xattr_set_handle+0xa0/0x5d0 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(&ei->xattr_sem); lock(&ei->xattr_sem); *** DEADLOCK *** May be due to missing lock nesting notation 3 locks held by python/1894: #0: (sb_writers#10){.+.+.+}, at: [] mnt_want_write+0x1f/0x50 #1: (&sb->s_type->i_mutex_key#15){+.+...}, at: [] vfs_setxattr+0x57/0xb0 #2: (&ei->xattr_sem){++++..}, at: [] ext4_xattr_set_handle+0xa0/0x5d0 stack backtrace: CPU: 0 PID: 1894 Comm: python Not tainted 4.11.0-rc8+ #402 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 Call Trace: dump_stack+0x67/0x99 __lock_acquire+0x5f3/0x1830 lock_acquire+0xb5/0x1d0 down_read+0x2f/0x60 ext4_xattr_get+0x66/0x270 ext4_get_acl+0x43/0x1e0 get_acl+0x72/0xf0 posix_acl_create+0x5e/0x170 ext4_init_acl+0x21/0xc0 __ext4_new_inode+0xffd/0x16b0 ext4_xattr_set_entry+0x5ea/0xb70 ext4_xattr_block_set+0x1b5/0x970 ext4_xattr_set_handle+0x351/0x5d0 ext4_xattr_set+0x124/0x180 ext4_xattr_user_set+0x34/0x40 __vfs_setxattr+0x66/0x80 __vfs_setxattr_noperm+0x69/0x1c0 vfs_setxattr+0xa2/0xb0 setxattr+0x129/0x160 path_setxattr+0x87/0xb0 SyS_setxattr+0xf/0x20 entry_SYSCALL_64_fastpath+0x18/0xad Signed-off-by: Tahsin Erdogan Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 11 ++++++----- fs/ext4/ialloc.c | 14 +++++++++----- fs/ext4/migrate.c | 2 +- fs/ext4/xattr.c | 3 ++- 4 files changed, 18 insertions(+), 12 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 41e26ad86fc3..bc80082a2375 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2411,16 +2411,17 @@ extern int ext4fs_dirhash(const char *name, int len, struct /* ialloc.c */ extern struct inode *__ext4_new_inode(handle_t *, struct inode *, umode_t, const struct qstr *qstr, __u32 goal, - uid_t *owner, int handle_type, - unsigned int line_no, int nblocks); + uid_t *owner, __u32 i_flags, + int handle_type, unsigned int line_no, + int nblocks); -#define ext4_new_inode(handle, dir, mode, qstr, goal, owner) \ +#define ext4_new_inode(handle, dir, mode, qstr, goal, owner, i_flags) \ __ext4_new_inode((handle), (dir), (mode), (qstr), (goal), (owner), \ - 0, 0, 0) + i_flags, 0, 0, 0) #define ext4_new_inode_start_handle(dir, mode, qstr, goal, owner, \ type, nblocks) \ __ext4_new_inode(NULL, (dir), (mode), (qstr), (goal), (owner), \ - (type), __LINE__, (nblocks)) + 0, (type), __LINE__, (nblocks)) extern void ext4_free_inode(handle_t *, struct inode *); diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index e2eb3cc06820..fb1b3df17f6e 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -742,8 +742,9 @@ out: */ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, umode_t mode, const struct qstr *qstr, - __u32 goal, uid_t *owner, int handle_type, - unsigned int line_no, int nblocks) + __u32 goal, uid_t *owner, __u32 i_flags, + int handle_type, unsigned int line_no, + int nblocks) { struct super_block *sb; struct buffer_head *inode_bitmap_bh = NULL; @@ -1052,6 +1053,7 @@ got: /* Don't inherit extent flag from directory, amongst others. */ ei->i_flags = ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED); + ei->i_flags |= i_flags; ei->i_file_acl = 0; ei->i_dtime = 0; ei->i_block_group = group; @@ -1108,9 +1110,11 @@ got: goto fail_free_drop; } - err = ext4_init_acl(handle, inode, dir); - if (err) - goto fail_free_drop; + if (!(ei->i_flags & EXT4_EA_INODE_FL)) { + err = ext4_init_acl(handle, inode, dir); + if (err) + goto fail_free_drop; + } err = ext4_init_security(handle, inode, dir, qstr); if (err) diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 364ea4d4a943..cf5181b62df1 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -475,7 +475,7 @@ int ext4_ext_migrate(struct inode *inode) owner[0] = i_uid_read(inode); owner[1] = i_gid_read(inode); tmp_inode = ext4_new_inode(handle, d_inode(inode->i_sb->s_root), - S_IFREG, NULL, goal, owner); + S_IFREG, NULL, goal, owner, 0); if (IS_ERR(tmp_inode)) { retval = PTR_ERR(tmp_inode); ext4_journal_stop(handle); diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 02b0462fec62..df032f50436b 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -830,7 +830,8 @@ static struct inode *ext4_xattr_inode_create(handle_t *handle, * in the same group, or nearby one. */ ea_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode, - S_IFREG | 0600, NULL, inode->i_ino + 1, NULL); + S_IFREG | 0600, NULL, inode->i_ino + 1, NULL, + EXT4_EA_INODE_FL); if (!IS_ERR(ea_inode)) { ea_inode->i_op = &ext4_file_inode_operations; ea_inode->i_fop = &ext4_file_operations; From bd3b963b273e247e13979f98812a6e4979b5c1e4 Mon Sep 17 00:00:00 2001 From: Tahsin Erdogan Date: Wed, 21 Jun 2017 21:24:31 -0400 Subject: [PATCH 06/47] ext4: attach jinode after creation of xattr inode In data=ordered mode jinode needs to be attached to the xattr inode when writing data to it. Attachment normally occurs during file open for regular files. Since we are not using file interface to write to the xattr inode, the jinode attach needs to be done manually. Otherwise the following crash occurs in data=ordered mode. BUG: unable to handle kernel NULL pointer dereference at (null) IP: jbd2_journal_file_inode+0x37/0x110 PGD 13b3c0067 P4D 13b3c0067 PUD 137660067 PMD 0 Oops: 0000 [#1] SMP CPU: 3 PID: 1877 Comm: python Not tainted 4.12.0-rc1+ #749 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 task: ffff88010e368980 task.stack: ffffc90000374000 RIP: 0010:jbd2_journal_file_inode+0x37/0x110 RSP: 0018:ffffc90000377980 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffff880123b06230 RCX: 0000000000280000 RDX: 0000000000000006 RSI: 0000000000000000 RDI: ffff88012c8585d0 RBP: ffffc900003779b0 R08: 0000000000000202 R09: 0000000000000001 R10: 0000000000000000 R11: 0000000000000400 R12: ffff8801111f81c0 R13: ffff88013b2b6800 R14: ffffc90000377ab0 R15: 0000000000000001 FS: 00007f0c99b77740(0000) GS:ffff88013fd80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 0000000136d91000 CR4: 00000000000006e0 Call Trace: jbd2_journal_inode_add_write+0xe/0x10 ext4_map_blocks+0x59e/0x620 ext4_xattr_set_entry+0x501/0x7d0 ext4_xattr_block_set+0x1b2/0x9b0 ext4_xattr_set_handle+0x322/0x4f0 ext4_xattr_set+0x144/0x1a0 ext4_xattr_user_set+0x34/0x40 __vfs_setxattr+0x66/0x80 __vfs_setxattr_noperm+0x69/0x1c0 vfs_setxattr+0xa2/0xb0 setxattr+0x12e/0x150 path_setxattr+0x87/0xb0 SyS_setxattr+0xf/0x20 entry_SYSCALL_64_fastpath+0x18/0xad Signed-off-by: Tahsin Erdogan Signed-off-by: Theodore Ts'o --- fs/ext4/xattr.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index df032f50436b..0b77ab944c3f 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -824,6 +824,7 @@ static struct inode *ext4_xattr_inode_create(handle_t *handle, struct inode *inode) { struct inode *ea_inode = NULL; + int err; /* * Let the next inode be the goal, so we try and allocate the EA inode @@ -846,6 +847,11 @@ static struct inode *ext4_xattr_inode_create(handle_t *handle, */ EXT4_XATTR_INODE_SET_PARENT(ea_inode, inode->i_ino); unlock_new_inode(ea_inode); + err = ext4_inode_attach_jinode(ea_inode); + if (err) { + iput(ea_inode); + return ERR_PTR(err); + } } return ea_inode; From 9e1ba00161a6f3bec8d4e7912025cbf889878e59 Mon Sep 17 00:00:00 2001 From: Tahsin Erdogan Date: Wed, 21 Jun 2017 21:27:00 -0400 Subject: [PATCH 07/47] ext4: ea_inode owner should be the same as the inode owner Quota charging is based on the ownership of the inode. Currently, the xattr inode owner is set to the caller which may be different from the parent inode owner. This is inconsistent with how quota is charged for xattr block and regular data block writes. Signed-off-by: Tahsin Erdogan Signed-off-by: Theodore Ts'o --- fs/ext4/xattr.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 0b77ab944c3f..a29e68293d59 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -824,6 +824,7 @@ static struct inode *ext4_xattr_inode_create(handle_t *handle, struct inode *inode) { struct inode *ea_inode = NULL; + uid_t owner[2] = { i_uid_read(inode), i_gid_read(inode) }; int err; /* @@ -831,7 +832,7 @@ static struct inode *ext4_xattr_inode_create(handle_t *handle, * in the same group, or nearby one. */ ea_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode, - S_IFREG | 0600, NULL, inode->i_ino + 1, NULL, + S_IFREG | 0600, NULL, inode->i_ino + 1, owner, EXT4_EA_INODE_FL); if (!IS_ERR(ea_inode)) { ea_inode->i_op = &ext4_file_inode_operations; From ddfa17e4adc4bd19c32216aaa6250dc38b0579df Mon Sep 17 00:00:00 2001 From: Tahsin Erdogan Date: Wed, 21 Jun 2017 21:36:51 -0400 Subject: [PATCH 08/47] ext4: call journal revoke when freeing ea_inode blocks ea_inode contents are treated as metadata, that's why it is journaled during initial writes. Failing to call revoke during freeing could cause user data to be overwritten with original ea_inode contents during journal replay. Signed-off-by: Tahsin Erdogan Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 3 ++- fs/ext4/indirect.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 3e36508610b7..e0a8425ff74d 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2488,7 +2488,8 @@ int ext4_ext_index_trans_blocks(struct inode *inode, int extents) static inline int get_default_free_blocks_flags(struct inode *inode) { - if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) || + ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE)) return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET; else if (ext4_should_journal_data(inode)) return EXT4_FREE_BLOCKS_FORGET; diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index bc15c2c17633..7ffa290cbb8e 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -829,7 +829,8 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode, int flags = EXT4_FREE_BLOCKS_VALIDATED; int err; - if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) || + ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE)) flags |= EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_METADATA; else if (ext4_should_journal_data(inode)) flags |= EXT4_FREE_BLOCKS_FORGET; From 1e7d359d710e84b996bd034f4ecc7c721e445603 Mon Sep 17 00:00:00 2001 From: Tahsin Erdogan Date: Wed, 21 Jun 2017 21:39:38 -0400 Subject: [PATCH 09/47] ext4: fix ref counting for ea_inode The ref count on ea_inode is incremented by ext4_xattr_inode_orphan_add() which is supposed to be decremented by ext4_xattr_inode_array_free(). The decrement is conditioned on whether the ea_inode is currently on the orphan list. However, the orphan list addition only happens when journaling is enabled. In non-journaled case,r we fail to release the ref count causing an error message like below. "VFS: Busy inodes after unmount of sdb. Self-destruct in 5 seconds. Have a nice day..." Signed-off-by: Tahsin Erdogan Signed-off-by: Theodore Ts'o --- fs/ext4/xattr.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index a29e68293d59..53698f8a6e54 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -2096,8 +2096,7 @@ ext4_xattr_inode_array_free(struct inode *inode, if (err) continue; /* for inode's i_count get from ext4_xattr_delete_inode */ - if (!list_empty(&EXT4_I(ea_inode)->i_orphan)) - iput(ea_inode); + iput(ea_inode); clear_nlink(ea_inode); iput(ea_inode); } From 0eefb10758e696616f19a84d8c5f15b9ffc0dccd Mon Sep 17 00:00:00 2001 From: Tahsin Erdogan Date: Wed, 21 Jun 2017 21:41:37 -0400 Subject: [PATCH 10/47] ext4: extended attribute value size limit is enforced by vfs EXT4_XATTR_MAX_LARGE_EA_SIZE definition in ext4 is currently unused. Besides, vfs enforces its own 64k limit which makes the 1MB limit in ext4 redundant. Remove it. Signed-off-by: Tahsin Erdogan Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index bc80082a2375..9b6a10e1bf18 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2231,12 +2231,6 @@ struct mmpd_data { */ #define EXT4_MMP_MAX_CHECK_INTERVAL 300UL -/* - * Maximum size of xattr attributes for FEATURE_INCOMPAT_EA_INODE 1Mb - * This limit is arbitrary, but is reasonable for the xattr API. - */ -#define EXT4_XATTR_MAX_LARGE_EA_SIZE (1024 * 1024) - /* * Function prototypes */ From bab79b04999ccbbf59f1693d0783cd6ae27e4278 Mon Sep 17 00:00:00 2001 From: Tahsin Erdogan Date: Wed, 21 Jun 2017 21:49:53 -0400 Subject: [PATCH 11/47] ext4: change ext4_xattr_inode_iget() signature In general, kernel functions indicate success/failure through their return values. This function returns the status as an output parameter and reserves the return value for the inode. Make it follow the general convention. Signed-off-by: Tahsin Erdogan Signed-off-by: Theodore Ts'o --- fs/ext4/xattr.c | 75 +++++++++++++++++++++++++++---------------------- fs/ext4/xattr.h | 2 -- 2 files changed, 41 insertions(+), 36 deletions(-) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 53698f8a6e54..f16a90824d44 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -312,40 +312,47 @@ ext4_xattr_inode_read(struct inode *ea_inode, void *buf, size_t *size) return 0; } -struct inode *ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino, int *err) +static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino, + struct inode **ea_inode) { - struct inode *ea_inode = NULL; + struct inode *inode; + int err; - ea_inode = ext4_iget(parent->i_sb, ea_ino); - if (IS_ERR(ea_inode) || is_bad_inode(ea_inode)) { - int rc = IS_ERR(ea_inode) ? PTR_ERR(ea_inode) : 0; + inode = ext4_iget(parent->i_sb, ea_ino); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); ext4_error(parent->i_sb, "error while reading EA inode %lu " - "/ %d %d", ea_ino, rc, is_bad_inode(ea_inode)); - *err = rc != 0 ? rc : -EIO; - return NULL; + "err=%d", ea_ino, err); + return err; } - if (EXT4_XATTR_INODE_GET_PARENT(ea_inode) != parent->i_ino || - ea_inode->i_generation != parent->i_generation) { - ext4_error(parent->i_sb, "Backpointer from EA inode %lu " - "to parent invalid.", ea_ino); - *err = -EINVAL; + if (is_bad_inode(inode)) { + ext4_error(parent->i_sb, "error while reading EA inode %lu " + "is_bad_inode", ea_ino); + err = -EIO; goto error; } - if (!(EXT4_I(ea_inode)->i_flags & EXT4_EA_INODE_FL)) { + if (EXT4_XATTR_INODE_GET_PARENT(inode) != parent->i_ino || + inode->i_generation != parent->i_generation) { + ext4_error(parent->i_sb, "Backpointer from EA inode %lu " + "to parent is invalid.", ea_ino); + err = -EINVAL; + goto error; + } + + if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) { ext4_error(parent->i_sb, "EA inode %lu does not have " "EXT4_EA_INODE_FL flag set.\n", ea_ino); - *err = -EINVAL; + err = -EINVAL; goto error; } - *err = 0; - return ea_inode; - + *ea_inode = inode; + return 0; error: - iput(ea_inode); - return NULL; + iput(inode); + return err; } /* @@ -355,17 +362,17 @@ static int ext4_xattr_inode_get(struct inode *inode, unsigned long ea_ino, void *buffer, size_t *size) { - struct inode *ea_inode = NULL; - int err; + struct inode *ea_inode; + int ret; - ea_inode = ext4_xattr_inode_iget(inode, ea_ino, &err); - if (err) - return err; + ret = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode); + if (ret) + return ret; - err = ext4_xattr_inode_read(ea_inode, buffer, size); + ret = ext4_xattr_inode_read(ea_inode, buffer, size); iput(ea_inode); - return err; + return ret; } static int @@ -866,7 +873,7 @@ int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino) struct inode *ea_inode = NULL; int err; - ea_inode = ext4_xattr_inode_iget(inode, ea_ino, &err); + err = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode); if (err) return err; @@ -1946,7 +1953,7 @@ static int ext4_xattr_inode_orphan_add(handle_t *handle, struct inode *inode, int credits, struct ext4_xattr_ino_array *lea_ino_array) { - struct inode *ea_inode = NULL; + struct inode *ea_inode; int idx = 0, error = 0; if (lea_ino_array == NULL) @@ -1965,8 +1972,8 @@ ext4_xattr_inode_orphan_add(handle_t *handle, struct inode *inode, return error; } } - ea_inode = ext4_xattr_inode_iget(inode, - lea_ino_array->xia_inodes[idx], &error); + error = ext4_xattr_inode_iget(inode, + lea_ino_array->xia_inodes[idx], &ea_inode); if (error) continue; inode_lock(ea_inode); @@ -2083,7 +2090,7 @@ void ext4_xattr_inode_array_free(struct inode *inode, struct ext4_xattr_ino_array *lea_ino_array) { - struct inode *ea_inode = NULL; + struct inode *ea_inode; int idx = 0; int err; @@ -2091,8 +2098,8 @@ ext4_xattr_inode_array_free(struct inode *inode, return; for (; idx < lea_ino_array->xia_count; ++idx) { - ea_inode = ext4_xattr_inode_iget(inode, - lea_ino_array->xia_inodes[idx], &err); + err = ext4_xattr_inode_iget(inode, + lea_ino_array->xia_inodes[idx], &ea_inode); if (err) continue; /* for inode's i_count get from ext4_xattr_delete_inode */ diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index e8bef79bdc38..b6ef99d1a061 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -161,8 +161,6 @@ extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t); extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_t, int); extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int); -extern struct inode *ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino, - int *err); extern int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino); extern int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, struct ext4_xattr_ino_array **array); From 909666933210eb145bc93426ce07f66bf3d1f798 Mon Sep 17 00:00:00 2001 From: Tahsin Erdogan Date: Wed, 21 Jun 2017 21:57:36 -0400 Subject: [PATCH 12/47] ext4: clean up ext4_xattr_inode_get() The input and output values of *size parameter are equal on successful return from ext4_xattr_inode_get(). On error return, the callers ignore the output value so there is no need to update it. Also check for NULL return from ext4_bread(). If the actual xattr inode size happens to be smaller than the expected size, ext4_bread() may return NULL which would indicate data corruption. Signed-off-by: Tahsin Erdogan Signed-off-by: Theodore Ts'o --- fs/ext4/xattr.c | 35 +++++++++++++---------------------- 1 file changed, 13 insertions(+), 22 deletions(-) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index f16a90824d44..61c67a04a7e1 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -278,37 +278,28 @@ ext4_xattr_find_entry(struct ext4_xattr_entry **pentry, int name_index, /* * Read the EA value from an inode. */ -static int -ext4_xattr_inode_read(struct inode *ea_inode, void *buf, size_t *size) +static int ext4_xattr_inode_read(struct inode *ea_inode, void *buf, size_t size) { unsigned long block = 0; struct buffer_head *bh = NULL; - int blocksize; - size_t csize, ret_size = 0; + int blocksize = ea_inode->i_sb->s_blocksize; + size_t csize, copied = 0; - if (*size == 0) - return 0; - - blocksize = ea_inode->i_sb->s_blocksize; - - while (ret_size < *size) { - csize = (*size - ret_size) > blocksize ? blocksize : - *size - ret_size; + while (copied < size) { + csize = (size - copied) > blocksize ? blocksize : size - copied; bh = ext4_bread(NULL, ea_inode, block, 0); - if (IS_ERR(bh)) { - *size = ret_size; + if (IS_ERR(bh)) return PTR_ERR(bh); - } + if (!bh) + return -EFSCORRUPTED; + memcpy(buf, bh->b_data, csize); brelse(bh); buf += csize; block += 1; - ret_size += csize; + copied += csize; } - - *size = ret_size; - return 0; } @@ -360,7 +351,7 @@ error: */ static int ext4_xattr_inode_get(struct inode *inode, unsigned long ea_ino, void *buffer, - size_t *size) + size_t size) { struct inode *ea_inode; int ret; @@ -417,7 +408,7 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, if (entry->e_value_inum) { error = ext4_xattr_inode_get(inode, le32_to_cpu(entry->e_value_inum), - buffer, &size); + buffer, size); if (error) goto cleanup; } else { @@ -467,7 +458,7 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name, if (entry->e_value_inum) { error = ext4_xattr_inode_get(inode, le32_to_cpu(entry->e_value_inum), - buffer, &size); + buffer, size); if (error) goto cleanup; } else { From 990461dd85d57875accc798919e6fe42ab7e294d Mon Sep 17 00:00:00 2001 From: Tahsin Erdogan Date: Wed, 21 Jun 2017 21:59:30 -0400 Subject: [PATCH 13/47] ext4: add missing le32_to_cpu(e_value_inum) conversions Two places in code missed converting xattr inode number using le32_to_cpu(). Signed-off-by: Tahsin Erdogan Signed-off-by: Theodore Ts'o --- fs/ext4/xattr.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 61c67a04a7e1..3983bc455d02 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1995,6 +1995,7 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, struct ext4_inode *raw_inode; struct ext4_iloc iloc; struct ext4_xattr_entry *entry; + unsigned int ea_ino; int credits = 3, error = 0; if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR)) @@ -2009,8 +2010,8 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, entry = EXT4_XATTR_NEXT(entry)) { if (!entry->e_value_inum) continue; - if (ext4_expand_ino_array(lea_ino_array, - entry->e_value_inum) != 0) { + ea_ino = le32_to_cpu(entry->e_value_inum); + if (ext4_expand_ino_array(lea_ino_array, ea_ino) != 0) { brelse(iloc.bh); goto cleanup; } @@ -2042,8 +2043,8 @@ delete_external_ea: entry = EXT4_XATTR_NEXT(entry)) { if (!entry->e_value_inum) continue; - if (ext4_expand_ino_array(lea_ino_array, - entry->e_value_inum) != 0) + ea_ino = le32_to_cpu(entry->e_value_inum); + if (ext4_expand_ino_array(lea_ino_array, ea_ino) != 0) goto cleanup; entry->e_value_inum = 0; } From 0bd454c04f02e7bb101d8ff510b54826eda4a5f0 Mon Sep 17 00:00:00 2001 From: Tahsin Erdogan Date: Wed, 21 Jun 2017 22:02:06 -0400 Subject: [PATCH 14/47] ext4: ext4_xattr_value_same() should return false for external data ext4_xattr_value_same() is used as a quick optimization in case the new xattr value is identical to the previous value. When xattr value is stored in a xattr inode the check becomes expensive so it is better to just assume that they are not equal. Signed-off-by: Tahsin Erdogan Signed-off-by: Theodore Ts'o --- fs/ext4/xattr.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 3983bc455d02..13b7fa4cbf16 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1426,6 +1426,9 @@ static int ext4_xattr_value_same(struct ext4_xattr_search *s, { void *value; + /* When e_value_inum is set the value is stored externally. */ + if (s->here->e_value_inum) + return 0; if (le32_to_cpu(s->here->e_value_size) != i->value_len) return 0; value = ((void *)s->base) + le16_to_cpu(s->here->e_value_offs); From 9bb21cedda7cd69789e1f93d7d918f5ca48ba165 Mon Sep 17 00:00:00 2001 From: Tahsin Erdogan Date: Wed, 21 Jun 2017 22:05:44 -0400 Subject: [PATCH 15/47] ext4: fix ext4_xattr_make_inode_space() value size calculation ext4_xattr_make_inode_space() is interested in calculating the inline space used in an inode. When a xattr entry refers to an external inode the value size indicates the external inode size, not the value size in the inline area. Change the function to take this into account. Signed-off-by: Tahsin Erdogan Signed-off-by: Theodore Ts'o --- fs/ext4/xattr.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 13b7fa4cbf16..2be891ffeda1 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1745,9 +1745,10 @@ static int ext4_xattr_make_inode_space(handle_t *handle, struct inode *inode, last = IFIRST(header); /* Find the entry best suited to be pushed into EA block */ for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { - total_size = - EXT4_XATTR_SIZE(le32_to_cpu(last->e_value_size)) + - EXT4_XATTR_LEN(last->e_name_len); + total_size = EXT4_XATTR_LEN(last->e_name_len); + if (!last->e_value_inum) + total_size += EXT4_XATTR_SIZE( + le32_to_cpu(last->e_value_size)); if (total_size <= bfree && total_size < min_total_size) { if (total_size + ifree < isize_diff) { @@ -1766,8 +1767,10 @@ static int ext4_xattr_make_inode_space(handle_t *handle, struct inode *inode, } entry_size = EXT4_XATTR_LEN(entry->e_name_len); - total_size = entry_size + - EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size)); + total_size = entry_size; + if (!entry->e_value_inum) + total_size += EXT4_XATTR_SIZE( + le32_to_cpu(entry->e_value_size)); error = ext4_xattr_move_to_block(handle, inode, raw_inode, entry); if (error) From f6109100ba8692c677cfdc88af1887a43263e63a Mon Sep 17 00:00:00 2001 From: Tahsin Erdogan Date: Wed, 21 Jun 2017 22:11:54 -0400 Subject: [PATCH 16/47] ext4: fix ext4_xattr_move_to_block() When moving xattr entries from inline area to a xattr block, entries that refer to external xattr inodes need special handling because value data is not available in the inline area but rather should be read from its external inode. Signed-off-by: Tahsin Erdogan Signed-off-by: Theodore Ts'o --- fs/ext4/xattr.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 2be891ffeda1..bd1e61a0c228 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1656,18 +1656,16 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode, struct ext4_xattr_ibody_find *is = NULL; struct ext4_xattr_block_find *bs = NULL; char *buffer = NULL, *b_entry_name = NULL; - size_t value_offs, value_size; + size_t value_size = le32_to_cpu(entry->e_value_size); struct ext4_xattr_info i = { .value = NULL, .value_len = 0, .name_index = entry->e_name_index, + .in_inode = !!entry->e_value_inum, }; struct ext4_xattr_ibody_header *header = IHDR(inode, raw_inode); int error; - value_offs = le16_to_cpu(entry->e_value_offs); - value_size = le32_to_cpu(entry->e_value_size); - is = kzalloc(sizeof(struct ext4_xattr_ibody_find), GFP_NOFS); bs = kzalloc(sizeof(struct ext4_xattr_block_find), GFP_NOFS); buffer = kmalloc(value_size, GFP_NOFS); @@ -1683,7 +1681,17 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode, bs->bh = NULL; /* Save the entry name and the entry value */ - memcpy(buffer, (void *)IFIRST(header) + value_offs, value_size); + if (entry->e_value_inum) { + error = ext4_xattr_inode_get(inode, + le32_to_cpu(entry->e_value_inum), + buffer, value_size); + if (error) + goto out; + } else { + size_t value_offs = le16_to_cpu(entry->e_value_offs); + memcpy(buffer, (void *)IFIRST(header) + value_offs, value_size); + } + memcpy(b_entry_name, entry->e_name, entry->e_name_len); b_entry_name[entry->e_name_len] = '\0'; i.name = b_entry_name; @@ -1701,7 +1709,6 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode, if (error) goto out; - i.name = b_entry_name; i.value = buffer; i.value_len = value_size; error = ext4_xattr_block_find(inode, &i, bs); From 7cec191894e4e2200d942415c3ebccb146214d26 Mon Sep 17 00:00:00 2001 From: Tahsin Erdogan Date: Wed, 21 Jun 2017 22:14:30 -0400 Subject: [PATCH 17/47] ext4: fix ext4_xattr_cmp() When a xattr entry refers to an external inode, the value data is not available in the inline area so we should not attempt to read it using value offset. Signed-off-by: Tahsin Erdogan Signed-off-by: Theodore Ts'o --- fs/ext4/xattr.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index bd1e61a0c228..ed27b5241e69 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -2167,7 +2167,8 @@ ext4_xattr_cmp(struct ext4_xattr_header *header1, entry1->e_value_inum != entry2->e_value_inum || memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len)) return 1; - if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs), + if (!entry1->e_value_inum && + memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs), (char *)header2 + le16_to_cpu(entry2->e_value_offs), le32_to_cpu(entry1->e_value_size))) return 1; From b3155298910c64a312620309f320d26e9461eb19 Mon Sep 17 00:00:00 2001 From: Tahsin Erdogan Date: Wed, 21 Jun 2017 22:16:20 -0400 Subject: [PATCH 18/47] ext4: fix credits calculation for xattr inode When there is no space for a value in xattr block, it may be stored in an xattr inode even if the value length is less than EXT4_XATTR_MIN_LARGE_EA_SIZE(). So the current assumption in credits calculation is wrong. Signed-off-by: Tahsin Erdogan Signed-off-by: Theodore Ts'o --- fs/ext4/xattr.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index ed27b5241e69..9932254c3de9 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1588,8 +1588,7 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name, if (error) return error; - if ((value_len >= EXT4_XATTR_MIN_LARGE_EA_SIZE(sb->s_blocksize)) && - ext4_has_feature_ea_inode(sb)) { + if (ext4_has_feature_ea_inode(sb)) { int nrblocks = (value_len + sb->s_blocksize - 1) >> sb->s_blocksize_bits; From b347e2bcd18eba7ed44659d12e4a39a9b5bdc873 Mon Sep 17 00:00:00 2001 From: Tahsin Erdogan Date: Wed, 21 Jun 2017 22:20:32 -0400 Subject: [PATCH 19/47] ext4: retry storing value in external inode with xattr block too When value size is <= EXT4_XATTR_MIN_LARGE_EA_SIZE(), and it doesn't fit in either inline or xattr block, a second try is made to store it in an external inode while storing the entry itself in inline area. There should also be an attempt to store the entry in xattr block. This patch adds a retry loop to do that. It also makes the caller the sole decider on whether to store a value in an external inode. Signed-off-by: Tahsin Erdogan Signed-off-by: Theodore Ts'o --- fs/ext4/xattr.c | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 9932254c3de9..ab94d6ee496b 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -909,11 +909,6 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, int in_inode = i->in_inode; int rc; - if (ext4_has_feature_ea_inode(inode->i_sb) && - (EXT4_XATTR_SIZE(i->value_len) > - EXT4_XATTR_MIN_LARGE_EA_SIZE(inode->i_sb->s_blocksize))) - in_inode = 1; - /* Compute min_offs and last. */ last = s->first; for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { @@ -1095,7 +1090,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, { struct super_block *sb = inode->i_sb; struct buffer_head *new_bh = NULL; - struct ext4_xattr_search *s = &bs->s; + struct ext4_xattr_search s_copy = bs->s; + struct ext4_xattr_search *s = &s_copy; struct mb_cache_entry *ce = NULL; int error = 0; struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); @@ -1517,6 +1513,11 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, if (!bs.s.not_found && ext4_xattr_value_same(&bs.s, &i)) goto cleanup; + if (ext4_has_feature_ea_inode(inode->i_sb) && + (EXT4_XATTR_SIZE(i.value_len) > + EXT4_XATTR_MIN_LARGE_EA_SIZE(inode->i_sb->s_blocksize))) + i.in_inode = 1; +retry_inode: error = ext4_xattr_ibody_set(handle, inode, &i, &is); if (!error && !bs.s.not_found) { i.value = NULL; @@ -1528,20 +1529,20 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, goto cleanup; } error = ext4_xattr_block_set(handle, inode, &i, &bs); - if (ext4_has_feature_ea_inode(inode->i_sb) && - error == -ENOSPC) { - /* xattr not fit to block, store at external - * inode */ - i.in_inode = 1; - error = ext4_xattr_ibody_set(handle, inode, - &i, &is); - } - if (error) - goto cleanup; - if (!is.s.not_found) { + if (!error && !is.s.not_found) { i.value = NULL; error = ext4_xattr_ibody_set(handle, inode, &i, &is); + } else if (error == -ENOSPC) { + /* + * Xattr does not fit in the block, store at + * external inode if possible. + */ + if (ext4_has_feature_ea_inode(inode->i_sb) && + !i.in_inode) { + i.in_inode = 1; + goto retry_inode; + } } } } From 65d3000520c50f3c160403a210a7504d789eafca Mon Sep 17 00:00:00 2001 From: Tahsin Erdogan Date: Wed, 21 Jun 2017 22:24:38 -0400 Subject: [PATCH 20/47] ext4: ext4_xattr_delete_inode() should return accurate errors In a few places the function returns without trying to pass the actual error code to the caller. Fix those. Signed-off-by: Tahsin Erdogan Signed-off-by: Theodore Ts'o --- fs/ext4/xattr.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index ab94d6ee496b..c8b71bd118b0 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -2024,7 +2024,8 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, if (!entry->e_value_inum) continue; ea_ino = le32_to_cpu(entry->e_value_inum); - if (ext4_expand_ino_array(lea_ino_array, ea_ino) != 0) { + error = ext4_expand_ino_array(lea_ino_array, ea_ino); + if (error) { brelse(iloc.bh); goto cleanup; } @@ -2035,20 +2036,22 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, delete_external_ea: if (!EXT4_I(inode)->i_file_acl) { /* add xattr inode to orphan list */ - ext4_xattr_inode_orphan_add(handle, inode, credits, - *lea_ino_array); + error = ext4_xattr_inode_orphan_add(handle, inode, credits, + *lea_ino_array); goto cleanup; } bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); if (!bh) { EXT4_ERROR_INODE(inode, "block %llu read error", EXT4_I(inode)->i_file_acl); + error = -EIO; goto cleanup; } if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || BHDR(bh)->h_blocks != cpu_to_le32(1)) { EXT4_ERROR_INODE(inode, "bad block %llu", EXT4_I(inode)->i_file_acl); + error = -EFSCORRUPTED; goto cleanup; } @@ -2057,7 +2060,8 @@ delete_external_ea: if (!entry->e_value_inum) continue; ea_ino = le32_to_cpu(entry->e_value_inum); - if (ext4_expand_ino_array(lea_ino_array, ea_ino) != 0) + error = ext4_expand_ino_array(lea_ino_array, ea_ino); + if (error) goto cleanup; entry->e_value_inum = 0; } @@ -2065,7 +2069,7 @@ delete_external_ea: /* add xattr inode to orphan list */ error = ext4_xattr_inode_orphan_add(handle, inode, credits, *lea_ino_array); - if (error != 0) + if (error) goto cleanup; if (!IS_NOQUOTA(inode)) @@ -2075,7 +2079,7 @@ delete_external_ea: error = ext4_journal_extend(handle, credits); if (error > 0) error = ext4_journal_restart(handle, credits); - if (error != 0) { + if (error) { ext4_warning(inode->i_sb, "couldn't extend journal (err %d)", error); goto cleanup; From c1a5d5f6ab21eb7e6ff8cb99489d9001cf2a2850 Mon Sep 17 00:00:00 2001 From: Tahsin Erdogan Date: Wed, 21 Jun 2017 22:28:40 -0400 Subject: [PATCH 21/47] ext4: improve journal credit handling in set xattr paths Both ext4_set_acl() and ext4_set_context() need to be made aware of ea_inode feature when it comes to credits calculation. Also add a sufficient credits check in ext4_xattr_set_handle() right after xattr write lock is grabbed. Original credits calculation is done outside the lock so there is a possiblity that the initially calculated credits are not sufficient anymore. Signed-off-by: Tahsin Erdogan Signed-off-by: Theodore Ts'o --- fs/ext4/acl.c | 7 +++--- fs/ext4/ext4_jbd2.h | 14 ------------ fs/ext4/super.c | 6 ++--- fs/ext4/xattr.c | 55 +++++++++++++++++++++++++++++++++++---------- fs/ext4/xattr.h | 1 + 5 files changed, 51 insertions(+), 32 deletions(-) diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index 3ec0e46de95f..74f7ac539e00 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -231,14 +231,15 @@ int ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type) { handle_t *handle; - int error, retries = 0; + int error, credits, retries = 0; + size_t acl_size = acl ? ext4_acl_size(acl->a_count) : 0; error = dquot_initialize(inode); if (error) return error; retry: - handle = ext4_journal_start(inode, EXT4_HT_XATTR, - ext4_jbd2_credits_xattr(inode)); + credits = ext4_xattr_set_credits(inode, acl_size); + handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits); if (IS_ERR(handle)) return PTR_ERR(handle); diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 5e61e464d71c..dabad1bc8617 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -111,20 +111,6 @@ #define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb)) #define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb)) -static inline int ext4_jbd2_credits_xattr(struct inode *inode) -{ - int credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb); - - /* - * In case of inline data, we may push out the data to a block, - * so we need to reserve credits for this eventuality - */ - if (ext4_has_inline_data(inode)) - credits += ext4_writepage_trans_blocks(inode) + 1; - return credits; -} - - /* * Ext4 handle operation types -- for logging purposes */ diff --git a/fs/ext4/super.c b/fs/ext4/super.c index d37c81f327e7..b02a23ec92ca 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1143,7 +1143,7 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len, void *fs_data) { handle_t *handle = fs_data; - int res, res2, retries = 0; + int res, res2, credits, retries = 0; res = ext4_convert_inline_data(inode); if (res) @@ -1178,8 +1178,8 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len, if (res) return res; retry: - handle = ext4_journal_start(inode, EXT4_HT_MISC, - ext4_jbd2_credits_xattr(inode)); + credits = ext4_xattr_set_credits(inode, len); + handle = ext4_journal_start(inode, EXT4_HT_MISC, credits); if (IS_ERR(handle)) return PTR_ERR(handle); diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index c8b71bd118b0..43a2c075aa1f 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1471,6 +1471,17 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, ext4_write_lock_xattr(inode, &no_expand); + /* Check journal credits under write lock. */ + if (ext4_handle_valid(handle)) { + int credits; + + credits = ext4_xattr_set_credits(inode, value_len); + if (!ext4_handle_has_enough_credits(handle, credits)) { + error = -ENOSPC; + goto cleanup; + } + } + error = ext4_reserve_inode_write(handle, inode, &is.iloc); if (error) goto cleanup; @@ -1568,6 +1579,36 @@ cleanup: return error; } +int ext4_xattr_set_credits(struct inode *inode, size_t value_len) +{ + struct super_block *sb = inode->i_sb; + int credits; + + if (!EXT4_SB(sb)->s_journal) + return 0; + + credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb); + + /* + * In case of inline data, we may push out the data to a block, + * so we need to reserve credits for this eventuality + */ + if (ext4_has_inline_data(inode)) + credits += ext4_writepage_trans_blocks(inode) + 1; + + if (ext4_has_feature_ea_inode(sb)) { + int nrblocks = (value_len + sb->s_blocksize - 1) >> + sb->s_blocksize_bits; + + /* For new inode */ + credits += EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + 3; + + /* For data blocks of EA inode */ + credits += ext4_meta_trans_blocks(inode, nrblocks, 0); + } + return credits; +} + /* * ext4_xattr_set() * @@ -1583,24 +1624,14 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name, handle_t *handle; struct super_block *sb = inode->i_sb; int error, retries = 0; - int credits = ext4_jbd2_credits_xattr(inode); + int credits; error = dquot_initialize(inode); if (error) return error; - if (ext4_has_feature_ea_inode(sb)) { - int nrblocks = (value_len + sb->s_blocksize - 1) >> - sb->s_blocksize_bits; - - /* For new inode */ - credits += EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + 3; - - /* For data blocks of EA inode */ - credits += ext4_meta_trans_blocks(inode, nrblocks, 0); - } - retry: + credits = ext4_xattr_set_credits(inode, value_len); handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits); if (IS_ERR(handle)) { error = PTR_ERR(handle); diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index b6ef99d1a061..e82c5fe36a26 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -160,6 +160,7 @@ extern ssize_t ext4_listxattr(struct dentry *, char *, size_t); extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t); extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_t, int); extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int); +extern int ext4_xattr_set_credits(struct inode *inode, size_t value_len); extern int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino); extern int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, From 0421a189bc8cdefa18a34aee962ac0558679b944 Mon Sep 17 00:00:00 2001 From: Tahsin Erdogan Date: Thu, 22 Jun 2017 10:26:31 -0400 Subject: [PATCH 22/47] ext4: modify ext4_xattr_ino_array to hold struct inode * Tracking struct inode * rather than the inode number eliminates the repeated ext4_xattr_inode_iget() call later. The second call cannot fail in practice but still requires explanation when it wants to ignore the return value. Avoid the trouble and make things simple. Signed-off-by: Tahsin Erdogan Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 6 ++-- fs/ext4/inode.c | 8 ++--- fs/ext4/xattr.c | 93 ++++++++++++++++++++++++------------------------- fs/ext4/xattr.h | 5 ++- 4 files changed, 53 insertions(+), 59 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 9b6a10e1bf18..144a2863ba27 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2243,9 +2243,9 @@ struct mmpd_data { # define ATTRIB_NORET __attribute__((noreturn)) # define NORET_AND noreturn, -struct ext4_xattr_ino_array { - unsigned int xia_count; /* # of used item in the array */ - unsigned int xia_inodes[0]; +struct ext4_xattr_inode_array { + unsigned int count; /* # of used items in the array */ + struct inode *inodes[0]; }; /* bitmap.c */ extern unsigned int ext4_count_free(char *bitmap, unsigned numchars); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index fda70fedf56d..1b2a68c5ea42 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -188,7 +188,7 @@ void ext4_evict_inode(struct inode *inode) handle_t *handle; int err; int extra_credits = 3; - struct ext4_xattr_ino_array *lea_ino_array = NULL; + struct ext4_xattr_inode_array *ea_inode_array = NULL; trace_ext4_evict_inode(inode); @@ -257,7 +257,7 @@ void ext4_evict_inode(struct inode *inode) /* * Delete xattr inode before deleting the main inode. */ - err = ext4_xattr_delete_inode(handle, inode, &lea_ino_array); + err = ext4_xattr_delete_inode(handle, inode, &ea_inode_array); if (err) { ext4_warning(inode->i_sb, "couldn't delete inode's xattr (err %d)", err); @@ -345,9 +345,7 @@ void ext4_evict_inode(struct inode *inode) ext4_journal_stop(handle); sb_end_intwrite(inode->i_sb); - - if (lea_ino_array != NULL) - ext4_xattr_inode_array_free(inode, lea_ino_array); + ext4_xattr_inode_array_free(ea_inode_array); return; no_delete: ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 43a2c075aa1f..fed54001c9e6 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1940,44 +1940,44 @@ cleanup: #define EIA_INCR 16 /* must be 2^n */ #define EIA_MASK (EIA_INCR - 1) -/* Add the large xattr @ino into @lea_ino_array for later deletion. - * If @lea_ino_array is new or full it will be grown and the old +/* Add the large xattr @inode into @ea_inode_array for later deletion. + * If @ea_inode_array is new or full it will be grown and the old * contents copied over. */ static int -ext4_expand_ino_array(struct ext4_xattr_ino_array **lea_ino_array, __u32 ino) +ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array, + struct inode *inode) { - if (*lea_ino_array == NULL) { + if (*ea_inode_array == NULL) { /* * Start with 15 inodes, so it fits into a power-of-two size. - * If *lea_ino_array is NULL, this is essentially offsetof() + * If *ea_inode_array is NULL, this is essentially offsetof() */ - (*lea_ino_array) = - kmalloc(offsetof(struct ext4_xattr_ino_array, - xia_inodes[EIA_MASK]), + (*ea_inode_array) = + kmalloc(offsetof(struct ext4_xattr_inode_array, + inodes[EIA_MASK]), GFP_NOFS); - if (*lea_ino_array == NULL) + if (*ea_inode_array == NULL) return -ENOMEM; - (*lea_ino_array)->xia_count = 0; - } else if (((*lea_ino_array)->xia_count & EIA_MASK) == EIA_MASK) { + (*ea_inode_array)->count = 0; + } else if (((*ea_inode_array)->count & EIA_MASK) == EIA_MASK) { /* expand the array once all 15 + n * 16 slots are full */ - struct ext4_xattr_ino_array *new_array = NULL; - int count = (*lea_ino_array)->xia_count; + struct ext4_xattr_inode_array *new_array = NULL; + int count = (*ea_inode_array)->count; /* if new_array is NULL, this is essentially offsetof() */ new_array = kmalloc( - offsetof(struct ext4_xattr_ino_array, - xia_inodes[count + EIA_INCR]), + offsetof(struct ext4_xattr_inode_array, + inodes[count + EIA_INCR]), GFP_NOFS); if (new_array == NULL) return -ENOMEM; - memcpy(new_array, *lea_ino_array, - offsetof(struct ext4_xattr_ino_array, - xia_inodes[count])); - kfree(*lea_ino_array); - *lea_ino_array = new_array; + memcpy(new_array, *ea_inode_array, + offsetof(struct ext4_xattr_inode_array, inodes[count])); + kfree(*ea_inode_array); + *ea_inode_array = new_array; } - (*lea_ino_array)->xia_inodes[(*lea_ino_array)->xia_count++] = ino; + (*ea_inode_array)->inodes[(*ea_inode_array)->count++] = inode; return 0; } @@ -1985,16 +1985,16 @@ ext4_expand_ino_array(struct ext4_xattr_ino_array **lea_ino_array, __u32 ino) * Add xattr inode to orphan list */ static int -ext4_xattr_inode_orphan_add(handle_t *handle, struct inode *inode, - int credits, struct ext4_xattr_ino_array *lea_ino_array) +ext4_xattr_inode_orphan_add(handle_t *handle, struct inode *inode, int credits, + struct ext4_xattr_inode_array *ea_inode_array) { - struct inode *ea_inode; int idx = 0, error = 0; + struct inode *ea_inode; - if (lea_ino_array == NULL) + if (ea_inode_array == NULL) return 0; - for (; idx < lea_ino_array->xia_count; ++idx) { + for (; idx < ea_inode_array->count; ++idx) { if (!ext4_handle_has_enough_credits(handle, credits)) { error = ext4_journal_extend(handle, credits); if (error > 0) @@ -2007,10 +2007,7 @@ ext4_xattr_inode_orphan_add(handle_t *handle, struct inode *inode, return error; } } - error = ext4_xattr_inode_iget(inode, - lea_ino_array->xia_inodes[idx], &ea_inode); - if (error) - continue; + ea_inode = ea_inode_array->inodes[idx]; inode_lock(ea_inode); ext4_orphan_add(handle, ea_inode); inode_unlock(ea_inode); @@ -2032,13 +2029,14 @@ ext4_xattr_inode_orphan_add(handle_t *handle, struct inode *inode, */ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, - struct ext4_xattr_ino_array **lea_ino_array) + struct ext4_xattr_inode_array **ea_inode_array) { struct buffer_head *bh = NULL; struct ext4_xattr_ibody_header *header; struct ext4_inode *raw_inode; struct ext4_iloc iloc; struct ext4_xattr_entry *entry; + struct inode *ea_inode; unsigned int ea_ino; int credits = 3, error = 0; @@ -2055,8 +2053,12 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, if (!entry->e_value_inum) continue; ea_ino = le32_to_cpu(entry->e_value_inum); - error = ext4_expand_ino_array(lea_ino_array, ea_ino); + error = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode); + if (error) + continue; + error = ext4_expand_inode_array(ea_inode_array, ea_inode); if (error) { + iput(ea_inode); brelse(iloc.bh); goto cleanup; } @@ -2068,7 +2070,7 @@ delete_external_ea: if (!EXT4_I(inode)->i_file_acl) { /* add xattr inode to orphan list */ error = ext4_xattr_inode_orphan_add(handle, inode, credits, - *lea_ino_array); + *ea_inode_array); goto cleanup; } bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); @@ -2091,7 +2093,10 @@ delete_external_ea: if (!entry->e_value_inum) continue; ea_ino = le32_to_cpu(entry->e_value_inum); - error = ext4_expand_ino_array(lea_ino_array, ea_ino); + error = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode); + if (error) + continue; + error = ext4_expand_inode_array(ea_inode_array, ea_inode); if (error) goto cleanup; entry->e_value_inum = 0; @@ -2099,7 +2104,7 @@ delete_external_ea: /* add xattr inode to orphan list */ error = ext4_xattr_inode_orphan_add(handle, inode, credits, - *lea_ino_array); + *ea_inode_array); if (error) goto cleanup; @@ -2126,28 +2131,20 @@ cleanup: return error; } -void -ext4_xattr_inode_array_free(struct inode *inode, - struct ext4_xattr_ino_array *lea_ino_array) +void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *ea_inode_array) { struct inode *ea_inode; int idx = 0; - int err; - if (lea_ino_array == NULL) + if (ea_inode_array == NULL) return; - for (; idx < lea_ino_array->xia_count; ++idx) { - err = ext4_xattr_inode_iget(inode, - lea_ino_array->xia_inodes[idx], &ea_inode); - if (err) - continue; - /* for inode's i_count get from ext4_xattr_delete_inode */ - iput(ea_inode); + for (; idx < ea_inode_array->count; ++idx) { + ea_inode = ea_inode_array->inodes[idx]; clear_nlink(ea_inode); iput(ea_inode); } - kfree(lea_ino_array); + kfree(ea_inode_array); } /* diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index e82c5fe36a26..323eba54f72f 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -164,9 +164,8 @@ extern int ext4_xattr_set_credits(struct inode *inode, size_t value_len); extern int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino); extern int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, - struct ext4_xattr_ino_array **array); -extern void ext4_xattr_inode_array_free(struct inode *inode, - struct ext4_xattr_ino_array *array); + struct ext4_xattr_inode_array **array); +extern void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *array); extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, struct ext4_inode *raw_inode, handle_t *handle); From b6d9029df083c0a9ce1d4eda1480105e635e0d61 Mon Sep 17 00:00:00 2001 From: Tahsin Erdogan Date: Thu, 22 Jun 2017 10:28:38 -0400 Subject: [PATCH 23/47] ext4: move struct ext4_xattr_inode_array to xattr.h Since this is a xattr specific data structure it is cleaner to keep it in xattr header file. Signed-off-by: Tahsin Erdogan Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 4 ---- fs/ext4/xattr.h | 5 +++++ 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 144a2863ba27..6b7c517498c9 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2243,10 +2243,6 @@ struct mmpd_data { # define ATTRIB_NORET __attribute__((noreturn)) # define NORET_AND noreturn, -struct ext4_xattr_inode_array { - unsigned int count; /* # of used items in the array */ - struct inode *inodes[0]; -}; /* bitmap.c */ extern unsigned int ext4_count_free(char *bitmap, unsigned numchars); void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group, diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 323eba54f72f..adf761518a73 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -117,6 +117,11 @@ struct ext4_xattr_ibody_find { struct ext4_iloc iloc; }; +struct ext4_xattr_inode_array { + unsigned int count; /* # of used items in the array */ + struct inode *inodes[0]; +}; + extern const struct xattr_handler ext4_xattr_user_handler; extern const struct xattr_handler ext4_xattr_trusted_handler; extern const struct xattr_handler ext4_xattr_security_handler; From c07dfcb45877fbc6798fa042bab3c4b85378efd4 Mon Sep 17 00:00:00 2001 From: Tahsin Erdogan Date: Thu, 22 Jun 2017 10:29:53 -0400 Subject: [PATCH 24/47] mbcache: make mbcache naming more generic Make names more generic so that mbcache usage is not limited to block sharing. In a subsequent patch in the series ("ext4: xattr inode deduplication"), we start using the mbcache code for sharing xattr inodes. With that patch, old mb_cache_entry.e_block field could be holding either a block number or an inode number. Signed-off-by: Tahsin Erdogan Signed-off-by: Theodore Ts'o --- fs/ext2/xattr.c | 18 ++++++++--------- fs/ext4/xattr.c | 10 +++++----- fs/mbcache.c | 43 ++++++++++++++++++++--------------------- include/linux/mbcache.h | 11 +++++------ 4 files changed, 40 insertions(+), 42 deletions(-) diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index fbdb8f171893..1e5f76070580 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -493,8 +493,8 @@ bad_block: ext2_error(sb, "ext2_xattr_set", * This must happen under buffer lock for * ext2_xattr_set2() to reliably detect modified block */ - mb_cache_entry_delete_block(EXT2_SB(sb)->s_mb_cache, - hash, bh->b_blocknr); + mb_cache_entry_delete(EXT2_SB(sb)->s_mb_cache, hash, + bh->b_blocknr); /* keep the buffer locked while modifying it. */ } else { @@ -721,8 +721,8 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, * This must happen under buffer lock for * ext2_xattr_set2() to reliably detect freed block */ - mb_cache_entry_delete_block(ext2_mb_cache, - hash, old_bh->b_blocknr); + mb_cache_entry_delete(ext2_mb_cache, hash, + old_bh->b_blocknr); /* Free the old block. */ ea_bdebug(old_bh, "freeing"); ext2_free_blocks(inode, old_bh->b_blocknr, 1); @@ -795,8 +795,8 @@ ext2_xattr_delete_inode(struct inode *inode) * This must happen under buffer lock for ext2_xattr_set2() to * reliably detect freed block */ - mb_cache_entry_delete_block(EXT2_SB(inode->i_sb)->s_mb_cache, - hash, bh->b_blocknr); + mb_cache_entry_delete(EXT2_SB(inode->i_sb)->s_mb_cache, hash, + bh->b_blocknr); ext2_free_blocks(inode, EXT2_I(inode)->i_file_acl, 1); get_bh(bh); bforget(bh); @@ -907,11 +907,11 @@ again: while (ce) { struct buffer_head *bh; - bh = sb_bread(inode->i_sb, ce->e_block); + bh = sb_bread(inode->i_sb, ce->e_value); if (!bh) { ext2_error(inode->i_sb, "ext2_xattr_cache_find", "inode %ld: block %ld read error", - inode->i_ino, (unsigned long) ce->e_block); + inode->i_ino, (unsigned long) ce->e_value); } else { lock_buffer(bh); /* @@ -931,7 +931,7 @@ again: } else if (le32_to_cpu(HDR(bh)->h_refcount) > EXT2_XATTR_REFCOUNT_MAX) { ea_idebug(inode, "block %ld refcount %d>%d", - (unsigned long) ce->e_block, + (unsigned long) ce->e_value, le32_to_cpu(HDR(bh)->h_refcount), EXT2_XATTR_REFCOUNT_MAX); } else if (!ext2_xattr_cmp(header, HDR(bh))) { diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index fed54001c9e6..85da7792afd0 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -678,7 +678,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, * This must happen under buffer lock for * ext4_xattr_block_set() to reliably detect freed block */ - mb_cache_entry_delete_block(ext4_mb_cache, hash, bh->b_blocknr); + mb_cache_entry_delete(ext4_mb_cache, hash, bh->b_blocknr); get_bh(bh); unlock_buffer(bh); ext4_free_blocks(handle, inode, bh, 0, 1, @@ -1113,8 +1113,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, * ext4_xattr_block_set() to reliably detect modified * block */ - mb_cache_entry_delete_block(ext4_mb_cache, hash, - bs->bh->b_blocknr); + mb_cache_entry_delete(ext4_mb_cache, hash, + bs->bh->b_blocknr); ea_bdebug(bs->bh, "modifying in-place"); error = ext4_xattr_set_entry(i, s, handle, inode); if (!error) { @@ -2236,10 +2236,10 @@ ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header, while (ce) { struct buffer_head *bh; - bh = sb_bread(inode->i_sb, ce->e_block); + bh = sb_bread(inode->i_sb, ce->e_value); if (!bh) { EXT4_ERROR_INODE(inode, "block %lu read error", - (unsigned long) ce->e_block); + (unsigned long)ce->e_value); } else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) { *pce = ce; return bh; diff --git a/fs/mbcache.c b/fs/mbcache.c index b19be429d655..45a8d52dc991 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c @@ -10,7 +10,7 @@ /* * Mbcache is a simple key-value store. Keys need not be unique, however * key-value pairs are expected to be unique (we use this fact in - * mb_cache_entry_delete_block()). + * mb_cache_entry_delete()). * * Ext2 and ext4 use this cache for deduplication of extended attribute blocks. * They use hash of a block contents as a key and block number as a value. @@ -62,15 +62,15 @@ static inline struct hlist_bl_head *mb_cache_entry_head(struct mb_cache *cache, * @cache - cache where the entry should be created * @mask - gfp mask with which the entry should be allocated * @key - key of the entry - * @block - block that contains data - * @reusable - is the block reusable by other inodes? + * @value - value of the entry + * @reusable - is the entry reusable by others? * - * Creates entry in @cache with key @key and records that data is stored in - * block @block. The function returns -EBUSY if entry with the same key - * and for the same block already exists in cache. Otherwise 0 is returned. + * Creates entry in @cache with key @key and value @value. The function returns + * -EBUSY if entry with the same key and value already exists in cache. + * Otherwise 0 is returned. */ int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key, - sector_t block, bool reusable) + u64 value, bool reusable) { struct mb_cache_entry *entry, *dup; struct hlist_bl_node *dup_node; @@ -91,12 +91,12 @@ int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key, /* One ref for hash, one ref returned */ atomic_set(&entry->e_refcnt, 1); entry->e_key = key; - entry->e_block = block; + entry->e_value = value; entry->e_reusable = reusable; head = mb_cache_entry_head(cache, key); hlist_bl_lock(head); hlist_bl_for_each_entry(dup, dup_node, head, e_hash_list) { - if (dup->e_key == key && dup->e_block == block) { + if (dup->e_key == key && dup->e_value == value) { hlist_bl_unlock(head); kmem_cache_free(mb_entry_cache, entry); return -EBUSY; @@ -187,13 +187,13 @@ struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache, EXPORT_SYMBOL(mb_cache_entry_find_next); /* - * mb_cache_entry_get - get a cache entry by block number (and key) + * mb_cache_entry_get - get a cache entry by value (and key) * @cache - cache we work with - * @key - key of block number @block - * @block - block number + * @key - key + * @value - value */ struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key, - sector_t block) + u64 value) { struct hlist_bl_node *node; struct hlist_bl_head *head; @@ -202,7 +202,7 @@ struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key, head = mb_cache_entry_head(cache, key); hlist_bl_lock(head); hlist_bl_for_each_entry(entry, node, head, e_hash_list) { - if (entry->e_key == key && entry->e_block == block) { + if (entry->e_key == key && entry->e_value == value) { atomic_inc(&entry->e_refcnt); goto out; } @@ -214,15 +214,14 @@ out: } EXPORT_SYMBOL(mb_cache_entry_get); -/* mb_cache_entry_delete_block - remove information about block from cache +/* mb_cache_entry_delete - remove a cache entry * @cache - cache we work with - * @key - key of block @block - * @block - block number + * @key - key + * @value - value * - * Remove entry from cache @cache with key @key with data stored in @block. + * Remove entry from cache @cache with key @key and value @value. */ -void mb_cache_entry_delete_block(struct mb_cache *cache, u32 key, - sector_t block) +void mb_cache_entry_delete(struct mb_cache *cache, u32 key, u64 value) { struct hlist_bl_node *node; struct hlist_bl_head *head; @@ -231,7 +230,7 @@ void mb_cache_entry_delete_block(struct mb_cache *cache, u32 key, head = mb_cache_entry_head(cache, key); hlist_bl_lock(head); hlist_bl_for_each_entry(entry, node, head, e_hash_list) { - if (entry->e_key == key && entry->e_block == block) { + if (entry->e_key == key && entry->e_value == value) { /* We keep hash list reference to keep entry alive */ hlist_bl_del_init(&entry->e_hash_list); hlist_bl_unlock(head); @@ -248,7 +247,7 @@ void mb_cache_entry_delete_block(struct mb_cache *cache, u32 key, } hlist_bl_unlock(head); } -EXPORT_SYMBOL(mb_cache_entry_delete_block); +EXPORT_SYMBOL(mb_cache_entry_delete); /* mb_cache_entry_touch - cache entry got used * @cache - cache the entry belongs to diff --git a/include/linux/mbcache.h b/include/linux/mbcache.h index 86c9a8b480c5..e1bc73414983 100644 --- a/include/linux/mbcache.h +++ b/include/linux/mbcache.h @@ -19,15 +19,15 @@ struct mb_cache_entry { u32 e_key; u32 e_referenced:1; u32 e_reusable:1; - /* Block number of hashed block - stable during lifetime of the entry */ - sector_t e_block; + /* User provided value - stable during lifetime of the entry */ + u64 e_value; }; struct mb_cache *mb_cache_create(int bucket_bits); void mb_cache_destroy(struct mb_cache *cache); int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key, - sector_t block, bool reusable); + u64 value, bool reusable); void __mb_cache_entry_free(struct mb_cache_entry *entry); static inline int mb_cache_entry_put(struct mb_cache *cache, struct mb_cache_entry *entry) @@ -38,10 +38,9 @@ static inline int mb_cache_entry_put(struct mb_cache *cache, return 1; } -void mb_cache_entry_delete_block(struct mb_cache *cache, u32 key, - sector_t block); +void mb_cache_entry_delete(struct mb_cache *cache, u32 key, u64 value); struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key, - sector_t block); + u64 value); struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache, u32 key); struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache, From 47387409ee2e09db6d0e79a026a02073dc56bb8c Mon Sep 17 00:00:00 2001 From: Tahsin Erdogan Date: Thu, 22 Jun 2017 11:28:55 -0400 Subject: [PATCH 25/47] ext2, ext4: make mb block cache names more explicit There will be a second mb_cache instance that tracks ea_inodes. Make existing names more explicit so that it is clear that they refer to xattr block cache. Signed-off-by: Tahsin Erdogan Signed-off-by: Theodore Ts'o --- fs/ext2/ext2.h | 2 +- fs/ext2/super.c | 16 +++++------ fs/ext2/xattr.c | 36 +++++++++++++------------ fs/ext4/ext4.h | 2 +- fs/ext4/super.c | 18 ++++++------- fs/ext4/xattr.c | 71 ++++++++++++++++++++++++++----------------------- 6 files changed, 75 insertions(+), 70 deletions(-) diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 03f5ce1d3dbe..23ebb92484c6 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -113,7 +113,7 @@ struct ext2_sb_info { * of the mount options. */ spinlock_t s_lock; - struct mb_cache *s_mb_cache; + struct mb_cache *s_ea_block_cache; }; static inline spinlock_t * diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 9c2028b50e5c..7b1bc9059863 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -147,9 +147,9 @@ static void ext2_put_super (struct super_block * sb) ext2_quota_off_umount(sb); - if (sbi->s_mb_cache) { - ext2_xattr_destroy_cache(sbi->s_mb_cache); - sbi->s_mb_cache = NULL; + if (sbi->s_ea_block_cache) { + ext2_xattr_destroy_cache(sbi->s_ea_block_cache); + sbi->s_ea_block_cache = NULL; } if (!(sb->s_flags & MS_RDONLY)) { struct ext2_super_block *es = sbi->s_es; @@ -1131,9 +1131,9 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) } #ifdef CONFIG_EXT2_FS_XATTR - sbi->s_mb_cache = ext2_xattr_create_cache(); - if (!sbi->s_mb_cache) { - ext2_msg(sb, KERN_ERR, "Failed to create an mb_cache"); + sbi->s_ea_block_cache = ext2_xattr_create_cache(); + if (!sbi->s_ea_block_cache) { + ext2_msg(sb, KERN_ERR, "Failed to create ea_block_cache"); goto failed_mount3; } #endif @@ -1182,8 +1182,8 @@ cantfind_ext2: sb->s_id); goto failed_mount; failed_mount3: - if (sbi->s_mb_cache) - ext2_xattr_destroy_cache(sbi->s_mb_cache); + if (sbi->s_ea_block_cache) + ext2_xattr_destroy_cache(sbi->s_ea_block_cache); percpu_counter_destroy(&sbi->s_freeblocks_counter); percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index 1e5f76070580..1b9b1268d418 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -121,6 +121,8 @@ const struct xattr_handler *ext2_xattr_handlers[] = { NULL }; +#define EA_BLOCK_CACHE(inode) (EXT2_SB(inode->i_sb)->s_ea_block_cache) + static inline const struct xattr_handler * ext2_xattr_handler(int name_index) { @@ -150,7 +152,7 @@ ext2_xattr_get(struct inode *inode, int name_index, const char *name, size_t name_len, size; char *end; int error; - struct mb_cache *ext2_mb_cache = EXT2_SB(inode->i_sb)->s_mb_cache; + struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode); ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", name_index, name, buffer, (long)buffer_size); @@ -195,7 +197,7 @@ bad_block: ext2_error(inode->i_sb, "ext2_xattr_get", goto found; entry = next; } - if (ext2_xattr_cache_insert(ext2_mb_cache, bh)) + if (ext2_xattr_cache_insert(ea_block_cache, bh)) ea_idebug(inode, "cache insert failed"); error = -ENODATA; goto cleanup; @@ -208,7 +210,7 @@ found: le16_to_cpu(entry->e_value_offs) + size > inode->i_sb->s_blocksize) goto bad_block; - if (ext2_xattr_cache_insert(ext2_mb_cache, bh)) + if (ext2_xattr_cache_insert(ea_block_cache, bh)) ea_idebug(inode, "cache insert failed"); if (buffer) { error = -ERANGE; @@ -246,7 +248,7 @@ ext2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) char *end; size_t rest = buffer_size; int error; - struct mb_cache *ext2_mb_cache = EXT2_SB(inode->i_sb)->s_mb_cache; + struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode); ea_idebug(inode, "buffer=%p, buffer_size=%ld", buffer, (long)buffer_size); @@ -281,7 +283,7 @@ bad_block: ext2_error(inode->i_sb, "ext2_xattr_list", goto bad_block; entry = next; } - if (ext2_xattr_cache_insert(ext2_mb_cache, bh)) + if (ext2_xattr_cache_insert(ea_block_cache, bh)) ea_idebug(inode, "cache insert failed"); /* list the attribute names */ @@ -493,7 +495,7 @@ bad_block: ext2_error(sb, "ext2_xattr_set", * This must happen under buffer lock for * ext2_xattr_set2() to reliably detect modified block */ - mb_cache_entry_delete(EXT2_SB(sb)->s_mb_cache, hash, + mb_cache_entry_delete(EA_BLOCK_CACHE(inode), hash, bh->b_blocknr); /* keep the buffer locked while modifying it. */ @@ -627,7 +629,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, struct super_block *sb = inode->i_sb; struct buffer_head *new_bh = NULL; int error; - struct mb_cache *ext2_mb_cache = EXT2_SB(sb)->s_mb_cache; + struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode); if (header) { new_bh = ext2_xattr_cache_find(inode, header); @@ -655,7 +657,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, don't need to change the reference count. */ new_bh = old_bh; get_bh(new_bh); - ext2_xattr_cache_insert(ext2_mb_cache, new_bh); + ext2_xattr_cache_insert(ea_block_cache, new_bh); } else { /* We need to allocate a new block */ ext2_fsblk_t goal = ext2_group_first_block_no(sb, @@ -676,7 +678,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, memcpy(new_bh->b_data, header, new_bh->b_size); set_buffer_uptodate(new_bh); unlock_buffer(new_bh); - ext2_xattr_cache_insert(ext2_mb_cache, new_bh); + ext2_xattr_cache_insert(ea_block_cache, new_bh); ext2_xattr_update_super_block(sb); } @@ -721,7 +723,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, * This must happen under buffer lock for * ext2_xattr_set2() to reliably detect freed block */ - mb_cache_entry_delete(ext2_mb_cache, hash, + mb_cache_entry_delete(ea_block_cache, hash, old_bh->b_blocknr); /* Free the old block. */ ea_bdebug(old_bh, "freeing"); @@ -795,7 +797,7 @@ ext2_xattr_delete_inode(struct inode *inode) * This must happen under buffer lock for ext2_xattr_set2() to * reliably detect freed block */ - mb_cache_entry_delete(EXT2_SB(inode->i_sb)->s_mb_cache, hash, + mb_cache_entry_delete(EA_BLOCK_CACHE(inode), hash, bh->b_blocknr); ext2_free_blocks(inode, EXT2_I(inode)->i_file_acl, 1); get_bh(bh); @@ -897,13 +899,13 @@ ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header) { __u32 hash = le32_to_cpu(header->h_hash); struct mb_cache_entry *ce; - struct mb_cache *ext2_mb_cache = EXT2_SB(inode->i_sb)->s_mb_cache; + struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode); if (!header->h_hash) return NULL; /* never share */ ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); again: - ce = mb_cache_entry_find_first(ext2_mb_cache, hash); + ce = mb_cache_entry_find_first(ea_block_cache, hash); while (ce) { struct buffer_head *bh; @@ -924,7 +926,7 @@ again: * entry is still hashed is reliable. */ if (hlist_bl_unhashed(&ce->e_hash_list)) { - mb_cache_entry_put(ext2_mb_cache, ce); + mb_cache_entry_put(ea_block_cache, ce); unlock_buffer(bh); brelse(bh); goto again; @@ -937,14 +939,14 @@ again: } else if (!ext2_xattr_cmp(header, HDR(bh))) { ea_bdebug(bh, "b_count=%d", atomic_read(&(bh->b_count))); - mb_cache_entry_touch(ext2_mb_cache, ce); - mb_cache_entry_put(ext2_mb_cache, ce); + mb_cache_entry_touch(ea_block_cache, ce); + mb_cache_entry_put(ea_block_cache, ce); return bh; } unlock_buffer(bh); brelse(bh); } - ce = mb_cache_entry_find_next(ext2_mb_cache, ce); + ce = mb_cache_entry_find_next(ea_block_cache, ce); } return NULL; } diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 6b7c517498c9..caf004d0d1b3 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1516,7 +1516,7 @@ struct ext4_sb_info { struct list_head s_es_list; /* List of inodes with reclaimable extents */ long s_es_nr_inode; struct ext4_es_stats s_es_stats; - struct mb_cache *s_mb_cache; + struct mb_cache *s_ea_block_cache; spinlock_t s_es_lock ____cacheline_aligned_in_smp; /* Ratelimit ext4 messages. */ diff --git a/fs/ext4/super.c b/fs/ext4/super.c index b02a23ec92ca..380389740575 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -927,9 +927,9 @@ static void ext4_put_super(struct super_block *sb) invalidate_bdev(sbi->journal_bdev); ext4_blkdev_remove(sbi); } - if (sbi->s_mb_cache) { - ext4_xattr_destroy_cache(sbi->s_mb_cache); - sbi->s_mb_cache = NULL; + if (sbi->s_ea_block_cache) { + ext4_xattr_destroy_cache(sbi->s_ea_block_cache); + sbi->s_ea_block_cache = NULL; } if (sbi->s_mmp_tsk) kthread_stop(sbi->s_mmp_tsk); @@ -4061,9 +4061,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_journal->j_commit_callback = ext4_journal_commit_callback; no_journal: - sbi->s_mb_cache = ext4_xattr_create_cache(); - if (!sbi->s_mb_cache) { - ext4_msg(sb, KERN_ERR, "Failed to create an mb_cache"); + sbi->s_ea_block_cache = ext4_xattr_create_cache(); + if (!sbi->s_ea_block_cache) { + ext4_msg(sb, KERN_ERR, "Failed to create ea_block_cache"); goto failed_mount_wq; } @@ -4296,9 +4296,9 @@ failed_mount4: if (EXT4_SB(sb)->rsv_conversion_wq) destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq); failed_mount_wq: - if (sbi->s_mb_cache) { - ext4_xattr_destroy_cache(sbi->s_mb_cache); - sbi->s_mb_cache = NULL; + if (sbi->s_ea_block_cache) { + ext4_xattr_destroy_cache(sbi->s_ea_block_cache); + sbi->s_ea_block_cache = NULL; } if (sbi->s_journal) { jbd2_journal_destroy(sbi->s_journal); diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 85da7792afd0..53980ee164ed 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -72,10 +72,11 @@ # define ea_bdebug(bh, fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif -static void ext4_xattr_cache_insert(struct mb_cache *, struct buffer_head *); -static struct buffer_head *ext4_xattr_cache_find(struct inode *, - struct ext4_xattr_header *, - struct mb_cache_entry **); +static void ext4_xattr_block_cache_insert(struct mb_cache *, + struct buffer_head *); +static struct buffer_head * +ext4_xattr_block_cache_find(struct inode *, struct ext4_xattr_header *, + struct mb_cache_entry **); static void ext4_xattr_rehash(struct ext4_xattr_header *, struct ext4_xattr_entry *); @@ -104,8 +105,8 @@ const struct xattr_handler *ext4_xattr_handlers[] = { NULL }; -#define EXT4_GET_MB_CACHE(inode) (((struct ext4_sb_info *) \ - inode->i_sb->s_fs_info)->s_mb_cache) +#define EA_BLOCK_CACHE(inode) (((struct ext4_sb_info *) \ + inode->i_sb->s_fs_info)->s_ea_block_cache) #ifdef CONFIG_LOCKDEP void ext4_xattr_inode_set_class(struct inode *ea_inode) @@ -374,7 +375,7 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, struct ext4_xattr_entry *entry; size_t size; int error; - struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); + struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode); ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", name_index, name, buffer, (long)buffer_size); @@ -395,7 +396,7 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, error = -EFSCORRUPTED; goto cleanup; } - ext4_xattr_cache_insert(ext4_mb_cache, bh); + ext4_xattr_block_cache_insert(ea_block_cache, bh); entry = BFIRST(bh); error = ext4_xattr_find_entry(&entry, name_index, name, 1); if (error) @@ -541,7 +542,6 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size) struct inode *inode = d_inode(dentry); struct buffer_head *bh = NULL; int error; - struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); ea_idebug(inode, "buffer=%p, buffer_size=%ld", buffer, (long)buffer_size); @@ -563,7 +563,7 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size) error = -EFSCORRUPTED; goto cleanup; } - ext4_xattr_cache_insert(ext4_mb_cache, bh); + ext4_xattr_block_cache_insert(EA_BLOCK_CACHE(inode), bh); error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size); cleanup: @@ -660,7 +660,7 @@ static void ext4_xattr_release_block(handle_t *handle, struct inode *inode, struct buffer_head *bh) { - struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); + struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode); u32 hash, ref; int error = 0; @@ -678,7 +678,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, * This must happen under buffer lock for * ext4_xattr_block_set() to reliably detect freed block */ - mb_cache_entry_delete(ext4_mb_cache, hash, bh->b_blocknr); + mb_cache_entry_delete(ea_block_cache, hash, bh->b_blocknr); get_bh(bh); unlock_buffer(bh); ext4_free_blocks(handle, inode, bh, 0, 1, @@ -690,11 +690,11 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, if (ref == EXT4_XATTR_REFCOUNT_MAX - 1) { struct mb_cache_entry *ce; - ce = mb_cache_entry_get(ext4_mb_cache, hash, + ce = mb_cache_entry_get(ea_block_cache, hash, bh->b_blocknr); if (ce) { ce->e_reusable = 1; - mb_cache_entry_put(ext4_mb_cache, ce); + mb_cache_entry_put(ea_block_cache, ce); } } @@ -1094,7 +1094,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, struct ext4_xattr_search *s = &s_copy; struct mb_cache_entry *ce = NULL; int error = 0; - struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); + struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode); #define header(x) ((struct ext4_xattr_header *)(x)) @@ -1113,7 +1113,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, * ext4_xattr_block_set() to reliably detect modified * block */ - mb_cache_entry_delete(ext4_mb_cache, hash, + mb_cache_entry_delete(ea_block_cache, hash, bs->bh->b_blocknr); ea_bdebug(bs->bh, "modifying in-place"); error = ext4_xattr_set_entry(i, s, handle, inode); @@ -1121,8 +1121,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, if (!IS_LAST_ENTRY(s->first)) ext4_xattr_rehash(header(s->base), s->here); - ext4_xattr_cache_insert(ext4_mb_cache, - bs->bh); + ext4_xattr_block_cache_insert(ea_block_cache, + bs->bh); } ext4_xattr_block_csum_set(inode, bs->bh); unlock_buffer(bs->bh); @@ -1175,7 +1175,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, inserted: if (!IS_LAST_ENTRY(s->first)) { - new_bh = ext4_xattr_cache_find(inode, header(s->base), &ce); + new_bh = ext4_xattr_block_cache_find(inode, header(s->base), + &ce); if (new_bh) { /* We found an identical block in the cache. */ if (new_bh == bs->bh) @@ -1220,7 +1221,7 @@ inserted: EXT4_C2B(EXT4_SB(sb), 1)); brelse(new_bh); - mb_cache_entry_put(ext4_mb_cache, ce); + mb_cache_entry_put(ea_block_cache, ce); ce = NULL; new_bh = NULL; goto inserted; @@ -1239,8 +1240,8 @@ inserted: if (error) goto cleanup_dquot; } - mb_cache_entry_touch(ext4_mb_cache, ce); - mb_cache_entry_put(ext4_mb_cache, ce); + mb_cache_entry_touch(ea_block_cache, ce); + mb_cache_entry_put(ea_block_cache, ce); ce = NULL; } else if (bs->bh && s->base == bs->bh->b_data) { /* We were modifying this block in-place. */ @@ -1290,7 +1291,7 @@ getblk_failed: ext4_xattr_block_csum_set(inode, new_bh); set_buffer_uptodate(new_bh); unlock_buffer(new_bh); - ext4_xattr_cache_insert(ext4_mb_cache, new_bh); + ext4_xattr_block_cache_insert(ea_block_cache, new_bh); error = ext4_handle_dirty_metadata(handle, inode, new_bh); if (error) @@ -1308,7 +1309,7 @@ getblk_failed: cleanup: if (ce) - mb_cache_entry_put(ext4_mb_cache, ce); + mb_cache_entry_put(ea_block_cache, ce); brelse(new_bh); if (!(bs->bh && s->base == bs->bh->b_data)) kfree(s->base); @@ -2148,15 +2149,16 @@ void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *ea_inode_array) } /* - * ext4_xattr_cache_insert() + * ext4_xattr_block_cache_insert() * - * Create a new entry in the extended attribute cache, and insert + * Create a new entry in the extended attribute block cache, and insert * it unless such an entry is already in the cache. * * Returns 0, or a negative error number on failure. */ static void -ext4_xattr_cache_insert(struct mb_cache *ext4_mb_cache, struct buffer_head *bh) +ext4_xattr_block_cache_insert(struct mb_cache *ea_block_cache, + struct buffer_head *bh) { struct ext4_xattr_header *header = BHDR(bh); __u32 hash = le32_to_cpu(header->h_hash); @@ -2164,7 +2166,7 @@ ext4_xattr_cache_insert(struct mb_cache *ext4_mb_cache, struct buffer_head *bh) EXT4_XATTR_REFCOUNT_MAX; int error; - error = mb_cache_entry_create(ext4_mb_cache, GFP_NOFS, hash, + error = mb_cache_entry_create(ea_block_cache, GFP_NOFS, hash, bh->b_blocknr, reusable); if (error) { if (error == -EBUSY) @@ -2214,7 +2216,7 @@ ext4_xattr_cmp(struct ext4_xattr_header *header1, } /* - * ext4_xattr_cache_find() + * ext4_xattr_block_cache_find() * * Find an identical extended attribute block. * @@ -2222,17 +2224,18 @@ ext4_xattr_cmp(struct ext4_xattr_header *header1, * not found or an error occurred. */ static struct buffer_head * -ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header, - struct mb_cache_entry **pce) +ext4_xattr_block_cache_find(struct inode *inode, + struct ext4_xattr_header *header, + struct mb_cache_entry **pce) { __u32 hash = le32_to_cpu(header->h_hash); struct mb_cache_entry *ce; - struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); + struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode); if (!header->h_hash) return NULL; /* never share */ ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); - ce = mb_cache_entry_find_first(ext4_mb_cache, hash); + ce = mb_cache_entry_find_first(ea_block_cache, hash); while (ce) { struct buffer_head *bh; @@ -2245,7 +2248,7 @@ ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header, return bh; } brelse(bh); - ce = mb_cache_entry_find_next(ext4_mb_cache, ce); + ce = mb_cache_entry_find_next(ea_block_cache, ce); } return NULL; } From 02749a4c20827649859bf7e2435f1b238c24f935 Mon Sep 17 00:00:00 2001 From: Tahsin Erdogan Date: Thu, 22 Jun 2017 11:31:25 -0400 Subject: [PATCH 26/47] ext4: add ext4_is_quota_file() IS_NOQUOTA() indicates whether quota is disabled for an inode. Ext4 also uses it to check whether an inode is for a quota file. The distinction currently doesn't matter because quota is disabled only for the quota files. When we start disabling quota for other inodes in the future, we will want to make the distinction clear. Replace IS_NOQUOTA() call with ext4_is_quota_file() at places where we are checking for quota files. Signed-off-by: Tahsin Erdogan Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 2 ++ fs/ext4/inode.c | 2 +- fs/ext4/ioctl.c | 4 ++-- fs/ext4/mballoc.c | 2 +- fs/ext4/move_extent.c | 2 +- 5 files changed, 7 insertions(+), 5 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index caf004d0d1b3..09983c774d31 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2100,6 +2100,8 @@ static inline struct ext4_inode *ext4_raw_inode(struct ext4_iloc *iloc) return (struct ext4_inode *) (iloc->bh->b_data + iloc->offset); } +#define ext4_is_quota_file(inode) IS_NOQUOTA(inode) + /* * This structure is stuffed into the struct file's private_data field * for directories. It is where we put information so that we can do diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 1b2a68c5ea42..3e9415e2e74d 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -739,7 +739,7 @@ out_sem: if (map->m_flags & EXT4_MAP_NEW && !(map->m_flags & EXT4_MAP_UNWRITTEN) && !(flags & EXT4_GET_BLOCKS_ZERO) && - !IS_NOQUOTA(inode) && + !ext4_is_quota_file(inode) && ext4_should_order_data(inode)) { if (flags & EXT4_GET_BLOCKS_IO_SUBMIT) ret = ext4_jbd2_inode_add_wait(handle, inode); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 0c21e22acd74..dde8deb11e59 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -218,7 +218,7 @@ static int ext4_ioctl_setflags(struct inode *inode, unsigned int jflag; /* Is it quota file? Do not allow user to mess with it */ - if (IS_NOQUOTA(inode)) + if (ext4_is_quota_file(inode)) goto flags_out; oldflags = ei->i_flags; @@ -342,7 +342,7 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid) err = -EPERM; inode_lock(inode); /* Is it quota file? Do not allow user to mess with it */ - if (IS_NOQUOTA(inode)) + if (ext4_is_quota_file(inode)) goto out_unlock; err = ext4_get_inode_loc(inode, &iloc); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index b7928cddd539..d109a2a2fea0 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4464,7 +4464,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, trace_ext4_request_blocks(ar); /* Allow to use superuser reservation for quota file */ - if (IS_NOQUOTA(ar->inode)) + if (ext4_is_quota_file(ar->inode)) ar->flags |= EXT4_MB_USE_ROOT_BLOCKS; if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) { diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index c992ef2c2f94..9bb36909ec92 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -484,7 +484,7 @@ mext_check_arguments(struct inode *orig_inode, return -EBUSY; } - if (IS_NOQUOTA(orig_inode) || IS_NOQUOTA(donor_inode)) { + if (ext4_is_quota_file(orig_inode) && ext4_is_quota_file(donor_inode)) { ext4_debug("ext4 move extent: The argument files should " "not be quota files [ino:orig %lu, donor %lu]\n", orig_inode->i_ino, donor_inode->i_ino); From 30a7eb970c3aae6f1b74b2edea896fdca1cbea38 Mon Sep 17 00:00:00 2001 From: Tahsin Erdogan Date: Thu, 22 Jun 2017 11:42:09 -0400 Subject: [PATCH 27/47] ext4: cleanup transaction restarts during inode deletion During inode deletion, the number of journal credits that will be needed is hard to determine. For that reason we have journal extend/restart calls in several places. Whenever a transaction is restarted, filesystem must be in a consistent state because there is no atomicity guarantee beyond a restart call. Add ext4_xattr_ensure_credits() helper function which takes care of journal extend/restart logic. It also handles getting jbd2 write access and dirty metadata calls. This function is called at every iteration of handling an ea_inode reference. Signed-off-by: Tahsin Erdogan Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 66 ++++--------- fs/ext4/xattr.c | 258 +++++++++++++++++++++++++++++++----------------- fs/ext4/xattr.h | 3 +- 3 files changed, 184 insertions(+), 143 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 3e9415e2e74d..46def73d3472 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -239,7 +239,11 @@ void ext4_evict_inode(struct inode *inode) */ sb_start_intwrite(inode->i_sb); - handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, extra_credits); + if (!IS_NOQUOTA(inode)) + extra_credits += EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb); + + handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, + ext4_blocks_for_truncate(inode)+extra_credits); if (IS_ERR(handle)) { ext4_std_error(inode->i_sb, PTR_ERR(handle)); /* @@ -251,36 +255,9 @@ void ext4_evict_inode(struct inode *inode) sb_end_intwrite(inode->i_sb); goto no_delete; } + if (IS_SYNC(inode)) ext4_handle_sync(handle); - - /* - * Delete xattr inode before deleting the main inode. - */ - err = ext4_xattr_delete_inode(handle, inode, &ea_inode_array); - if (err) { - ext4_warning(inode->i_sb, - "couldn't delete inode's xattr (err %d)", err); - goto stop_handle; - } - - if (!IS_NOQUOTA(inode)) - extra_credits += 2 * EXT4_QUOTA_DEL_BLOCKS(inode->i_sb); - - if (!ext4_handle_has_enough_credits(handle, - ext4_blocks_for_truncate(inode) + extra_credits)) { - err = ext4_journal_extend(handle, - ext4_blocks_for_truncate(inode) + extra_credits); - if (err > 0) - err = ext4_journal_restart(handle, - ext4_blocks_for_truncate(inode) + extra_credits); - if (err != 0) { - ext4_warning(inode->i_sb, - "couldn't extend journal (err %d)", err); - goto stop_handle; - } - } - inode->i_size = 0; err = ext4_mark_inode_dirty(handle, inode); if (err) { @@ -298,25 +275,17 @@ void ext4_evict_inode(struct inode *inode) } } - /* - * ext4_ext_truncate() doesn't reserve any slop when it - * restarts journal transactions; therefore there may not be - * enough credits left in the handle to remove the inode from - * the orphan list and set the dtime field. - */ - if (!ext4_handle_has_enough_credits(handle, extra_credits)) { - err = ext4_journal_extend(handle, extra_credits); - if (err > 0) - err = ext4_journal_restart(handle, extra_credits); - if (err != 0) { - ext4_warning(inode->i_sb, - "couldn't extend journal (err %d)", err); - stop_handle: - ext4_journal_stop(handle); - ext4_orphan_del(NULL, inode); - sb_end_intwrite(inode->i_sb); - goto no_delete; - } + /* Remove xattr references. */ + err = ext4_xattr_delete_inode(handle, inode, &ea_inode_array, + extra_credits); + if (err) { + ext4_warning(inode->i_sb, "xattr delete (err %d)", err); +stop_handle: + ext4_journal_stop(handle); + ext4_orphan_del(NULL, inode); + sb_end_intwrite(inode->i_sb); + ext4_xattr_inode_array_free(ea_inode_array); + goto no_delete; } /* @@ -342,7 +311,6 @@ void ext4_evict_inode(struct inode *inode) ext4_clear_inode(inode); else ext4_free_inode(handle, inode); - ext4_journal_stop(handle); sb_end_intwrite(inode->i_sb); ext4_xattr_inode_array_free(ea_inode_array); diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 53980ee164ed..649dc2953901 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -108,6 +108,10 @@ const struct xattr_handler *ext4_xattr_handlers[] = { #define EA_BLOCK_CACHE(inode) (((struct ext4_sb_info *) \ inode->i_sb->s_fs_info)->s_ea_block_cache) +static int +ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array, + struct inode *inode); + #ifdef CONFIG_LOCKDEP void ext4_xattr_inode_set_class(struct inode *ea_inode) { @@ -652,6 +656,128 @@ static void ext4_xattr_update_super_block(handle_t *handle, } } +static int ext4_xattr_ensure_credits(handle_t *handle, struct inode *inode, + int credits, struct buffer_head *bh, + bool dirty, bool block_csum) +{ + int error; + + if (!ext4_handle_valid(handle)) + return 0; + + if (handle->h_buffer_credits >= credits) + return 0; + + error = ext4_journal_extend(handle, credits - handle->h_buffer_credits); + if (!error) + return 0; + if (error < 0) { + ext4_warning(inode->i_sb, "Extend journal (error %d)", error); + return error; + } + + if (bh && dirty) { + if (block_csum) + ext4_xattr_block_csum_set(inode, bh); + error = ext4_handle_dirty_metadata(handle, NULL, bh); + if (error) { + ext4_warning(inode->i_sb, "Handle metadata (error %d)", + error); + return error; + } + } + + error = ext4_journal_restart(handle, credits); + if (error) { + ext4_warning(inode->i_sb, "Restart journal (error %d)", error); + return error; + } + + if (bh) { + error = ext4_journal_get_write_access(handle, bh); + if (error) { + ext4_warning(inode->i_sb, + "Get write access failed (error %d)", + error); + return error; + } + } + return 0; +} + +static void +ext4_xattr_inode_remove_all(handle_t *handle, struct inode *parent, + struct buffer_head *bh, + struct ext4_xattr_entry *first, bool block_csum, + struct ext4_xattr_inode_array **ea_inode_array, + int extra_credits) +{ + struct inode *ea_inode; + struct ext4_xattr_entry *entry; + bool dirty = false; + unsigned int ea_ino; + int err; + int credits; + + /* One credit for dec ref on ea_inode, one for orphan list addition, */ + credits = 2 + extra_credits; + + for (entry = first; !IS_LAST_ENTRY(entry); + entry = EXT4_XATTR_NEXT(entry)) { + if (!entry->e_value_inum) + continue; + ea_ino = le32_to_cpu(entry->e_value_inum); + err = ext4_xattr_inode_iget(parent, ea_ino, &ea_inode); + if (err) + continue; + + err = ext4_expand_inode_array(ea_inode_array, ea_inode); + if (err) { + ext4_warning_inode(ea_inode, + "Expand inode array err=%d", err); + iput(ea_inode); + continue; + } + + err = ext4_xattr_ensure_credits(handle, parent, credits, bh, + dirty, block_csum); + if (err) { + ext4_warning_inode(ea_inode, "Ensure credits err=%d", + err); + continue; + } + + inode_lock(ea_inode); + clear_nlink(ea_inode); + ext4_orphan_add(handle, ea_inode); + inode_unlock(ea_inode); + + /* + * Forget about ea_inode within the same transaction that + * decrements the ref count. This avoids duplicate decrements in + * case the rest of the work spills over to subsequent + * transactions. + */ + entry->e_value_inum = 0; + entry->e_value_size = 0; + + dirty = true; + } + + if (dirty) { + /* + * Note that we are deliberately skipping csum calculation for + * the final update because we do not expect any journal + * restarts until xattr block is freed. + */ + + err = ext4_handle_dirty_metadata(handle, NULL, bh); + if (err) + ext4_warning_inode(parent, + "handle dirty metadata err=%d", err); + } +} + /* * Release the xattr block BH: If the reference count is > 1, decrement it; * otherwise free the block. @@ -1982,42 +2108,6 @@ ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array, return 0; } -/** - * Add xattr inode to orphan list - */ -static int -ext4_xattr_inode_orphan_add(handle_t *handle, struct inode *inode, int credits, - struct ext4_xattr_inode_array *ea_inode_array) -{ - int idx = 0, error = 0; - struct inode *ea_inode; - - if (ea_inode_array == NULL) - return 0; - - for (; idx < ea_inode_array->count; ++idx) { - if (!ext4_handle_has_enough_credits(handle, credits)) { - error = ext4_journal_extend(handle, credits); - if (error > 0) - error = ext4_journal_restart(handle, credits); - - if (error != 0) { - ext4_warning(inode->i_sb, - "couldn't extend journal " - "(err %d)", error); - return error; - } - } - ea_inode = ea_inode_array->inodes[idx]; - inode_lock(ea_inode); - ext4_orphan_add(handle, ea_inode); - inode_unlock(ea_inode); - /* the inode's i_count will be released by caller */ - } - - return 0; -} - /* * ext4_xattr_delete_inode() * @@ -2030,16 +2120,23 @@ ext4_xattr_inode_orphan_add(handle_t *handle, struct inode *inode, int credits, */ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, - struct ext4_xattr_inode_array **ea_inode_array) + struct ext4_xattr_inode_array **ea_inode_array, + int extra_credits) { struct buffer_head *bh = NULL; struct ext4_xattr_ibody_header *header; struct ext4_inode *raw_inode; - struct ext4_iloc iloc; - struct ext4_xattr_entry *entry; - struct inode *ea_inode; - unsigned int ea_ino; - int credits = 3, error = 0; + struct ext4_iloc iloc = { .bh = NULL }; + int error; + + error = ext4_xattr_ensure_credits(handle, inode, extra_credits, + NULL /* bh */, + false /* dirty */, + false /* block_csum */); + if (error) { + EXT4_ERROR_INODE(inode, "ensure credits (error %d)", error); + goto cleanup; + } if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR)) goto delete_external_ea; @@ -2047,31 +2144,20 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, error = ext4_get_inode_loc(inode, &iloc); if (error) goto cleanup; + + error = ext4_journal_get_write_access(handle, iloc.bh); + if (error) + goto cleanup; + raw_inode = ext4_raw_inode(&iloc); header = IHDR(inode, raw_inode); - for (entry = IFIRST(header); !IS_LAST_ENTRY(entry); - entry = EXT4_XATTR_NEXT(entry)) { - if (!entry->e_value_inum) - continue; - ea_ino = le32_to_cpu(entry->e_value_inum); - error = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode); - if (error) - continue; - error = ext4_expand_inode_array(ea_inode_array, ea_inode); - if (error) { - iput(ea_inode); - brelse(iloc.bh); - goto cleanup; - } - entry->e_value_inum = 0; - } - brelse(iloc.bh); + ext4_xattr_inode_remove_all(handle, inode, iloc.bh, IFIRST(header), + false /* block_csum */, ea_inode_array, + extra_credits); delete_external_ea: if (!EXT4_I(inode)->i_file_acl) { - /* add xattr inode to orphan list */ - error = ext4_xattr_inode_orphan_add(handle, inode, credits, - *ea_inode_array); + error = 0; goto cleanup; } bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); @@ -2089,46 +2175,32 @@ delete_external_ea: goto cleanup; } - for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry); - entry = EXT4_XATTR_NEXT(entry)) { - if (!entry->e_value_inum) - continue; - ea_ino = le32_to_cpu(entry->e_value_inum); - error = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode); - if (error) - continue; - error = ext4_expand_inode_array(ea_inode_array, ea_inode); - if (error) - goto cleanup; - entry->e_value_inum = 0; - } - - /* add xattr inode to orphan list */ - error = ext4_xattr_inode_orphan_add(handle, inode, credits, - *ea_inode_array); - if (error) - goto cleanup; - - if (!IS_NOQUOTA(inode)) - credits += 2 * EXT4_QUOTA_DEL_BLOCKS(inode->i_sb); - - if (!ext4_handle_has_enough_credits(handle, credits)) { - error = ext4_journal_extend(handle, credits); - if (error > 0) - error = ext4_journal_restart(handle, credits); + if (ext4_has_feature_ea_inode(inode->i_sb)) { + error = ext4_journal_get_write_access(handle, bh); if (error) { - ext4_warning(inode->i_sb, - "couldn't extend journal (err %d)", error); + EXT4_ERROR_INODE(inode, "write access %llu", + EXT4_I(inode)->i_file_acl); goto cleanup; } + ext4_xattr_inode_remove_all(handle, inode, bh, + BFIRST(bh), + true /* block_csum */, + ea_inode_array, + extra_credits); } ext4_xattr_release_block(handle, inode, bh); + /* Update i_file_acl within the same transaction that releases block. */ EXT4_I(inode)->i_file_acl = 0; - + error = ext4_mark_inode_dirty(handle, inode); + if (error) { + EXT4_ERROR_INODE(inode, "mark inode dirty (error %d)", + error); + goto cleanup; + } cleanup: + brelse(iloc.bh); brelse(bh); - return error; } diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index adf761518a73..b2005a2716d9 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -169,7 +169,8 @@ extern int ext4_xattr_set_credits(struct inode *inode, size_t value_len); extern int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino); extern int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, - struct ext4_xattr_inode_array **array); + struct ext4_xattr_inode_array **array, + int extra_credits); extern void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *array); extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, From dec214d00e0d78a08b947d7dccdfdb84407a9f4d Mon Sep 17 00:00:00 2001 From: Tahsin Erdogan Date: Thu, 22 Jun 2017 11:44:55 -0400 Subject: [PATCH 28/47] ext4: xattr inode deduplication Ext4 now supports xattr values that are up to 64k in size (vfs limit). Large xattr values are stored in external inodes each one holding a single value. Once written the data blocks of these inodes are immutable. The real world use cases are expected to have a lot of value duplication such as inherited acls etc. To reduce data duplication on disk, this patch implements a deduplicator that allows sharing of xattr inodes. The deduplication is based on an in-memory hash lookup that is a best effort sharing scheme. When a xattr inode is read from disk (i.e. getxattr() call), its crc32c hash is added to a hash table. Before creating a new xattr inode for a value being set, the hash table is checked to see if an existing inode holds an identical value. If such an inode is found, the ref count on that inode is incremented. On value removal the ref count is decremented and if it reaches zero the inode is deleted. The quota charging for such inodes is manually managed. Every reference holder is charged the full size as if there was no sharing happening. This is consistent with how xattr blocks are also charged. [ Fixed up journal credits calculation to handle inline data and the rare case where an shared xattr block can get freed when two thread race on breaking the xattr block sharing. --tytso ] Signed-off-by: Tahsin Erdogan Signed-off-by: Theodore Ts'o --- fs/ext4/acl.c | 5 +- fs/ext4/ext4.h | 23 +- fs/ext4/inode.c | 13 +- fs/ext4/super.c | 37 +- fs/ext4/xattr.c | 1062 +++++++++++++++++++++++++++++++++++------------ fs/ext4/xattr.h | 17 +- fs/mbcache.c | 9 +- 7 files changed, 867 insertions(+), 299 deletions(-) diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index 74f7ac539e00..8db03e5c78bc 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -238,7 +238,10 @@ ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type) if (error) return error; retry: - credits = ext4_xattr_set_credits(inode, acl_size); + error = ext4_xattr_set_credits(inode, acl_size, &credits); + if (error) + return error; + handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits); if (IS_ERR(handle)) return PTR_ERR(handle); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 09983c774d31..fe92a63c86cb 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1517,6 +1517,7 @@ struct ext4_sb_info { long s_es_nr_inode; struct ext4_es_stats s_es_stats; struct mb_cache *s_ea_block_cache; + struct mb_cache *s_ea_inode_cache; spinlock_t s_es_lock ____cacheline_aligned_in_smp; /* Ratelimit ext4 messages. */ @@ -2100,7 +2101,11 @@ static inline struct ext4_inode *ext4_raw_inode(struct ext4_iloc *iloc) return (struct ext4_inode *) (iloc->bh->b_data + iloc->offset); } -#define ext4_is_quota_file(inode) IS_NOQUOTA(inode) +static inline bool ext4_is_quota_file(struct inode *inode) +{ + return IS_NOQUOTA(inode) && + !(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL); +} /* * This structure is stuffed into the struct file's private_data field @@ -2493,7 +2498,6 @@ extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks); extern void ext4_set_inode_flags(struct inode *); extern int ext4_alloc_da_blocks(struct inode *inode); extern void ext4_set_aops(struct inode *inode); -extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int chunk); extern int ext4_writepage_trans_blocks(struct inode *); extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, @@ -2720,19 +2724,20 @@ extern void ext4_group_desc_csum_set(struct super_block *sb, __u32 group, extern int ext4_register_li_request(struct super_block *sb, ext4_group_t first_not_zeroed); -static inline int ext4_has_group_desc_csum(struct super_block *sb) -{ - return ext4_has_feature_gdt_csum(sb) || - EXT4_SB(sb)->s_chksum_driver != NULL; -} - static inline int ext4_has_metadata_csum(struct super_block *sb) { WARN_ON_ONCE(ext4_has_feature_metadata_csum(sb) && !EXT4_SB(sb)->s_chksum_driver); - return (EXT4_SB(sb)->s_chksum_driver != NULL); + return ext4_has_feature_metadata_csum(sb) && + (EXT4_SB(sb)->s_chksum_driver != NULL); } + +static inline int ext4_has_group_desc_csum(struct super_block *sb) +{ + return ext4_has_feature_gdt_csum(sb) || ext4_has_metadata_csum(sb); +} + static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es) { return ((ext4_fsblk_t)le32_to_cpu(es->s_blocks_count_hi) << 32) | diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 46def73d3472..962f28a0e176 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -139,6 +139,8 @@ static void ext4_invalidatepage(struct page *page, unsigned int offset, unsigned int length); static int __ext4_journalled_writepage(struct page *page, unsigned int len); static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh); +static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, + int pextents); /* * Test whether an inode is a fast symlink. @@ -4843,8 +4845,15 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) } brelse(iloc.bh); ext4_set_inode_flags(inode); - if (ei->i_flags & EXT4_EA_INODE_FL) + + if (ei->i_flags & EXT4_EA_INODE_FL) { ext4_xattr_inode_set_class(inode); + + inode_lock(inode); + inode->i_flags |= S_NOQUOTA; + inode_unlock(inode); + } + unlock_new_inode(inode); return inode; @@ -5503,7 +5512,7 @@ static int ext4_index_trans_blocks(struct inode *inode, int lblocks, * * Also account for superblock, inode, quota and xattr blocks */ -int ext4_meta_trans_blocks(struct inode *inode, int lblocks, +static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int pextents) { ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 380389740575..d501f8256dc4 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -927,6 +927,10 @@ static void ext4_put_super(struct super_block *sb) invalidate_bdev(sbi->journal_bdev); ext4_blkdev_remove(sbi); } + if (sbi->s_ea_inode_cache) { + ext4_xattr_destroy_cache(sbi->s_ea_inode_cache); + sbi->s_ea_inode_cache = NULL; + } if (sbi->s_ea_block_cache) { ext4_xattr_destroy_cache(sbi->s_ea_block_cache); sbi->s_ea_block_cache = NULL; @@ -1178,7 +1182,10 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len, if (res) return res; retry: - credits = ext4_xattr_set_credits(inode, len); + res = ext4_xattr_set_credits(inode, len, &credits); + if (res) + return res; + handle = ext4_journal_start(inode, EXT4_HT_MISC, credits); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -3445,7 +3452,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } /* Load the checksum driver */ - if (ext4_has_feature_metadata_csum(sb)) { + if (ext4_has_feature_metadata_csum(sb) || + ext4_has_feature_ea_inode(sb)) { sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); if (IS_ERR(sbi->s_chksum_driver)) { ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver."); @@ -3467,7 +3475,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) /* Precompute checksum seed for all metadata */ if (ext4_has_feature_csum_seed(sb)) sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed); - else if (ext4_has_metadata_csum(sb)) + else if (ext4_has_metadata_csum(sb) || ext4_has_feature_ea_inode(sb)) sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid, sizeof(es->s_uuid)); @@ -3597,6 +3605,16 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) "The Hurd can't support 64-bit file systems"); goto failed_mount; } + + /* + * ea_inode feature uses l_i_version field which is not + * available in HURD_COMPAT mode. + */ + if (ext4_has_feature_ea_inode(sb)) { + ext4_msg(sb, KERN_ERR, + "ea_inode feature is not supported for Hurd"); + goto failed_mount; + } } if (IS_EXT2_SB(sb)) { @@ -4067,6 +4085,15 @@ no_journal: goto failed_mount_wq; } + if (ext4_has_feature_ea_inode(sb)) { + sbi->s_ea_inode_cache = ext4_xattr_create_cache(); + if (!sbi->s_ea_inode_cache) { + ext4_msg(sb, KERN_ERR, + "Failed to create ea_inode_cache"); + goto failed_mount_wq; + } + } + if ((DUMMY_ENCRYPTION_ENABLED(sbi) || ext4_has_feature_encrypt(sb)) && (blocksize != PAGE_SIZE)) { ext4_msg(sb, KERN_ERR, @@ -4296,6 +4323,10 @@ failed_mount4: if (EXT4_SB(sb)->rsv_conversion_wq) destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq); failed_mount_wq: + if (sbi->s_ea_inode_cache) { + ext4_xattr_destroy_cache(sbi->s_ea_inode_cache); + sbi->s_ea_inode_cache = NULL; + } if (sbi->s_ea_block_cache) { ext4_xattr_destroy_cache(sbi->s_ea_block_cache); sbi->s_ea_block_cache = NULL; diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 649dc2953901..a4c8fe3692a2 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -108,6 +108,9 @@ const struct xattr_handler *ext4_xattr_handlers[] = { #define EA_BLOCK_CACHE(inode) (((struct ext4_sb_info *) \ inode->i_sb->s_fs_info)->s_ea_block_cache) +#define EA_INODE_CACHE(inode) (((struct ext4_sb_info *) \ + inode->i_sb->s_fs_info)->s_ea_inode_cache) + static int ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array, struct inode *inode); @@ -280,15 +283,44 @@ ext4_xattr_find_entry(struct ext4_xattr_entry **pentry, int name_index, return cmp ? -ENODATA : 0; } +static u32 +ext4_xattr_inode_hash(struct ext4_sb_info *sbi, const void *buffer, size_t size) +{ + return ext4_chksum(sbi, sbi->s_csum_seed, buffer, size); +} + +static u64 ext4_xattr_inode_get_ref(struct inode *ea_inode) +{ + return ((u64)ea_inode->i_ctime.tv_sec << 32) | + ((u32)ea_inode->i_version); +} + +static void ext4_xattr_inode_set_ref(struct inode *ea_inode, u64 ref_count) +{ + ea_inode->i_ctime.tv_sec = (u32)(ref_count >> 32); + ea_inode->i_version = (u32)ref_count; +} + +static u32 ext4_xattr_inode_get_hash(struct inode *ea_inode) +{ + return (u32)ea_inode->i_atime.tv_sec; +} + +static void ext4_xattr_inode_set_hash(struct inode *ea_inode, u32 hash) +{ + ea_inode->i_atime.tv_sec = hash; +} + /* * Read the EA value from an inode. */ static int ext4_xattr_inode_read(struct inode *ea_inode, void *buf, size_t size) { unsigned long block = 0; - struct buffer_head *bh = NULL; + struct buffer_head *bh; int blocksize = ea_inode->i_sb->s_blocksize; size_t csize, copied = 0; + void *copy_pos = buf; while (copied < size) { csize = (size - copied) > blocksize ? blocksize : size - copied; @@ -298,10 +330,10 @@ static int ext4_xattr_inode_read(struct inode *ea_inode, void *buf, size_t size) if (!bh) return -EFSCORRUPTED; - memcpy(buf, bh->b_data, csize); + memcpy(copy_pos, bh->b_data, csize); brelse(bh); - buf += csize; + copy_pos += csize; block += 1; copied += csize; } @@ -317,29 +349,24 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino, inode = ext4_iget(parent->i_sb, ea_ino); if (IS_ERR(inode)) { err = PTR_ERR(inode); - ext4_error(parent->i_sb, "error while reading EA inode %lu " - "err=%d", ea_ino, err); + ext4_error(parent->i_sb, + "error while reading EA inode %lu err=%d", ea_ino, + err); return err; } if (is_bad_inode(inode)) { - ext4_error(parent->i_sb, "error while reading EA inode %lu " - "is_bad_inode", ea_ino); + ext4_error(parent->i_sb, + "error while reading EA inode %lu is_bad_inode", + ea_ino); err = -EIO; goto error; } - if (EXT4_XATTR_INODE_GET_PARENT(inode) != parent->i_ino || - inode->i_generation != parent->i_generation) { - ext4_error(parent->i_sb, "Backpointer from EA inode %lu " - "to parent is invalid.", ea_ino); - err = -EINVAL; - goto error; - } - if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) { - ext4_error(parent->i_sb, "EA inode %lu does not have " - "EXT4_EA_INODE_FL flag set.\n", ea_ino); + ext4_error(parent->i_sb, + "EA inode %lu does not have EXT4_EA_INODE_FL flag", + ea_ino); err = -EINVAL; goto error; } @@ -351,6 +378,20 @@ error: return err; } +static int +ext4_xattr_inode_verify_hash(struct inode *ea_inode, void *buffer, size_t size) +{ + u32 hash; + + /* Verify stored hash matches calculated hash. */ + hash = ext4_xattr_inode_hash(EXT4_SB(ea_inode->i_sb), buffer, size); + if (hash != ext4_xattr_inode_get_hash(ea_inode)) + return -EFSCORRUPTED; + return 0; +} + +#define EXT4_XATTR_INODE_GET_PARENT(inode) ((__u32)(inode)->i_mtime.tv_sec) + /* * Read the value from the EA inode. */ @@ -358,17 +399,53 @@ static int ext4_xattr_inode_get(struct inode *inode, unsigned long ea_ino, void *buffer, size_t size) { + struct mb_cache *ea_inode_cache = EA_INODE_CACHE(inode); struct inode *ea_inode; - int ret; + int err; - ret = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode); - if (ret) - return ret; + err = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode); + if (err) { + ea_inode = NULL; + goto out; + } - ret = ext4_xattr_inode_read(ea_inode, buffer, size); + if (i_size_read(ea_inode) != size) { + ext4_warning_inode(ea_inode, + "ea_inode file size=%llu entry size=%zu", + i_size_read(ea_inode), size); + err = -EFSCORRUPTED; + goto out; + } + + err = ext4_xattr_inode_read(ea_inode, buffer, size); + if (err) + goto out; + + err = ext4_xattr_inode_verify_hash(ea_inode, buffer, size); + /* + * Compatibility check for old Lustre ea_inode implementation. Old + * version does not have hash validation, but it has a backpointer + * from ea_inode to the parent inode. + */ + if (err == -EFSCORRUPTED) { + if (EXT4_XATTR_INODE_GET_PARENT(ea_inode) != inode->i_ino || + ea_inode->i_generation != inode->i_generation) { + ext4_warning_inode(ea_inode, + "EA inode hash validation failed"); + goto out; + } + /* Do not add ea_inode to the cache. */ + ea_inode_cache = NULL; + } else if (err) + goto out; + + if (ea_inode_cache) + mb_cache_entry_create(ea_inode_cache, GFP_NOFS, + ext4_xattr_inode_get_hash(ea_inode), + ea_inode->i_ino, true /* reusable */); +out: iput(ea_inode); - - return ret; + return err; } static int @@ -656,6 +733,115 @@ static void ext4_xattr_update_super_block(handle_t *handle, } } +static inline size_t round_up_cluster(struct inode *inode, size_t length) +{ + struct super_block *sb = inode->i_sb; + size_t cluster_size = 1 << (EXT4_SB(sb)->s_cluster_bits + + inode->i_blkbits); + size_t mask = ~(cluster_size - 1); + + return (length + cluster_size - 1) & mask; +} + +static int ext4_xattr_inode_alloc_quota(struct inode *inode, size_t len) +{ + int err; + + err = dquot_alloc_inode(inode); + if (err) + return err; + err = dquot_alloc_space_nodirty(inode, round_up_cluster(inode, len)); + if (err) + dquot_free_inode(inode); + return err; +} + +static void ext4_xattr_inode_free_quota(struct inode *inode, size_t len) +{ + dquot_free_space_nodirty(inode, round_up_cluster(inode, len)); + dquot_free_inode(inode); +} + +static int __ext4_xattr_set_credits(struct inode *inode, + struct buffer_head *block_bh, + size_t value_len) +{ + struct super_block *sb = inode->i_sb; + int credits; + int blocks; + + /* + * 1) Owner inode update + * 2) Ref count update on old xattr block + * 3) new xattr block + * 4) block bitmap update for new xattr block + * 5) group descriptor for new xattr block + * 6) block bitmap update for old xattr block + * 7) group descriptor for old block + * + * 6 & 7 can happen if we have two racing threads T_a and T_b + * which are each trying to set an xattr on inodes I_a and I_b + * which were both initially sharing an xattr block. + */ + credits = 7; + + /* Quota updates. */ + credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(sb); + + /* + * In case of inline data, we may push out the data to a block, + * so we need to reserve credits for this eventuality + */ + if (ext4_has_inline_data(inode)) + credits += ext4_writepage_trans_blocks(inode) + 1; + + /* We are done if ea_inode feature is not enabled. */ + if (!ext4_has_feature_ea_inode(sb)) + return credits; + + /* New ea_inode, inode map, block bitmap, group descriptor. */ + credits += 4; + + /* Data blocks. */ + blocks = (value_len + sb->s_blocksize - 1) >> sb->s_blocksize_bits; + + /* Indirection block or one level of extent tree. */ + blocks += 1; + + /* Block bitmap and group descriptor updates for each block. */ + credits += blocks * 2; + + /* Blocks themselves. */ + credits += blocks; + + /* Dereference ea_inode holding old xattr value. + * Old ea_inode, inode map, block bitmap, group descriptor. + */ + credits += 4; + + /* Data blocks for old ea_inode. */ + blocks = XATTR_SIZE_MAX >> sb->s_blocksize_bits; + + /* Indirection block or one level of extent tree for old ea_inode. */ + blocks += 1; + + /* Block bitmap and group descriptor updates for each block. */ + credits += blocks * 2; + + /* We may need to clone the existing xattr block in which case we need + * to increment ref counts for existing ea_inodes referenced by it. + */ + if (block_bh) { + struct ext4_xattr_entry *entry = BFIRST(block_bh); + + for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) + if (entry->e_value_inum) + /* Ref count update on ea_inode. */ + credits += 1; + } + return credits; +} + static int ext4_xattr_ensure_credits(handle_t *handle, struct inode *inode, int credits, struct buffer_head *bh, bool dirty, bool block_csum) @@ -705,12 +891,140 @@ static int ext4_xattr_ensure_credits(handle_t *handle, struct inode *inode, return 0; } +static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode, + int ref_change) +{ + struct mb_cache *ea_inode_cache = EA_INODE_CACHE(ea_inode); + struct ext4_iloc iloc; + s64 ref_count; + u32 hash; + int ret; + + inode_lock(ea_inode); + + ret = ext4_reserve_inode_write(handle, ea_inode, &iloc); + if (ret) { + iloc.bh = NULL; + goto out; + } + + ref_count = ext4_xattr_inode_get_ref(ea_inode); + ref_count += ref_change; + ext4_xattr_inode_set_ref(ea_inode, ref_count); + + if (ref_change > 0) { + WARN_ONCE(ref_count <= 0, "EA inode %lu ref_count=%lld", + ea_inode->i_ino, ref_count); + + if (ref_count == 1) { + WARN_ONCE(ea_inode->i_nlink, "EA inode %lu i_nlink=%u", + ea_inode->i_ino, ea_inode->i_nlink); + + set_nlink(ea_inode, 1); + ext4_orphan_del(handle, ea_inode); + + hash = ext4_xattr_inode_get_hash(ea_inode); + mb_cache_entry_create(ea_inode_cache, GFP_NOFS, hash, + ea_inode->i_ino, + true /* reusable */); + } + } else { + WARN_ONCE(ref_count < 0, "EA inode %lu ref_count=%lld", + ea_inode->i_ino, ref_count); + + if (ref_count == 0) { + WARN_ONCE(ea_inode->i_nlink != 1, + "EA inode %lu i_nlink=%u", + ea_inode->i_ino, ea_inode->i_nlink); + + clear_nlink(ea_inode); + ext4_orphan_add(handle, ea_inode); + + hash = ext4_xattr_inode_get_hash(ea_inode); + mb_cache_entry_delete(ea_inode_cache, hash, + ea_inode->i_ino); + } + } + + ret = ext4_mark_iloc_dirty(handle, ea_inode, &iloc); + iloc.bh = NULL; + if (ret) + ext4_warning_inode(ea_inode, + "ext4_mark_iloc_dirty() failed ret=%d", ret); +out: + brelse(iloc.bh); + inode_unlock(ea_inode); + return ret; +} + +static int ext4_xattr_inode_inc_ref(handle_t *handle, struct inode *ea_inode) +{ + return ext4_xattr_inode_update_ref(handle, ea_inode, 1); +} + +static int ext4_xattr_inode_dec_ref(handle_t *handle, struct inode *ea_inode) +{ + return ext4_xattr_inode_update_ref(handle, ea_inode, -1); +} + +static int ext4_xattr_inode_inc_ref_all(handle_t *handle, struct inode *parent, + struct ext4_xattr_entry *first) +{ + struct inode *ea_inode; + struct ext4_xattr_entry *entry; + struct ext4_xattr_entry *failed_entry; + unsigned int ea_ino; + int err, saved_err; + + for (entry = first; !IS_LAST_ENTRY(entry); + entry = EXT4_XATTR_NEXT(entry)) { + if (!entry->e_value_inum) + continue; + ea_ino = le32_to_cpu(entry->e_value_inum); + err = ext4_xattr_inode_iget(parent, ea_ino, &ea_inode); + if (err) + goto cleanup; + err = ext4_xattr_inode_inc_ref(handle, ea_inode); + if (err) { + ext4_warning_inode(ea_inode, "inc ref error %d", err); + iput(ea_inode); + goto cleanup; + } + iput(ea_inode); + } + return 0; + +cleanup: + saved_err = err; + failed_entry = entry; + + for (entry = first; entry != failed_entry; + entry = EXT4_XATTR_NEXT(entry)) { + if (!entry->e_value_inum) + continue; + ea_ino = le32_to_cpu(entry->e_value_inum); + err = ext4_xattr_inode_iget(parent, ea_ino, &ea_inode); + if (err) { + ext4_warning(parent->i_sb, + "cleanup ea_ino %u iget error %d", ea_ino, + err); + continue; + } + err = ext4_xattr_inode_dec_ref(handle, ea_inode); + if (err) + ext4_warning_inode(ea_inode, "cleanup dec ref error %d", + err); + iput(ea_inode); + } + return saved_err; +} + static void -ext4_xattr_inode_remove_all(handle_t *handle, struct inode *parent, - struct buffer_head *bh, - struct ext4_xattr_entry *first, bool block_csum, - struct ext4_xattr_inode_array **ea_inode_array, - int extra_credits) +ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent, + struct buffer_head *bh, + struct ext4_xattr_entry *first, bool block_csum, + struct ext4_xattr_inode_array **ea_inode_array, + int extra_credits, bool skip_quota) { struct inode *ea_inode; struct ext4_xattr_entry *entry; @@ -747,10 +1061,16 @@ ext4_xattr_inode_remove_all(handle_t *handle, struct inode *parent, continue; } - inode_lock(ea_inode); - clear_nlink(ea_inode); - ext4_orphan_add(handle, ea_inode); - inode_unlock(ea_inode); + err = ext4_xattr_inode_dec_ref(handle, ea_inode); + if (err) { + ext4_warning_inode(ea_inode, "ea_inode dec ref err=%d", + err); + continue; + } + + if (!skip_quota) + ext4_xattr_inode_free_quota(parent, + le32_to_cpu(entry->e_value_size)); /* * Forget about ea_inode within the same transaction that @@ -784,7 +1104,9 @@ ext4_xattr_inode_remove_all(handle_t *handle, struct inode *parent, */ static void ext4_xattr_release_block(handle_t *handle, struct inode *inode, - struct buffer_head *bh) + struct buffer_head *bh, + struct ext4_xattr_inode_array **ea_inode_array, + int extra_credits) { struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode); u32 hash, ref; @@ -807,6 +1129,14 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, mb_cache_entry_delete(ea_block_cache, hash, bh->b_blocknr); get_bh(bh); unlock_buffer(bh); + + if (ext4_has_feature_ea_inode(inode->i_sb)) + ext4_xattr_inode_dec_ref_all(handle, inode, bh, + BFIRST(bh), + true /* block_csum */, + ea_inode_array, + extra_credits, + true /* skip_quota */); ext4_free_blocks(handle, inode, bh, 0, 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); @@ -878,8 +1208,8 @@ static int ext4_xattr_inode_write(handle_t *handle, struct inode *ea_inode, { struct buffer_head *bh = NULL; unsigned long block = 0; - unsigned blocksize = ea_inode->i_sb->s_blocksize; - unsigned max_blocks = (bufsize + blocksize - 1) >> ea_inode->i_blkbits; + int blocksize = ea_inode->i_sb->s_blocksize; + int max_blocks = (bufsize + blocksize - 1) >> ea_inode->i_blkbits; int csize, wsize = 0; int ret = 0; int retries = 0; @@ -945,7 +1275,7 @@ out: * Create an inode to store the value of a large EA. */ static struct inode *ext4_xattr_inode_create(handle_t *handle, - struct inode *inode) + struct inode *inode, u32 hash) { struct inode *ea_inode = NULL; uid_t owner[2] = { i_uid_read(inode), i_gid_read(inode) }; @@ -963,67 +1293,115 @@ static struct inode *ext4_xattr_inode_create(handle_t *handle, ea_inode->i_fop = &ext4_file_operations; ext4_set_aops(ea_inode); ext4_xattr_inode_set_class(ea_inode); - ea_inode->i_generation = inode->i_generation; - EXT4_I(ea_inode)->i_flags |= EXT4_EA_INODE_FL; - - /* - * A back-pointer from EA inode to parent inode will be useful - * for e2fsck. - */ - EXT4_XATTR_INODE_SET_PARENT(ea_inode, inode->i_ino); unlock_new_inode(ea_inode); - err = ext4_inode_attach_jinode(ea_inode); + ext4_xattr_inode_set_ref(ea_inode, 1); + ext4_xattr_inode_set_hash(ea_inode, hash); + err = ext4_mark_inode_dirty(handle, ea_inode); + if (!err) + err = ext4_inode_attach_jinode(ea_inode); if (err) { iput(ea_inode); return ERR_PTR(err); } + + /* + * Xattr inodes are shared therefore quota charging is performed + * at a higher level. + */ + dquot_free_inode(ea_inode); + dquot_drop(ea_inode); + inode_lock(ea_inode); + ea_inode->i_flags |= S_NOQUOTA; + inode_unlock(ea_inode); } return ea_inode; } -/* - * Unlink the inode storing the value of the EA. - */ -int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino) +static struct inode * +ext4_xattr_inode_cache_find(struct inode *inode, const void *value, + size_t value_len, u32 hash) { - struct inode *ea_inode = NULL; - int err; + struct inode *ea_inode; + struct mb_cache_entry *ce; + struct mb_cache *ea_inode_cache = EA_INODE_CACHE(inode); + void *ea_data; - err = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode); - if (err) - return err; + ce = mb_cache_entry_find_first(ea_inode_cache, hash); + if (!ce) + return NULL; - clear_nlink(ea_inode); - iput(ea_inode); + ea_data = ext4_kvmalloc(value_len, GFP_NOFS); + if (!ea_data) { + mb_cache_entry_put(ea_inode_cache, ce); + return NULL; + } - return 0; + while (ce) { + ea_inode = ext4_iget(inode->i_sb, ce->e_value); + if (!IS_ERR(ea_inode) && + !is_bad_inode(ea_inode) && + (EXT4_I(ea_inode)->i_flags & EXT4_EA_INODE_FL) && + i_size_read(ea_inode) == value_len && + !ext4_xattr_inode_read(ea_inode, ea_data, value_len) && + !ext4_xattr_inode_verify_hash(ea_inode, ea_data, + value_len) && + !memcmp(value, ea_data, value_len)) { + mb_cache_entry_touch(ea_inode_cache, ce); + mb_cache_entry_put(ea_inode_cache, ce); + kvfree(ea_data); + return ea_inode; + } + + if (!IS_ERR(ea_inode)) + iput(ea_inode); + ce = mb_cache_entry_find_next(ea_inode_cache, ce); + } + kvfree(ea_data); + return NULL; } /* * Add value of the EA in an inode. */ -static int ext4_xattr_inode_set(handle_t *handle, struct inode *inode, - unsigned long *ea_ino, const void *value, - size_t value_len) +static int ext4_xattr_inode_lookup_create(handle_t *handle, struct inode *inode, + const void *value, size_t value_len, + struct inode **ret_inode) { struct inode *ea_inode; + u32 hash; int err; + hash = ext4_xattr_inode_hash(EXT4_SB(inode->i_sb), value, value_len); + ea_inode = ext4_xattr_inode_cache_find(inode, value, value_len, hash); + if (ea_inode) { + err = ext4_xattr_inode_inc_ref(handle, ea_inode); + if (err) { + iput(ea_inode); + return err; + } + + *ret_inode = ea_inode; + return 0; + } + /* Create an inode for the EA value */ - ea_inode = ext4_xattr_inode_create(handle, inode); + ea_inode = ext4_xattr_inode_create(handle, inode, hash); if (IS_ERR(ea_inode)) return PTR_ERR(ea_inode); err = ext4_xattr_inode_write(handle, ea_inode, value, value_len); - if (err) - clear_nlink(ea_inode); - else - *ea_ino = ea_inode->i_ino; + if (err) { + ext4_xattr_inode_dec_ref(handle, ea_inode); + iput(ea_inode); + return err; + } - iput(ea_inode); + mb_cache_entry_create(EA_INODE_CACHE(inode), GFP_NOFS, hash, + ea_inode->i_ino, true /* reusable */); - return err; + *ret_inode = ea_inode; + return 0; } static int ext4_xattr_set_entry(struct ext4_xattr_info *i, @@ -1031,9 +1409,37 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, handle_t *handle, struct inode *inode) { struct ext4_xattr_entry *last; - size_t free, min_offs = s->end - s->base, name_len = strlen(i->name); + struct ext4_xattr_entry *here = s->here; + size_t min_offs = s->end - s->base, name_len = strlen(i->name); int in_inode = i->in_inode; - int rc; + struct inode *old_ea_inode = NULL; + struct inode *new_ea_inode = NULL; + size_t old_size, new_size; + int ret; + + /* Space used by old and new values. */ + old_size = (!s->not_found && !here->e_value_inum) ? + EXT4_XATTR_SIZE(le32_to_cpu(here->e_value_size)) : 0; + new_size = (i->value && !in_inode) ? EXT4_XATTR_SIZE(i->value_len) : 0; + + /* + * Optimization for the simple case when old and new values have the + * same padded sizes. Not applicable if external inodes are involved. + */ + if (new_size && new_size == old_size) { + size_t offs = le16_to_cpu(here->e_value_offs); + void *val = s->base + offs; + + here->e_value_size = cpu_to_le32(i->value_len); + if (i->value == EXT4_ZERO_XATTR_VALUE) { + memset(val, 0, new_size); + } else { + memcpy(val, i->value, i->value_len); + /* Clear padding bytes. */ + memset(val + i->value_len, 0, new_size - i->value_len); + } + return 0; + } /* Compute min_offs and last. */ last = s->first; @@ -1044,122 +1450,148 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, min_offs = offs; } } - free = min_offs - ((void *)last - s->base) - sizeof(__u32); - if (!s->not_found) { - if (!in_inode && - !s->here->e_value_inum && s->here->e_value_size) { - size_t size = le32_to_cpu(s->here->e_value_size); - free += EXT4_XATTR_SIZE(size); - } - free += EXT4_XATTR_LEN(name_len); - } + + /* Check whether we have enough space. */ if (i->value) { - size_t value_len = EXT4_XATTR_SIZE(i->value_len); + size_t free; - if (in_inode) - value_len = 0; + free = min_offs - ((void *)last - s->base) - sizeof(__u32); + if (!s->not_found) + free += EXT4_XATTR_LEN(name_len) + old_size; - if (free < EXT4_XATTR_LEN(name_len) + value_len) - return -ENOSPC; + if (free < EXT4_XATTR_LEN(name_len) + new_size) { + ret = -ENOSPC; + goto out; + } } - if (i->value && s->not_found) { - /* Insert the new name. */ + /* + * Getting access to old and new ea inodes is subject to failures. + * Finish that work before doing any modifications to the xattr data. + */ + if (!s->not_found && here->e_value_inum) { + ret = ext4_xattr_inode_iget(inode, + le32_to_cpu(here->e_value_inum), + &old_ea_inode); + if (ret) { + old_ea_inode = NULL; + goto out; + } + } + if (i->value && in_inode) { + WARN_ON_ONCE(!i->value_len); + + ret = ext4_xattr_inode_alloc_quota(inode, i->value_len); + if (ret) + goto out; + + ret = ext4_xattr_inode_lookup_create(handle, inode, i->value, + i->value_len, + &new_ea_inode); + if (ret) { + new_ea_inode = NULL; + ext4_xattr_inode_free_quota(inode, i->value_len); + goto out; + } + } + + if (old_ea_inode) { + /* We are ready to release ref count on the old_ea_inode. */ + ret = ext4_xattr_inode_dec_ref(handle, old_ea_inode); + if (ret) { + /* Release newly required ref count on new_ea_inode. */ + if (new_ea_inode) { + int err; + + err = ext4_xattr_inode_dec_ref(handle, + new_ea_inode); + if (err) + ext4_warning_inode(new_ea_inode, + "dec ref new_ea_inode err=%d", + err); + ext4_xattr_inode_free_quota(inode, + i->value_len); + } + goto out; + } + + ext4_xattr_inode_free_quota(inode, + le32_to_cpu(here->e_value_size)); + } + + /* No failures allowed past this point. */ + + if (!s->not_found && here->e_value_offs) { + /* Remove the old value. */ + void *first_val = s->base + min_offs; + size_t offs = le16_to_cpu(here->e_value_offs); + void *val = s->base + offs; + + memmove(first_val + old_size, first_val, val - first_val); + memset(first_val, 0, old_size); + min_offs += old_size; + + /* Adjust all value offsets. */ + last = s->first; + while (!IS_LAST_ENTRY(last)) { + size_t o = le16_to_cpu(last->e_value_offs); + + if (!last->e_value_inum && + last->e_value_size && o < offs) + last->e_value_offs = cpu_to_le16(o + old_size); + last = EXT4_XATTR_NEXT(last); + } + } + + if (!i->value) { + /* Remove old name. */ size_t size = EXT4_XATTR_LEN(name_len); - size_t rest = (void *)last - (void *)s->here + sizeof(__u32); - memmove((void *)s->here + size, s->here, rest); - memset(s->here, 0, size); - s->here->e_name_index = i->name_index; - s->here->e_name_len = name_len; - memcpy(s->here->e_name, i->name, name_len); + + last = ENTRY((void *)last - size); + memmove(here, (void *)here + size, + (void *)last - (void *)here + sizeof(__u32)); + memset(last, 0, size); + } else if (s->not_found) { + /* Insert new name. */ + size_t size = EXT4_XATTR_LEN(name_len); + size_t rest = (void *)last - (void *)here + sizeof(__u32); + + memmove((void *)here + size, here, rest); + memset(here, 0, size); + here->e_name_index = i->name_index; + here->e_name_len = name_len; + memcpy(here->e_name, i->name, name_len); } else { - if (!s->here->e_value_inum && s->here->e_value_size && - s->here->e_value_offs > 0) { - void *first_val = s->base + min_offs; - size_t offs = le16_to_cpu(s->here->e_value_offs); - void *val = s->base + offs; - size_t size = EXT4_XATTR_SIZE( - le32_to_cpu(s->here->e_value_size)); - - if (i->value && size == EXT4_XATTR_SIZE(i->value_len)) { - /* The old and the new value have the same - size. Just replace. */ - s->here->e_value_size = - cpu_to_le32(i->value_len); - if (i->value == EXT4_ZERO_XATTR_VALUE) { - memset(val, 0, size); - } else { - /* Clear pad bytes first. */ - memset(val + size - EXT4_XATTR_PAD, 0, - EXT4_XATTR_PAD); - memcpy(val, i->value, i->value_len); - } - return 0; - } - - /* Remove the old value. */ - memmove(first_val + size, first_val, val - first_val); - memset(first_val, 0, size); - s->here->e_value_size = 0; - s->here->e_value_offs = 0; - min_offs += size; - - /* Adjust all value offsets. */ - last = s->first; - while (!IS_LAST_ENTRY(last)) { - size_t o = le16_to_cpu(last->e_value_offs); - if (!last->e_value_inum && - last->e_value_size && o < offs) - last->e_value_offs = - cpu_to_le16(o + size); - last = EXT4_XATTR_NEXT(last); - } - } - if (s->here->e_value_inum) { - ext4_xattr_inode_unlink(inode, - le32_to_cpu(s->here->e_value_inum)); - s->here->e_value_inum = 0; - } - if (!i->value) { - /* Remove the old name. */ - size_t size = EXT4_XATTR_LEN(name_len); - last = ENTRY((void *)last - size); - memmove(s->here, (void *)s->here + size, - (void *)last - (void *)s->here + sizeof(__u32)); - memset(last, 0, size); - } + /* This is an update, reset value info. */ + here->e_value_inum = 0; + here->e_value_offs = 0; + here->e_value_size = 0; } if (i->value) { - /* Insert the new value. */ + /* Insert new value. */ if (in_inode) { - unsigned long ea_ino = - le32_to_cpu(s->here->e_value_inum); - rc = ext4_xattr_inode_set(handle, inode, &ea_ino, - i->value, i->value_len); - if (rc) - goto out; - s->here->e_value_inum = cpu_to_le32(ea_ino); - s->here->e_value_offs = 0; + here->e_value_inum = cpu_to_le32(new_ea_inode->i_ino); } else if (i->value_len) { - size_t size = EXT4_XATTR_SIZE(i->value_len); - void *val = s->base + min_offs - size; - s->here->e_value_offs = cpu_to_le16(min_offs - size); - s->here->e_value_inum = 0; + void *val = s->base + min_offs - new_size; + + here->e_value_offs = cpu_to_le16(min_offs - new_size); if (i->value == EXT4_ZERO_XATTR_VALUE) { - memset(val, 0, size); + memset(val, 0, new_size); } else { - /* Clear the pad bytes first. */ - memset(val + size - EXT4_XATTR_PAD, 0, - EXT4_XATTR_PAD); memcpy(val, i->value, i->value_len); + /* Clear padding bytes. */ + memset(val + i->value_len, 0, + new_size - i->value_len); } } - s->here->e_value_size = cpu_to_le32(i->value_len); + here->e_value_size = cpu_to_le32(i->value_len); } - + ret = 0; out: - return rc; + iput(old_ea_inode); + iput(new_ea_inode); + return ret; } struct ext4_xattr_block_find { @@ -1221,6 +1653,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, struct mb_cache_entry *ce = NULL; int error = 0; struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode); + struct inode *ea_inode = NULL; + size_t old_ea_inode_size = 0; #define header(x) ((struct ext4_xattr_header *)(x)) @@ -1275,6 +1709,24 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, header(s->base)->h_refcount = cpu_to_le32(1); s->here = ENTRY(s->base + offset); s->end = s->base + bs->bh->b_size; + + /* + * If existing entry points to an xattr inode, we need + * to prevent ext4_xattr_set_entry() from decrementing + * ref count on it because the reference belongs to the + * original block. In this case, make the entry look + * like it has an empty value. + */ + if (!s->not_found && s->here->e_value_inum) { + /* + * Defer quota free call for previous inode + * until success is guaranteed. + */ + old_ea_inode_size = le32_to_cpu( + s->here->e_value_size); + s->here->e_value_inum = 0; + s->here->e_value_size = 0; + } } } else { /* Allocate a buffer where we construct the new block. */ @@ -1296,6 +1748,24 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, goto bad_block; if (error) goto cleanup; + + if (i->value && s->here->e_value_inum) { + unsigned int ea_ino; + + /* + * A ref count on ea_inode has been taken as part of the call to + * ext4_xattr_set_entry() above. We would like to drop this + * extra ref but we have to wait until the xattr block is + * initialized and has its own ref count on the ea_inode. + */ + ea_ino = le32_to_cpu(s->here->e_value_inum); + error = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode); + if (error) { + ea_inode = NULL; + goto cleanup; + } + } + if (!IS_LAST_ENTRY(s->first)) ext4_xattr_rehash(header(s->base), s->here); @@ -1406,6 +1876,22 @@ getblk_failed: EXT4_FREE_BLOCKS_METADATA); goto cleanup; } + error = ext4_xattr_inode_inc_ref_all(handle, inode, + ENTRY(header(s->base)+1)); + if (error) + goto getblk_failed; + if (ea_inode) { + /* Drop the extra ref on ea_inode. */ + error = ext4_xattr_inode_dec_ref(handle, + ea_inode); + if (error) + ext4_warning_inode(ea_inode, + "dec ref error=%d", + error); + iput(ea_inode); + ea_inode = NULL; + } + lock_buffer(new_bh); error = ext4_journal_get_create_access(handle, new_bh); if (error) { @@ -1425,15 +1911,38 @@ getblk_failed: } } + if (old_ea_inode_size) + ext4_xattr_inode_free_quota(inode, old_ea_inode_size); + /* Update the inode. */ EXT4_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0; /* Drop the previous xattr block. */ - if (bs->bh && bs->bh != new_bh) - ext4_xattr_release_block(handle, inode, bs->bh); + if (bs->bh && bs->bh != new_bh) { + struct ext4_xattr_inode_array *ea_inode_array = NULL; + + ext4_xattr_release_block(handle, inode, bs->bh, + &ea_inode_array, + 0 /* extra_credits */); + ext4_xattr_inode_array_free(ea_inode_array); + } error = 0; cleanup: + if (ea_inode) { + int error2; + + error2 = ext4_xattr_inode_dec_ref(handle, ea_inode); + if (error2) + ext4_warning_inode(ea_inode, "dec ref error=%d", + error2); + + /* If there was an error, revert the quota charge. */ + if (error) + ext4_xattr_inode_free_quota(inode, + i_size_read(ea_inode)); + iput(ea_inode); + } if (ce) mb_cache_entry_put(ea_block_cache, ce); brelse(new_bh); @@ -1558,6 +2067,22 @@ static int ext4_xattr_value_same(struct ext4_xattr_search *s, return !memcmp(value, i->value, i->value_len); } +static struct buffer_head *ext4_xattr_get_block(struct inode *inode) +{ + struct buffer_head *bh; + int error; + + if (!EXT4_I(inode)->i_file_acl) + return NULL; + bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); + if (!bh) + return ERR_PTR(-EIO); + error = ext4_xattr_check_block(inode, bh); + if (error) + return ERR_PTR(error); + return bh; +} + /* * ext4_xattr_set_handle() * @@ -1600,9 +2125,18 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, /* Check journal credits under write lock. */ if (ext4_handle_valid(handle)) { + struct buffer_head *bh; int credits; - credits = ext4_xattr_set_credits(inode, value_len); + bh = ext4_xattr_get_block(inode); + if (IS_ERR(bh)) { + error = PTR_ERR(bh); + goto cleanup; + } + + credits = __ext4_xattr_set_credits(inode, bh, value_len); + brelse(bh); + if (!ext4_handle_has_enough_credits(handle, credits)) { error = -ENOSPC; goto cleanup; @@ -1638,6 +2172,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, if (flags & XATTR_CREATE) goto cleanup; } + if (!value) { if (!is.s.not_found) error = ext4_xattr_ibody_set(handle, inode, &i, &is); @@ -1706,34 +2241,29 @@ cleanup: return error; } -int ext4_xattr_set_credits(struct inode *inode, size_t value_len) +int ext4_xattr_set_credits(struct inode *inode, size_t value_len, int *credits) { - struct super_block *sb = inode->i_sb; - int credits; + struct buffer_head *bh; + int err; - if (!EXT4_SB(sb)->s_journal) + *credits = 0; + + if (!EXT4_SB(inode->i_sb)->s_journal) return 0; - credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb); + down_read(&EXT4_I(inode)->xattr_sem); - /* - * In case of inline data, we may push out the data to a block, - * so we need to reserve credits for this eventuality - */ - if (ext4_has_inline_data(inode)) - credits += ext4_writepage_trans_blocks(inode) + 1; - - if (ext4_has_feature_ea_inode(sb)) { - int nrblocks = (value_len + sb->s_blocksize - 1) >> - sb->s_blocksize_bits; - - /* For new inode */ - credits += EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + 3; - - /* For data blocks of EA inode */ - credits += ext4_meta_trans_blocks(inode, nrblocks, 0); + bh = ext4_xattr_get_block(inode); + if (IS_ERR(bh)) { + err = PTR_ERR(bh); + } else { + *credits = __ext4_xattr_set_credits(inode, bh, value_len); + brelse(bh); + err = 0; } - return credits; + + up_read(&EXT4_I(inode)->xattr_sem); + return err; } /* @@ -1758,7 +2288,10 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name, return error; retry: - credits = ext4_xattr_set_credits(inode, value_len); + error = ext4_xattr_set_credits(inode, value_len, &credits); + if (error) + return error; + handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits); if (IS_ERR(handle)) { error = PTR_ERR(handle); @@ -2064,10 +2597,10 @@ cleanup: return error; } - #define EIA_INCR 16 /* must be 2^n */ #define EIA_MASK (EIA_INCR - 1) -/* Add the large xattr @inode into @ea_inode_array for later deletion. + +/* Add the large xattr @inode into @ea_inode_array for deferred iput(). * If @ea_inode_array is new or full it will be grown and the old * contents copied over. */ @@ -2112,21 +2645,19 @@ ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array, * ext4_xattr_delete_inode() * * Free extended attribute resources associated with this inode. Traverse - * all entries and unlink any xattr inodes associated with this inode. This - * is called immediately before an inode is freed. We have exclusive - * access to the inode. If an orphan inode is deleted it will also delete any - * xattr block and all xattr inodes. They are checked by ext4_xattr_inode_iget() - * to ensure they belong to the parent inode and were not deleted already. + * all entries and decrement reference on any xattr inodes associated with this + * inode. This is called immediately before an inode is freed. We have exclusive + * access to the inode. If an orphan inode is deleted it will also release its + * references on xattr block and xattr inodes. */ -int -ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, - struct ext4_xattr_inode_array **ea_inode_array, - int extra_credits) +int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, + struct ext4_xattr_inode_array **ea_inode_array, + int extra_credits) { struct buffer_head *bh = NULL; struct ext4_xattr_ibody_header *header; - struct ext4_inode *raw_inode; struct ext4_iloc iloc = { .bh = NULL }; + struct ext4_xattr_entry *entry; int error; error = ext4_xattr_ensure_credits(handle, inode, extra_credits, @@ -2138,66 +2669,71 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, goto cleanup; } - if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR)) - goto delete_external_ea; + if (ext4_has_feature_ea_inode(inode->i_sb) && + ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { - error = ext4_get_inode_loc(inode, &iloc); - if (error) - goto cleanup; - - error = ext4_journal_get_write_access(handle, iloc.bh); - if (error) - goto cleanup; - - raw_inode = ext4_raw_inode(&iloc); - header = IHDR(inode, raw_inode); - ext4_xattr_inode_remove_all(handle, inode, iloc.bh, IFIRST(header), - false /* block_csum */, ea_inode_array, - extra_credits); - -delete_external_ea: - if (!EXT4_I(inode)->i_file_acl) { - error = 0; - goto cleanup; - } - bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); - if (!bh) { - EXT4_ERROR_INODE(inode, "block %llu read error", - EXT4_I(inode)->i_file_acl); - error = -EIO; - goto cleanup; - } - if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || - BHDR(bh)->h_blocks != cpu_to_le32(1)) { - EXT4_ERROR_INODE(inode, "bad block %llu", - EXT4_I(inode)->i_file_acl); - error = -EFSCORRUPTED; - goto cleanup; - } - - if (ext4_has_feature_ea_inode(inode->i_sb)) { - error = ext4_journal_get_write_access(handle, bh); + error = ext4_get_inode_loc(inode, &iloc); if (error) { - EXT4_ERROR_INODE(inode, "write access %llu", - EXT4_I(inode)->i_file_acl); + EXT4_ERROR_INODE(inode, "inode loc (error %d)", error); goto cleanup; } - ext4_xattr_inode_remove_all(handle, inode, bh, - BFIRST(bh), - true /* block_csum */, - ea_inode_array, - extra_credits); + + error = ext4_journal_get_write_access(handle, iloc.bh); + if (error) { + EXT4_ERROR_INODE(inode, "write access (error %d)", + error); + goto cleanup; + } + + header = IHDR(inode, ext4_raw_inode(&iloc)); + if (header->h_magic == cpu_to_le32(EXT4_XATTR_MAGIC)) + ext4_xattr_inode_dec_ref_all(handle, inode, iloc.bh, + IFIRST(header), + false /* block_csum */, + ea_inode_array, + extra_credits, + false /* skip_quota */); } - ext4_xattr_release_block(handle, inode, bh); - /* Update i_file_acl within the same transaction that releases block. */ - EXT4_I(inode)->i_file_acl = 0; - error = ext4_mark_inode_dirty(handle, inode); - if (error) { - EXT4_ERROR_INODE(inode, "mark inode dirty (error %d)", - error); - goto cleanup; + if (EXT4_I(inode)->i_file_acl) { + bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); + if (!bh) { + EXT4_ERROR_INODE(inode, "block %llu read error", + EXT4_I(inode)->i_file_acl); + error = -EIO; + goto cleanup; + } + error = ext4_xattr_check_block(inode, bh); + if (error) { + EXT4_ERROR_INODE(inode, "bad block %llu (error %d)", + EXT4_I(inode)->i_file_acl, error); + goto cleanup; + } + + if (ext4_has_feature_ea_inode(inode->i_sb)) { + for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry); + entry = EXT4_XATTR_NEXT(entry)) + if (entry->e_value_inum) + ext4_xattr_inode_free_quota(inode, + le32_to_cpu(entry->e_value_size)); + + } + + ext4_xattr_release_block(handle, inode, bh, ea_inode_array, + extra_credits); + /* + * Update i_file_acl value in the same transaction that releases + * block. + */ + EXT4_I(inode)->i_file_acl = 0; + error = ext4_mark_inode_dirty(handle, inode); + if (error) { + EXT4_ERROR_INODE(inode, "mark inode dirty (error %d)", + error); + goto cleanup; + } } + error = 0; cleanup: brelse(iloc.bh); brelse(bh); @@ -2206,17 +2742,13 @@ cleanup: void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *ea_inode_array) { - struct inode *ea_inode; - int idx = 0; + int idx; if (ea_inode_array == NULL) return; - for (; idx < ea_inode_array->count; ++idx) { - ea_inode = ea_inode_array->inodes[idx]; - clear_nlink(ea_inode); - iput(ea_inode); - } + for (idx = 0; idx < ea_inode_array->count; ++idx) + iput(ea_inode_array->inodes[idx]); kfree(ea_inode_array); } diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index b2005a2716d9..67616cb9a059 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -69,19 +69,6 @@ struct ext4_xattr_entry { EXT4_I(inode)->i_extra_isize)) #define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1)) -/* - * Link EA inode back to parent one using i_mtime field. - * Extra integer type conversion added to ignore higher - * bits in i_mtime.tv_sec which might be set by ext4_get() - */ -#define EXT4_XATTR_INODE_SET_PARENT(inode, inum) \ -do { \ - (inode)->i_mtime.tv_sec = inum; \ -} while(0) - -#define EXT4_XATTR_INODE_GET_PARENT(inode) \ -((__u32)(inode)->i_mtime.tv_sec) - /* * The minimum size of EA value when you start storing it in an external inode * size of block - size of header - size of 1 entry - 4 null bytes @@ -165,9 +152,9 @@ extern ssize_t ext4_listxattr(struct dentry *, char *, size_t); extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t); extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_t, int); extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int); -extern int ext4_xattr_set_credits(struct inode *inode, size_t value_len); +extern int ext4_xattr_set_credits(struct inode *inode, size_t value_len, + int *credits); -extern int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino); extern int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, struct ext4_xattr_inode_array **array, int extra_credits); diff --git a/fs/mbcache.c b/fs/mbcache.c index 45a8d52dc991..d818fd236787 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c @@ -13,10 +13,11 @@ * mb_cache_entry_delete()). * * Ext2 and ext4 use this cache for deduplication of extended attribute blocks. - * They use hash of a block contents as a key and block number as a value. - * That's why keys need not be unique (different xattr blocks may end up having - * the same hash). However block number always uniquely identifies a cache - * entry. + * Ext4 also uses it for deduplication of xattr values stored in inodes. + * They use hash of data as a key and provide a value that may represent a + * block or inode number. That's why keys need not be unique (hash of different + * data may be the same). However user provided value always uniquely + * identifies a cache entry. * * We provide functions for creation and removal of entries, search by key, * and a special "delete entry with given key-value pair" operation. Fixed From 7a9ca53aea10ad4677a0f347ad7639c304b80194 Mon Sep 17 00:00:00 2001 From: Tahsin Erdogan Date: Thu, 22 Jun 2017 11:46:48 -0400 Subject: [PATCH 29/47] quota: add get_inode_usage callback to transfer multi-inode charges Ext4 ea_inode feature allows storing xattr values in external inodes to be able to store values that are bigger than a block in size. Ext4 also has deduplication support for these type of inodes. With deduplication, the actual storage waste is eliminated but the users of such inodes are still charged full quota for the inodes as if there was no sharing happening in the background. This design requires ext4 to manually charge the users because the inodes are shared. An implication of this is that, if someone calls chown on a file that has such references we need to transfer the quota for the file and xattr inodes. Current dquot_transfer() function implicitly transfers one inode charge. With ea_inode feature, we would like to transfer multiple inode charges. Add get_inode_usage callback which can interrogate the total number of inodes that were charged for a given inode. [ Applied fix from Colin King to make sure the 'ret' variable is initialized on the successful return path. Detected by CoverityScan, CID#1446616 ("Uninitialized scalar variable") --tytso] Signed-off-by: Tahsin Erdogan Signed-off-by: Theodore Ts'o Signed-off-by: Colin Ian King Acked-by: Jan Kara --- fs/ext4/inode.c | 7 ++++++ fs/ext4/ioctl.c | 6 +++++ fs/ext4/super.c | 21 +++++++++-------- fs/ext4/xattr.c | 55 +++++++++++++++++++++++++++++++++++++++++++ fs/ext4/xattr.h | 2 ++ fs/quota/dquot.c | 16 +++++++++---- include/linux/quota.h | 2 ++ 7 files changed, 95 insertions(+), 14 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 962f28a0e176..d9733aa955e9 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5295,7 +5295,14 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) error = PTR_ERR(handle); goto err_out; } + + /* dquot_transfer() calls back ext4_get_inode_usage() which + * counts xattr inode references. + */ + down_read(&EXT4_I(inode)->xattr_sem); error = dquot_transfer(inode, attr); + up_read(&EXT4_I(inode)->xattr_sem); + if (error) { ext4_journal_stop(handle); return error; diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index dde8deb11e59..42b3a73143cf 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -373,7 +373,13 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid) transfer_to[PRJQUOTA] = dqget(sb, make_kqid_projid(kprojid)); if (!IS_ERR(transfer_to[PRJQUOTA])) { + + /* __dquot_transfer() calls back ext4_get_inode_usage() which + * counts xattr inode references. + */ + down_read(&EXT4_I(inode)->xattr_sem); err = __dquot_transfer(inode, transfer_to); + up_read(&EXT4_I(inode)->xattr_sem); dqput(transfer_to[PRJQUOTA]); if (err) goto out_dirty; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index d501f8256dc4..5ac76e8d4013 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1263,16 +1263,17 @@ static struct dquot **ext4_get_dquots(struct inode *inode) } static const struct dquot_operations ext4_quota_operations = { - .get_reserved_space = ext4_get_reserved_space, - .write_dquot = ext4_write_dquot, - .acquire_dquot = ext4_acquire_dquot, - .release_dquot = ext4_release_dquot, - .mark_dirty = ext4_mark_dquot_dirty, - .write_info = ext4_write_info, - .alloc_dquot = dquot_alloc, - .destroy_dquot = dquot_destroy, - .get_projid = ext4_get_projid, - .get_next_id = ext4_get_next_id, + .get_reserved_space = ext4_get_reserved_space, + .write_dquot = ext4_write_dquot, + .acquire_dquot = ext4_acquire_dquot, + .release_dquot = ext4_release_dquot, + .mark_dirty = ext4_mark_dquot_dirty, + .write_info = ext4_write_info, + .alloc_dquot = dquot_alloc, + .destroy_dquot = dquot_destroy, + .get_projid = ext4_get_projid, + .get_inode_usage = ext4_get_inode_usage, + .get_next_id = ext4_get_next_id, }; static const struct quotactl_ops ext4_qctl_operations = { diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index a4c8fe3692a2..22bfb6221a2d 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -733,6 +733,61 @@ static void ext4_xattr_update_super_block(handle_t *handle, } } +int ext4_get_inode_usage(struct inode *inode, qsize_t *usage) +{ + struct ext4_iloc iloc = { .bh = NULL }; + struct buffer_head *bh = NULL; + struct ext4_inode *raw_inode; + struct ext4_xattr_ibody_header *header; + struct ext4_xattr_entry *entry; + qsize_t ea_inode_refs = 0; + void *end; + int ret; + + lockdep_assert_held_read(&EXT4_I(inode)->xattr_sem); + + if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) + goto out; + raw_inode = ext4_raw_inode(&iloc); + header = IHDR(inode, raw_inode); + end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; + ret = xattr_check_inode(inode, header, end); + if (ret) + goto out; + + for (entry = IFIRST(header); !IS_LAST_ENTRY(entry); + entry = EXT4_XATTR_NEXT(entry)) + if (entry->e_value_inum) + ea_inode_refs++; + } + + if (EXT4_I(inode)->i_file_acl) { + bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); + if (!bh) { + ret = -EIO; + goto out; + } + + if (ext4_xattr_check_block(inode, bh)) { + ret = -EFSCORRUPTED; + goto out; + } + + for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry); + entry = EXT4_XATTR_NEXT(entry)) + if (entry->e_value_inum) + ea_inode_refs++; + } + *usage = ea_inode_refs + 1; + ret = 0; +out: + brelse(iloc.bh); + brelse(bh); + return ret; +} + static inline size_t round_up_cluster(struct inode *inode, size_t length) { struct super_block *sb = inode->i_sb; diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 67616cb9a059..26119a67c8c3 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -193,3 +193,5 @@ extern void ext4_xattr_inode_set_class(struct inode *ea_inode); #else static inline void ext4_xattr_inode_set_class(struct inode *ea_inode) { } #endif + +extern int ext4_get_inode_usage(struct inode *inode, qsize_t *usage); diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 48813aeaab80..53a17496c5c5 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -1910,6 +1910,7 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) { qsize_t space, cur_space; qsize_t rsv_space = 0; + qsize_t inode_usage = 1; struct dquot *transfer_from[MAXQUOTAS] = {}; int cnt, ret = 0; char is_valid[MAXQUOTAS] = {}; @@ -1919,6 +1920,13 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) if (IS_NOQUOTA(inode)) return 0; + + if (inode->i_sb->dq_op->get_inode_usage) { + ret = inode->i_sb->dq_op->get_inode_usage(inode, &inode_usage); + if (ret) + return ret; + } + /* Initialize the arrays */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { warn_to[cnt].w_type = QUOTA_NL_NOWARN; @@ -1946,7 +1954,7 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) continue; is_valid[cnt] = 1; transfer_from[cnt] = i_dquot(inode)[cnt]; - ret = check_idq(transfer_to[cnt], 1, &warn_to[cnt]); + ret = check_idq(transfer_to[cnt], inode_usage, &warn_to[cnt]); if (ret) goto over_quota; ret = check_bdq(transfer_to[cnt], space, 0, &warn_to[cnt]); @@ -1963,7 +1971,7 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) /* Due to IO error we might not have transfer_from[] structure */ if (transfer_from[cnt]) { int wtype; - wtype = info_idq_free(transfer_from[cnt], 1); + wtype = info_idq_free(transfer_from[cnt], inode_usage); if (wtype != QUOTA_NL_NOWARN) prepare_warning(&warn_from_inodes[cnt], transfer_from[cnt], wtype); @@ -1971,13 +1979,13 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) if (wtype != QUOTA_NL_NOWARN) prepare_warning(&warn_from_space[cnt], transfer_from[cnt], wtype); - dquot_decr_inodes(transfer_from[cnt], 1); + dquot_decr_inodes(transfer_from[cnt], inode_usage); dquot_decr_space(transfer_from[cnt], cur_space); dquot_free_reserved_space(transfer_from[cnt], rsv_space); } - dquot_incr_inodes(transfer_to[cnt], 1); + dquot_incr_inodes(transfer_to[cnt], inode_usage); dquot_incr_space(transfer_to[cnt], cur_space); dquot_resv_space(transfer_to[cnt], rsv_space); diff --git a/include/linux/quota.h b/include/linux/quota.h index 3434eef2a5aa..bfd077ca6ac3 100644 --- a/include/linux/quota.h +++ b/include/linux/quota.h @@ -332,6 +332,8 @@ struct dquot_operations { * quota code only */ qsize_t *(*get_reserved_space) (struct inode *); int (*get_projid) (struct inode *, kprojid_t *);/* Get project ID */ + /* Get number of inodes that were charged for a given inode */ + int (*get_inode_usage) (struct inode *, qsize_t *); /* Get next ID with active quota structure */ int (*get_next_id) (struct super_block *sb, struct kqid *qid); }; From 9c6e7853c531c7cecca8c257bde767d089e5c880 Mon Sep 17 00:00:00 2001 From: Tahsin Erdogan Date: Thu, 22 Jun 2017 11:48:53 -0400 Subject: [PATCH 30/47] ext4: reserve space for xattr entries/names New ea_inode feature allows putting large xattr values into external inodes. struct ext4_xattr_entry and the attribute name however have to remain in the inode extra space or external attribute block. Once that space is exhausted, no further entries can be added. Some of that space could also be used by values that fit in there at the time of addition. So, a single xattr entry whose value barely fits in the external block could prevent further entries being added. To mitigate the problem, this patch introduces a notion of reserved space in the external attribute block that cannot be used by value data. This reserve is enforced when ea_inode feature is enabled. The amount of reserve is arbitrarily chosen to be min(block_size/8, 1024). The table below shows how much space is reserved for each block size and the guaranteed mininum number of entries that can be placed in the external attribute block. block size reserved bytes entries (name length = 16) 1k 128 3 2k 256 7 4k 512 15 8k 1024 31 16k 1024 31 32k 1024 31 64k 1024 31 Signed-off-by: Tahsin Erdogan Signed-off-by: Theodore Ts'o --- fs/ext4/xattr.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 22bfb6221a2d..174d4e4a295f 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1459,6 +1459,12 @@ static int ext4_xattr_inode_lookup_create(handle_t *handle, struct inode *inode, return 0; } +/* + * Reserve min(block_size/8, 1024) bytes for xattr entries/names if ea_inode + * feature is enabled. + */ +#define EXT4_XATTR_BLOCK_RESERVE(inode) min(i_blocksize(inode)/8, 1024U) + static int ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s, handle_t *handle, struct inode *inode) @@ -1518,6 +1524,20 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, ret = -ENOSPC; goto out; } + + /* + * If storing the value in an external inode is an option, + * reserve space for xattr entries/names in the external + * attribute block so that a long value does not occupy the + * whole space and prevent futher entries being added. + */ + if (ext4_has_feature_ea_inode(inode->i_sb) && new_size && + (s->end - s->base) == i_blocksize(inode) && + (min_offs + old_size - new_size) < + EXT4_XATTR_BLOCK_RESERVE(inode)) { + ret = -ENOSPC; + goto out; + } } /* From daf8328172dffabb4a6b5e1970d6e9628669f51c Mon Sep 17 00:00:00 2001 From: Tahsin Erdogan Date: Thu, 22 Jun 2017 11:52:03 -0400 Subject: [PATCH 31/47] ext4: eliminate xattr entry e_hash recalculation for removes When an extended attribute block is modified, ext4_xattr_hash_entry() recalculates e_hash for the entry that is pointed by s->here. This is unnecessary if the modification is to remove an entry. Currently, if the removed entry is the last one and there are other entries remaining, hash calculation targets the just erased entry which has been filled with zeroes and effectively does nothing. If the removed entry is not the last one and there are more entries, this time it will recalculate hash on the next entry which is totally unnecessary. Fix these by moving the decision on when to recalculate hash to ext4_xattr_set_entry(). Signed-off-by: Tahsin Erdogan Signed-off-by: Theodore Ts'o --- fs/ext4/xattr.c | 50 +++++++++++++++++++++++++------------------------ 1 file changed, 26 insertions(+), 24 deletions(-) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 174d4e4a295f..354c55c3f70c 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -77,8 +77,9 @@ static void ext4_xattr_block_cache_insert(struct mb_cache *, static struct buffer_head * ext4_xattr_block_cache_find(struct inode *, struct ext4_xattr_header *, struct mb_cache_entry **); -static void ext4_xattr_rehash(struct ext4_xattr_header *, - struct ext4_xattr_entry *); +static void ext4_xattr_hash_entry(struct ext4_xattr_entry *entry, + void *value_base); +static void ext4_xattr_rehash(struct ext4_xattr_header *); static const struct xattr_handler * const ext4_xattr_handler_map[] = { [EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler, @@ -1467,7 +1468,8 @@ static int ext4_xattr_inode_lookup_create(handle_t *handle, struct inode *inode, static int ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s, - handle_t *handle, struct inode *inode) + handle_t *handle, struct inode *inode, + bool is_block) { struct ext4_xattr_entry *last; struct ext4_xattr_entry *here = s->here; @@ -1531,8 +1533,8 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, * attribute block so that a long value does not occupy the * whole space and prevent futher entries being added. */ - if (ext4_has_feature_ea_inode(inode->i_sb) && new_size && - (s->end - s->base) == i_blocksize(inode) && + if (ext4_has_feature_ea_inode(inode->i_sb) && + new_size && is_block && (min_offs + old_size - new_size) < EXT4_XATTR_BLOCK_RESERVE(inode)) { ret = -ENOSPC; @@ -1662,6 +1664,13 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, } here->e_value_size = cpu_to_le32(i->value_len); } + + if (is_block) { + if (i->value) + ext4_xattr_hash_entry(here, s->base); + ext4_xattr_rehash((struct ext4_xattr_header *)s->base); + } + ret = 0; out: iput(old_ea_inode); @@ -1751,14 +1760,11 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, mb_cache_entry_delete(ea_block_cache, hash, bs->bh->b_blocknr); ea_bdebug(bs->bh, "modifying in-place"); - error = ext4_xattr_set_entry(i, s, handle, inode); - if (!error) { - if (!IS_LAST_ENTRY(s->first)) - ext4_xattr_rehash(header(s->base), - s->here); + error = ext4_xattr_set_entry(i, s, handle, inode, + true /* is_block */); + if (!error) ext4_xattr_block_cache_insert(ea_block_cache, bs->bh); - } ext4_xattr_block_csum_set(inode, bs->bh); unlock_buffer(bs->bh); if (error == -EFSCORRUPTED) @@ -1818,7 +1824,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, s->end = s->base + sb->s_blocksize; } - error = ext4_xattr_set_entry(i, s, handle, inode); + error = ext4_xattr_set_entry(i, s, handle, inode, true /* is_block */); if (error == -EFSCORRUPTED) goto bad_block; if (error) @@ -1841,9 +1847,6 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, } } - if (!IS_LAST_ENTRY(s->first)) - ext4_xattr_rehash(header(s->base), s->here); - inserted: if (!IS_LAST_ENTRY(s->first)) { new_bh = ext4_xattr_block_cache_find(inode, header(s->base), @@ -2076,7 +2079,7 @@ int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode, if (EXT4_I(inode)->i_extra_isize == 0) return -ENOSPC; - error = ext4_xattr_set_entry(i, s, handle, inode); + error = ext4_xattr_set_entry(i, s, handle, inode, false /* is_block */); if (error) { if (error == -ENOSPC && ext4_has_inline_data(inode)) { @@ -2088,7 +2091,8 @@ int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode, error = ext4_xattr_ibody_find(inode, i, is); if (error) return error; - error = ext4_xattr_set_entry(i, s, handle, inode); + error = ext4_xattr_set_entry(i, s, handle, inode, + false /* is_block */); } if (error) return error; @@ -2114,7 +2118,7 @@ static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, if (EXT4_I(inode)->i_extra_isize == 0) return -ENOSPC; - error = ext4_xattr_set_entry(i, s, handle, inode); + error = ext4_xattr_set_entry(i, s, handle, inode, false /* is_block */); if (error) return error; header = IHDR(inode, ext4_raw_inode(&is->iloc)); @@ -2940,8 +2944,8 @@ ext4_xattr_block_cache_find(struct inode *inode, * * Compute the hash of an extended attribute. */ -static inline void ext4_xattr_hash_entry(struct ext4_xattr_header *header, - struct ext4_xattr_entry *entry) +static void ext4_xattr_hash_entry(struct ext4_xattr_entry *entry, + void *value_base) { __u32 hash = 0; char *name = entry->e_name; @@ -2954,7 +2958,7 @@ static inline void ext4_xattr_hash_entry(struct ext4_xattr_header *header, } if (!entry->e_value_inum && entry->e_value_size) { - __le32 *value = (__le32 *)((char *)header + + __le32 *value = (__le32 *)((char *)value_base + le16_to_cpu(entry->e_value_offs)); for (n = (le32_to_cpu(entry->e_value_size) + EXT4_XATTR_ROUND) >> EXT4_XATTR_PAD_BITS; n; n--) { @@ -2976,13 +2980,11 @@ static inline void ext4_xattr_hash_entry(struct ext4_xattr_header *header, * * Re-compute the extended attribute hash value after an entry has changed. */ -static void ext4_xattr_rehash(struct ext4_xattr_header *header, - struct ext4_xattr_entry *entry) +static void ext4_xattr_rehash(struct ext4_xattr_header *header) { struct ext4_xattr_entry *here; __u32 hash = 0; - ext4_xattr_hash_entry(header, entry); here = ENTRY(header+1); while (!IS_LAST_ENTRY(here)) { if (!here->e_hash) { From b9fc761ea2d82e910e92f83d01bbbbe1f5e99bfc Mon Sep 17 00:00:00 2001 From: Tahsin Erdogan Date: Thu, 22 Jun 2017 11:53:15 -0400 Subject: [PATCH 32/47] ext4: strong binding of xattr inode references To verify that a xattr entry is not pointing to the wrong xattr inode, we currently check that the target inode has EXT4_EA_INODE_FL flag set and also the entry size matches the target inode size. For stronger validation, also incorporate crc32c hash of the value into the e_hash field. This is done regardless of whether the entry lives in the inode body or external attribute block. Signed-off-by: Tahsin Erdogan Signed-off-by: Theodore Ts'o --- fs/ext4/xattr.c | 104 ++++++++++++++++++++++++++++++------------------ 1 file changed, 65 insertions(+), 39 deletions(-) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 354c55c3f70c..a5ad0ccdd1cb 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -77,8 +77,8 @@ static void ext4_xattr_block_cache_insert(struct mb_cache *, static struct buffer_head * ext4_xattr_block_cache_find(struct inode *, struct ext4_xattr_header *, struct mb_cache_entry **); -static void ext4_xattr_hash_entry(struct ext4_xattr_entry *entry, - void *value_base); +static __le32 ext4_xattr_hash_entry(char *name, size_t name_len, __le32 *value, + size_t value_count); static void ext4_xattr_rehash(struct ext4_xattr_header *); static const struct xattr_handler * const ext4_xattr_handler_map[] = { @@ -380,7 +380,9 @@ error: } static int -ext4_xattr_inode_verify_hash(struct inode *ea_inode, void *buffer, size_t size) +ext4_xattr_inode_verify_hashes(struct inode *ea_inode, + struct ext4_xattr_entry *entry, void *buffer, + size_t size) { u32 hash; @@ -388,23 +390,35 @@ ext4_xattr_inode_verify_hash(struct inode *ea_inode, void *buffer, size_t size) hash = ext4_xattr_inode_hash(EXT4_SB(ea_inode->i_sb), buffer, size); if (hash != ext4_xattr_inode_get_hash(ea_inode)) return -EFSCORRUPTED; + + if (entry) { + __le32 e_hash, tmp_data; + + /* Verify entry hash. */ + tmp_data = cpu_to_le32(hash); + e_hash = ext4_xattr_hash_entry(entry->e_name, entry->e_name_len, + &tmp_data, 1); + if (e_hash != entry->e_hash) + return -EFSCORRUPTED; + } return 0; } #define EXT4_XATTR_INODE_GET_PARENT(inode) ((__u32)(inode)->i_mtime.tv_sec) /* - * Read the value from the EA inode. + * Read xattr value from the EA inode. */ static int -ext4_xattr_inode_get(struct inode *inode, unsigned long ea_ino, void *buffer, - size_t size) +ext4_xattr_inode_get(struct inode *inode, struct ext4_xattr_entry *entry, + void *buffer, size_t size) { struct mb_cache *ea_inode_cache = EA_INODE_CACHE(inode); struct inode *ea_inode; int err; - err = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode); + err = ext4_xattr_inode_iget(inode, le32_to_cpu(entry->e_value_inum), + &ea_inode); if (err) { ea_inode = NULL; goto out; @@ -422,7 +436,7 @@ ext4_xattr_inode_get(struct inode *inode, unsigned long ea_ino, void *buffer, if (err) goto out; - err = ext4_xattr_inode_verify_hash(ea_inode, buffer, size); + err = ext4_xattr_inode_verify_hashes(ea_inode, entry, buffer, size); /* * Compatibility check for old Lustre ea_inode implementation. Old * version does not have hash validation, but it has a backpointer @@ -489,9 +503,8 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, if (size > buffer_size) goto cleanup; if (entry->e_value_inum) { - error = ext4_xattr_inode_get(inode, - le32_to_cpu(entry->e_value_inum), - buffer, size); + error = ext4_xattr_inode_get(inode, entry, buffer, + size); if (error) goto cleanup; } else { @@ -539,9 +552,8 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name, if (size > buffer_size) goto cleanup; if (entry->e_value_inum) { - error = ext4_xattr_inode_get(inode, - le32_to_cpu(entry->e_value_inum), - buffer, size); + error = ext4_xattr_inode_get(inode, entry, buffer, + size); if (error) goto cleanup; } else { @@ -1400,8 +1412,8 @@ ext4_xattr_inode_cache_find(struct inode *inode, const void *value, (EXT4_I(ea_inode)->i_flags & EXT4_EA_INODE_FL) && i_size_read(ea_inode) == value_len && !ext4_xattr_inode_read(ea_inode, ea_data, value_len) && - !ext4_xattr_inode_verify_hash(ea_inode, ea_data, - value_len) && + !ext4_xattr_inode_verify_hashes(ea_inode, NULL, ea_data, + value_len) && !memcmp(value, ea_data, value_len)) { mb_cache_entry_touch(ea_inode_cache, ce); mb_cache_entry_put(ea_inode_cache, ce); @@ -1665,12 +1677,36 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, here->e_value_size = cpu_to_le32(i->value_len); } - if (is_block) { - if (i->value) - ext4_xattr_hash_entry(here, s->base); - ext4_xattr_rehash((struct ext4_xattr_header *)s->base); + if (i->value) { + __le32 hash = 0; + + /* Entry hash calculation. */ + if (in_inode) { + __le32 crc32c_hash; + + /* + * Feed crc32c hash instead of the raw value for entry + * hash calculation. This is to avoid walking + * potentially long value buffer again. + */ + crc32c_hash = cpu_to_le32( + ext4_xattr_inode_get_hash(new_ea_inode)); + hash = ext4_xattr_hash_entry(here->e_name, + here->e_name_len, + &crc32c_hash, 1); + } else if (is_block) { + __le32 *value = s->base + min_offs - new_size; + + hash = ext4_xattr_hash_entry(here->e_name, + here->e_name_len, value, + new_size >> 2); + } + here->e_hash = hash; } + if (is_block) + ext4_xattr_rehash((struct ext4_xattr_header *)s->base); + ret = 0; out: iput(old_ea_inode); @@ -2452,9 +2488,7 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode, /* Save the entry name and the entry value */ if (entry->e_value_inum) { - error = ext4_xattr_inode_get(inode, - le32_to_cpu(entry->e_value_inum), - buffer, value_size); + error = ext4_xattr_inode_get(inode, entry, buffer, value_size); if (error) goto out; } else { @@ -2944,30 +2978,22 @@ ext4_xattr_block_cache_find(struct inode *inode, * * Compute the hash of an extended attribute. */ -static void ext4_xattr_hash_entry(struct ext4_xattr_entry *entry, - void *value_base) +static __le32 ext4_xattr_hash_entry(char *name, size_t name_len, __le32 *value, + size_t value_count) { __u32 hash = 0; - char *name = entry->e_name; - int n; - for (n = 0; n < entry->e_name_len; n++) { + while (name_len--) { hash = (hash << NAME_HASH_SHIFT) ^ (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^ *name++; } - - if (!entry->e_value_inum && entry->e_value_size) { - __le32 *value = (__le32 *)((char *)value_base + - le16_to_cpu(entry->e_value_offs)); - for (n = (le32_to_cpu(entry->e_value_size) + - EXT4_XATTR_ROUND) >> EXT4_XATTR_PAD_BITS; n; n--) { - hash = (hash << VALUE_HASH_SHIFT) ^ - (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^ - le32_to_cpu(*value++); - } + while (value_count--) { + hash = (hash << VALUE_HASH_SHIFT) ^ + (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^ + le32_to_cpu(*value++); } - entry->e_hash = cpu_to_le32(hash); + return cpu_to_le32(hash); } #undef NAME_HASH_SHIFT From cdb7ee4c632759075866bb8da5fb16b764e66ded Mon Sep 17 00:00:00 2001 From: Tahsin Erdogan Date: Thu, 22 Jun 2017 11:55:14 -0400 Subject: [PATCH 33/47] ext4: add nombcache mount option The main purpose of mb cache is to achieve deduplication in extended attributes. In use cases where opportunity for deduplication is unlikely, it only adds overhead. Add a mount option to explicitly turn off mb cache. Suggested-by: Andreas Dilger Signed-off-by: Tahsin Erdogan Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 1 + fs/ext4/super.c | 34 +++++++++++++++++++++----------- fs/ext4/xattr.c | 52 +++++++++++++++++++++++++++++++++---------------- 3 files changed, 59 insertions(+), 28 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index fe92a63c86cb..68ddd24db9a2 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1114,6 +1114,7 @@ struct ext4_inode_info { /* * Mount flags set via mount options or defaults */ +#define EXT4_MOUNT_NO_MBCACHE 0x00001 /* Do not use mbcache */ #define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */ #define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */ #define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */ diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 5ac76e8d4013..1fec35bd4084 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1336,7 +1336,7 @@ enum { Opt_inode_readahead_blks, Opt_journal_ioprio, Opt_dioread_nolock, Opt_dioread_lock, Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, - Opt_max_dir_size_kb, Opt_nojournal_checksum, + Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache, }; static const match_table_t tokens = { @@ -1419,6 +1419,8 @@ static const match_table_t tokens = { {Opt_noinit_itable, "noinit_itable"}, {Opt_max_dir_size_kb, "max_dir_size_kb=%u"}, {Opt_test_dummy_encryption, "test_dummy_encryption"}, + {Opt_nombcache, "nombcache"}, + {Opt_nombcache, "no_mbcache"}, /* for backward compatibility */ {Opt_removed, "check=none"}, /* mount option from ext2/3 */ {Opt_removed, "nocheck"}, /* mount option from ext2/3 */ {Opt_removed, "reservation"}, /* mount option from ext2/3 */ @@ -1626,6 +1628,7 @@ static const struct mount_opts { {Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT}, {Opt_max_dir_size_kb, 0, MOPT_GTE0}, {Opt_test_dummy_encryption, 0, MOPT_GTE0}, + {Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET}, {Opt_err, 0, 0} }; @@ -4080,19 +4083,22 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_journal->j_commit_callback = ext4_journal_commit_callback; no_journal: - sbi->s_ea_block_cache = ext4_xattr_create_cache(); - if (!sbi->s_ea_block_cache) { - ext4_msg(sb, KERN_ERR, "Failed to create ea_block_cache"); - goto failed_mount_wq; - } - - if (ext4_has_feature_ea_inode(sb)) { - sbi->s_ea_inode_cache = ext4_xattr_create_cache(); - if (!sbi->s_ea_inode_cache) { + if (!test_opt(sb, NO_MBCACHE)) { + sbi->s_ea_block_cache = ext4_xattr_create_cache(); + if (!sbi->s_ea_block_cache) { ext4_msg(sb, KERN_ERR, - "Failed to create ea_inode_cache"); + "Failed to create ea_block_cache"); goto failed_mount_wq; } + + if (ext4_has_feature_ea_inode(sb)) { + sbi->s_ea_inode_cache = ext4_xattr_create_cache(); + if (!sbi->s_ea_inode_cache) { + ext4_msg(sb, KERN_ERR, + "Failed to create ea_inode_cache"); + goto failed_mount_wq; + } + } } if ((DUMMY_ENCRYPTION_ENABLED(sbi) || ext4_has_feature_encrypt(sb)) && @@ -4989,6 +4995,12 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) } } + if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_NO_MBCACHE) { + ext4_msg(sb, KERN_ERR, "can't enable nombcache during remount"); + err = -EINVAL; + goto restore_opts; + } + if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_DAX) { ext4_msg(sb, KERN_WARNING, "warning: refusing change of " "dax flag with busy inodes while remounting"); diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index a5ad0ccdd1cb..34fa37e7744c 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -991,10 +991,13 @@ static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode, set_nlink(ea_inode, 1); ext4_orphan_del(handle, ea_inode); - hash = ext4_xattr_inode_get_hash(ea_inode); - mb_cache_entry_create(ea_inode_cache, GFP_NOFS, hash, - ea_inode->i_ino, - true /* reusable */); + if (ea_inode_cache) { + hash = ext4_xattr_inode_get_hash(ea_inode); + mb_cache_entry_create(ea_inode_cache, + GFP_NOFS, hash, + ea_inode->i_ino, + true /* reusable */); + } } } else { WARN_ONCE(ref_count < 0, "EA inode %lu ref_count=%lld", @@ -1008,9 +1011,11 @@ static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode, clear_nlink(ea_inode); ext4_orphan_add(handle, ea_inode); - hash = ext4_xattr_inode_get_hash(ea_inode); - mb_cache_entry_delete(ea_inode_cache, hash, - ea_inode->i_ino); + if (ea_inode_cache) { + hash = ext4_xattr_inode_get_hash(ea_inode); + mb_cache_entry_delete(ea_inode_cache, hash, + ea_inode->i_ino); + } } } @@ -1194,7 +1199,9 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, * This must happen under buffer lock for * ext4_xattr_block_set() to reliably detect freed block */ - mb_cache_entry_delete(ea_block_cache, hash, bh->b_blocknr); + if (ea_block_cache) + mb_cache_entry_delete(ea_block_cache, hash, + bh->b_blocknr); get_bh(bh); unlock_buffer(bh); @@ -1214,11 +1221,13 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, if (ref == EXT4_XATTR_REFCOUNT_MAX - 1) { struct mb_cache_entry *ce; - ce = mb_cache_entry_get(ea_block_cache, hash, - bh->b_blocknr); - if (ce) { - ce->e_reusable = 1; - mb_cache_entry_put(ea_block_cache, ce); + if (ea_block_cache) { + ce = mb_cache_entry_get(ea_block_cache, hash, + bh->b_blocknr); + if (ce) { + ce->e_reusable = 1; + mb_cache_entry_put(ea_block_cache, ce); + } } } @@ -1395,6 +1404,9 @@ ext4_xattr_inode_cache_find(struct inode *inode, const void *value, struct mb_cache *ea_inode_cache = EA_INODE_CACHE(inode); void *ea_data; + if (!ea_inode_cache) + return NULL; + ce = mb_cache_entry_find_first(ea_inode_cache, hash); if (!ce) return NULL; @@ -1465,8 +1477,9 @@ static int ext4_xattr_inode_lookup_create(handle_t *handle, struct inode *inode, return err; } - mb_cache_entry_create(EA_INODE_CACHE(inode), GFP_NOFS, hash, - ea_inode->i_ino, true /* reusable */); + if (EA_INODE_CACHE(inode)) + mb_cache_entry_create(EA_INODE_CACHE(inode), GFP_NOFS, hash, + ea_inode->i_ino, true /* reusable */); *ret_inode = ea_inode; return 0; @@ -1793,8 +1806,9 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, * ext4_xattr_block_set() to reliably detect modified * block */ - mb_cache_entry_delete(ea_block_cache, hash, - bs->bh->b_blocknr); + if (ea_block_cache) + mb_cache_entry_delete(ea_block_cache, hash, + bs->bh->b_blocknr); ea_bdebug(bs->bh, "modifying in-place"); error = ext4_xattr_set_entry(i, s, handle, inode, true /* is_block */); @@ -2883,6 +2897,8 @@ ext4_xattr_block_cache_insert(struct mb_cache *ea_block_cache, EXT4_XATTR_REFCOUNT_MAX; int error; + if (!ea_block_cache) + return; error = mb_cache_entry_create(ea_block_cache, GFP_NOFS, hash, bh->b_blocknr, reusable); if (error) { @@ -2949,6 +2965,8 @@ ext4_xattr_block_cache_find(struct inode *inode, struct mb_cache_entry *ce; struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode); + if (!ea_block_cache) + return NULL; if (!header->h_hash) return NULL; /* never share */ ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); From 3abb1a0fc2871f2db52199e1748a1d48a54a3427 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 22 Jun 2017 23:49:46 -0400 Subject: [PATCH 34/47] ext4: avoid unnecessary stalls in ext4_evict_inode() These days inode reclaim calls evict_inode() only when it has no pages in the mapping. In that case it is not necessary to wait for transaction commit in ext4_evict_inode() as there can be no pages waiting to be committed. So avoid unnecessary transaction waiting in that case. We still have to keep the check for the case where ext4_evict_inode() gets called from other paths (e.g. umount) where inode still can have some page cache pages. Reported-by: Johannes Weiner Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d9733aa955e9..754c2190af31 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -215,7 +215,8 @@ void ext4_evict_inode(struct inode *inode) */ if (inode->i_ino != EXT4_JOURNAL_INO && ext4_should_journal_data(inode) && - (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) { + (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) && + inode->i_data.nrpages) { journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; tid_t commit_tid = EXT4_I(inode)->i_datasync_tid; From a015434480dcdbfdc188df9b3633348af745e1b1 Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Thu, 22 Jun 2017 23:54:33 -0400 Subject: [PATCH 35/47] ext4: send parallel discards on commit completions Now, when we mount ext4 filesystem with '-o discard' option, we have to issue all the discard commands for the blocks to be deallocated and wait for the completion of the commands on the commit complete phase. Because this procedure might involve a lot of sequential combinations of issuing discard commands and waiting for that, the delay of this procedure might be too much long, even to 17.0s in our test, and it results in long commit delay and fsync() performance degradation. To reduce this kind of delay, instead of adding callback for each extent and handling all of them in a sequential manner on commit phase, we instead add a separate list of extents to free to the superblock and then process this list at once after transaction commits so that we can issue all the discard commands in a parallel manner like XFS filesystem. Finally, we could enhance the discard command handling performance. The result was such that 17.0s delay of a single commit in the worst case has been enhanced to 4.8s. Signed-off-by: Daeho Jeong Signed-off-by: Theodore Ts'o Tested-by: Hobin Woo Tested-by: Kitae Lee Reviewed-by: Jan Kara --- fs/ext4/ext4.h | 3 + fs/ext4/mballoc.c | 141 ++++++++++++++++++++++++++++++---------------- fs/ext4/mballoc.h | 6 +- fs/ext4/super.c | 3 + 4 files changed, 100 insertions(+), 53 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 68ddd24db9a2..9ebde0cd632e 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1445,6 +1445,8 @@ struct ext4_sb_info { unsigned int *s_mb_maxs; unsigned int s_group_info_size; unsigned int s_mb_free_pending; + struct list_head s_freed_data_list; /* List of blocks to be freed + after commit completed */ /* tunables */ unsigned long s_stripe; @@ -2454,6 +2456,7 @@ extern int ext4_mb_add_groupinfo(struct super_block *sb, extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, ext4_fsblk_t block, unsigned long count); extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); +extern void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid); /* inode.c */ int ext4_inode_is_fast_symlink(struct inode *inode); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index d109a2a2fea0..95425918875e 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -367,8 +367,6 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_group_t group); static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, ext4_group_t group); -static void ext4_free_data_callback(struct super_block *sb, - struct ext4_journal_cb_entry *jce, int rc); static inline void *mb_correct_addr_and_bit(int *bit, void *addr) { @@ -2639,6 +2637,7 @@ int ext4_mb_init(struct super_block *sb) spin_lock_init(&sbi->s_md_lock); spin_lock_init(&sbi->s_bal_lock); sbi->s_mb_free_pending = 0; + INIT_LIST_HEAD(&sbi->s_freed_data_list); sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN; sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN; @@ -2782,7 +2781,8 @@ int ext4_mb_release(struct super_block *sb) } static inline int ext4_issue_discard(struct super_block *sb, - ext4_group_t block_group, ext4_grpblk_t cluster, int count) + ext4_group_t block_group, ext4_grpblk_t cluster, int count, + struct bio **biop) { ext4_fsblk_t discard_block; @@ -2791,18 +2791,18 @@ static inline int ext4_issue_discard(struct super_block *sb, count = EXT4_C2B(EXT4_SB(sb), count); trace_ext4_discard_blocks(sb, (unsigned long long) discard_block, count); - return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0); + if (biop) { + return __blkdev_issue_discard(sb->s_bdev, + (sector_t)discard_block << (sb->s_blocksize_bits - 9), + (sector_t)count << (sb->s_blocksize_bits - 9), + GFP_NOFS, 0, biop); + } else + return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0); } -/* - * This function is called by the jbd2 layer once the commit has finished, - * so we know we can free the blocks that were released with that commit. - */ -static void ext4_free_data_callback(struct super_block *sb, - struct ext4_journal_cb_entry *jce, - int rc) +static void ext4_free_data_in_buddy(struct super_block *sb, + struct ext4_free_data *entry) { - struct ext4_free_data *entry = (struct ext4_free_data *)jce; struct ext4_buddy e4b; struct ext4_group_info *db; int err, count = 0, count2 = 0; @@ -2810,18 +2810,6 @@ static void ext4_free_data_callback(struct super_block *sb, mb_debug(1, "gonna free %u blocks in group %u (0x%p):", entry->efd_count, entry->efd_group, entry); - if (test_opt(sb, DISCARD)) { - err = ext4_issue_discard(sb, entry->efd_group, - entry->efd_start_cluster, - entry->efd_count); - if (err && err != -EOPNOTSUPP) - ext4_msg(sb, KERN_WARNING, "discard request in" - " group:%d block:%d count:%d failed" - " with %d", entry->efd_group, - entry->efd_start_cluster, - entry->efd_count, err); - } - err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b); /* we expect to find existing buddy because it's pinned */ BUG_ON(err != 0); @@ -2862,6 +2850,56 @@ static void ext4_free_data_callback(struct super_block *sb, mb_debug(1, "freed %u blocks in %u structures\n", count, count2); } +/* + * This function is called by the jbd2 layer once the commit has finished, + * so we know we can free the blocks that were released with that commit. + */ +void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_free_data *entry, *tmp; + struct bio *discard_bio = NULL; + struct list_head freed_data_list; + struct list_head *cut_pos = NULL; + int err; + + INIT_LIST_HEAD(&freed_data_list); + + spin_lock(&sbi->s_md_lock); + list_for_each_entry(entry, &sbi->s_freed_data_list, efd_list) { + if (entry->efd_tid != commit_tid) + break; + cut_pos = &entry->efd_list; + } + if (cut_pos) + list_cut_position(&freed_data_list, &sbi->s_freed_data_list, + cut_pos); + spin_unlock(&sbi->s_md_lock); + + if (test_opt(sb, DISCARD)) { + list_for_each_entry(entry, &freed_data_list, efd_list) { + err = ext4_issue_discard(sb, entry->efd_group, + entry->efd_start_cluster, + entry->efd_count, + &discard_bio); + if (err && err != -EOPNOTSUPP) { + ext4_msg(sb, KERN_WARNING, "discard request in" + " group:%d block:%d count:%d failed" + " with %d", entry->efd_group, + entry->efd_start_cluster, + entry->efd_count, err); + } else if (err == -EOPNOTSUPP) + break; + } + + if (discard_bio) + submit_bio_wait(discard_bio); + } + + list_for_each_entry_safe(entry, tmp, &freed_data_list, efd_list) + ext4_free_data_in_buddy(sb, entry); +} + int __init ext4_init_mballoc(void) { ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space, @@ -4583,14 +4621,28 @@ out: * are contiguous, AND the extents were freed by the same transaction, * AND the blocks are associated with the same group. */ -static int can_merge(struct ext4_free_data *entry1, - struct ext4_free_data *entry2) +static void ext4_try_merge_freed_extent(struct ext4_sb_info *sbi, + struct ext4_free_data *entry, + struct ext4_free_data *new_entry, + struct rb_root *entry_rb_root) { - if ((entry1->efd_tid == entry2->efd_tid) && - (entry1->efd_group == entry2->efd_group) && - ((entry1->efd_start_cluster + entry1->efd_count) == entry2->efd_start_cluster)) - return 1; - return 0; + if ((entry->efd_tid != new_entry->efd_tid) || + (entry->efd_group != new_entry->efd_group)) + return; + if (entry->efd_start_cluster + entry->efd_count == + new_entry->efd_start_cluster) { + new_entry->efd_start_cluster = entry->efd_start_cluster; + new_entry->efd_count += entry->efd_count; + } else if (new_entry->efd_start_cluster + new_entry->efd_count == + entry->efd_start_cluster) { + new_entry->efd_count += entry->efd_count; + } else + return; + spin_lock(&sbi->s_md_lock); + list_del(&entry->efd_list); + spin_unlock(&sbi->s_md_lock); + rb_erase(&entry->efd_node, entry_rb_root); + kmem_cache_free(ext4_free_data_cachep, entry); } static noinline_for_stack int @@ -4646,29 +4698,19 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, node = rb_prev(new_node); if (node) { entry = rb_entry(node, struct ext4_free_data, efd_node); - if (can_merge(entry, new_entry) && - ext4_journal_callback_try_del(handle, &entry->efd_jce)) { - new_entry->efd_start_cluster = entry->efd_start_cluster; - new_entry->efd_count += entry->efd_count; - rb_erase(node, &(db->bb_free_root)); - kmem_cache_free(ext4_free_data_cachep, entry); - } + ext4_try_merge_freed_extent(sbi, entry, new_entry, + &(db->bb_free_root)); } node = rb_next(new_node); if (node) { entry = rb_entry(node, struct ext4_free_data, efd_node); - if (can_merge(new_entry, entry) && - ext4_journal_callback_try_del(handle, &entry->efd_jce)) { - new_entry->efd_count += entry->efd_count; - rb_erase(node, &(db->bb_free_root)); - kmem_cache_free(ext4_free_data_cachep, entry); - } + ext4_try_merge_freed_extent(sbi, entry, new_entry, + &(db->bb_free_root)); } - /* Add the extent to transaction's private list */ - new_entry->efd_jce.jce_func = ext4_free_data_callback; + spin_lock(&sbi->s_md_lock); - _ext4_journal_callback_add(handle, &new_entry->efd_jce); + list_add_tail(&new_entry->efd_list, &sbi->s_freed_data_list); sbi->s_mb_free_pending += clusters; spin_unlock(&sbi->s_md_lock); return 0; @@ -4871,7 +4913,8 @@ do_more: * them with group lock_held */ if (test_opt(sb, DISCARD)) { - err = ext4_issue_discard(sb, block_group, bit, count); + err = ext4_issue_discard(sb, block_group, bit, count, + NULL); if (err && err != -EOPNOTSUPP) ext4_msg(sb, KERN_WARNING, "discard request in" " group:%d block:%d count:%lu failed" @@ -5094,7 +5137,7 @@ __acquires(bitlock) */ mb_mark_used(e4b, &ex); ext4_unlock_group(sb, group); - ret = ext4_issue_discard(sb, group, start, count); + ret = ext4_issue_discard(sb, group, start, count, NULL); ext4_lock_group(sb, group); mb_free_blocks(NULL, e4b, start, ex.fe_len); return ret; diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 2bed62084a8c..009300ee1561 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -78,10 +78,8 @@ do { \ struct ext4_free_data { - /* MUST be the first member */ - struct ext4_journal_cb_entry efd_jce; - - /* ext4_free_data private data starts from here */ + /* this links the free block information from sb_info */ + struct list_head efd_list; /* this links the free block information from group_info */ struct rb_node efd_node; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 1fec35bd4084..cb9af5d5c29b 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -373,6 +373,9 @@ static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) struct ext4_journal_cb_entry *jce; BUG_ON(txn->t_state == T_FINISHED); + + ext4_process_freed_data(sb, txn->t_tid); + spin_lock(&sbi->s_md_lock); while (!list_empty(&txn->t_private_list)) { jce = list_entry(txn->t_private_list.next, From 9ce0151a47f6fa8e4b3b35785aac0d51adbb06ca Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 23 Jun 2017 00:10:36 -0400 Subject: [PATCH 36/47] ext4: forbid encrypting root directory Currently it's possible to encrypt all files and directories on an ext4 filesystem by deleting everything, including lost+found, then setting an encryption policy on the root directory. However, this is incompatible with e2fsck because e2fsck expects to find, create, and/or write to lost+found and does not have access to any encryption keys. Especially problematic is that if e2fsck can't find lost+found, it will create it without regard for whether the root directory is encrypted. This is wrong for obvious reasons, and it causes a later run of e2fsck to consider the lost+found directory entry to be corrupted. Encrypting the root directory may also be of limited use because it is the "all-or-nothing" use case, for which dm-crypt can be used instead. (By design, encryption policies are inherited and cannot be overridden; so the root directory having an encryption policy implies that all files and directories on the filesystem have that same encryption policy.) In any case, encrypting the root directory is broken currently and must not be allowed; so start returning an error if userspace requests it. For now only do this in ext4, because f2fs and ubifs do not appear to have the lost+found requirement. We could move it into fscrypt_ioctl_set_policy() later if desired, though. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o Reviewed-by: Andreas Dilger --- fs/ext4/super.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index cb9af5d5c29b..56c971807df5 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1152,6 +1152,15 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len, handle_t *handle = fs_data; int res, res2, credits, retries = 0; + /* + * Encrypting the root directory is not allowed because e2fsck expects + * lost+found to exist and be unencrypted, and encrypting the root + * directory would imply encrypting the lost+found directory as well as + * the filename "lost+found" itself. + */ + if (inode->i_ino == EXT4_ROOT_INO) + return -EPERM; + res = ext4_convert_inline_data(inode); if (res) return res; From 6febe6f253a5bb5c7ac2080d688bd4b75e019d4d Mon Sep 17 00:00:00 2001 From: Khazhismel Kumykov Date: Fri, 23 Jun 2017 00:29:05 -0400 Subject: [PATCH 37/47] ext4: return EIO on read error in ext4_find_entry Previously, a read error would be ignored and we would eventually return NULL from ext4_find_entry, which signals "no such file or directory". We should be returning EIO. Signed-off-by: Khazhismel Kumykov --- fs/ext4/namei.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 423e1f761768..2a7f2dc7f4dd 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1442,11 +1442,11 @@ restart: goto next; wait_on_buffer(bh); if (!buffer_uptodate(bh)) { - /* read error, skip block & hope for the best */ EXT4_ERROR_INODE(dir, "reading directory lblock %lu", (unsigned long) block); brelse(bh); - goto next; + ret = ERR_PTR(-EIO); + goto cleanup_and_exit; } if (!buffer_verified(bh) && !is_dx_internal_node(dir, block, From bdddf342796765a1a946e7c4aed2574f4488e4e5 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 23 Jun 2017 00:47:05 -0400 Subject: [PATCH 38/47] ext4: return EFSBADCRC if a bad checksum error is found in ext4_find_entry() Previously a bad directory block with a bad checksum is skipped; we should be returning EFSBADCRC (aka EBADMSG). Signed-off-by: Theodore Ts'o --- fs/ext4/namei.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 2a7f2dc7f4dd..13f0cadb1238 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1456,7 +1456,8 @@ restart: EXT4_ERROR_INODE(dir, "checksumming directory " "block %lu", (unsigned long)block); brelse(bh); - goto next; + ret = ERR_PTR(-EFSBADCRC); + goto cleanup_and_exit; } set_buffer_verified(bh); i = search_dirblock(bh, dir, &fname, From 4a4956249dac0b9b0027949907bff0cd1a9b57fa Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 23 Jun 2017 00:58:57 -0400 Subject: [PATCH 39/47] ext4: fix off-by-one fsmap error on 1k block filesystems For 1k-block filesystems, the filesystem starts at block 1, not block 0. This fact is recorded in s_first_data_block, so use that to bump up the start_fsb before we start querying the filesystem for its space map. Without this, ext4/026 fails on 1k block ext4 because various functions (notably ext4_get_group_no_and_offset) don't know what to do with an fsblock that is "before" the start of the filesystem and return garbage results (blockgroup 2^32-1, etc.) that confuse fsmap. Signed-off-by: Darrick J. Wong Signed-off-by: Theodore Ts'o --- fs/ext4/fsmap.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c index b19436098837..7ec340898598 100644 --- a/fs/ext4/fsmap.c +++ b/fs/ext4/fsmap.c @@ -480,6 +480,7 @@ static int ext4_getfsmap_datadev(struct super_block *sb, struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t start_fsb; ext4_fsblk_t end_fsb; + ext4_fsblk_t bofs; ext4_fsblk_t eofs; ext4_group_t start_ag; ext4_group_t end_ag; @@ -487,9 +488,12 @@ static int ext4_getfsmap_datadev(struct super_block *sb, ext4_grpblk_t last_cluster; int error = 0; + bofs = le32_to_cpu(sbi->s_es->s_first_data_block); eofs = ext4_blocks_count(sbi->s_es); if (keys[0].fmr_physical >= eofs) return 0; + else if (keys[0].fmr_physical < bofs) + keys[0].fmr_physical = bofs; if (keys[1].fmr_physical >= eofs) keys[1].fmr_physical = eofs - 1; start_fsb = keys[0].fmr_physical; From 1ea1516fbbab2b30bf98c534ecaacba579a35208 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 23 Jun 2017 01:08:22 -0400 Subject: [PATCH 40/47] ext4: check return value of kstrtoull correctly in reserved_clusters_store kstrtoull returns 0 on success, however, in reserved_clusters_store we will return -EINVAL if kstrtoull returns 0, it makes us fail to update reserved_clusters value through sysfs. Fixes: 76d33bca5581b1dd5c3157fa168db849a784ada4 Cc: stable@vger.kernel.org # 4.4 Signed-off-by: Chao Yu Signed-off-by: Miao Xie Signed-off-by: Theodore Ts'o --- fs/ext4/sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index d74dc5f81a04..48c7a7d55ed3 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -100,7 +100,7 @@ static ssize_t reserved_clusters_store(struct ext4_attr *a, int ret; ret = kstrtoull(skip_spaces(buf), 0, &val); - if (!ret || val >= clusters) + if (ret || val >= clusters) return -EINVAL; atomic64_set(&sbi->s_resv_clusters, val); From 66e0aaadce7fc99e91e0b427e2b177e14d0b951b Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 23 Jun 2017 19:41:38 -0400 Subject: [PATCH 41/47] ext4: don't bother checking for encryption key in ->mmap() Since only an open file can be mmap'ed, and we only allow open()ing an encrypted file when its key is available, there is no need to check for the key again before permitting each mmap(). Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/ext4/file.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 02ce7e7bbdf5..736538911f00 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -345,13 +345,6 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) return -EIO; - if (ext4_encrypted_inode(inode)) { - int err = fscrypt_get_encryption_info(inode); - if (err) - return 0; - if (!fscrypt_has_encryption_key(inode)) - return -ENOKEY; - } file_accessed(file); if (IS_DAX(file_inode(file))) { vma->vm_ops = &ext4_dax_vm_ops; From 63136858aecbe86dbd3c3289a3f46ba1b5f92239 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 23 Jun 2017 19:48:44 -0400 Subject: [PATCH 42/47] ext4: require key for truncate(2) of encrypted file Currently, filesystems allow truncate(2) on an encrypted file without the encryption key. However, it's impossible to correctly handle the case where the size being truncated to is not a multiple of the filesystem block size, because that would require decrypting the final block, zeroing the part beyond i_size, then encrypting the block. As other modifications to encrypted file contents are prohibited without the key, just prohibit truncate(2) as well, making it fail with ENOKEY. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 754c2190af31..daed9b38362a 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5323,6 +5323,14 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) loff_t oldsize = inode->i_size; int shrink = (attr->ia_size <= inode->i_size); + if (ext4_encrypted_inode(inode)) { + error = fscrypt_get_encryption_info(inode); + if (error) + return error; + if (!fscrypt_has_encryption_key(inode)) + return -ENOKEY; + } + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); From 407cd7fb83c0ebabb490190e673d8c71ee7df97e Mon Sep 17 00:00:00 2001 From: Tahsin Erdogan Date: Tue, 4 Jul 2017 00:11:21 -0400 Subject: [PATCH 43/47] ext4: change fast symlink test to not rely on i_blocks ext4_inode_info->i_data is the storage area for 4 types of data: a) Extents data b) Inline data c) Block map d) Fast symlink data (symlink length < 60) Extents data case is positively identified by EXT4_INODE_EXTENTS flag. Inline data case is also obvious because of EXT4_INODE_INLINE_DATA flag. Distinguishing c) and d) however requires additional logic. This currently relies on i_blocks count. After subtracting external xattr block from i_blocks, if it is greater than 0 then we know that some data blocks exist, so there must be a block map. This logic got broken after ea_inode feature was added. That feature charges the data blocks of external xattr inodes to the referencing inode and so adds them to the i_blocks. To fix this, we could subtract ea_inode blocks by iterating through all xattr entries and then check whether remaining i_blocks count is zero. Besides being complicated, this won't change the fact that the current way of distinguishing between c) and d) is fragile. The alternative solution is to test whether i_size is less than 60 to determine fast symlink case. ext4_symlink() uses the same test to decide whether to store the symlink in i_data. There is one caveat to address before this can work though. If an inode's i_nlink is zero during eviction, its i_size is set to zero and its data is truncated. If system crashes before inode is removed from the orphan list, next boot orphan cleanup may find the inode with zero i_size. So, a symlink that had its data stored in a block may now appear to be a fast symlink. The solution used in this patch is to treat i_size = 0 as a non-fast symlink case. A zero sized symlink is not legal so the only time this can happen is the mentioned scenario. This is also logically correct because a i_size = 0 symlink has no data stored in i_data. Suggested-by: Andreas Dilger Signed-off-by: Tahsin Erdogan Signed-off-by: Theodore Ts'o Reviewed-by: Andreas Dilger --- fs/ext4/inode.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index daed9b38362a..3c600f02673f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -144,16 +144,12 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, /* * Test whether an inode is a fast symlink. + * A fast symlink has its symlink data stored in ext4_inode_info->i_data. */ int ext4_inode_is_fast_symlink(struct inode *inode) { - int ea_blocks = EXT4_I(inode)->i_file_acl ? - EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0; - - if (ext4_has_inline_data(inode)) - return 0; - - return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0); + return S_ISLNK(inode->i_mode) && inode->i_size && + (inode->i_size < EXT4_N_BLOCKS * 4); } /* @@ -261,6 +257,16 @@ void ext4_evict_inode(struct inode *inode) if (IS_SYNC(inode)) ext4_handle_sync(handle); + + /* + * Set inode->i_size to 0 before calling ext4_truncate(). We need + * special handling of symlinks here because i_size is used to + * determine whether ext4_inode_info->i_data contains symlink data or + * block mappings. Setting i_size to 0 will remove its fast symlink + * status. Erase i_data so that it becomes a valid empty block map. + */ + if (ext4_inode_is_fast_symlink(inode)) + memset(EXT4_I(inode)->i_data, 0, sizeof(EXT4_I(inode)->i_data)); inode->i_size = 0; err = ext4_mark_inode_dirty(handle, inode); if (err) { From 2a527d6858c246db8afc3d576dbcbff0902f933b Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Wed, 5 Jul 2017 00:56:21 -0400 Subject: [PATCH 44/47] fs: generic_block_bmap(): initialize all of the fields in the temp bh KMSAN (KernelMemorySanitizer, a new error detection tool) reports the use of uninitialized memory in ext4_update_bh_state(): ================================================================== BUG: KMSAN: use of unitialized memory CPU: 3 PID: 1 Comm: swapper/0 Tainted: G B 4.8.0-rc6+ #597 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 0000000000000282 ffff88003cc96f68 ffffffff81f30856 0000003000000008 ffff88003cc96f78 0000000000000096 ffffffff8169742a ffff88003cc96ff8 ffffffff812fc1fc 0000000000000008 ffff88003a1980e8 0000000100000000 Call Trace: [< inline >] __dump_stack lib/dump_stack.c:15 [] dump_stack+0xa6/0xc0 lib/dump_stack.c:51 [] kmsan_report+0x1ec/0x300 mm/kmsan/kmsan.c:? [] __msan_warning+0x2b/0x40 ??:? [< inline >] ext4_update_bh_state fs/ext4/inode.c:727 [] _ext4_get_block+0x6ca/0x8a0 fs/ext4/inode.c:759 [] ext4_get_block+0x8c/0xa0 fs/ext4/inode.c:769 [] generic_block_bmap+0x246/0x2b0 fs/buffer.c:2991 [] ext4_bmap+0x5ee/0x660 fs/ext4/inode.c:3177 ... origin description: ----tmp@generic_block_bmap ================================================================== (the line numbers are relative to 4.8-rc6, but the bug persists upstream) The local |tmp| is created in generic_block_bmap() and then passed into ext4_bmap() => ext4_get_block() => _ext4_get_block() => ext4_update_bh_state(). Along the way tmp.b_page is never initialized before ext4_update_bh_state() checks its value. [ Use the approach suggested by Kees Cook of initializing the whole bh structure.] Signed-off-by: Alexander Potapenko Signed-off-by: Theodore Ts'o --- fs/buffer.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index 161be58c5cb0..a3399aa6a2bd 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -3021,11 +3021,11 @@ EXPORT_SYMBOL(block_write_full_page); sector_t generic_block_bmap(struct address_space *mapping, sector_t block, get_block_t *get_block) { - struct buffer_head tmp; struct inode *inode = mapping->host; - tmp.b_state = 0; - tmp.b_blocknr = 0; - tmp.b_size = i_blocksize(inode); + struct buffer_head tmp = { + .b_size = i_blocksize(inode), + }; + get_block(inode, block, &tmp, 0); return tmp.b_blocknr; } From ad47f9533994d7e3d2dbfa4fffe85934a1627edc Mon Sep 17 00:00:00 2001 From: Tahsin Erdogan Date: Thu, 6 Jul 2017 00:00:59 -0400 Subject: [PATCH 45/47] ext4: skip ext4_init_security() and encryption on ea_inodes Extended attribute inodes are internal to ext4. Adding encryption/security related attributes on them would mean dealing with nested calls into ea code. Since they have no direct exposure to user mode, just avoid creating ea entries for them. Signed-off-by: Tahsin Erdogan Signed-off-by: Theodore Ts'o --- fs/ext4/ialloc.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index fb1b3df17f6e..0c79e3efcaf7 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -771,7 +771,8 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, if ((ext4_encrypted_inode(dir) || DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb))) && - (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) { + (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) && + !(i_flags & EXT4_EA_INODE_FL)) { err = fscrypt_get_encryption_info(dir); if (err) return ERR_PTR(err); @@ -1114,11 +1115,11 @@ got: err = ext4_init_acl(handle, inode, dir); if (err) goto fail_free_drop; - } - err = ext4_init_security(handle, inode, dir, qstr); - if (err) - goto fail_free_drop; + err = ext4_init_security(handle, inode, dir, qstr); + if (err) + goto fail_free_drop; + } if (ext4_has_feature_extents(sb)) { /* set extent flag only for directory, file and normal symlink*/ From af65207c76ce8e6263a3b097ea35365dde9913d0 Mon Sep 17 00:00:00 2001 From: Tahsin Erdogan Date: Thu, 6 Jul 2017 00:01:59 -0400 Subject: [PATCH 46/47] ext4: fix __ext4_new_inode() journal credits calculation ea_inode feature allows creating extended attributes that are up to 64k in size. Update __ext4_new_inode() to pick increased credit limits. To avoid overallocating too many journal credits, update __ext4_xattr_set_credits() to make a distinction between xattr create vs update. This helps __ext4_new_inode() because all attributes are known to be new, so we can save credits that are normally needed to delete old values. Also, have fscrypt specify its maximum context size so that we don't end up allocating credits for 64k size. Signed-off-by: Tahsin Erdogan Signed-off-by: Theodore Ts'o --- fs/crypto/policy.c | 1 + fs/ext4/acl.c | 13 +++++---- fs/ext4/ialloc.c | 52 +++++++++++++++++++++++++++++----- fs/ext4/super.c | 3 +- fs/ext4/xattr.c | 46 +++++++++++++++++------------- fs/ext4/xattr.h | 5 +++- include/linux/fscrypt_common.h | 3 ++ 7 files changed, 89 insertions(+), 34 deletions(-) diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 210976e7a269..94becf5a1519 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -260,6 +260,7 @@ int fscrypt_inherit_context(struct inode *parent, struct inode *child, memcpy(ctx.master_key_descriptor, ci->ci_master_key, FS_KEY_DESCRIPTOR_SIZE); get_random_bytes(ctx.nonce, FS_KEY_DERIVATION_NONCE_SIZE); + BUILD_BUG_ON(sizeof(ctx) != FSCRYPT_SET_CONTEXT_MAX_SIZE); res = parent->i_sb->s_cop->set_context(child, &ctx, sizeof(ctx), fs_data); if (res) diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index 8db03e5c78bc..09441ae07a5b 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -183,7 +183,7 @@ ext4_get_acl(struct inode *inode, int type) */ static int __ext4_set_acl(handle_t *handle, struct inode *inode, int type, - struct posix_acl *acl) + struct posix_acl *acl, int xattr_flags) { int name_index; void *value = NULL; @@ -218,7 +218,7 @@ __ext4_set_acl(handle_t *handle, struct inode *inode, int type, } error = ext4_xattr_set_handle(handle, inode, name_index, "", - value, size, 0); + value, size, xattr_flags); kfree(value); if (!error) @@ -238,7 +238,8 @@ ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type) if (error) return error; retry: - error = ext4_xattr_set_credits(inode, acl_size, &credits); + error = ext4_xattr_set_credits(inode, acl_size, false /* is_create */, + &credits); if (error) return error; @@ -246,7 +247,7 @@ retry: if (IS_ERR(handle)) return PTR_ERR(handle); - error = __ext4_set_acl(handle, inode, type, acl); + error = __ext4_set_acl(handle, inode, type, acl, 0 /* xattr_flags */); ext4_journal_stop(handle); if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; @@ -271,13 +272,13 @@ ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) if (default_acl) { error = __ext4_set_acl(handle, inode, ACL_TYPE_DEFAULT, - default_acl); + default_acl, XATTR_CREATE); posix_acl_release(default_acl); } if (acl) { if (!error) error = __ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, - acl); + acl, XATTR_CREATE); posix_acl_release(acl); } return error; diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 0c79e3efcaf7..507bfb3344d4 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -766,11 +766,13 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, if (!dir || !dir->i_nlink) return ERR_PTR(-EPERM); - if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb)))) + sb = dir->i_sb; + sbi = EXT4_SB(sb); + + if (unlikely(ext4_forced_shutdown(sbi))) return ERR_PTR(-EIO); - if ((ext4_encrypted_inode(dir) || - DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb))) && + if ((ext4_encrypted_inode(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) && (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) && !(i_flags & EXT4_EA_INODE_FL)) { err = fscrypt_get_encryption_info(dir); @@ -778,19 +780,55 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, return ERR_PTR(err); if (!fscrypt_has_encryption_key(dir)) return ERR_PTR(-ENOKEY); - if (!handle) - nblocks += EXT4_DATA_TRANS_BLOCKS(dir->i_sb); encrypt = 1; } - sb = dir->i_sb; + if (!handle && sbi->s_journal && !(i_flags & EXT4_EA_INODE_FL)) { +#ifdef CONFIG_EXT4_FS_POSIX_ACL + struct posix_acl *p = get_acl(dir, ACL_TYPE_DEFAULT); + + if (p) { + int acl_size = p->a_count * sizeof(ext4_acl_entry); + + nblocks += (S_ISDIR(mode) ? 2 : 1) * + __ext4_xattr_set_credits(sb, NULL /* inode */, + NULL /* block_bh */, acl_size, + true /* is_create */); + posix_acl_release(p); + } +#endif + +#ifdef CONFIG_SECURITY + { + int num_security_xattrs = 1; + +#ifdef CONFIG_INTEGRITY + num_security_xattrs++; +#endif + /* + * We assume that security xattrs are never + * more than 1k. In practice they are under + * 128 bytes. + */ + nblocks += num_security_xattrs * + __ext4_xattr_set_credits(sb, NULL /* inode */, + NULL /* block_bh */, 1024, + true /* is_create */); + } +#endif + if (encrypt) + nblocks += __ext4_xattr_set_credits(sb, + NULL /* inode */, NULL /* block_bh */, + FSCRYPT_SET_CONTEXT_MAX_SIZE, + true /* is_create */); + } + ngroups = ext4_get_groups_count(sb); trace_ext4_request_inode(dir, mode); inode = new_inode(sb); if (!inode) return ERR_PTR(-ENOMEM); ei = EXT4_I(inode); - sbi = EXT4_SB(sb); /* * Initialize owners and quota early so that we don't have to account diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 56c971807df5..f666042a3d58 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1194,7 +1194,8 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len, if (res) return res; retry: - res = ext4_xattr_set_credits(inode, len, &credits); + res = ext4_xattr_set_credits(inode, len, false /* is_create */, + &credits); if (res) return res; diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 34fa37e7744c..cff4f41ced61 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -830,11 +830,10 @@ static void ext4_xattr_inode_free_quota(struct inode *inode, size_t len) dquot_free_inode(inode); } -static int __ext4_xattr_set_credits(struct inode *inode, - struct buffer_head *block_bh, - size_t value_len) +int __ext4_xattr_set_credits(struct super_block *sb, struct inode *inode, + struct buffer_head *block_bh, size_t value_len, + bool is_create) { - struct super_block *sb = inode->i_sb; int credits; int blocks; @@ -860,7 +859,7 @@ static int __ext4_xattr_set_credits(struct inode *inode, * In case of inline data, we may push out the data to a block, * so we need to reserve credits for this eventuality */ - if (ext4_has_inline_data(inode)) + if (inode && ext4_has_inline_data(inode)) credits += ext4_writepage_trans_blocks(inode) + 1; /* We are done if ea_inode feature is not enabled. */ @@ -882,19 +881,23 @@ static int __ext4_xattr_set_credits(struct inode *inode, /* Blocks themselves. */ credits += blocks; - /* Dereference ea_inode holding old xattr value. - * Old ea_inode, inode map, block bitmap, group descriptor. - */ - credits += 4; + if (!is_create) { + /* Dereference ea_inode holding old xattr value. + * Old ea_inode, inode map, block bitmap, group descriptor. + */ + credits += 4; - /* Data blocks for old ea_inode. */ - blocks = XATTR_SIZE_MAX >> sb->s_blocksize_bits; + /* Data blocks for old ea_inode. */ + blocks = XATTR_SIZE_MAX >> sb->s_blocksize_bits; - /* Indirection block or one level of extent tree for old ea_inode. */ - blocks += 1; + /* Indirection block or one level of extent tree for old + * ea_inode. + */ + blocks += 1; - /* Block bitmap and group descriptor updates for each block. */ - credits += blocks * 2; + /* Block bitmap and group descriptor updates for each block. */ + credits += blocks * 2; + } /* We may need to clone the existing xattr block in which case we need * to increment ref counts for existing ea_inodes referenced by it. @@ -2263,7 +2266,9 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, goto cleanup; } - credits = __ext4_xattr_set_credits(inode, bh, value_len); + credits = __ext4_xattr_set_credits(inode->i_sb, inode, bh, + value_len, + flags & XATTR_CREATE); brelse(bh); if (!ext4_handle_has_enough_credits(handle, credits)) { @@ -2370,7 +2375,8 @@ cleanup: return error; } -int ext4_xattr_set_credits(struct inode *inode, size_t value_len, int *credits) +int ext4_xattr_set_credits(struct inode *inode, size_t value_len, + bool is_create, int *credits) { struct buffer_head *bh; int err; @@ -2386,7 +2392,8 @@ int ext4_xattr_set_credits(struct inode *inode, size_t value_len, int *credits) if (IS_ERR(bh)) { err = PTR_ERR(bh); } else { - *credits = __ext4_xattr_set_credits(inode, bh, value_len); + *credits = __ext4_xattr_set_credits(inode->i_sb, inode, bh, + value_len, is_create); brelse(bh); err = 0; } @@ -2417,7 +2424,8 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name, return error; retry: - error = ext4_xattr_set_credits(inode, value_len, &credits); + error = ext4_xattr_set_credits(inode, value_len, flags & XATTR_CREATE, + &credits); if (error) return error; diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 26119a67c8c3..0d2dde1fa87a 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -153,7 +153,10 @@ extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t); extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_t, int); extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int); extern int ext4_xattr_set_credits(struct inode *inode, size_t value_len, - int *credits); + bool is_create, int *credits); +extern int __ext4_xattr_set_credits(struct super_block *sb, struct inode *inode, + struct buffer_head *block_bh, size_t value_len, + bool is_create); extern int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, struct ext4_xattr_inode_array **array, diff --git a/include/linux/fscrypt_common.h b/include/linux/fscrypt_common.h index 0a30c106c1e5..82beaf70e7e2 100644 --- a/include/linux/fscrypt_common.h +++ b/include/linux/fscrypt_common.h @@ -83,6 +83,9 @@ struct fscrypt_operations { unsigned (*max_namelen)(struct inode *); }; +/* Maximum value for the third parameter of fscrypt_operations.set_context(). */ +#define FSCRYPT_SET_CONTEXT_MAX_SIZE 28 + static inline bool fscrypt_dummy_context_enabled(struct inode *inode) { if (inode->i_sb->s_cop->dummy_context && From ff95015648df445999c8483270905f7d3dec51e1 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 6 Jul 2017 15:28:45 -0400 Subject: [PATCH 47/47] ext4: fix spelling mistake: "prellocated" -> "preallocated" Trivial fix to spelling mistake in mb_debug debug message Signed-off-by: Colin Ian King Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 95425918875e..581e357e8406 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3567,7 +3567,7 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_set_bits(bitmap, start, len); preallocated += len; } - mb_debug(1, "prellocated %u for group %u\n", preallocated, group); + mb_debug(1, "preallocated %u for group %u\n", preallocated, group); } static void ext4_mb_pa_callback(struct rcu_head *head)