From 353b67d8ced4dc53281c88150ad295e24bc4b4c5 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sat, 26 Nov 2011 00:35:39 +0100 Subject: [PATCH 1/3] jbd: Issue cache flush after checkpointing When we reach cleanup_journal_tail(), there is no guarantee that checkpointed buffers are on a stable storage - especially if buffers were written out by log_do_checkpoint(), they are likely to be only in disk's caches. Thus when we update journal superblock, effectively removing old transaction from journal, this write of superblock can get to stable storage before those checkpointed buffers which can result in filesystem corruption after a crash. A similar problem can happen if we replay the journal and wipe it before flushing disk's caches. Thus we must unconditionally issue a cache flush before we update journal superblock in these cases. The fix is slightly complicated by the fact that we have to get log tail before we issue cache flush but we can store it in the journal superblock only after the cache flush. Otherwise we risk races where new tail is written before appropriate cache flush is finished. I managed to reproduce the corruption using somewhat tweaked Chris Mason's barrier-test scheduler. Also this should fix occasional reports of 'Bit already freed' filesystem errors which are totally unreproducible but inspection of several fs images I've gathered over time points to a problem like this. CC: stable@kernel.org Signed-off-by: Jan Kara --- fs/jbd/checkpoint.c | 27 ++++++++++++++++++++++----- fs/jbd/recovery.c | 4 ++++ 2 files changed, 26 insertions(+), 5 deletions(-) diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c index 5d1a00a5041b..05f0754f2b46 100644 --- a/fs/jbd/checkpoint.c +++ b/fs/jbd/checkpoint.c @@ -453,8 +453,6 @@ out: * * Return <0 on error, 0 on success, 1 if there was nothing to clean up. * - * Called with the journal lock held. - * * This is the only part of the journaling code which really needs to be * aware of transaction aborts. Checkpointing involves writing to the * main filesystem area rather than to the journal, so it can proceed @@ -472,13 +470,14 @@ int cleanup_journal_tail(journal_t *journal) if (is_journal_aborted(journal)) return 1; - /* OK, work out the oldest transaction remaining in the log, and + /* + * OK, work out the oldest transaction remaining in the log, and * the log block it starts at. * * If the log is now empty, we need to work out which is the * next transaction ID we will write, and where it will - * start. */ - + * start. + */ spin_lock(&journal->j_state_lock); spin_lock(&journal->j_list_lock); transaction = journal->j_checkpoint_transactions; @@ -504,7 +503,25 @@ int cleanup_journal_tail(journal_t *journal) spin_unlock(&journal->j_state_lock); return 1; } + spin_unlock(&journal->j_state_lock); + /* + * We need to make sure that any blocks that were recently written out + * --- perhaps by log_do_checkpoint() --- are flushed out before we + * drop the transactions from the journal. It's unlikely this will be + * necessary, especially with an appropriately sized journal, but we + * need this to guarantee correctness. Fortunately + * cleanup_journal_tail() doesn't get called all that often. + */ + if (journal->j_flags & JFS_BARRIER) + blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); + + spin_lock(&journal->j_state_lock); + if (!tid_gt(first_tid, journal->j_tail_sequence)) { + spin_unlock(&journal->j_state_lock); + /* Someone else cleaned up journal so return 0 */ + return 0; + } /* OK, update the superblock to recover the freed space. * Physical blocks come first: have we wrapped beyond the end of * the log? */ diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c index 5b43e96788e6..008bf062fd26 100644 --- a/fs/jbd/recovery.c +++ b/fs/jbd/recovery.c @@ -20,6 +20,7 @@ #include #include #include +#include #endif /* @@ -263,6 +264,9 @@ int journal_recover(journal_t *journal) err2 = sync_blockdev(journal->j_fs_dev); if (!err) err = err2; + /* Flush disk caches to get replayed data on the permanent storage */ + if (journal->j_flags & JFS_BARRIER) + blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); return err; } From 34b07840565004cfa444e165e88bf77a5cbcdb25 Mon Sep 17 00:00:00 2001 From: Djalal Harouni Date: Mon, 9 Jan 2012 15:58:37 +0100 Subject: [PATCH 2/3] ext2: protect inode changes in the SETVERSION and SETFLAGS ioctls Unlock mutex after i_flags and i_ctime updates in the EXT2_IOC_SETFLAGS ioctl. Use i_mutex in the EXT2_IOC_SETVERSION ioctl to protect i_ctime and i_generation updates and make the ioctl consistent since i_mutex is also used in other places to protect timestamps and inode changes. Cc: Andreas Dilger Cc: Jan Kara Signed-off-by: Djalal Harouni Signed-off-by: Jan Kara --- fs/ext2/ioctl.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c index 1089f760c847..2de655f5d625 100644 --- a/fs/ext2/ioctl.c +++ b/fs/ext2/ioctl.c @@ -77,10 +77,11 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) flags = flags & EXT2_FL_USER_MODIFIABLE; flags |= oldflags & ~EXT2_FL_USER_MODIFIABLE; ei->i_flags = flags; - mutex_unlock(&inode->i_mutex); ext2_set_inode_flags(inode); inode->i_ctime = CURRENT_TIME_SEC; + mutex_unlock(&inode->i_mutex); + mark_inode_dirty(inode); setflags_out: mnt_drop_write_file(filp); @@ -88,20 +89,29 @@ setflags_out: } case EXT2_IOC_GETVERSION: return put_user(inode->i_generation, (int __user *) arg); - case EXT2_IOC_SETVERSION: + case EXT2_IOC_SETVERSION: { + __u32 generation; + if (!inode_owner_or_capable(inode)) return -EPERM; ret = mnt_want_write_file(filp); if (ret) return ret; - if (get_user(inode->i_generation, (int __user *) arg)) { + if (get_user(generation, (int __user *) arg)) { ret = -EFAULT; - } else { - inode->i_ctime = CURRENT_TIME_SEC; - mark_inode_dirty(inode); + goto setversion_out; } + + mutex_lock(&inode->i_mutex); + inode->i_ctime = CURRENT_TIME_SEC; + inode->i_generation = generation; + mutex_unlock(&inode->i_mutex); + + mark_inode_dirty(inode); +setversion_out: mnt_drop_write_file(filp); return ret; + } case EXT2_IOC_GETRSVSZ: if (test_opt(inode->i_sb, RESERVATION) && S_ISREG(inode->i_mode) From 46fe44ce8777f087aa8ad4a2605fdcfb9c2d63af Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 16 Nov 2011 15:03:59 +0100 Subject: [PATCH 3/3] quota: Pass information that quota is stored in system file to userspace Quota tools need to know whether quota is stored in a system file or in classical aquota.{user|group} files. So pass this information as a flag in GETINFO quotactl. Signed-off-by: Jan Kara --- fs/quota/dquot.c | 8 +++++--- include/linux/quota.h | 6 +++++- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 5ec59b20cf76..46741970371b 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2125,6 +2125,8 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, mutex_unlock(&dqopt->dqio_mutex); goto out_file_init; } + if (dqopt->flags & DQUOT_QUOTA_SYS_FILE) + dqopt->info[type].dqi_flags |= DQF_SYS_FILE; mutex_unlock(&dqopt->dqio_mutex); spin_lock(&dq_state_lock); dqopt->flags |= dquot_state_flag(flags, type); @@ -2464,7 +2466,7 @@ int dquot_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii) spin_lock(&dq_data_lock); ii->dqi_bgrace = mi->dqi_bgrace; ii->dqi_igrace = mi->dqi_igrace; - ii->dqi_flags = mi->dqi_flags & DQF_MASK; + ii->dqi_flags = mi->dqi_flags & DQF_GETINFO_MASK; ii->dqi_valid = IIF_ALL; spin_unlock(&dq_data_lock); mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); @@ -2490,8 +2492,8 @@ int dquot_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii) if (ii->dqi_valid & IIF_IGRACE) mi->dqi_igrace = ii->dqi_igrace; if (ii->dqi_valid & IIF_FLAGS) - mi->dqi_flags = (mi->dqi_flags & ~DQF_MASK) | - (ii->dqi_flags & DQF_MASK); + mi->dqi_flags = (mi->dqi_flags & ~DQF_SETINFO_MASK) | + (ii->dqi_flags & DQF_SETINFO_MASK); spin_unlock(&dq_data_lock); mark_info_dirty(sb, type); /* Force write to disk */ diff --git a/include/linux/quota.h b/include/linux/quota.h index cb7855699037..c09fa042b5ea 100644 --- a/include/linux/quota.h +++ b/include/linux/quota.h @@ -230,7 +230,11 @@ struct mem_dqinfo { struct super_block; #define DQF_MASK 0xffff /* Mask for format specific flags */ -#define DQF_INFO_DIRTY_B 16 +#define DQF_GETINFO_MASK 0x1ffff /* Mask for flags passed to userspace */ +#define DQF_SETINFO_MASK 0xffff /* Mask for flags modifiable from userspace */ +#define DQF_SYS_FILE_B 16 +#define DQF_SYS_FILE (1 << DQF_SYS_FILE_B) /* Quota file stored as system file */ +#define DQF_INFO_DIRTY_B 31 #define DQF_INFO_DIRTY (1 << DQF_INFO_DIRTY_B) /* Is info dirty? */ extern void mark_info_dirty(struct super_block *sb, int type);