alistair23-linux/fs/btrfs/tree-log.h
Filipe Manana 28a235931b Btrfs: fix lockdep warning on deadlock against an inode's log mutex
Commit 44f714dae5 ("Btrfs: improve performance on fsync against new
inode after rename/unlink"), which landed in 4.8-rc2, introduced a
possibility for a deadlock due to double locking of an inode's log mutex
by the same task, which lockdep reports with:

[23045.433975] =============================================
[23045.434748] [ INFO: possible recursive locking detected ]
[23045.435426] 4.7.0-rc6-btrfs-next-34+ #1 Not tainted
[23045.436044] ---------------------------------------------
[23045.436044] xfs_io/3688 is trying to acquire lock:
[23045.436044]  (&ei->log_mutex){+.+...}, at: [<ffffffffa038552d>] btrfs_log_inode+0x13a/0xc95 [btrfs]
[23045.436044]
               but task is already holding lock:
[23045.436044]  (&ei->log_mutex){+.+...}, at: [<ffffffffa038552d>] btrfs_log_inode+0x13a/0xc95 [btrfs]
[23045.436044]
               other info that might help us debug this:
[23045.436044]  Possible unsafe locking scenario:

[23045.436044]        CPU0
[23045.436044]        ----
[23045.436044]   lock(&ei->log_mutex);
[23045.436044]   lock(&ei->log_mutex);
[23045.436044]
                *** DEADLOCK ***

[23045.436044]  May be due to missing lock nesting notation

[23045.436044] 3 locks held by xfs_io/3688:
[23045.436044]  #0:  (&sb->s_type->i_mutex_key#15){+.+...}, at: [<ffffffffa035f2ae>] btrfs_sync_file+0x14e/0x425 [btrfs]
[23045.436044]  #1:  (sb_internal#2){.+.+.+}, at: [<ffffffff8118446b>] __sb_start_write+0x5f/0xb0
[23045.436044]  #2:  (&ei->log_mutex){+.+...}, at: [<ffffffffa038552d>] btrfs_log_inode+0x13a/0xc95 [btrfs]
[23045.436044]
               stack backtrace:
[23045.436044] CPU: 4 PID: 3688 Comm: xfs_io Not tainted 4.7.0-rc6-btrfs-next-34+ #1
[23045.436044] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.9.1-0-gb3ef39f-prebuilt.qemu-project.org 04/01/2014
[23045.436044]  0000000000000000 ffff88022f5f7860 ffffffff8127074d ffffffff82a54b70
[23045.436044]  ffffffff82a54b70 ffff88022f5f7920 ffffffff81092897 ffff880228015d68
[23045.436044]  0000000000000000 ffffffff82a54b70 ffffffff829c3f00 ffff880228015d68
[23045.436044] Call Trace:
[23045.436044]  [<ffffffff8127074d>] dump_stack+0x67/0x90
[23045.436044]  [<ffffffff81092897>] __lock_acquire+0xcbb/0xe4e
[23045.436044]  [<ffffffff8109155f>] ? mark_lock+0x24/0x201
[23045.436044]  [<ffffffff8109179a>] ? mark_held_locks+0x5e/0x74
[23045.436044]  [<ffffffff81092de0>] lock_acquire+0x12f/0x1c3
[23045.436044]  [<ffffffff81092de0>] ? lock_acquire+0x12f/0x1c3
[23045.436044]  [<ffffffffa038552d>] ? btrfs_log_inode+0x13a/0xc95 [btrfs]
[23045.436044]  [<ffffffffa038552d>] ? btrfs_log_inode+0x13a/0xc95 [btrfs]
[23045.436044]  [<ffffffff814a51a4>] mutex_lock_nested+0x77/0x3a7
[23045.436044]  [<ffffffffa038552d>] ? btrfs_log_inode+0x13a/0xc95 [btrfs]
[23045.436044]  [<ffffffffa039705e>] ? btrfs_release_delayed_node+0xb/0xd [btrfs]
[23045.436044]  [<ffffffffa038552d>] btrfs_log_inode+0x13a/0xc95 [btrfs]
[23045.436044]  [<ffffffffa038552d>] ? btrfs_log_inode+0x13a/0xc95 [btrfs]
[23045.436044]  [<ffffffff810a0ed1>] ? vprintk_emit+0x453/0x465
[23045.436044]  [<ffffffffa0385a61>] btrfs_log_inode+0x66e/0xc95 [btrfs]
[23045.436044]  [<ffffffffa03c084d>] log_new_dir_dentries+0x26c/0x359 [btrfs]
[23045.436044]  [<ffffffffa03865aa>] btrfs_log_inode_parent+0x4a6/0x628 [btrfs]
[23045.436044]  [<ffffffffa0387552>] btrfs_log_dentry_safe+0x5a/0x75 [btrfs]
[23045.436044]  [<ffffffffa035f464>] btrfs_sync_file+0x304/0x425 [btrfs]
[23045.436044]  [<ffffffff811acaf4>] vfs_fsync_range+0x8c/0x9e
[23045.436044]  [<ffffffff811acb22>] vfs_fsync+0x1c/0x1e
[23045.436044]  [<ffffffff811acc79>] do_fsync+0x31/0x4a
[23045.436044]  [<ffffffff811ace99>] SyS_fsync+0x10/0x14
[23045.436044]  [<ffffffff814a88e5>] entry_SYSCALL_64_fastpath+0x18/0xa8
[23045.436044]  [<ffffffff8108f039>] ? trace_hardirqs_off_caller+0x3f/0xaa

An example reproducer for this is:

   $ mkfs.btrfs -f /dev/sdb
   $ mount /dev/sdb /mnt
   $ mkdir /mnt/dir
   $ touch /mnt/dir/foo
   $ sync
   $ mv /mnt/dir/foo /mnt/dir/bar
   $ touch /mnt/dir/foo
   $ xfs_io -c "fsync" /mnt/dir/bar

This is because while logging the inode of file bar we end up logging its
parent directory (since its inode has an unlink_trans field matching the
current transaction id due to the rename operation), which in turn logs
the inodes for all its new dentries, so that the new inode for the new
file named foo gets logged which in turn triggered another logging attempt
for the inode we are fsync'ing, since that inode had an old name that
corresponds to the name of the new inode.

So fix this by ensuring that when logging the inode for a new dentry that
has a name matching an old name of some other inode, we don't log again
the original inode that we are fsync'ing.

Fixes: 44f714dae5 ("Btrfs: improve performance on fsync against new inode after rename/unlink")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2016-08-25 03:58:32 -07:00

91 lines
3 KiB
C

/*
* Copyright (C) 2008 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License v2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#ifndef __TREE_LOG_
#define __TREE_LOG_
#include "ctree.h"
#include "transaction.h"
/* return value for btrfs_log_dentry_safe that means we don't need to log it at all */
#define BTRFS_NO_LOG_SYNC 256
struct btrfs_log_ctx {
int log_ret;
int log_transid;
int io_err;
bool log_new_dentries;
struct inode *inode;
struct list_head list;
};
static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx,
struct inode *inode)
{
ctx->log_ret = 0;
ctx->log_transid = 0;
ctx->io_err = 0;
ctx->log_new_dentries = false;
ctx->inode = inode;
INIT_LIST_HEAD(&ctx->list);
}
static inline void btrfs_set_log_full_commit(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans)
{
ACCESS_ONCE(fs_info->last_trans_log_full_commit) = trans->transid;
}
static inline int btrfs_need_log_full_commit(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans)
{
return ACCESS_ONCE(fs_info->last_trans_log_full_commit) ==
trans->transid;
}
int btrfs_sync_log(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct btrfs_log_ctx *ctx);
int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
int btrfs_recover_log_trees(struct btrfs_root *tree_root);
int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct dentry *dentry,
const loff_t start,
const loff_t end,
struct btrfs_log_ctx *ctx);
int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
const char *name, int name_len,
struct inode *dir, u64 index);
int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
const char *name, int name_len,
struct inode *inode, u64 dirid);
void btrfs_end_log_trans(struct btrfs_root *root);
int btrfs_pin_log_trans(struct btrfs_root *root);
void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
struct inode *dir, struct inode *inode,
int for_rename);
void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
struct inode *dir);
int btrfs_log_new_name(struct btrfs_trans_handle *trans,
struct inode *inode, struct inode *old_dir,
struct dentry *parent);
#endif