From e1a86dbbbd6a77f73c3d099030495fa31f181e2f Mon Sep 17 00:00:00 2001 From: Junxiao Bi Date: Tue, 14 Jul 2020 16:10:26 -0700 Subject: [PATCH] md: fix deadlock causing by sysfs_notify The following deadlock was captured. The first process is holding 'kernfs_mutex' and hung by io. The io was staging in 'r1conf.pending_bio_list' of raid1 device, this pending bio list would be flushed by second process 'md127_raid1', but it was hung by 'kernfs_mutex'. Using sysfs_notify_dirent_safe() to replace sysfs_notify() can fix it. There were other sysfs_notify() invoked from io path, removed all of them. PID: 40430 TASK: ffff8ee9c8c65c40 CPU: 29 COMMAND: "probe_file" #0 [ffffb87c4df37260] __schedule at ffffffff9a8678ec #1 [ffffb87c4df372f8] schedule at ffffffff9a867f06 #2 [ffffb87c4df37310] io_schedule at ffffffff9a0c73e6 #3 [ffffb87c4df37328] __dta___xfs_iunpin_wait_3443 at ffffffffc03a4057 [xfs] #4 [ffffb87c4df373a0] xfs_iunpin_wait at ffffffffc03a6c79 [xfs] #5 [ffffb87c4df373b0] __dta_xfs_reclaim_inode_3357 at ffffffffc039a46c [xfs] #6 [ffffb87c4df37400] xfs_reclaim_inodes_ag at ffffffffc039a8b6 [xfs] #7 [ffffb87c4df37590] xfs_reclaim_inodes_nr at ffffffffc039bb33 [xfs] #8 [ffffb87c4df375b0] xfs_fs_free_cached_objects at ffffffffc03af0e9 [xfs] #9 [ffffb87c4df375c0] super_cache_scan at ffffffff9a287ec7 #10 [ffffb87c4df37618] shrink_slab at ffffffff9a1efd93 #11 [ffffb87c4df37700] shrink_node at ffffffff9a1f5968 #12 [ffffb87c4df37788] do_try_to_free_pages at ffffffff9a1f5ea2 #13 [ffffb87c4df377f0] try_to_free_mem_cgroup_pages at ffffffff9a1f6445 #14 [ffffb87c4df37880] try_charge at ffffffff9a26cc5f #15 [ffffb87c4df37920] memcg_kmem_charge_memcg at ffffffff9a270f6a #16 [ffffb87c4df37958] new_slab at ffffffff9a251430 #17 [ffffb87c4df379c0] ___slab_alloc at ffffffff9a251c85 #18 [ffffb87c4df37a80] __slab_alloc at ffffffff9a25635d #19 [ffffb87c4df37ac0] kmem_cache_alloc at ffffffff9a251f89 #20 [ffffb87c4df37b00] alloc_inode at ffffffff9a2a2b10 #21 [ffffb87c4df37b20] iget_locked at ffffffff9a2a4854 #22 [ffffb87c4df37b60] kernfs_get_inode at ffffffff9a311377 #23 [ffffb87c4df37b80] kernfs_iop_lookup at ffffffff9a311e2b #24 [ffffb87c4df37ba8] lookup_slow at ffffffff9a290118 #25 [ffffb87c4df37c10] walk_component at ffffffff9a291e83 #26 [ffffb87c4df37c78] path_lookupat at ffffffff9a293619 #27 [ffffb87c4df37cd8] filename_lookup at ffffffff9a2953af #28 [ffffb87c4df37de8] user_path_at_empty at ffffffff9a295566 #29 [ffffb87c4df37e10] vfs_statx at ffffffff9a289787 #30 [ffffb87c4df37e70] SYSC_newlstat at ffffffff9a289d5d #31 [ffffb87c4df37f18] sys_newlstat at ffffffff9a28a60e #32 [ffffb87c4df37f28] do_syscall_64 at ffffffff9a003949 #33 [ffffb87c4df37f50] entry_SYSCALL_64_after_hwframe at ffffffff9aa001ad RIP: 00007f617a5f2905 RSP: 00007f607334f838 RFLAGS: 00000246 RAX: ffffffffffffffda RBX: 00007f6064044b20 RCX: 00007f617a5f2905 RDX: 00007f6064044b20 RSI: 00007f6064044b20 RDI: 00007f6064005890 RBP: 00007f6064044aa0 R8: 0000000000000030 R9: 000000000000011c R10: 0000000000000013 R11: 0000000000000246 R12: 00007f606417e6d0 R13: 00007f6064044aa0 R14: 00007f6064044b10 R15: 00000000ffffffff ORIG_RAX: 0000000000000006 CS: 0033 SS: 002b PID: 927 TASK: ffff8f15ac5dbd80 CPU: 42 COMMAND: "md127_raid1" #0 [ffffb87c4df07b28] __schedule at ffffffff9a8678ec #1 [ffffb87c4df07bc0] schedule at ffffffff9a867f06 #2 [ffffb87c4df07bd8] schedule_preempt_disabled at ffffffff9a86825e #3 [ffffb87c4df07be8] __mutex_lock at ffffffff9a869bcc #4 [ffffb87c4df07ca0] __mutex_lock_slowpath at ffffffff9a86a013 #5 [ffffb87c4df07cb0] mutex_lock at ffffffff9a86a04f #6 [ffffb87c4df07cc8] kernfs_find_and_get_ns at ffffffff9a311d83 #7 [ffffb87c4df07cf0] sysfs_notify at ffffffff9a314b3a #8 [ffffb87c4df07d18] md_update_sb at ffffffff9a688696 #9 [ffffb87c4df07d98] md_update_sb at ffffffff9a6886d5 #10 [ffffb87c4df07da8] md_check_recovery at ffffffff9a68ad9c #11 [ffffb87c4df07dd0] raid1d at ffffffffc01f0375 [raid1] #12 [ffffb87c4df07ea0] md_thread at ffffffff9a680348 #13 [ffffb87c4df07f08] kthread at ffffffff9a0b8005 #14 [ffffb87c4df07f50] ret_from_fork at ffffffff9aa00344 Signed-off-by: Junxiao Bi Signed-off-by: Song Liu --- drivers/md/md-bitmap.c | 2 +- drivers/md/md.c | 44 ++++++++++++++++++++++++++++-------------- drivers/md/md.h | 8 +++++++- drivers/md/raid10.c | 2 +- drivers/md/raid5.c | 6 +++--- 5 files changed, 42 insertions(+), 20 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 95a5f3757fa3..d61b524ae440 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1631,7 +1631,7 @@ void md_bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force) s += blocks; } bitmap->last_end_sync = jiffies; - sysfs_notify(&bitmap->mddev->kobj, NULL, "sync_completed"); + sysfs_notify_dirent_safe(bitmap->mddev->sysfs_completed); } EXPORT_SYMBOL(md_bitmap_cond_end_sync); diff --git a/drivers/md/md.c b/drivers/md/md.c index 07e5b67a2c48..fc33f2f8a415 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2472,6 +2472,10 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) if (sysfs_create_link(&rdev->kobj, ko, "block")) /* failure here is OK */; rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state"); + rdev->sysfs_unack_badblocks = + sysfs_get_dirent_safe(rdev->kobj.sd, "unacknowledged_bad_blocks"); + rdev->sysfs_badblocks = + sysfs_get_dirent_safe(rdev->kobj.sd, "bad_blocks"); list_add_rcu(&rdev->same_set, &mddev->disks); bd_link_disk_holder(rdev->bdev, mddev->gendisk); @@ -2505,7 +2509,11 @@ static void unbind_rdev_from_array(struct md_rdev *rdev) rdev->mddev = NULL; sysfs_remove_link(&rdev->kobj, "block"); sysfs_put(rdev->sysfs_state); + sysfs_put(rdev->sysfs_unack_badblocks); + sysfs_put(rdev->sysfs_badblocks); rdev->sysfs_state = NULL; + rdev->sysfs_unack_badblocks = NULL; + rdev->sysfs_badblocks = NULL; rdev->badblocks.count = 0; /* We need to delay this, otherwise we can deadlock when * writing to 'remove' to "dev/state". We also need @@ -2850,7 +2858,7 @@ rewrite: goto repeat; wake_up(&mddev->sb_wait); if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) - sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + sysfs_notify_dirent_safe(mddev->sysfs_completed); rdev_for_each(rdev, mddev) { if (test_and_clear_bit(FaultRecorded, &rdev->flags)) @@ -4103,7 +4111,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) mddev_resume(mddev); if (!mddev->thread) md_update_sb(mddev, 1); - sysfs_notify(&mddev->kobj, NULL, "level"); + sysfs_notify_dirent_safe(mddev->sysfs_level); md_new_event(mddev); rv = len; out_unlock: @@ -4856,7 +4864,7 @@ action_store(struct mddev *mddev, const char *page, size_t len) } if (err) return err; - sysfs_notify(&mddev->kobj, NULL, "degraded"); + sysfs_notify_dirent_safe(mddev->sysfs_degraded); } else { if (cmd_match(page, "check")) set_bit(MD_RECOVERY_CHECK, &mddev->recovery); @@ -5562,6 +5570,13 @@ static void md_free(struct kobject *ko) if (mddev->sysfs_state) sysfs_put(mddev->sysfs_state); + if (mddev->sysfs_completed) + sysfs_put(mddev->sysfs_completed); + if (mddev->sysfs_degraded) + sysfs_put(mddev->sysfs_degraded); + if (mddev->sysfs_level) + sysfs_put(mddev->sysfs_level); + if (mddev->gendisk) del_gendisk(mddev->gendisk); @@ -5729,6 +5744,9 @@ static int md_alloc(dev_t dev, char *name) if (!error && mddev->kobj.sd) { kobject_uevent(&mddev->kobj, KOBJ_ADD); mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state"); + mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed"); + mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded"); + mddev->sysfs_level = sysfs_get_dirent_safe(mddev->kobj.sd, "level"); } mddev_put(mddev); return error; @@ -6083,7 +6101,7 @@ static int do_md_run(struct mddev *mddev) kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); sysfs_notify_dirent_safe(mddev->sysfs_state); sysfs_notify_dirent_safe(mddev->sysfs_action); - sysfs_notify(&mddev->kobj, NULL, "degraded"); + sysfs_notify_dirent_safe(mddev->sysfs_degraded); out: clear_bit(MD_NOT_READY, &mddev->flags); return err; @@ -8802,7 +8820,7 @@ void md_do_sync(struct md_thread *thread) } else mddev->curr_resync = 3; /* no longer delayed */ mddev->curr_resync_completed = j; - sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + sysfs_notify_dirent_safe(mddev->sysfs_completed); md_new_event(mddev); update_time = jiffies; @@ -8830,7 +8848,7 @@ void md_do_sync(struct md_thread *thread) mddev->recovery_cp = j; update_time = jiffies; set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); - sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + sysfs_notify_dirent_safe(mddev->sysfs_completed); } while (j >= mddev->resync_max && @@ -8937,7 +8955,7 @@ void md_do_sync(struct md_thread *thread) !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && mddev->curr_resync > 3) { mddev->curr_resync_completed = mddev->curr_resync; - sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + sysfs_notify_dirent_safe(mddev->sysfs_completed); } mddev->pers->sync_request(mddev, max_sectors, &skipped); @@ -9067,7 +9085,7 @@ static int remove_and_add_spares(struct mddev *mddev, } if (removed && mddev->kobj.sd) - sysfs_notify(&mddev->kobj, NULL, "degraded"); + sysfs_notify_dirent_safe(mddev->sysfs_degraded); if (this && removed) goto no_add; @@ -9350,8 +9368,7 @@ void md_reap_sync_thread(struct mddev *mddev) /* success...*/ /* activate any spares */ if (mddev->pers->spare_active(mddev)) { - sysfs_notify(&mddev->kobj, NULL, - "degraded"); + sysfs_notify_dirent_safe(mddev->sysfs_degraded); set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); } } @@ -9441,8 +9458,7 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, if (rv == 0) { /* Make sure they get written out promptly */ if (test_bit(ExternalBbl, &rdev->flags)) - sysfs_notify(&rdev->kobj, NULL, - "unacknowledged_bad_blocks"); + sysfs_notify_dirent_safe(rdev->sysfs_unack_badblocks); sysfs_notify_dirent_safe(rdev->sysfs_state); set_mask_bits(&mddev->sb_flags, 0, BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING)); @@ -9463,7 +9479,7 @@ int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, s += rdev->data_offset; rv = badblocks_clear(&rdev->badblocks, s, sectors); if ((rv == 0) && test_bit(ExternalBbl, &rdev->flags)) - sysfs_notify(&rdev->kobj, NULL, "bad_blocks"); + sysfs_notify_dirent_safe(rdev->sysfs_badblocks); return rv; } EXPORT_SYMBOL_GPL(rdev_clear_badblocks); @@ -9693,7 +9709,7 @@ static int read_rdev(struct mddev *mddev, struct md_rdev *rdev) if (rdev->recovery_offset == MaxSector && !test_bit(In_sync, &rdev->flags) && mddev->pers->spare_active(mddev)) - sysfs_notify(&mddev->kobj, NULL, "degraded"); + sysfs_notify_dirent_safe(mddev->sysfs_degraded); put_page(swapout); return 0; diff --git a/drivers/md/md.h b/drivers/md/md.h index c26fa8bd41e7..1e172b3e499f 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -126,7 +126,10 @@ struct md_rdev { struct kernfs_node *sysfs_state; /* handle for 'state' * sysfs entry */ - + /* handle for 'unacknowledged_bad_blocks' sysfs dentry */ + struct kernfs_node *sysfs_unack_badblocks; + /* handle for 'bad_blocks' sysfs dentry */ + struct kernfs_node *sysfs_badblocks; struct badblocks badblocks; struct { @@ -420,6 +423,9 @@ struct mddev { * file in sysfs. */ struct kernfs_node *sysfs_action; /* handle for 'sync_action' */ + struct kernfs_node *sysfs_completed; /*handle for 'sync_completed' */ + struct kernfs_node *sysfs_degraded; /*handle for 'degraded' */ + struct kernfs_node *sysfs_level; /*handle for 'level' */ struct work_struct del_work; /* used for delayed sysfs removal */ diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index e45fd56cf584..2c47474fa69d 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -4454,7 +4454,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, sector_nr = conf->reshape_progress; if (sector_nr) { mddev->curr_resync_completed = sector_nr; - sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + sysfs_notify_dirent_safe(mddev->sysfs_completed); *skipped = 1; return sector_nr; } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 8dea4398b191..a4fbafa5b8f8 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5799,7 +5799,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk sector_div(sector_nr, new_data_disks); if (sector_nr) { mddev->curr_resync_completed = sector_nr; - sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + sysfs_notify_dirent_safe(mddev->sysfs_completed); *skipped = 1; retn = sector_nr; goto finish; @@ -5913,7 +5913,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk conf->reshape_safe = mddev->reshape_position; spin_unlock_irq(&conf->device_lock); wake_up(&conf->wait_for_overlap); - sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + sysfs_notify_dirent_safe(mddev->sysfs_completed); } INIT_LIST_HEAD(&stripes); @@ -6020,7 +6020,7 @@ finish: conf->reshape_safe = mddev->reshape_position; spin_unlock_irq(&conf->device_lock); wake_up(&conf->wait_for_overlap); - sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + sysfs_notify_dirent_safe(mddev->sysfs_completed); } ret: return retn;