diff --git a/Documentation/md/md-cluster.txt b/Documentation/md/md-cluster.txt index 82ee51604e9a..e1055f105cf5 100644 --- a/Documentation/md/md-cluster.txt +++ b/Documentation/md/md-cluster.txt @@ -1,4 +1,5 @@ -The cluster MD is a shared-device RAID for a cluster. +The cluster MD is a shared-device RAID for a cluster, it supports +two levels: raid1 and raid10 (limited support). 1. On-disk format diff --git a/MAINTAINERS b/MAINTAINERS index ba3d8c197d92..8604cf64a169 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4103,6 +4103,8 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm.git T: quilt http://people.redhat.com/agk/patches/linux/editing/ S: Maintained F: Documentation/device-mapper/ +F: drivers/md/Makefile +F: drivers/md/Kconfig F: drivers/md/dm* F: drivers/md/persistent-data/ F: include/linux/device-mapper.h @@ -12487,7 +12489,10 @@ M: Shaohua Li L: linux-raid@vger.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/shli/md.git S: Supported -F: drivers/md/ +F: drivers/md/Makefile +F: drivers/md/Kconfig +F: drivers/md/md* +F: drivers/md/raid* F: include/linux/raid/ F: include/uapi/linux/raid/ diff --git a/block/badblocks.c b/block/badblocks.c index 43c71166e1e2..91f7bcf979d3 100644 --- a/block/badblocks.c +++ b/block/badblocks.c @@ -178,7 +178,7 @@ int badblocks_set(struct badblocks *bb, sector_t s, int sectors, if (bb->shift < 0) /* badblocks are disabled */ - return 0; + return 1; if (bb->shift) { /* round the start down, and the end up */ diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 4a249ee86364..83b9362be09c 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -178,7 +178,7 @@ config MD_FAULTY config MD_CLUSTER - tristate "Cluster Support for MD (EXPERIMENTAL)" + tristate "Cluster Support for MD" depends on BLK_DEV_MD depends on DLM default n @@ -188,7 +188,8 @@ config MD_CLUSTER nodes in the cluster can access the MD devices simultaneously. This brings the redundancy (and uptime) of RAID levels across the - nodes of the cluster. + nodes of the cluster. Currently, it can work with raid1 and raid10 + (limited support). If unsure, say N. diff --git a/drivers/md/Makefile b/drivers/md/Makefile index e94b6f9be941..f701bb211783 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -19,9 +19,12 @@ dm-cache-y += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o \ dm-cache-smq-y += dm-cache-policy-smq.o dm-era-y += dm-era-target.o dm-verity-y += dm-verity-target.o -md-mod-y += md.o bitmap.o +md-mod-y += md.o md-bitmap.o raid456-y += raid5.o raid5-cache.o raid5-ppl.o dm-zoned-y += dm-zoned-target.o dm-zoned-metadata.o dm-zoned-reclaim.o +linear-y += md-linear.o +multipath-y += md-multipath.o +faulty-y += md-faulty.o # Note: link order is important. All raid personalities # and must come before md.o, as they each initialise diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index a25eebd98996..366c625b9591 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -12,7 +12,7 @@ #include "raid1.h" #include "raid5.h" #include "raid10.h" -#include "bitmap.h" +#include "md-bitmap.h" #include @@ -3630,8 +3630,11 @@ static void raid_postsuspend(struct dm_target *ti) { struct raid_set *rs = ti->private; - if (!test_and_set_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) + if (!test_and_set_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) { + mddev_lock_nointr(&rs->md); mddev_suspend(&rs->md); + mddev_unlock(&rs->md); + } rs->md.ro = 1; } @@ -3888,8 +3891,11 @@ static void raid_resume(struct dm_target *ti) if (!(rs->ctr_flags & RESUME_STAY_FROZEN_FLAGS)) clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - if (test_and_clear_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) + if (test_and_clear_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) { + mddev_lock_nointr(mddev); mddev_resume(mddev); + mddev_unlock(mddev); + } } static struct target_type raid_target = { diff --git a/drivers/md/bitmap.c b/drivers/md/md-bitmap.c similarity index 99% rename from drivers/md/bitmap.c rename to drivers/md/md-bitmap.c index 4d8ed74efadf..239c7bb3929b 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/md-bitmap.c @@ -29,7 +29,7 @@ #include #include #include "md.h" -#include "bitmap.h" +#include "md-bitmap.h" static inline char *bmname(struct bitmap *bitmap) { @@ -459,7 +459,11 @@ void bitmap_update_sb(struct bitmap *bitmap) /* rocking back to read-only */ bitmap->events_cleared = bitmap->mddev->events; sb->events_cleared = cpu_to_le64(bitmap->events_cleared); - sb->state = cpu_to_le32(bitmap->flags); + /* + * clear BITMAP_WRITE_ERROR bit to protect against the case that + * a bitmap write error occurred but the later writes succeeded. + */ + sb->state = cpu_to_le32(bitmap->flags & ~BIT(BITMAP_WRITE_ERROR)); /* Just in case these have been changed via sysfs: */ sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ); sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind); @@ -625,7 +629,7 @@ re_read: err = read_sb_page(bitmap->mddev, offset, sb_page, - 0, PAGE_SIZE); + 0, sizeof(bitmap_super_t)); } if (err) return err; @@ -1816,6 +1820,12 @@ struct bitmap *bitmap_create(struct mddev *mddev, int slot) BUG_ON(file && mddev->bitmap_info.offset); + if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { + pr_notice("md/raid:%s: array with journal cannot have bitmap\n", + mdname(mddev)); + return ERR_PTR(-EBUSY); + } + bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL); if (!bitmap) return ERR_PTR(-ENOMEM); @@ -2123,7 +2133,7 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks, if (store.sb_page && bitmap->storage.sb_page) memcpy(page_address(store.sb_page), page_address(bitmap->storage.sb_page), - PAGE_SIZE); + sizeof(bitmap_super_t)); bitmap_file_unmap(&bitmap->storage); bitmap->storage = store; @@ -2152,6 +2162,7 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks, for (k = 0; k < page; k++) { kfree(new_bp[k].map); } + kfree(new_bp); /* restore some fields from old_counts */ bitmap->counts.bp = old_counts.bp; @@ -2202,6 +2213,14 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks, block += old_blocks; } + if (bitmap->counts.bp != old_counts.bp) { + unsigned long k; + for (k = 0; k < old_counts.pages; k++) + if (!old_counts.bp[k].hijacked) + kfree(old_counts.bp[k].map); + kfree(old_counts.bp); + } + if (!init) { int i; while (block < (chunks << chunkshift)) { diff --git a/drivers/md/bitmap.h b/drivers/md/md-bitmap.h similarity index 100% rename from drivers/md/bitmap.h rename to drivers/md/md-bitmap.h diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 03082e17c65c..79bfbc840385 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -15,7 +15,7 @@ #include #include #include "md.h" -#include "bitmap.h" +#include "md-bitmap.h" #include "md-cluster.h" #define LVB_SIZE 64 @@ -442,10 +442,11 @@ static void __remove_suspend_info(struct md_cluster_info *cinfo, int slot) static void remove_suspend_info(struct mddev *mddev, int slot) { struct md_cluster_info *cinfo = mddev->cluster_info; + mddev->pers->quiesce(mddev, 1); spin_lock_irq(&cinfo->suspend_lock); __remove_suspend_info(cinfo, slot); spin_unlock_irq(&cinfo->suspend_lock); - mddev->pers->quiesce(mddev, 2); + mddev->pers->quiesce(mddev, 0); } @@ -492,13 +493,12 @@ static void process_suspend_info(struct mddev *mddev, s->lo = lo; s->hi = hi; mddev->pers->quiesce(mddev, 1); - mddev->pers->quiesce(mddev, 0); spin_lock_irq(&cinfo->suspend_lock); /* Remove existing entry (if exists) before adding */ __remove_suspend_info(cinfo, slot); list_add(&s->list, &cinfo->suspend_list); spin_unlock_irq(&cinfo->suspend_lock); - mddev->pers->quiesce(mddev, 2); + mddev->pers->quiesce(mddev, 0); } static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg) @@ -1094,7 +1094,7 @@ static void metadata_update_cancel(struct mddev *mddev) /* * return 0 if all the bitmaps have the same sync_size */ -int cluster_check_sync_size(struct mddev *mddev) +static int cluster_check_sync_size(struct mddev *mddev) { int i, rv; bitmap_super_t *sb; @@ -1478,7 +1478,7 @@ static struct md_cluster_operations cluster_ops = { static int __init cluster_init(void) { - pr_warn("md-cluster: EXPERIMENTAL. Use with caution\n"); + pr_warn("md-cluster: support raid1 and raid10 (limited support)\n"); pr_info("Registering Cluster MD functions\n"); register_md_cluster_operations(&cluster_ops, THIS_MODULE); return 0; diff --git a/drivers/md/faulty.c b/drivers/md/md-faulty.c similarity index 100% rename from drivers/md/faulty.c rename to drivers/md/md-faulty.c diff --git a/drivers/md/linear.c b/drivers/md/md-linear.c similarity index 99% rename from drivers/md/linear.c rename to drivers/md/md-linear.c index c464fb48039a..773fc70dced7 100644 --- a/drivers/md/linear.c +++ b/drivers/md/md-linear.c @@ -23,7 +23,7 @@ #include #include #include "md.h" -#include "linear.h" +#include "md-linear.h" /* * find which device holds a particular offset diff --git a/drivers/md/linear.h b/drivers/md/md-linear.h similarity index 100% rename from drivers/md/linear.h rename to drivers/md/md-linear.h diff --git a/drivers/md/multipath.c b/drivers/md/md-multipath.c similarity index 99% rename from drivers/md/multipath.c rename to drivers/md/md-multipath.c index b68e0666b9b0..e40065bdbfc8 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/md-multipath.c @@ -25,7 +25,7 @@ #include #include #include "md.h" -#include "multipath.h" +#include "md-multipath.h" #define MAX_WORK_PER_DISK 128 @@ -243,7 +243,6 @@ static void print_multipath_conf (struct mpconf *conf) static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev) { struct mpconf *conf = mddev->private; - struct request_queue *q; int err = -EEXIST; int path; struct multipath_info *p; @@ -257,7 +256,6 @@ static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev) for (path = first; path <= last; path++) if ((p=conf->multipaths+path)->rdev == NULL) { - q = rdev->bdev->bd_disk->queue; disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->data_offset << 9); diff --git a/drivers/md/multipath.h b/drivers/md/md-multipath.h similarity index 100% rename from drivers/md/multipath.h rename to drivers/md/md-multipath.h diff --git a/drivers/md/md.c b/drivers/md/md.c index 447ddcbc9566..09c3af3dcdca 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -69,7 +69,7 @@ #include #include "md.h" -#include "bitmap.h" +#include "md-bitmap.h" #include "md-cluster.h" #ifndef MODULE @@ -266,16 +266,31 @@ static DEFINE_SPINLOCK(all_mddevs_lock); * call has finished, the bio has been linked into some internal structure * and so is visible to ->quiesce(), so we don't need the refcount any more. */ +static bool is_suspended(struct mddev *mddev, struct bio *bio) +{ + if (mddev->suspended) + return true; + if (bio_data_dir(bio) != WRITE) + return false; + if (mddev->suspend_lo >= mddev->suspend_hi) + return false; + if (bio->bi_iter.bi_sector >= mddev->suspend_hi) + return false; + if (bio_end_sector(bio) < mddev->suspend_lo) + return false; + return true; +} + void md_handle_request(struct mddev *mddev, struct bio *bio) { check_suspended: rcu_read_lock(); - if (mddev->suspended) { + if (is_suspended(mddev, bio)) { DEFINE_WAIT(__wait); for (;;) { prepare_to_wait(&mddev->sb_wait, &__wait, TASK_UNINTERRUPTIBLE); - if (!mddev->suspended) + if (!is_suspended(mddev, bio)) break; rcu_read_unlock(); schedule(); @@ -344,12 +359,17 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) void mddev_suspend(struct mddev *mddev) { WARN_ON_ONCE(mddev->thread && current == mddev->thread->tsk); + lockdep_assert_held(&mddev->reconfig_mutex); if (mddev->suspended++) return; synchronize_rcu(); wake_up(&mddev->sb_wait); + set_bit(MD_ALLOW_SB_UPDATE, &mddev->flags); + smp_mb__after_atomic(); wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0); mddev->pers->quiesce(mddev, 1); + clear_bit_unlock(MD_ALLOW_SB_UPDATE, &mddev->flags); + wait_event(mddev->sb_wait, !test_bit(MD_UPDATING_SB, &mddev->flags)); del_timer_sync(&mddev->safemode_timer); } @@ -357,6 +377,7 @@ EXPORT_SYMBOL_GPL(mddev_suspend); void mddev_resume(struct mddev *mddev) { + lockdep_assert_held(&mddev->reconfig_mutex); if (--mddev->suspended) return; wake_up(&mddev->sb_wait); @@ -663,6 +684,7 @@ void mddev_unlock(struct mddev *mddev) */ spin_lock(&pers_lock); md_wakeup_thread(mddev->thread); + wake_up(&mddev->sb_wait); spin_unlock(&pers_lock); } EXPORT_SYMBOL_GPL(mddev_unlock); @@ -2313,7 +2335,7 @@ static void export_array(struct mddev *mddev) static bool set_in_sync(struct mddev *mddev) { - WARN_ON_ONCE(NR_CPUS != 1 && !spin_is_locked(&mddev->lock)); + lockdep_assert_held(&mddev->lock); if (!mddev->in_sync) { mddev->sync_checkers++; spin_unlock(&mddev->lock); @@ -2432,10 +2454,18 @@ repeat: } } - /* First make sure individual recovery_offsets are correct */ + /* + * First make sure individual recovery_offsets are correct + * curr_resync_completed can only be used during recovery. + * During reshape/resync it might use array-addresses rather + * that device addresses. + */ rdev_for_each(rdev, mddev) { if (rdev->raid_disk >= 0 && mddev->delta_disks >= 0 && + test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && + test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) && + !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && !test_bit(Journal, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && mddev->curr_resync_completed > rdev->recovery_offset) @@ -4824,7 +4854,7 @@ suspend_lo_show(struct mddev *mddev, char *page) static ssize_t suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) { - unsigned long long old, new; + unsigned long long new; int err; err = kstrtoull(buf, 10, &new); @@ -4840,16 +4870,10 @@ suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) if (mddev->pers == NULL || mddev->pers->quiesce == NULL) goto unlock; - old = mddev->suspend_lo; + mddev_suspend(mddev); mddev->suspend_lo = new; - if (new >= old) - /* Shrinking suspended region */ - mddev->pers->quiesce(mddev, 2); - else { - /* Expanding suspended region - need to wait */ - mddev->pers->quiesce(mddev, 1); - mddev->pers->quiesce(mddev, 0); - } + mddev_resume(mddev); + err = 0; unlock: mddev_unlock(mddev); @@ -4867,7 +4891,7 @@ suspend_hi_show(struct mddev *mddev, char *page) static ssize_t suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) { - unsigned long long old, new; + unsigned long long new; int err; err = kstrtoull(buf, 10, &new); @@ -4880,19 +4904,13 @@ suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) if (err) return err; err = -EINVAL; - if (mddev->pers == NULL || - mddev->pers->quiesce == NULL) + if (mddev->pers == NULL) goto unlock; - old = mddev->suspend_hi; + + mddev_suspend(mddev); mddev->suspend_hi = new; - if (new <= old) - /* Shrinking suspended region */ - mddev->pers->quiesce(mddev, 2); - else { - /* Expanding suspended region - need to wait */ - mddev->pers->quiesce(mddev, 1); - mddev->pers->quiesce(mddev, 0); - } + mddev_resume(mddev); + err = 0; unlock: mddev_unlock(mddev); @@ -5834,8 +5852,14 @@ void md_stop(struct mddev *mddev) * This is called from dm-raid */ __md_stop(mddev); - if (mddev->bio_set) + if (mddev->bio_set) { bioset_free(mddev->bio_set); + mddev->bio_set = NULL; + } + if (mddev->sync_set) { + bioset_free(mddev->sync_set); + mddev->sync_set = NULL; + } } EXPORT_SYMBOL_GPL(md_stop); @@ -6362,7 +6386,7 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) break; } } - if (has_journal) { + if (has_journal || mddev->bitmap) { export_rdev(rdev); return -EBUSY; } @@ -6618,22 +6642,26 @@ static int set_bitmap_file(struct mddev *mddev, int fd) return -ENOENT; /* cannot remove what isn't there */ err = 0; if (mddev->pers) { - mddev->pers->quiesce(mddev, 1); if (fd >= 0) { struct bitmap *bitmap; bitmap = bitmap_create(mddev, -1); + mddev_suspend(mddev); if (!IS_ERR(bitmap)) { mddev->bitmap = bitmap; err = bitmap_load(mddev); } else err = PTR_ERR(bitmap); - } - if (fd < 0 || err) { + if (err) { + bitmap_destroy(mddev); + fd = -1; + } + mddev_resume(mddev); + } else if (fd < 0) { + mddev_suspend(mddev); bitmap_destroy(mddev); - fd = -1; /* make sure to put the file */ + mddev_resume(mddev); } - mddev->pers->quiesce(mddev, 0); } if (fd < 0) { struct file *f = mddev->bitmap_info.file; @@ -6735,7 +6763,7 @@ static int set_array_info(struct mddev *mddev, mdu_array_info_t *info) void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors) { - WARN(!mddev_is_locked(mddev), "%s: unlocked mddev!\n", __func__); + lockdep_assert_held(&mddev->reconfig_mutex); if (mddev->external_size) return; @@ -6917,8 +6945,8 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) mddev->bitmap_info.default_offset; mddev->bitmap_info.space = mddev->bitmap_info.default_space; - mddev->pers->quiesce(mddev, 1); bitmap = bitmap_create(mddev, -1); + mddev_suspend(mddev); if (!IS_ERR(bitmap)) { mddev->bitmap = bitmap; rv = bitmap_load(mddev); @@ -6926,7 +6954,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) rv = PTR_ERR(bitmap); if (rv) bitmap_destroy(mddev); - mddev->pers->quiesce(mddev, 0); + mddev_resume(mddev); } else { /* remove the bitmap */ if (!mddev->bitmap) { @@ -6949,9 +6977,9 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) mddev->bitmap_info.nodes = 0; md_cluster_ops->leave(mddev); } - mddev->pers->quiesce(mddev, 1); + mddev_suspend(mddev); bitmap_destroy(mddev); - mddev->pers->quiesce(mddev, 0); + mddev_resume(mddev); mddev->bitmap_info.offset = 0; } } @@ -7468,8 +7496,8 @@ void md_wakeup_thread(struct md_thread *thread) { if (thread) { pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm); - if (!test_and_set_bit(THREAD_WAKEUP, &thread->flags)) - wake_up(&thread->wqueue); + set_bit(THREAD_WAKEUP, &thread->flags); + wake_up(&thread->wqueue); } } EXPORT_SYMBOL(md_wakeup_thread); @@ -8039,7 +8067,8 @@ bool md_write_start(struct mddev *mddev, struct bio *bi) if (did_change) sysfs_notify_dirent_safe(mddev->sysfs_state); wait_event(mddev->sb_wait, - !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) && !mddev->suspended); + !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) || + mddev->suspended); if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { percpu_ref_put(&mddev->writes_pending); return false; @@ -8110,7 +8139,6 @@ void md_allow_write(struct mddev *mddev) sysfs_notify_dirent_safe(mddev->sysfs_state); /* wait for the dirty state to be recorded in the metadata */ wait_event(mddev->sb_wait, - !test_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags) && !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); } else spin_unlock(&mddev->lock); @@ -8477,16 +8505,19 @@ void md_do_sync(struct md_thread *thread) } else { if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) mddev->curr_resync = MaxSector; - rcu_read_lock(); - rdev_for_each_rcu(rdev, mddev) - if (rdev->raid_disk >= 0 && - mddev->delta_disks >= 0 && - !test_bit(Journal, &rdev->flags) && - !test_bit(Faulty, &rdev->flags) && - !test_bit(In_sync, &rdev->flags) && - rdev->recovery_offset < mddev->curr_resync) - rdev->recovery_offset = mddev->curr_resync; - rcu_read_unlock(); + if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && + test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) { + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) + if (rdev->raid_disk >= 0 && + mddev->delta_disks >= 0 && + !test_bit(Journal, &rdev->flags) && + !test_bit(Faulty, &rdev->flags) && + !test_bit(In_sync, &rdev->flags) && + rdev->recovery_offset < mddev->curr_resync) + rdev->recovery_offset = mddev->curr_resync; + rcu_read_unlock(); + } } } skip: @@ -8813,6 +8844,16 @@ void md_check_recovery(struct mddev *mddev) unlock: wake_up(&mddev->sb_wait); mddev_unlock(mddev); + } else if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags) && mddev->sb_flags) { + /* Write superblock - thread that called mddev_suspend() + * holds reconfig_mutex for us. + */ + set_bit(MD_UPDATING_SB, &mddev->flags); + smp_mb__after_atomic(); + if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags)) + md_update_sb(mddev, 0); + clear_bit_unlock(MD_UPDATING_SB, &mddev->flags); + wake_up(&mddev->sb_wait); } } EXPORT_SYMBOL(md_check_recovery); diff --git a/drivers/md/md.h b/drivers/md/md.h index d8287d3cd1bf..7d6bcf0eba0c 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -237,6 +237,12 @@ enum mddev_flags { */ MD_HAS_PPL, /* The raid array has PPL feature set */ MD_HAS_MULTIPLE_PPLS, /* The raid array has multiple PPLs feature set */ + MD_ALLOW_SB_UPDATE, /* md_check_recovery is allowed to update + * the metadata without taking reconfig_mutex. + */ + MD_UPDATING_SB, /* md_check_recovery is updating the metadata + * without explicitly holding reconfig_mutex. + */ }; enum mddev_sb_flags { @@ -494,11 +500,6 @@ static inline void mddev_lock_nointr(struct mddev *mddev) mutex_lock(&mddev->reconfig_mutex); } -static inline int mddev_is_locked(struct mddev *mddev) -{ - return mutex_is_locked(&mddev->reconfig_mutex); -} - static inline int mddev_trylock(struct mddev *mddev) { return mutex_trylock(&mddev->reconfig_mutex); @@ -538,12 +539,11 @@ struct md_personality int (*check_reshape) (struct mddev *mddev); int (*start_reshape) (struct mddev *mddev); void (*finish_reshape) (struct mddev *mddev); - /* quiesce moves between quiescence states - * 0 - fully active - * 1 - no new requests allowed - * others - reserved + /* quiesce suspends or resumes internal processing. + * 1 - stop new actions and wait for action io to complete + * 0 - return to normal behaviour */ - void (*quiesce) (struct mddev *mddev, int state); + void (*quiesce) (struct mddev *mddev, int quiesce); /* takeover is used to transition an array from one * personality to another. The new personality must be able * to handle the data in the current layout. diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 5a00fc118470..5ecba9eef441 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -768,7 +768,7 @@ static void *raid0_takeover(struct mddev *mddev) return ERR_PTR(-EINVAL); } -static void raid0_quiesce(struct mddev *mddev, int state) +static void raid0_quiesce(struct mddev *mddev, int quiesce) { } diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index f3f3e40dc9d8..cc9d337a1ed3 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -37,13 +37,12 @@ #include #include #include -#include #include #include "md.h" #include "raid1.h" -#include "bitmap.h" +#include "md-bitmap.h" #define UNSUPPORTED_MDDEV_FLAGS \ ((1L << MD_HAS_JOURNAL) | \ @@ -990,14 +989,6 @@ static void wait_barrier(struct r1conf *conf, sector_t sector_nr) _wait_barrier(conf, idx); } -static void wait_all_barriers(struct r1conf *conf) -{ - int idx; - - for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++) - _wait_barrier(conf, idx); -} - static void _allow_barrier(struct r1conf *conf, int idx) { atomic_dec(&conf->nr_pending[idx]); @@ -1011,14 +1002,6 @@ static void allow_barrier(struct r1conf *conf, sector_t sector_nr) _allow_barrier(conf, idx); } -static void allow_all_barriers(struct r1conf *conf) -{ - int idx; - - for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++) - _allow_barrier(conf, idx); -} - /* conf->resync_lock should be held */ static int get_unqueued_pending(struct r1conf *conf) { @@ -1303,42 +1286,28 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, int first_clone; int max_sectors; + if (mddev_is_clustered(mddev) && + md_cluster_ops->area_resyncing(mddev, WRITE, + bio->bi_iter.bi_sector, bio_end_sector(bio))) { + + DEFINE_WAIT(w); + for (;;) { + prepare_to_wait(&conf->wait_barrier, + &w, TASK_IDLE); + if (!md_cluster_ops->area_resyncing(mddev, WRITE, + bio->bi_iter.bi_sector, + bio_end_sector(bio))) + break; + schedule(); + } + finish_wait(&conf->wait_barrier, &w); + } + /* * Register the new request and wait if the reconstruction * thread has put up a bar for new requests. * Continue immediately if no resync is active currently. */ - - - if ((bio_end_sector(bio) > mddev->suspend_lo && - bio->bi_iter.bi_sector < mddev->suspend_hi) || - (mddev_is_clustered(mddev) && - md_cluster_ops->area_resyncing(mddev, WRITE, - bio->bi_iter.bi_sector, bio_end_sector(bio)))) { - - /* - * As the suspend_* range is controlled by userspace, we want - * an interruptible wait. - */ - DEFINE_WAIT(w); - for (;;) { - sigset_t full, old; - prepare_to_wait(&conf->wait_barrier, - &w, TASK_INTERRUPTIBLE); - if (bio_end_sector(bio) <= mddev->suspend_lo || - bio->bi_iter.bi_sector >= mddev->suspend_hi || - (mddev_is_clustered(mddev) && - !md_cluster_ops->area_resyncing(mddev, WRITE, - bio->bi_iter.bi_sector, - bio_end_sector(bio)))) - break; - sigfillset(&full); - sigprocmask(SIG_BLOCK, &full, &old); - schedule(); - sigprocmask(SIG_SETMASK, &old, NULL); - } - finish_wait(&conf->wait_barrier, &w); - } wait_barrier(conf, bio->bi_iter.bi_sector); r1_bio = alloc_r1bio(mddev, bio); @@ -1654,8 +1623,12 @@ static void print_conf(struct r1conf *conf) static void close_sync(struct r1conf *conf) { - wait_all_barriers(conf); - allow_all_barriers(conf); + int idx; + + for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++) { + _wait_barrier(conf, idx); + _allow_barrier(conf, idx); + } mempool_destroy(conf->r1buf_pool); conf->r1buf_pool = NULL; @@ -3277,21 +3250,14 @@ static int raid1_reshape(struct mddev *mddev) return 0; } -static void raid1_quiesce(struct mddev *mddev, int state) +static void raid1_quiesce(struct mddev *mddev, int quiesce) { struct r1conf *conf = mddev->private; - switch(state) { - case 2: /* wake for suspend */ - wake_up(&conf->wait_barrier); - break; - case 1: + if (quiesce) freeze_array(conf, 0); - break; - case 0: + else unfreeze_array(conf); - break; - } } static void *raid1_takeover(struct mddev *mddev) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 374df5796649..b9edbc747a95 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -29,7 +29,7 @@ #include "md.h" #include "raid10.h" #include "raid0.h" -#include "bitmap.h" +#include "md-bitmap.h" /* * RAID10 provides a combination of RAID0 and RAID1 functionality. @@ -136,10 +136,13 @@ static void r10bio_pool_free(void *r10_bio, void *data) kfree(r10_bio); } +#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9) /* amount of memory to reserve for resync requests */ #define RESYNC_WINDOW (1024*1024) /* maximum number of concurrent requests, memory permitting */ #define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE) +#define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW) +#define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9) /* * When performing a resync, we need to read and compare, so @@ -383,12 +386,11 @@ static void raid10_end_read_request(struct bio *bio) { int uptodate = !bio->bi_status; struct r10bio *r10_bio = bio->bi_private; - int slot, dev; + int slot; struct md_rdev *rdev; struct r10conf *conf = r10_bio->mddev->private; slot = r10_bio->read_slot; - dev = r10_bio->devs[slot].devnum; rdev = r10_bio->devs[slot].rdev; /* * this branch is our 'one mirror IO has finished' event handler: @@ -748,7 +750,6 @@ static struct md_rdev *read_balance(struct r10conf *conf, raid10_find_phys(conf, r10_bio); rcu_read_lock(); - sectors = r10_bio->sectors; best_slot = -1; best_rdev = NULL; best_dist = MaxSector; @@ -761,8 +762,11 @@ static struct md_rdev *read_balance(struct r10conf *conf, * the resync window. We take the first readable disk when * above the resync window. */ - if (conf->mddev->recovery_cp < MaxSector - && (this_sector + sectors >= conf->next_resync)) + if ((conf->mddev->recovery_cp < MaxSector + && (this_sector + sectors >= conf->next_resync)) || + (mddev_is_clustered(conf->mddev) && + md_cluster_ops->area_resyncing(conf->mddev, READ, this_sector, + this_sector + sectors))) do_balance = 0; for (slot = 0; slot < conf->copies ; slot++) { @@ -1293,6 +1297,22 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, sector_t sectors; int max_sectors; + if ((mddev_is_clustered(mddev) && + md_cluster_ops->area_resyncing(mddev, WRITE, + bio->bi_iter.bi_sector, + bio_end_sector(bio)))) { + DEFINE_WAIT(w); + for (;;) { + prepare_to_wait(&conf->wait_barrier, + &w, TASK_IDLE); + if (!md_cluster_ops->area_resyncing(mddev, WRITE, + bio->bi_iter.bi_sector, bio_end_sector(bio))) + break; + schedule(); + } + finish_wait(&conf->wait_barrier, &w); + } + /* * Register the new request and wait if the reconstruction * thread has put up a bar for new requests. @@ -2575,7 +2595,6 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) struct bio *bio; struct r10conf *conf = mddev->private; struct md_rdev *rdev = r10_bio->devs[slot].rdev; - sector_t bio_last_sector; /* we got a read error. Maybe the drive is bad. Maybe just * the block and we can fix it. @@ -2586,7 +2605,6 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) * frozen. */ bio = r10_bio->devs[slot].bio; - bio_last_sector = r10_bio->devs[slot].addr + rdev->data_offset + r10_bio->sectors; bio_put(bio); r10_bio->devs[slot].bio = NULL; @@ -2825,6 +2843,43 @@ static struct r10bio *raid10_alloc_init_r10buf(struct r10conf *conf) return r10bio; } +/* + * Set cluster_sync_high since we need other nodes to add the + * range [cluster_sync_low, cluster_sync_high] to suspend list. + */ +static void raid10_set_cluster_sync_high(struct r10conf *conf) +{ + sector_t window_size; + int extra_chunk, chunks; + + /* + * First, here we define "stripe" as a unit which across + * all member devices one time, so we get chunks by use + * raid_disks / near_copies. Otherwise, if near_copies is + * close to raid_disks, then resync window could increases + * linearly with the increase of raid_disks, which means + * we will suspend a really large IO window while it is not + * necessary. If raid_disks is not divisible by near_copies, + * an extra chunk is needed to ensure the whole "stripe" is + * covered. + */ + + chunks = conf->geo.raid_disks / conf->geo.near_copies; + if (conf->geo.raid_disks % conf->geo.near_copies == 0) + extra_chunk = 0; + else + extra_chunk = 1; + window_size = (chunks + extra_chunk) * conf->mddev->chunk_sectors; + + /* + * At least use a 32M window to align with raid1's resync window + */ + window_size = (CLUSTER_RESYNC_WINDOW_SECTORS > window_size) ? + CLUSTER_RESYNC_WINDOW_SECTORS : window_size; + + conf->cluster_sync_high = conf->cluster_sync_low + window_size; +} + /* * perform a "sync" on one "block" * @@ -2897,6 +2952,9 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) max_sector = mddev->resync_max_sectors; if (sector_nr >= max_sector) { + conf->cluster_sync_low = 0; + conf->cluster_sync_high = 0; + /* If we aborted, we need to abort the * sync on the 'current' bitmap chucks (there can * be several when recovering multiple devices). @@ -3251,7 +3309,17 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, /* resync. Schedule a read for every block at this virt offset */ int count = 0; - bitmap_cond_end_sync(mddev->bitmap, sector_nr, 0); + /* + * Since curr_resync_completed could probably not update in + * time, and we will set cluster_sync_low based on it. + * Let's check against "sector_nr + 2 * RESYNC_SECTORS" for + * safety reason, which ensures curr_resync_completed is + * updated in bitmap_cond_end_sync. + */ + bitmap_cond_end_sync(mddev->bitmap, sector_nr, + mddev_is_clustered(mddev) && + (sector_nr + 2 * RESYNC_SECTORS > + conf->cluster_sync_high)); if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, mddev->degraded) && @@ -3385,6 +3453,52 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, } while (++page_idx < RESYNC_PAGES); r10_bio->sectors = nr_sectors; + if (mddev_is_clustered(mddev) && + test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { + /* It is resync not recovery */ + if (conf->cluster_sync_high < sector_nr + nr_sectors) { + conf->cluster_sync_low = mddev->curr_resync_completed; + raid10_set_cluster_sync_high(conf); + /* Send resync message */ + md_cluster_ops->resync_info_update(mddev, + conf->cluster_sync_low, + conf->cluster_sync_high); + } + } else if (mddev_is_clustered(mddev)) { + /* This is recovery not resync */ + sector_t sect_va1, sect_va2; + bool broadcast_msg = false; + + for (i = 0; i < conf->geo.raid_disks; i++) { + /* + * sector_nr is a device address for recovery, so we + * need translate it to array address before compare + * with cluster_sync_high. + */ + sect_va1 = raid10_find_virt(conf, sector_nr, i); + + if (conf->cluster_sync_high < sect_va1 + nr_sectors) { + broadcast_msg = true; + /* + * curr_resync_completed is similar as + * sector_nr, so make the translation too. + */ + sect_va2 = raid10_find_virt(conf, + mddev->curr_resync_completed, i); + + if (conf->cluster_sync_low == 0 || + conf->cluster_sync_low > sect_va2) + conf->cluster_sync_low = sect_va2; + } + } + if (broadcast_msg) { + raid10_set_cluster_sync_high(conf); + md_cluster_ops->resync_info_update(mddev, + conf->cluster_sync_low, + conf->cluster_sync_high); + } + } + while (biolist) { bio = biolist; biolist = biolist->bi_next; @@ -3644,6 +3758,18 @@ static int raid10_run(struct mddev *mddev) if (!conf) goto out; + if (mddev_is_clustered(conf->mddev)) { + int fc, fo; + + fc = (mddev->layout >> 8) & 255; + fo = mddev->layout & (1<<16); + if (fc > 1 || fo > 0) { + pr_err("only near layout is supported by clustered" + " raid10\n"); + goto out; + } + } + mddev->thread = conf->thread; conf->thread = NULL; @@ -3832,18 +3958,14 @@ static void raid10_free(struct mddev *mddev, void *priv) kfree(conf); } -static void raid10_quiesce(struct mddev *mddev, int state) +static void raid10_quiesce(struct mddev *mddev, int quiesce) { struct r10conf *conf = mddev->private; - switch(state) { - case 1: + if (quiesce) raise_barrier(conf, 0); - break; - case 0: + else lower_barrier(conf); - break; - } } static int raid10_resize(struct mddev *mddev, sector_t sectors) @@ -4578,15 +4700,18 @@ static int handle_reshape_read_error(struct mddev *mddev, /* Use sync reads to get the blocks from somewhere else */ int sectors = r10_bio->sectors; struct r10conf *conf = mddev->private; - struct { - struct r10bio r10_bio; - struct r10dev devs[conf->copies]; - } on_stack; - struct r10bio *r10b = &on_stack.r10_bio; + struct r10bio *r10b; int slot = 0; int idx = 0; struct page **pages; + r10b = kmalloc(sizeof(*r10b) + + sizeof(struct r10dev) * conf->copies, GFP_NOIO); + if (!r10b) { + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + return -ENOMEM; + } + /* reshape IOs share pages from .devs[0].bio */ pages = get_resync_pages(r10_bio->devs[0].bio)->pages; @@ -4635,11 +4760,13 @@ static int handle_reshape_read_error(struct mddev *mddev, /* couldn't read this block, must give up */ set_bit(MD_RECOVERY_INTR, &mddev->recovery); + kfree(r10b); return -EIO; } sectors -= s; idx++; } + kfree(r10b); return 0; } diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index abceccab6671..db2ac22ac1b4 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -89,6 +89,12 @@ struct r10conf { * the new thread here until we fully activate the array. */ struct md_thread *thread; + + /* + * Keep track of cluster resync window to send to other nodes. + */ + sector_t cluster_sync_low; + sector_t cluster_sync_high; }; /* diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 0b7406ac8ce1..f1c86d938502 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -23,7 +23,7 @@ #include #include "md.h" #include "raid5.h" -#include "bitmap.h" +#include "md-bitmap.h" #include "raid5-log.h" /* @@ -539,7 +539,7 @@ static void r5l_log_run_stripes(struct r5l_log *log) { struct r5l_io_unit *io, *next; - assert_spin_locked(&log->io_list_lock); + lockdep_assert_held(&log->io_list_lock); list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) { /* don't change list order */ @@ -555,7 +555,7 @@ static void r5l_move_to_end_ios(struct r5l_log *log) { struct r5l_io_unit *io, *next; - assert_spin_locked(&log->io_list_lock); + lockdep_assert_held(&log->io_list_lock); list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) { /* don't change list order */ @@ -693,6 +693,8 @@ static void r5c_disable_writeback_async(struct work_struct *work) struct r5l_log *log = container_of(work, struct r5l_log, disable_writeback_work); struct mddev *mddev = log->rdev->mddev; + struct r5conf *conf = mddev->private; + int locked = 0; if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) return; @@ -701,11 +703,15 @@ static void r5c_disable_writeback_async(struct work_struct *work) /* wait superblock change before suspend */ wait_event(mddev->sb_wait, - !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); - - mddev_suspend(mddev); - log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; - mddev_resume(mddev); + conf->log == NULL || + (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) && + (locked = mddev_trylock(mddev)))); + if (locked) { + mddev_suspend(mddev); + log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; + mddev_resume(mddev); + mddev_unlock(mddev); + } } static void r5l_submit_current_io(struct r5l_log *log) @@ -1194,7 +1200,7 @@ static void r5l_run_no_mem_stripe(struct r5l_log *log) { struct stripe_head *sh; - assert_spin_locked(&log->io_list_lock); + lockdep_assert_held(&log->io_list_lock); if (!list_empty(&log->no_mem_stripes)) { sh = list_first_entry(&log->no_mem_stripes, @@ -1210,7 +1216,7 @@ static bool r5l_complete_finished_ios(struct r5l_log *log) struct r5l_io_unit *io, *next; bool found = false; - assert_spin_locked(&log->io_list_lock); + lockdep_assert_held(&log->io_list_lock); list_for_each_entry_safe(io, next, &log->finished_ios, log_sibling) { /* don't change list order */ @@ -1382,7 +1388,7 @@ static void r5c_flush_stripe(struct r5conf *conf, struct stripe_head *sh) * raid5_release_stripe() while holding conf->device_lock */ BUG_ON(test_bit(STRIPE_ON_RELEASE_LIST, &sh->state)); - assert_spin_locked(&conf->device_lock); + lockdep_assert_held(&conf->device_lock); list_del_init(&sh->lru); atomic_inc(&sh->count); @@ -1409,7 +1415,7 @@ void r5c_flush_cache(struct r5conf *conf, int num) int count; struct stripe_head *sh, *next; - assert_spin_locked(&conf->device_lock); + lockdep_assert_held(&conf->device_lock); if (!conf->log) return; @@ -1583,21 +1589,21 @@ void r5l_wake_reclaim(struct r5l_log *log, sector_t space) md_wakeup_thread(log->reclaim_thread); } -void r5l_quiesce(struct r5l_log *log, int state) +void r5l_quiesce(struct r5l_log *log, int quiesce) { struct mddev *mddev; - if (!log || state == 2) + if (!log) return; - if (state == 0) - kthread_unpark(log->reclaim_thread->tsk); - else if (state == 1) { + + if (quiesce) { /* make sure r5l_write_super_and_discard_space exits */ mddev = log->rdev->mddev; wake_up(&mddev->sb_wait); kthread_park(log->reclaim_thread->tsk); r5l_wake_reclaim(log, MaxSector); r5l_do_reclaim(log); - } + } else + kthread_unpark(log->reclaim_thread->tsk); } bool r5l_log_disk_error(struct r5conf *conf) @@ -3165,6 +3171,8 @@ void r5l_exit_log(struct r5conf *conf) conf->log = NULL; synchronize_rcu(); + /* Ensure disable_writeback_work wakes up and exits */ + wake_up(&conf->mddev->sb_wait); flush_work(&log->disable_writeback_work); md_unregister_thread(&log->reclaim_thread); mempool_destroy(log->meta_pool); diff --git a/drivers/md/raid5-log.h b/drivers/md/raid5-log.h index 7f9ad5f7cda0..284578b0a349 100644 --- a/drivers/md/raid5-log.h +++ b/drivers/md/raid5-log.h @@ -9,7 +9,7 @@ extern void r5l_write_stripe_run(struct r5l_log *log); extern void r5l_flush_stripe_to_raid(struct r5l_log *log); extern void r5l_stripe_write_finished(struct stripe_head *sh); extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio); -extern void r5l_quiesce(struct r5l_log *log, int state); +extern void r5l_quiesce(struct r5l_log *log, int quiesce); extern bool r5l_log_disk_error(struct r5conf *conf); extern bool r5c_is_writeback(struct r5l_log *log); extern int diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c index cd026c88f7ef..628c0bf7b9fd 100644 --- a/drivers/md/raid5-ppl.c +++ b/drivers/md/raid5-ppl.c @@ -758,7 +758,8 @@ static int ppl_recover_entry(struct ppl_log *log, struct ppl_header_entry *e, (unsigned long long)sector); rdev = conf->disks[dd_idx].rdev; - if (!rdev) { + if (!rdev || (!test_bit(In_sync, &rdev->flags) && + sector >= rdev->recovery_offset)) { pr_debug("%s:%*s data member disk %d missing\n", __func__, indent, "", dd_idx); update_parity = false; @@ -1296,8 +1297,7 @@ int ppl_init_log(struct r5conf *conf) if (ret) { goto err; - } else if (!mddev->pers && - mddev->recovery_cp == 0 && !mddev->degraded && + } else if (!mddev->pers && mddev->recovery_cp == 0 && ppl_conf->recovered_entries > 0 && ppl_conf->mismatch_count == 0) { /* diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 7d9a50eed9db..31dc25e2871a 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -55,7 +55,6 @@ #include #include #include -#include #include #include @@ -63,7 +62,7 @@ #include "md.h" #include "raid5.h" #include "raid0.h" -#include "bitmap.h" +#include "md-bitmap.h" #include "raid5-log.h" #define UNSUPPORTED_MDDEV_FLAGS (1L << MD_FAILFAST_SUPPORTED) @@ -1818,8 +1817,11 @@ static void ops_complete_reconstruct(void *stripe_head_ref) struct r5dev *dev = &sh->dev[i]; if (dev->written || i == pd_idx || i == qd_idx) { - if (!discard && !test_bit(R5_SkipCopy, &dev->flags)) + if (!discard && !test_bit(R5_SkipCopy, &dev->flags)) { set_bit(R5_UPTODATE, &dev->flags); + if (test_bit(STRIPE_EXPAND_READY, &sh->state)) + set_bit(R5_Expanded, &dev->flags); + } if (fua) set_bit(R5_WantFUA, &dev->flags); if (sync) @@ -5682,28 +5684,6 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) goto retry; } - if (rw == WRITE && - logical_sector >= mddev->suspend_lo && - logical_sector < mddev->suspend_hi) { - raid5_release_stripe(sh); - /* As the suspend_* range is controlled by - * userspace, we want an interruptible - * wait. - */ - prepare_to_wait(&conf->wait_for_overlap, - &w, TASK_INTERRUPTIBLE); - if (logical_sector >= mddev->suspend_lo && - logical_sector < mddev->suspend_hi) { - sigset_t full, old; - sigfillset(&full); - sigprocmask(SIG_BLOCK, &full, &old); - schedule(); - sigprocmask(SIG_SETMASK, &old, NULL); - do_prepare = true; - } - goto retry; - } - if (test_bit(STRIPE_EXPANDING, &sh->state) || !add_stripe_bio(sh, bi, dd_idx, rw, previous)) { /* Stripe is busy expanding or @@ -5758,6 +5738,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk */ struct r5conf *conf = mddev->private; struct stripe_head *sh; + struct md_rdev *rdev; sector_t first_sector, last_sector; int raid_disks = conf->previous_raid_disks; int data_disks = raid_disks - conf->max_degraded; @@ -5880,6 +5861,15 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk return 0; mddev->reshape_position = conf->reshape_progress; mddev->curr_resync_completed = sector_nr; + if (!mddev->reshape_backwards) + /* Can update recovery_offset */ + rdev_for_each(rdev, mddev) + if (rdev->raid_disk >= 0 && + !test_bit(Journal, &rdev->flags) && + !test_bit(In_sync, &rdev->flags) && + rdev->recovery_offset < sector_nr) + rdev->recovery_offset = sector_nr; + conf->reshape_checkpoint = jiffies; set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); md_wakeup_thread(mddev->thread); @@ -5978,6 +5968,14 @@ finish: goto ret; mddev->reshape_position = conf->reshape_progress; mddev->curr_resync_completed = sector_nr; + if (!mddev->reshape_backwards) + /* Can update recovery_offset */ + rdev_for_each(rdev, mddev) + if (rdev->raid_disk >= 0 && + !test_bit(Journal, &rdev->flags) && + !test_bit(In_sync, &rdev->flags) && + rdev->recovery_offset < sector_nr) + rdev->recovery_offset = sector_nr; conf->reshape_checkpoint = jiffies; set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); md_wakeup_thread(mddev->thread); @@ -7156,6 +7154,13 @@ static int raid5_run(struct mddev *mddev) min_offset_diff = diff; } + if ((test_bit(MD_HAS_JOURNAL, &mddev->flags) || journal_dev) && + (mddev->bitmap_info.offset || mddev->bitmap_info.file)) { + pr_notice("md/raid:%s: array cannot have both journal and bitmap\n", + mdname(mddev)); + return -EINVAL; + } + if (mddev->reshape_position != MaxSector) { /* Check that we can continue the reshape. * Difficulties arise if the stripe we would write to @@ -7958,6 +7963,7 @@ static void end_reshape(struct r5conf *conf) { if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { + struct md_rdev *rdev; spin_lock_irq(&conf->device_lock); conf->previous_raid_disks = conf->raid_disks; @@ -7965,6 +7971,11 @@ static void end_reshape(struct r5conf *conf) smp_wmb(); conf->reshape_progress = MaxSector; conf->mddev->reshape_position = MaxSector; + rdev_for_each(rdev, conf->mddev) + if (rdev->raid_disk >= 0 && + !test_bit(Journal, &rdev->flags) && + !test_bit(In_sync, &rdev->flags)) + rdev->recovery_offset = MaxSector; spin_unlock_irq(&conf->device_lock); wake_up(&conf->wait_for_overlap); @@ -8020,16 +8031,12 @@ static void raid5_finish_reshape(struct mddev *mddev) } } -static void raid5_quiesce(struct mddev *mddev, int state) +static void raid5_quiesce(struct mddev *mddev, int quiesce) { struct r5conf *conf = mddev->private; - switch(state) { - case 2: /* resume for a suspend */ - wake_up(&conf->wait_for_overlap); - break; - - case 1: /* stop all writes */ + if (quiesce) { + /* stop all writes */ lock_all_device_hash_locks_irq(conf); /* '2' tells resync/reshape to pause so that all * active stripes can drain @@ -8045,17 +8052,15 @@ static void raid5_quiesce(struct mddev *mddev, int state) unlock_all_device_hash_locks_irq(conf); /* allow reshape to continue */ wake_up(&conf->wait_for_overlap); - break; - - case 0: /* re-enable writes */ + } else { + /* re-enable writes */ lock_all_device_hash_locks_irq(conf); conf->quiesce = 0; wake_up(&conf->wait_for_quiescent); wake_up(&conf->wait_for_overlap); unlock_all_device_hash_locks_irq(conf); - break; } - r5l_quiesce(conf->log, state); + r5l_quiesce(conf->log, quiesce); } static void *raid45_takeover_raid0(struct mddev *mddev, int level)