From f5e70d0fe3ea990cfb3fc8d7f76a719adcb1e0b5 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Mon, 13 Jul 2009 11:35:12 +0100 Subject: [PATCH 01/24] md: Factor out RAID6 algorithms into lib/ We'll want to use these in btrfs too. Signed-off-by: David Woodhouse --- drivers/md/Kconfig | 7 +- drivers/md/Makefile | 76 ------------------- lib/Kconfig | 3 + lib/Makefile | 1 + lib/raid6/Makefile | 78 ++++++++++++++++++++ {drivers/md => lib/raid6}/mktables.c | 0 {drivers/md => lib/raid6}/raid6algos.c | 0 {drivers/md => lib/raid6}/raid6altivec.uc | 0 {drivers/md => lib/raid6}/raid6int.uc | 0 {drivers/md => lib/raid6}/raid6mmx.c | 0 {drivers/md => lib/raid6}/raid6recov.c | 0 {drivers/md => lib/raid6}/raid6sse1.c | 0 {drivers/md => lib/raid6}/raid6sse2.c | 0 {drivers/md => lib/raid6}/raid6test/Makefile | 0 {drivers/md => lib/raid6}/raid6test/test.c | 0 {drivers/md => lib/raid6}/raid6x86.h | 0 {drivers/md => lib/raid6}/unroll.pl | 0 17 files changed, 84 insertions(+), 81 deletions(-) create mode 100644 lib/raid6/Makefile rename {drivers/md => lib/raid6}/mktables.c (100%) rename {drivers/md => lib/raid6}/raid6algos.c (100%) rename {drivers/md => lib/raid6}/raid6altivec.uc (100%) rename {drivers/md => lib/raid6}/raid6int.uc (100%) rename {drivers/md => lib/raid6}/raid6mmx.c (100%) rename {drivers/md => lib/raid6}/raid6recov.c (100%) rename {drivers/md => lib/raid6}/raid6sse1.c (100%) rename {drivers/md => lib/raid6}/raid6sse2.c (100%) rename {drivers/md => lib/raid6}/raid6test/Makefile (100%) rename {drivers/md => lib/raid6}/raid6test/test.c (100%) rename {drivers/md => lib/raid6}/raid6x86.h (100%) rename {drivers/md => lib/raid6}/unroll.pl (100%) diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 2158377a1359..891f7c8490d3 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -121,7 +121,7 @@ config MD_RAID10 config MD_RAID456 tristate "RAID-4/RAID-5/RAID-6 mode" depends on BLK_DEV_MD - select MD_RAID6_PQ + select RAID6_PQ select ASYNC_MEMCPY select ASYNC_XOR select ASYNC_PQ @@ -165,12 +165,9 @@ config MULTICORE_RAID456 If unsure, say N. -config MD_RAID6_PQ - tristate - config ASYNC_RAID6_TEST tristate "Self test for hardware accelerated raid6 recovery" - depends on MD_RAID6_PQ + depends on RAID6_PQ select ASYNC_RAID6_RECOV ---help--- This is a one-shot self test that permutes through the diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 1dc4185bd781..c9b3a7843d83 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -12,13 +12,6 @@ dm-log-userspace-y \ += dm-log-userspace-base.o dm-log-userspace-transfer.o md-mod-y += md.o bitmap.o raid456-y += raid5.o -raid6_pq-y += raid6algos.o raid6recov.o raid6tables.o \ - raid6int1.o raid6int2.o raid6int4.o \ - raid6int8.o raid6int16.o raid6int32.o \ - raid6altivec1.o raid6altivec2.o raid6altivec4.o \ - raid6altivec8.o \ - raid6mmx.o raid6sse1.o raid6sse2.o -hostprogs-y += mktables # Note: link order is important. All raid personalities # and must come before md.o, as they each initialise @@ -29,7 +22,6 @@ obj-$(CONFIG_MD_LINEAR) += linear.o obj-$(CONFIG_MD_RAID0) += raid0.o obj-$(CONFIG_MD_RAID1) += raid1.o obj-$(CONFIG_MD_RAID10) += raid10.o -obj-$(CONFIG_MD_RAID6_PQ) += raid6_pq.o obj-$(CONFIG_MD_RAID456) += raid456.o obj-$(CONFIG_MD_MULTIPATH) += multipath.o obj-$(CONFIG_MD_FAULTY) += faulty.o @@ -45,75 +37,7 @@ obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o obj-$(CONFIG_DM_LOG_USERSPACE) += dm-log-userspace.o obj-$(CONFIG_DM_ZERO) += dm-zero.o -quiet_cmd_unroll = UNROLL $@ - cmd_unroll = $(PERL) $(srctree)/$(src)/unroll.pl $(UNROLL) \ - < $< > $@ || ( rm -f $@ && exit 1 ) - -ifeq ($(CONFIG_ALTIVEC),y) -altivec_flags := -maltivec -mabi=altivec -endif - ifeq ($(CONFIG_DM_UEVENT),y) dm-mod-objs += dm-uevent.o endif -targets += raid6int1.c -$(obj)/raid6int1.c: UNROLL := 1 -$(obj)/raid6int1.c: $(src)/raid6int.uc $(src)/unroll.pl FORCE - $(call if_changed,unroll) - -targets += raid6int2.c -$(obj)/raid6int2.c: UNROLL := 2 -$(obj)/raid6int2.c: $(src)/raid6int.uc $(src)/unroll.pl FORCE - $(call if_changed,unroll) - -targets += raid6int4.c -$(obj)/raid6int4.c: UNROLL := 4 -$(obj)/raid6int4.c: $(src)/raid6int.uc $(src)/unroll.pl FORCE - $(call if_changed,unroll) - -targets += raid6int8.c -$(obj)/raid6int8.c: UNROLL := 8 -$(obj)/raid6int8.c: $(src)/raid6int.uc $(src)/unroll.pl FORCE - $(call if_changed,unroll) - -targets += raid6int16.c -$(obj)/raid6int16.c: UNROLL := 16 -$(obj)/raid6int16.c: $(src)/raid6int.uc $(src)/unroll.pl FORCE - $(call if_changed,unroll) - -targets += raid6int32.c -$(obj)/raid6int32.c: UNROLL := 32 -$(obj)/raid6int32.c: $(src)/raid6int.uc $(src)/unroll.pl FORCE - $(call if_changed,unroll) - -CFLAGS_raid6altivec1.o += $(altivec_flags) -targets += raid6altivec1.c -$(obj)/raid6altivec1.c: UNROLL := 1 -$(obj)/raid6altivec1.c: $(src)/raid6altivec.uc $(src)/unroll.pl FORCE - $(call if_changed,unroll) - -CFLAGS_raid6altivec2.o += $(altivec_flags) -targets += raid6altivec2.c -$(obj)/raid6altivec2.c: UNROLL := 2 -$(obj)/raid6altivec2.c: $(src)/raid6altivec.uc $(src)/unroll.pl FORCE - $(call if_changed,unroll) - -CFLAGS_raid6altivec4.o += $(altivec_flags) -targets += raid6altivec4.c -$(obj)/raid6altivec4.c: UNROLL := 4 -$(obj)/raid6altivec4.c: $(src)/raid6altivec.uc $(src)/unroll.pl FORCE - $(call if_changed,unroll) - -CFLAGS_raid6altivec8.o += $(altivec_flags) -targets += raid6altivec8.c -$(obj)/raid6altivec8.c: UNROLL := 8 -$(obj)/raid6altivec8.c: $(src)/raid6altivec.uc $(src)/unroll.pl FORCE - $(call if_changed,unroll) - -quiet_cmd_mktable = TABLE $@ - cmd_mktable = $(obj)/mktables > $@ || ( rm -f $@ && exit 1 ) - -targets += raid6tables.c -$(obj)/raid6tables.c: $(obj)/mktables FORCE - $(call if_changed,mktable) diff --git a/lib/Kconfig b/lib/Kconfig index bb1326d3839c..1cc756c4c78c 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -7,6 +7,9 @@ config BINARY_PRINTF menu "Library routines" +config RAID6_PQ + tristate + config BITREVERSE tristate diff --git a/lib/Makefile b/lib/Makefile index 2e78277eff9d..b25faf0d1d5c 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -65,6 +65,7 @@ obj-$(CONFIG_ZLIB_DEFLATE) += zlib_deflate/ obj-$(CONFIG_REED_SOLOMON) += reed_solomon/ obj-$(CONFIG_LZO_COMPRESS) += lzo/ obj-$(CONFIG_LZO_DECOMPRESS) += lzo/ +obj-$(CONFIG_RAID6_PQ) += raid6/ lib-$(CONFIG_DECOMPRESS_GZIP) += decompress_inflate.o lib-$(CONFIG_DECOMPRESS_BZIP2) += decompress_bunzip2.o diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile new file mode 100644 index 000000000000..b2fe4baa90e0 --- /dev/null +++ b/lib/raid6/Makefile @@ -0,0 +1,78 @@ +obj-$(CONFIG_RAID6_PQ) += raid6_pq.o + +raid6_pq-y += raid6algos.o raid6recov.o raid6tables.o \ + raid6int1.o raid6int2.o raid6int4.o \ + raid6int8.o raid6int16.o raid6int32.o \ + raid6altivec1.o raid6altivec2.o raid6altivec4.o \ + raid6altivec8.o \ + raid6mmx.o raid6sse1.o raid6sse2.o +hostprogs-y += mktables + +quiet_cmd_unroll = UNROLL $@ + cmd_unroll = $(PERL) $(srctree)/$(src)/unroll.pl $(UNROLL) \ + < $< > $@ || ( rm -f $@ && exit 1 ) + +ifeq ($(CONFIG_ALTIVEC),y) +altivec_flags := -maltivec -mabi=altivec +endif + +targets += raid6int1.c +$(obj)/raid6int1.c: UNROLL := 1 +$(obj)/raid6int1.c: $(src)/raid6int.uc $(src)/unroll.pl FORCE + $(call if_changed,unroll) + +targets += raid6int2.c +$(obj)/raid6int2.c: UNROLL := 2 +$(obj)/raid6int2.c: $(src)/raid6int.uc $(src)/unroll.pl FORCE + $(call if_changed,unroll) + +targets += raid6int4.c +$(obj)/raid6int4.c: UNROLL := 4 +$(obj)/raid6int4.c: $(src)/raid6int.uc $(src)/unroll.pl FORCE + $(call if_changed,unroll) + +targets += raid6int8.c +$(obj)/raid6int8.c: UNROLL := 8 +$(obj)/raid6int8.c: $(src)/raid6int.uc $(src)/unroll.pl FORCE + $(call if_changed,unroll) + +targets += raid6int16.c +$(obj)/raid6int16.c: UNROLL := 16 +$(obj)/raid6int16.c: $(src)/raid6int.uc $(src)/unroll.pl FORCE + $(call if_changed,unroll) + +targets += raid6int32.c +$(obj)/raid6int32.c: UNROLL := 32 +$(obj)/raid6int32.c: $(src)/raid6int.uc $(src)/unroll.pl FORCE + $(call if_changed,unroll) + +CFLAGS_raid6altivec1.o += $(altivec_flags) +targets += raid6altivec1.c +$(obj)/raid6altivec1.c: UNROLL := 1 +$(obj)/raid6altivec1.c: $(src)/raid6altivec.uc $(src)/unroll.pl FORCE + $(call if_changed,unroll) + +CFLAGS_raid6altivec2.o += $(altivec_flags) +targets += raid6altivec2.c +$(obj)/raid6altivec2.c: UNROLL := 2 +$(obj)/raid6altivec2.c: $(src)/raid6altivec.uc $(src)/unroll.pl FORCE + $(call if_changed,unroll) + +CFLAGS_raid6altivec4.o += $(altivec_flags) +targets += raid6altivec4.c +$(obj)/raid6altivec4.c: UNROLL := 4 +$(obj)/raid6altivec4.c: $(src)/raid6altivec.uc $(src)/unroll.pl FORCE + $(call if_changed,unroll) + +CFLAGS_raid6altivec8.o += $(altivec_flags) +targets += raid6altivec8.c +$(obj)/raid6altivec8.c: UNROLL := 8 +$(obj)/raid6altivec8.c: $(src)/raid6altivec.uc $(src)/unroll.pl FORCE + $(call if_changed,unroll) + +quiet_cmd_mktable = TABLE $@ + cmd_mktable = $(obj)/mktables > $@ || ( rm -f $@ && exit 1 ) + +targets += raid6tables.c +$(obj)/raid6tables.c: $(obj)/mktables FORCE + $(call if_changed,mktable) diff --git a/drivers/md/mktables.c b/lib/raid6/mktables.c similarity index 100% rename from drivers/md/mktables.c rename to lib/raid6/mktables.c diff --git a/drivers/md/raid6algos.c b/lib/raid6/raid6algos.c similarity index 100% rename from drivers/md/raid6algos.c rename to lib/raid6/raid6algos.c diff --git a/drivers/md/raid6altivec.uc b/lib/raid6/raid6altivec.uc similarity index 100% rename from drivers/md/raid6altivec.uc rename to lib/raid6/raid6altivec.uc diff --git a/drivers/md/raid6int.uc b/lib/raid6/raid6int.uc similarity index 100% rename from drivers/md/raid6int.uc rename to lib/raid6/raid6int.uc diff --git a/drivers/md/raid6mmx.c b/lib/raid6/raid6mmx.c similarity index 100% rename from drivers/md/raid6mmx.c rename to lib/raid6/raid6mmx.c diff --git a/drivers/md/raid6recov.c b/lib/raid6/raid6recov.c similarity index 100% rename from drivers/md/raid6recov.c rename to lib/raid6/raid6recov.c diff --git a/drivers/md/raid6sse1.c b/lib/raid6/raid6sse1.c similarity index 100% rename from drivers/md/raid6sse1.c rename to lib/raid6/raid6sse1.c diff --git a/drivers/md/raid6sse2.c b/lib/raid6/raid6sse2.c similarity index 100% rename from drivers/md/raid6sse2.c rename to lib/raid6/raid6sse2.c diff --git a/drivers/md/raid6test/Makefile b/lib/raid6/raid6test/Makefile similarity index 100% rename from drivers/md/raid6test/Makefile rename to lib/raid6/raid6test/Makefile diff --git a/drivers/md/raid6test/test.c b/lib/raid6/raid6test/test.c similarity index 100% rename from drivers/md/raid6test/test.c rename to lib/raid6/raid6test/test.c diff --git a/drivers/md/raid6x86.h b/lib/raid6/raid6x86.h similarity index 100% rename from drivers/md/raid6x86.h rename to lib/raid6/raid6x86.h diff --git a/drivers/md/unroll.pl b/lib/raid6/unroll.pl similarity index 100% rename from drivers/md/unroll.pl rename to lib/raid6/unroll.pl From e5d84970a554d5c0072043a7b9f0f5b88b5fdfe1 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 29 Oct 2009 16:41:49 +0000 Subject: [PATCH 02/24] async_tx: Move ASYNC_RAID6_TEST option to crypto/async_tx/, fix dependencies Signed-off-by: David Woodhouse --- crypto/async_tx/Kconfig | 14 ++++++++++++++ drivers/md/Kconfig | 13 ------------- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/crypto/async_tx/Kconfig b/crypto/async_tx/Kconfig index e5aeb2b79e6f..e24aa80087ad 100644 --- a/crypto/async_tx/Kconfig +++ b/crypto/async_tx/Kconfig @@ -22,4 +22,18 @@ config ASYNC_RAID6_RECOV tristate select ASYNC_CORE select ASYNC_PQ + select ASYNC_XOR + +config ASYNC_RAID6_TEST + tristate "Self test for hardware accelerated raid6 recovery" + depends on ASYNC_RAID6_RECOV + select ASYNC_MEMCPY + ---help--- + This is a one-shot self test that permutes through the + recovery of all the possible two disk failure scenarios for a + N-disk array. Recovery is performed with the asynchronous + raid6 recovery routines, and will optionally use an offload + engine if one is available. + + If unsure, say N. diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 891f7c8490d3..e27ae4604cef 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -165,19 +165,6 @@ config MULTICORE_RAID456 If unsure, say N. -config ASYNC_RAID6_TEST - tristate "Self test for hardware accelerated raid6 recovery" - depends on RAID6_PQ - select ASYNC_RAID6_RECOV - ---help--- - This is a one-shot self test that permutes through the - recovery of all the possible two disk failure scenarios for a - N-disk array. Recovery is performed with the asynchronous - raid6 recovery routines, and will optionally use an offload - engine if one is available. - - If unsure, say N. - config MD_MULTIPATH tristate "Multipath I/O support" depends on BLK_DEV_MD From 00bcb4ac7ee7e557a491b614219142cea0ef16f4 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 1 Jun 2010 19:37:23 +1000 Subject: [PATCH 03/24] md: reduce dependence on sysfs. We will want md devices to live as dm targets where sysfs is not visible. So allow md to not connect to sysfs. Signed-off-by: NeilBrown --- drivers/md/md.c | 101 +++++++++++++++++++++------------------------ drivers/md/md.h | 12 ++++++ drivers/md/raid5.c | 8 ++-- 3 files changed, 62 insertions(+), 59 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index cb20d0b0555a..9007651ce175 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -541,14 +541,16 @@ static void mddev_unlock(mddev_t * mddev) mutex_lock(&mddev->open_mutex); mutex_unlock(&mddev->reconfig_mutex); - if (to_remove != &md_redundancy_group) - sysfs_remove_group(&mddev->kobj, to_remove); - if (mddev->pers == NULL || - mddev->pers->sync_request == NULL) { - sysfs_remove_group(&mddev->kobj, &md_redundancy_group); - if (mddev->sysfs_action) - sysfs_put(mddev->sysfs_action); - mddev->sysfs_action = NULL; + if (mddev->kobj.sd) { + if (to_remove != &md_redundancy_group) + sysfs_remove_group(&mddev->kobj, to_remove); + if (mddev->pers == NULL || + mddev->pers->sync_request == NULL) { + sysfs_remove_group(&mddev->kobj, &md_redundancy_group); + if (mddev->sysfs_action) + sysfs_put(mddev->sysfs_action); + mddev->sysfs_action = NULL; + } } mutex_unlock(&mddev->open_mutex); } else @@ -1811,11 +1813,9 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) goto fail; ko = &part_to_dev(rdev->bdev->bd_part)->kobj; - if ((err = sysfs_create_link(&rdev->kobj, ko, "block"))) { - kobject_del(&rdev->kobj); - goto fail; - } - rdev->sysfs_state = sysfs_get_dirent(rdev->kobj.sd, NULL, "state"); + if (sysfs_create_link(&rdev->kobj, ko, "block")) + /* failure here is OK */; + rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state"); list_add_rcu(&rdev->same_set, &mddev->disks); bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk); @@ -2334,8 +2334,8 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len) set_bit(In_sync, &rdev->flags); err = 0; } - if (!err && rdev->sysfs_state) - sysfs_notify_dirent(rdev->sysfs_state); + if (!err) + sysfs_notify_dirent_safe(rdev->sysfs_state); return err ? err : len; } static struct rdev_sysfs_entry rdev_state = @@ -2430,14 +2430,10 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) rdev->raid_disk = -1; return err; } else - sysfs_notify_dirent(rdev->sysfs_state); + sysfs_notify_dirent_safe(rdev->sysfs_state); sprintf(nm, "rd%d", rdev->raid_disk); if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm)) - printk(KERN_WARNING - "md: cannot register " - "%s for %s\n", - nm, mdname(rdev->mddev)); - + /* failure here is OK */; /* don't wakeup anyone, leave that to userspace. */ } else { if (slot >= rdev->mddev->raid_disks) @@ -2447,7 +2443,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) clear_bit(Faulty, &rdev->flags); clear_bit(WriteMostly, &rdev->flags); set_bit(In_sync, &rdev->flags); - sysfs_notify_dirent(rdev->sysfs_state); + sysfs_notify_dirent_safe(rdev->sysfs_state); } return len; } @@ -3437,7 +3433,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) if (err) return err; else { - sysfs_notify_dirent(mddev->sysfs_state); + sysfs_notify_dirent_safe(mddev->sysfs_state); return len; } } @@ -3735,7 +3731,7 @@ action_store(mddev_t *mddev, const char *page, size_t len) } set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); - sysfs_notify_dirent(mddev->sysfs_action); + sysfs_notify_dirent_safe(mddev->sysfs_action); return len; } @@ -4281,13 +4277,14 @@ static int md_alloc(dev_t dev, char *name) disk->disk_name); error = 0; } - if (sysfs_create_group(&mddev->kobj, &md_bitmap_group)) + if (mddev->kobj.sd && + sysfs_create_group(&mddev->kobj, &md_bitmap_group)) printk(KERN_DEBUG "pointless warning\n"); abort: mutex_unlock(&disks_mutex); - if (!error) { + if (!error && mddev->kobj.sd) { kobject_uevent(&mddev->kobj, KOBJ_ADD); - mddev->sysfs_state = sysfs_get_dirent(mddev->kobj.sd, NULL, "array_state"); + mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state"); } mddev_put(mddev); return error; @@ -4325,7 +4322,7 @@ static void md_safemode_timeout(unsigned long data) if (!atomic_read(&mddev->writes_pending)) { mddev->safemode = 1; if (mddev->external) - sysfs_notify_dirent(mddev->sysfs_state); + sysfs_notify_dirent_safe(mddev->sysfs_state); } md_wakeup_thread(mddev->thread); } @@ -4397,7 +4394,7 @@ static int md_run(mddev_t *mddev) return -EINVAL; } } - sysfs_notify_dirent(rdev->sysfs_state); + sysfs_notify_dirent_safe(rdev->sysfs_state); } spin_lock(&pers_lock); @@ -4496,11 +4493,12 @@ static int md_run(mddev_t *mddev) return err; } if (mddev->pers->sync_request) { - if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) + if (mddev->kobj.sd && + sysfs_create_group(&mddev->kobj, &md_redundancy_group)) printk(KERN_WARNING "md: cannot register extra attributes for %s\n", mdname(mddev)); - mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, NULL, "sync_action"); + mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action"); } else if (mddev->ro == 2) /* auto-readonly not meaningful */ mddev->ro = 0; @@ -4518,8 +4516,7 @@ static int md_run(mddev_t *mddev) char nm[20]; sprintf(nm, "rd%d", rdev->raid_disk); if (sysfs_create_link(&mddev->kobj, &rdev->kobj, nm)) - printk("md: cannot register %s for %s\n", - nm, mdname(mddev)); + /* failure here is OK */; } set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); @@ -4531,9 +4528,8 @@ static int md_run(mddev_t *mddev) md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ md_new_event(mddev); - sysfs_notify_dirent(mddev->sysfs_state); - if (mddev->sysfs_action) - sysfs_notify_dirent(mddev->sysfs_action); + sysfs_notify_dirent_safe(mddev->sysfs_state); + sysfs_notify_dirent_safe(mddev->sysfs_action); sysfs_notify(&mddev->kobj, NULL, "degraded"); return 0; } @@ -4573,7 +4569,7 @@ static int restart_array(mddev_t *mddev) set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->sync_thread); - sysfs_notify_dirent(mddev->sysfs_state); + sysfs_notify_dirent_safe(mddev->sysfs_state); return 0; } @@ -4697,7 +4693,7 @@ static int md_set_readonly(mddev_t *mddev, int is_open) mddev->ro = 1; set_disk_ro(mddev->gendisk, 1); clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - sysfs_notify_dirent(mddev->sysfs_state); + sysfs_notify_dirent_safe(mddev->sysfs_state); err = 0; } out: @@ -4730,7 +4726,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) mddev->queue->backing_dev_info.congested_fn = NULL; /* tell userspace to handle 'inactive' */ - sysfs_notify_dirent(mddev->sysfs_state); + sysfs_notify_dirent_safe(mddev->sysfs_state); list_for_each_entry(rdev, &mddev->disks, same_set) if (rdev->raid_disk >= 0) { @@ -4776,7 +4772,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) err = 0; blk_integrity_unregister(disk); md_new_event(mddev); - sysfs_notify_dirent(mddev->sysfs_state); + sysfs_notify_dirent_safe(mddev->sysfs_state); return err; } @@ -5138,7 +5134,7 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) if (err) export_rdev(rdev); else - sysfs_notify_dirent(rdev->sysfs_state); + sysfs_notify_dirent_safe(rdev->sysfs_state); md_update_sb(mddev, 1); if (mddev->degraded) @@ -5813,7 +5809,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, if (_IOC_TYPE(cmd) == MD_MAJOR && mddev->ro && mddev->pers) { if (mddev->ro == 2) { mddev->ro = 0; - sysfs_notify_dirent(mddev->sysfs_state); + sysfs_notify_dirent_safe(mddev->sysfs_state); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); } else { @@ -6059,7 +6055,7 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev) mddev->pers->error_handler(mddev,rdev); if (mddev->degraded) set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); - sysfs_notify_dirent(rdev->sysfs_state); + sysfs_notify_dirent_safe(rdev->sysfs_state); set_bit(MD_RECOVERY_INTR, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); @@ -6520,7 +6516,7 @@ void md_write_start(mddev_t *mddev, struct bio *bi) spin_unlock_irq(&mddev->write_lock); } if (did_change) - sysfs_notify_dirent(mddev->sysfs_state); + sysfs_notify_dirent_safe(mddev->sysfs_state); wait_event(mddev->sb_wait, !test_bit(MD_CHANGE_CLEAN, &mddev->flags) && !test_bit(MD_CHANGE_PENDING, &mddev->flags)); @@ -6563,7 +6559,7 @@ int md_allow_write(mddev_t *mddev) mddev->safemode = 1; spin_unlock_irq(&mddev->write_lock); md_update_sb(mddev, 0); - sysfs_notify_dirent(mddev->sysfs_state); + sysfs_notify_dirent_safe(mddev->sysfs_state); } else spin_unlock_irq(&mddev->write_lock); @@ -6950,10 +6946,7 @@ static int remove_and_add_spares(mddev_t *mddev) sprintf(nm, "rd%d", rdev->raid_disk); if (sysfs_create_link(&mddev->kobj, &rdev->kobj, nm)) - printk(KERN_WARNING - "md: cannot register " - "%s for %s\n", - nm, mdname(mddev)); + /* failure here is OK */; spares++; md_new_event(mddev); set_bit(MD_CHANGE_DEVS, &mddev->flags); @@ -7046,7 +7039,7 @@ void md_check_recovery(mddev_t *mddev) mddev->safemode = 0; spin_unlock_irq(&mddev->write_lock); if (did_change) - sysfs_notify_dirent(mddev->sysfs_state); + sysfs_notify_dirent_safe(mddev->sysfs_state); } if (mddev->flags) @@ -7085,7 +7078,7 @@ void md_check_recovery(mddev_t *mddev) mddev->recovery = 0; /* flag recovery needed just to double check */ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - sysfs_notify_dirent(mddev->sysfs_action); + sysfs_notify_dirent_safe(mddev->sysfs_action); md_new_event(mddev); goto unlock; } @@ -7147,7 +7140,7 @@ void md_check_recovery(mddev_t *mddev) mddev->recovery = 0; } else md_wakeup_thread(mddev->sync_thread); - sysfs_notify_dirent(mddev->sysfs_action); + sysfs_notify_dirent_safe(mddev->sysfs_action); md_new_event(mddev); } unlock: @@ -7156,7 +7149,7 @@ void md_check_recovery(mddev_t *mddev) if (test_and_clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) if (mddev->sysfs_action) - sysfs_notify_dirent(mddev->sysfs_action); + sysfs_notify_dirent_safe(mddev->sysfs_action); } mddev_unlock(mddev); } @@ -7164,7 +7157,7 @@ void md_check_recovery(mddev_t *mddev) void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev) { - sysfs_notify_dirent(rdev->sysfs_state); + sysfs_notify_dirent_safe(rdev->sysfs_state); wait_event_timeout(rdev->blocked_wait, !test_bit(Blocked, &rdev->flags), msecs_to_jiffies(5000)); diff --git a/drivers/md/md.h b/drivers/md/md.h index 10597bfec000..1e6405918eec 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -382,6 +382,18 @@ struct md_sysfs_entry { }; extern struct attribute_group md_bitmap_group; +static inline struct sysfs_dirent *sysfs_get_dirent_safe(struct sysfs_dirent *sd, char *name) +{ + if (sd) + return sysfs_get_dirent(sd, NULL, name); + return sd; +} +static inline void sysfs_notify_dirent_safe(struct sysfs_dirent *sd) +{ + if (sd) + sysfs_notify_dirent(sd); +} + static inline char * mdname (mddev_t * mddev) { return mddev->gendisk ? mddev->gendisk->disk_name : "mdX"; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 96c690279fc6..6a7a30113161 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5158,7 +5158,8 @@ static int run(mddev_t *mddev) /* Ok, everything is just fine now */ if (mddev->to_remove == &raid5_attrs_group) mddev->to_remove = NULL; - else if (sysfs_create_group(&mddev->kobj, &raid5_attrs_group)) + else if (mddev->kobj.sd && + sysfs_create_group(&mddev->kobj, &raid5_attrs_group)) printk(KERN_WARNING "md/raid:%s: failed to create sysfs attributes.\n", mdname(mddev)); @@ -5545,10 +5546,7 @@ static int raid5_start_reshape(mddev_t *mddev) sprintf(nm, "rd%d", rdev->raid_disk); if (sysfs_create_link(&mddev->kobj, &rdev->kobj, nm)) - printk(KERN_WARNING - "md/raid:%s: failed to create " - " link %s\n", - mdname(mddev), nm); + /* Failure here is OK */; } else break; } From c41d4ac40df0d01bf9c383ff28f194d1df2d4fd9 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 1 Jun 2010 19:37:24 +1000 Subject: [PATCH 04/24] md/raid5: factor out code for changing size of stripe cache. Separate the actual 'change' code from the sysfs interface so that it can eventually be called internally. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 41 +++++++++++++++++++++++++++-------------- drivers/md/raid5.h | 1 + 2 files changed, 28 insertions(+), 14 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 6a7a30113161..bd4067a70834 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4566,6 +4566,32 @@ raid5_show_stripe_cache_size(mddev_t *mddev, char *page) return 0; } +int +raid5_set_cache_size(mddev_t *mddev, int size) +{ + raid5_conf_t *conf = mddev->private; + int err; + + if (size <= 16 || size > 32768) + return -EINVAL; + while (size < conf->max_nr_stripes) { + if (drop_one_stripe(conf)) + conf->max_nr_stripes--; + else + break; + } + err = md_allow_write(mddev); + if (err) + return err; + while (size > conf->max_nr_stripes) { + if (grow_one_stripe(conf)) + conf->max_nr_stripes++; + else break; + } + return 0; +} +EXPORT_SYMBOL(raid5_set_cache_size); + static ssize_t raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len) { @@ -4580,22 +4606,9 @@ raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len) if (strict_strtoul(page, 10, &new)) return -EINVAL; - if (new <= 16 || new > 32768) - return -EINVAL; - while (new < conf->max_nr_stripes) { - if (drop_one_stripe(conf)) - conf->max_nr_stripes--; - else - break; - } - err = md_allow_write(mddev); + err = raid5_set_cache_size(mddev, new); if (err) return err; - while (new > conf->max_nr_stripes) { - if (grow_one_stripe(conf)) - conf->max_nr_stripes++; - else break; - } return len; } diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 0f86f5e36724..cbdbc77695b3 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -497,4 +497,5 @@ static inline int algorithm_is_DDF(int layout) { return layout >= 8 && layout <= 10; } +extern int raid5_set_cache_size(mddev_t *mddev, int size); #endif From f4be6b43f1ac60dff00ef0923ee43b0e08872947 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 1 Jun 2010 19:37:25 +1000 Subject: [PATCH 05/24] md/raid5: ensure we create a unique name for kmem_cache when mddev has no gendisk We will shortly allow md devices with no gendisk (they are attached to a dm-target instead). That will cause mdname() to return 'mdX'. There is one place where mdname really needs to be unique: when creating the name for a slab cache. So in that case, if there is no gendisk, you the address of the mddev formatted in HEX to provide a unique name. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 12 ++++++++---- drivers/md/raid5.h | 2 +- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index bd4067a70834..6fa60e416a09 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1337,10 +1337,14 @@ static int grow_stripes(raid5_conf_t *conf, int num) struct kmem_cache *sc; int devs = max(conf->raid_disks, conf->previous_raid_disks); - sprintf(conf->cache_name[0], - "raid%d-%s", conf->level, mdname(conf->mddev)); - sprintf(conf->cache_name[1], - "raid%d-%s-alt", conf->level, mdname(conf->mddev)); + if (conf->mddev->gendisk) + sprintf(conf->cache_name[0], + "raid%d-%s", conf->level, mdname(conf->mddev)); + else + sprintf(conf->cache_name[0], + "raid%d-%p", conf->level, conf->mddev); + sprintf(conf->cache_name[1], "%s-alt", conf->cache_name[0]); + conf->active_name = 0; sc = kmem_cache_create(conf->cache_name[conf->active_name], sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev), diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index cbdbc77695b3..61b6b25dc5e7 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -388,7 +388,7 @@ struct raid5_private_data { * two caches. */ int active_name; - char cache_name[2][20]; + char cache_name[2][32]; struct kmem_cache *slab_cache; /* for allocating stripes */ int seq_flush, seq_write; From 676e42d896ab6967859fabbb06f4e11b9615cbcf Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 1 Jun 2010 19:37:26 +1000 Subject: [PATCH 06/24] md: be more careful setting MD_CHANGE_CLEAN When MD_CHANGE_CLEAN is set we might block in md_write_start. So we should only set it when fairly sure that something will clear it. There are two places where it is set so as to encourage a metadata update to record the progress of resync/recovery. This should only be done if the internal metadata update mechanisms are in use, which can be tested by by inspecting '->persistent'. Signed-off-by: NeilBrown --- drivers/md/bitmap.c | 3 ++- drivers/md/md.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 1742435ce3ae..4518994712c7 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -1526,7 +1526,8 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector) atomic_read(&bitmap->mddev->recovery_active) == 0); bitmap->mddev->curr_resync_completed = bitmap->mddev->curr_resync; - set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags); + if (bitmap->mddev->persistent) + set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags); sector &= ~((1ULL << CHUNK_BLOCK_SHIFT(bitmap)) - 1); s = 0; while (s < sector && s < bitmap->mddev->resync_max_sectors) { diff --git a/drivers/md/md.c b/drivers/md/md.c index 9007651ce175..d636b0a40fac 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6753,7 +6753,8 @@ void md_do_sync(mddev_t *mddev) atomic_read(&mddev->recovery_active) == 0); mddev->curr_resync_completed = mddev->curr_resync; - set_bit(MD_CHANGE_CLEAN, &mddev->flags); + if (mddev->persistent) + set_bit(MD_CHANGE_CLEAN, &mddev->flags); sysfs_notify(&mddev->kobj, NULL, "sync_completed"); } From e8bb9a839a26f076379e9cb9f46a879d210156f1 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 1 Jun 2010 19:37:26 +1000 Subject: [PATCH 07/24] md: split out md_rdev_init This functionality will be needed separately in a subsequent patch, so split it into it's own exported function. Signed-off-by: NeilBrown --- drivers/md/md.c | 34 +++++++++++++++++++--------------- drivers/md/md.h | 1 + 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index d636b0a40fac..3de623aceed0 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2691,6 +2691,24 @@ static struct kobj_type rdev_ktype = { .default_attrs = rdev_default_attrs, }; +void md_rdev_init(mdk_rdev_t *rdev) +{ + rdev->desc_nr = -1; + rdev->saved_raid_disk = -1; + rdev->raid_disk = -1; + rdev->flags = 0; + rdev->data_offset = 0; + rdev->sb_events = 0; + rdev->last_read_error.tv_sec = 0; + rdev->last_read_error.tv_nsec = 0; + atomic_set(&rdev->nr_pending, 0); + atomic_set(&rdev->read_errors, 0); + atomic_set(&rdev->corrected_errors, 0); + + INIT_LIST_HEAD(&rdev->same_set); + init_waitqueue_head(&rdev->blocked_wait); +} +EXPORT_SYMBOL_GPL(md_rdev_init); /* * Import a device. If 'super_format' >= 0, then sanity check the superblock * @@ -2714,6 +2732,7 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi return ERR_PTR(-ENOMEM); } + md_rdev_init(rdev); if ((err = alloc_disk_sb(rdev))) goto abort_free; @@ -2723,18 +2742,6 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi kobject_init(&rdev->kobj, &rdev_ktype); - rdev->desc_nr = -1; - rdev->saved_raid_disk = -1; - rdev->raid_disk = -1; - rdev->flags = 0; - rdev->data_offset = 0; - rdev->sb_events = 0; - rdev->last_read_error.tv_sec = 0; - rdev->last_read_error.tv_nsec = 0; - atomic_set(&rdev->nr_pending, 0); - atomic_set(&rdev->read_errors, 0); - atomic_set(&rdev->corrected_errors, 0); - size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; if (!size) { printk(KERN_WARNING @@ -2763,9 +2770,6 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi } } - INIT_LIST_HEAD(&rdev->same_set); - init_waitqueue_head(&rdev->blocked_wait); - return rdev; abort_free: diff --git a/drivers/md/md.h b/drivers/md/md.h index 1e6405918eec..cc8030543e82 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -487,4 +487,5 @@ extern void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev); extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale); extern void restore_bitmap_write_access(struct file *file); +extern void md_rdev_init(mdk_rdev_t *rdev); #endif /* _MD_MD_H */ From 390ee602a142a93f2c7eb7bffee8e277058b8e0a Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 1 Jun 2010 19:37:27 +1000 Subject: [PATCH 08/24] md: export various start/stop interfaces export entry points for starting and stopping md arrays. This will be used by a module to make md/raid5 work under dm. Also stop calling md_stop_writes from md_stop, as that won't work well with dm - it will want to call the two separately. Signed-off-by: NeilBrown --- drivers/md/md.c | 21 +++++++++++++-------- drivers/md/md.h | 7 +++++++ 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 3de623aceed0..012482a900fc 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -261,7 +261,7 @@ static int md_make_request(struct request_queue *q, struct bio *bio) * Once ->stop is called and completes, the module will be completely * unused. */ -static void mddev_suspend(mddev_t *mddev) +void mddev_suspend(mddev_t *mddev) { BUG_ON(mddev->suspended); mddev->suspended = 1; @@ -269,13 +269,15 @@ static void mddev_suspend(mddev_t *mddev) wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0); mddev->pers->quiesce(mddev, 1); } +EXPORT_SYMBOL_GPL(mddev_suspend); -static void mddev_resume(mddev_t *mddev) +void mddev_resume(mddev_t *mddev) { mddev->suspended = 0; wake_up(&mddev->sb_wait); mddev->pers->quiesce(mddev, 0); } +EXPORT_SYMBOL_GPL(mddev_resume); int mddev_congested(mddev_t *mddev, int bits) { @@ -416,7 +418,7 @@ static void mddev_put(mddev_t *mddev) spin_unlock(&all_mddevs_lock); } -static void mddev_init(mddev_t *mddev) +void mddev_init(mddev_t *mddev) { mutex_init(&mddev->open_mutex); mutex_init(&mddev->reconfig_mutex); @@ -436,6 +438,7 @@ static void mddev_init(mddev_t *mddev) mddev->resync_max = MaxSector; mddev->level = LEVEL_NONE; } +EXPORT_SYMBOL_GPL(mddev_init); static mddev_t * mddev_find(dev_t unit) { @@ -4333,7 +4336,7 @@ static void md_safemode_timeout(unsigned long data) static int start_dirty_degraded; -static int md_run(mddev_t *mddev) +int md_run(mddev_t *mddev) { int err; mdk_rdev_t *rdev; @@ -4537,6 +4540,7 @@ static int md_run(mddev_t *mddev) sysfs_notify(&mddev->kobj, NULL, "degraded"); return 0; } +EXPORT_SYMBOL_GPL(md_run); static int do_md_run(mddev_t *mddev) { @@ -4646,7 +4650,7 @@ static void md_clean(mddev_t *mddev) mddev->bitmap_info.max_write_behind = 0; } -static void md_stop_writes(mddev_t *mddev) +void md_stop_writes(mddev_t *mddev) { if (mddev->sync_thread) { set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); @@ -4666,11 +4670,10 @@ static void md_stop_writes(mddev_t *mddev) md_update_sb(mddev, 1); } } +EXPORT_SYMBOL_GPL(md_stop_writes); -static void md_stop(mddev_t *mddev) +void md_stop(mddev_t *mddev) { - md_stop_writes(mddev); - mddev->pers->stop(mddev); if (mddev->pers->sync_request && mddev->to_remove == NULL) mddev->to_remove = &md_redundancy_group; @@ -4678,6 +4681,7 @@ static void md_stop(mddev_t *mddev) mddev->pers = NULL; clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); } +EXPORT_SYMBOL_GPL(md_stop); static int md_set_readonly(mddev_t *mddev, int is_open) { @@ -4724,6 +4728,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) if (mddev->ro) set_disk_ro(disk, 0); + md_stop_writes(mddev); md_stop(mddev); mddev->queue->merge_bvec_fn = NULL; mddev->queue->unplug_fn = NULL; diff --git a/drivers/md/md.h b/drivers/md/md.h index cc8030543e82..6e7e3495f6e4 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -487,5 +487,12 @@ extern void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev); extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale); extern void restore_bitmap_write_access(struct file *file); +extern void mddev_init(mddev_t *mddev); +extern int md_run(mddev_t *mddev); +extern void md_stop(mddev_t *mddev); +extern void md_stop_writes(mddev_t *mddev); extern void md_rdev_init(mdk_rdev_t *rdev); + +extern void mddev_suspend(mddev_t *mddev); +extern void mddev_resume(mddev_t *mddev); #endif /* _MD_MD_H */ From 768a418db102bb6aa6064e6090892b5c21ff1f9e Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 26 Jul 2010 11:49:55 +1000 Subject: [PATCH 09/24] md: add support for raising dm events. dm uses scheduled work to raise events to user-space. So allow md device to have work_structs and schedule them on an error. Signed-off-by: NeilBrown --- drivers/md/md.c | 2 ++ drivers/md/md.h | 1 + 2 files changed, 3 insertions(+) diff --git a/drivers/md/md.c b/drivers/md/md.c index 012482a900fc..f8775699e15a 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6068,6 +6068,8 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev) set_bit(MD_RECOVERY_INTR, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); + if (mddev->event_work.func) + schedule_work(&mddev->event_work); md_new_event_inintr(mddev); } diff --git a/drivers/md/md.h b/drivers/md/md.h index 6e7e3495f6e4..c88b04745e85 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -318,6 +318,7 @@ struct mddev_s struct bio *barrier; atomic_t flush_pending; struct work_struct barrier_work; + struct work_struct event_work; /* used by dm to report failure event */ }; From 4a5add49951e698073011855d1a8a7306bc9308d Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 1 Jun 2010 19:37:28 +1000 Subject: [PATCH 10/24] raid5: Don't set read-ahead when there is no queue dm-raid456 does not provide a 'queue' for raid5 to use, so we must make raid5 stop depending on the queue. First: read_ahead dm handles read-ahead adjustment fully in userspace, so simply don't do any readahead adjustments if there is no queue. Also re-arrange code slightly so all the accesses to ->queue are together. Finally, move the blk_queue_merge_bvec function into the 'if' as the ->split_io setting in dm-raid456 has the same effect. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 6fa60e416a09..9c462f6659c3 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5161,16 +5161,6 @@ static int run(mddev_t *mddev) "reshape"); } - /* read-ahead size must cover two whole stripes, which is - * 2 * (datadisks) * chunksize where 'n' is the number of raid devices - */ - { - int data_disks = conf->previous_raid_disks - conf->max_degraded; - int stripe = data_disks * - ((mddev->chunk_sectors << 9) / PAGE_SIZE); - if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) - mddev->queue->backing_dev_info.ra_pages = 2 * stripe; - } /* Ok, everything is just fine now */ if (mddev->to_remove == &raid5_attrs_group) @@ -5178,8 +5168,23 @@ static int run(mddev_t *mddev) else if (mddev->kobj.sd && sysfs_create_group(&mddev->kobj, &raid5_attrs_group)) printk(KERN_WARNING - "md/raid:%s: failed to create sysfs attributes.\n", + "raid5: failed to create sysfs attributes for %s\n", mdname(mddev)); + md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); + + if (mddev->queue) { + /* read-ahead size must cover two whole stripes, which + * is 2 * (datadisks) * chunksize where 'n' is the + * number of raid devices + */ + int data_disks = conf->previous_raid_disks - conf->max_degraded; + int stripe = data_disks * + ((mddev->chunk_sectors << 9) / PAGE_SIZE); + if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) + mddev->queue->backing_dev_info.ra_pages = 2 * stripe; + + blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); + } mddev->queue->queue_lock = &conf->device_lock; @@ -5187,9 +5192,6 @@ static int run(mddev_t *mddev) mddev->queue->backing_dev_info.congested_data = mddev; mddev->queue->backing_dev_info.congested_fn = raid5_congested; - md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); - - blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); chunk_size = mddev->chunk_sectors << 9; blk_queue_io_min(mddev->queue, chunk_size); blk_queue_io_opt(mddev->queue, chunk_size * @@ -5618,7 +5620,7 @@ static void end_reshape(raid5_conf_t *conf) /* read-ahead size must cover two whole stripes, which is * 2 * (datadisks) * chunksize where 'n' is the number of raid devices */ - { + if (conf->mddev->queue) { int data_disks = conf->raid_disks - conf->max_degraded; int stripe = data_disks * ((conf->chunk_sectors << 9) / PAGE_SIZE); From 11d8a6e3719519fbc0e2c9d61b6fa931b84bf813 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 26 Jul 2010 11:57:07 +1000 Subject: [PATCH 11/24] md/raid5: export is_congested test the dm module will need this for dm-raid45. Also only access ->queue->backing_dev_info->congested_fn if ->queue actually exists. It won't in a dm target. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 22 +++++++++++++++------- drivers/md/raid5.h | 2 ++ 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 9c462f6659c3..ad6694f8a3a8 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3678,17 +3678,14 @@ static void raid5_unplug_device(struct request_queue *q) unplug_slaves(mddev); } -static int raid5_congested(void *data, int bits) +int md_raid5_congested(mddev_t *mddev, int bits) { - mddev_t *mddev = data; raid5_conf_t *conf = mddev->private; /* No difference between reads and writes. Just check * how busy the stripe_cache is */ - if (mddev_congested(mddev, bits)) - return 1; if (conf->inactive_blocked) return 1; if (conf->quiesce) @@ -3698,6 +3695,15 @@ static int raid5_congested(void *data, int bits) return 0; } +EXPORT_SYMBOL_GPL(md_raid5_congested); + +static int raid5_congested(void *data, int bits) +{ + mddev_t *mddev = data; + + return mddev_congested(mddev, bits) || + md_raid5_congested(mddev, bits); +} /* We want read requests to align with chunks where possible, * but write requests don't need to. @@ -5184,13 +5190,14 @@ static int run(mddev_t *mddev) mddev->queue->backing_dev_info.ra_pages = 2 * stripe; blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); + + mddev->queue->backing_dev_info.congested_data = mddev; + mddev->queue->backing_dev_info.congested_fn = raid5_congested; } mddev->queue->queue_lock = &conf->device_lock; mddev->queue->unplug_fn = raid5_unplug_device; - mddev->queue->backing_dev_info.congested_data = mddev; - mddev->queue->backing_dev_info.congested_fn = raid5_congested; chunk_size = mddev->chunk_sectors << 9; blk_queue_io_min(mddev->queue, chunk_size); @@ -5220,7 +5227,8 @@ static int stop(mddev_t *mddev) md_unregister_thread(mddev->thread); mddev->thread = NULL; - mddev->queue->backing_dev_info.congested_fn = NULL; + if (mddev->queue) + mddev->queue->backing_dev_info.congested_fn = NULL; blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ free_conf(conf); mddev->private = NULL; diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 61b6b25dc5e7..d6470dec667a 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -497,5 +497,7 @@ static inline int algorithm_is_DDF(int layout) { return layout >= 8 && layout <= 10; } + +extern int md_raid5_congested(mddev_t *mddev, int bits); extern int raid5_set_cache_size(mddev_t *mddev, int size); #endif From 2ac8740151b082f045e58010eb92560c3a23a0e9 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 1 Jun 2010 19:37:29 +1000 Subject: [PATCH 12/24] md/raid5: add simple plugging infrastructure. md/raid5 uses the plugging infrastructure provided by the block layer and 'struct request_queue'. However when we plug raid5 under dm there is no request queue so we cannot use that. So create a similar infrastructure that is much lighter weight and use it for raid5. Signed-off-by: NeilBrown --- drivers/md/md.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ drivers/md/md.h | 20 ++++++++++++++++++++ drivers/md/raid5.c | 39 +++++++++++++++++++++++++-------------- drivers/md/raid5.h | 3 +++ 4 files changed, 93 insertions(+), 14 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index f8775699e15a..eec75f130708 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -386,6 +386,51 @@ void md_barrier_request(mddev_t *mddev, struct bio *bio) } EXPORT_SYMBOL(md_barrier_request); +/* Support for plugging. + * This mirrors the plugging support in request_queue, but does not + * require having a whole queue + */ +static void plugger_work(struct work_struct *work) +{ + struct plug_handle *plug = + container_of(work, struct plug_handle, unplug_work); + plug->unplug_fn(plug); +} +static void plugger_timeout(unsigned long data) +{ + struct plug_handle *plug = (void *)data; + kblockd_schedule_work(NULL, &plug->unplug_work); +} +void plugger_init(struct plug_handle *plug, + void (*unplug_fn)(struct plug_handle *)) +{ + plug->unplug_flag = 0; + plug->unplug_fn = unplug_fn; + init_timer(&plug->unplug_timer); + plug->unplug_timer.function = plugger_timeout; + plug->unplug_timer.data = (unsigned long)plug; + INIT_WORK(&plug->unplug_work, plugger_work); +} +EXPORT_SYMBOL_GPL(plugger_init); + +void plugger_set_plug(struct plug_handle *plug) +{ + if (!test_and_set_bit(PLUGGED_FLAG, &plug->unplug_flag)) + mod_timer(&plug->unplug_timer, jiffies + msecs_to_jiffies(3)+1); +} +EXPORT_SYMBOL_GPL(plugger_set_plug); + +int plugger_remove_plug(struct plug_handle *plug) +{ + if (test_and_clear_bit(PLUGGED_FLAG, &plug->unplug_flag)) { + del_timer(&plug->unplug_timer); + return 1; + } else + return 0; +} +EXPORT_SYMBOL_GPL(plugger_remove_plug); + + static inline mddev_t *mddev_get(mddev_t *mddev) { atomic_inc(&mddev->active); diff --git a/drivers/md/md.h b/drivers/md/md.h index c88b04745e85..5be0d6921b9d 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -29,6 +29,26 @@ typedef struct mddev_s mddev_t; typedef struct mdk_rdev_s mdk_rdev_t; +/* generic plugging support - like that provided with request_queue, + * but does not require a request_queue + */ +struct plug_handle { + void (*unplug_fn)(struct plug_handle *); + struct timer_list unplug_timer; + struct work_struct unplug_work; + unsigned long unplug_flag; +}; +#define PLUGGED_FLAG 1 +void plugger_init(struct plug_handle *plug, + void (*unplug_fn)(struct plug_handle *)); +void plugger_set_plug(struct plug_handle *plug); +int plugger_remove_plug(struct plug_handle *plug); +static inline void plugger_flush(struct plug_handle *plug) +{ + del_timer_sync(&plug->unplug_timer); + cancel_work_sync(&plug->unplug_work); +} + /* * MD's 'extended' device */ diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index ad6694f8a3a8..84bb9aec2211 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -201,11 +201,11 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) if (test_bit(STRIPE_HANDLE, &sh->state)) { if (test_bit(STRIPE_DELAYED, &sh->state)) { list_add_tail(&sh->lru, &conf->delayed_list); - blk_plug_device(conf->mddev->queue); + plugger_set_plug(&conf->plug); } else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && sh->bm_seq - conf->seq_write > 0) { list_add_tail(&sh->lru, &conf->bitmap_list); - blk_plug_device(conf->mddev->queue); + plugger_set_plug(&conf->plug); } else { clear_bit(STRIPE_BIT_DELAY, &sh->state); list_add_tail(&sh->lru, &conf->handle_list); @@ -434,7 +434,7 @@ static int has_failed(raid5_conf_t *conf) } static void unplug_slaves(mddev_t *mddev); -static void raid5_unplug_device(struct request_queue *q); +static void raid5_unplug_device(raid5_conf_t *conf); static struct stripe_head * get_active_stripe(raid5_conf_t *conf, sector_t sector, @@ -464,7 +464,7 @@ get_active_stripe(raid5_conf_t *conf, sector_t sector, < (conf->max_nr_stripes *3/4) || !conf->inactive_blocked), conf->device_lock, - raid5_unplug_device(conf->mddev->queue) + raid5_unplug_device(conf) ); conf->inactive_blocked = 0; } else @@ -3618,7 +3618,7 @@ static void raid5_activate_delayed(raid5_conf_t *conf) list_add_tail(&sh->lru, &conf->hold_list); } } else - blk_plug_device(conf->mddev->queue); + plugger_set_plug(&conf->plug); } static void activate_bit_delay(raid5_conf_t *conf) @@ -3659,23 +3659,33 @@ static void unplug_slaves(mddev_t *mddev) rcu_read_unlock(); } -static void raid5_unplug_device(struct request_queue *q) +static void raid5_unplug_device(raid5_conf_t *conf) { - mddev_t *mddev = q->queuedata; - raid5_conf_t *conf = mddev->private; unsigned long flags; spin_lock_irqsave(&conf->device_lock, flags); - if (blk_remove_plug(q)) { + if (plugger_remove_plug(&conf->plug)) { conf->seq_flush++; raid5_activate_delayed(conf); } - md_wakeup_thread(mddev->thread); + md_wakeup_thread(conf->mddev->thread); spin_unlock_irqrestore(&conf->device_lock, flags); - unplug_slaves(mddev); + unplug_slaves(conf->mddev); +} + +static void raid5_unplug(struct plug_handle *plug) +{ + raid5_conf_t *conf = container_of(plug, raid5_conf_t, plug); + raid5_unplug_device(conf); +} + +static void raid5_unplug_queue(struct request_queue *q) +{ + mddev_t *mddev = q->queuedata; + raid5_unplug_device(mddev->private); } int md_raid5_congested(mddev_t *mddev, int bits) @@ -4085,7 +4095,7 @@ static int make_request(mddev_t *mddev, struct bio * bi) * add failed due to overlap. Flush everything * and wait a while */ - raid5_unplug_device(mddev->queue); + raid5_unplug_device(conf); release_stripe(sh); schedule(); goto retry; @@ -5178,6 +5188,7 @@ static int run(mddev_t *mddev) mdname(mddev)); md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); + plugger_init(&conf->plug, raid5_unplug); if (mddev->queue) { /* read-ahead size must cover two whole stripes, which * is 2 * (datadisks) * chunksize where 'n' is the @@ -5197,7 +5208,7 @@ static int run(mddev_t *mddev) mddev->queue->queue_lock = &conf->device_lock; - mddev->queue->unplug_fn = raid5_unplug_device; + mddev->queue->unplug_fn = raid5_unplug_queue; chunk_size = mddev->chunk_sectors << 9; blk_queue_io_min(mddev->queue, chunk_size); @@ -5229,7 +5240,7 @@ static int stop(mddev_t *mddev) mddev->thread = NULL; if (mddev->queue) mddev->queue->backing_dev_info.congested_fn = NULL; - blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ + plugger_flush(&conf->plug); /* the unplug fn references 'conf'*/ free_conf(conf); mddev->private = NULL; mddev->to_remove = &raid5_attrs_group; diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index d6470dec667a..6acd458f239d 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -398,6 +398,9 @@ struct raid5_private_data { * (fresh device added). * Cleared when a sync completes. */ + + struct plug_handle plug; + /* per cpu variables */ struct raid5_percpu { struct page *spare_page; /* Used when checking P/Q in raid6 */ From 252ac5221a71be72b7e7c7b7482af91e9c962e8c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 1 Jun 2010 19:37:29 +1000 Subject: [PATCH 13/24] md/plug: optionally use plugger to unplug an array during resync/recovery. If an array doesn't have a 'queue' then md_do_sync cannot unplug it. In that case it will have a 'plugger', so make that available to the mddev, and use it to unplug the array if needed. Signed-off-by: NeilBrown --- drivers/md/md.c | 15 ++++++++++++--- drivers/md/md.h | 2 ++ drivers/md/raid5.c | 1 + 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index eec75f130708..03c64e9735fb 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4693,6 +4693,7 @@ static void md_clean(mddev_t *mddev) mddev->bitmap_info.chunksize = 0; mddev->bitmap_info.daemon_sleep = 0; mddev->bitmap_info.max_write_behind = 0; + mddev->plug = NULL; } void md_stop_writes(mddev_t *mddev) @@ -6626,6 +6627,14 @@ int md_allow_write(mddev_t *mddev) } EXPORT_SYMBOL_GPL(md_allow_write); +static void md_unplug(mddev_t *mddev) +{ + if (mddev->queue) + blk_unplug(mddev->queue); + if (mddev->plug) + mddev->plug->unplug_fn(mddev->plug); +} + #define SYNC_MARKS 10 #define SYNC_MARK_STEP (3*HZ) void md_do_sync(mddev_t *mddev) @@ -6804,7 +6813,7 @@ void md_do_sync(mddev_t *mddev) >= mddev->resync_max - mddev->curr_resync_completed )) { /* time to update curr_resync_completed */ - blk_unplug(mddev->queue); + md_unplug(mddev); wait_event(mddev->recovery_wait, atomic_read(&mddev->recovery_active) == 0); mddev->curr_resync_completed = @@ -6882,7 +6891,7 @@ void md_do_sync(mddev_t *mddev) * about not overloading the IO subsystem. (things like an * e2fsck being done on the RAID array should execute fast) */ - blk_unplug(mddev->queue); + md_unplug(mddev); cond_resched(); currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2 @@ -6901,7 +6910,7 @@ void md_do_sync(mddev_t *mddev) * this also signals 'finished resyncing' to md_stop */ out: - blk_unplug(mddev->queue); + md_unplug(mddev); wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); diff --git a/drivers/md/md.h b/drivers/md/md.h index 5be0d6921b9d..57eb864a8249 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -328,6 +328,8 @@ struct mddev_s struct list_head all_mddevs; struct attribute_group *to_remove; + struct plug_handle *plug; /* if used by personality */ + /* Generic barrier handling. * If there is a pending barrier request, all other * writes are blocked while the devices are flushed. diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 84bb9aec2211..0a8173e650bb 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5189,6 +5189,7 @@ static int run(mddev_t *mddev) md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); plugger_init(&conf->plug, raid5_unplug); + mddev->plug = &conf->plug; if (mddev->queue) { /* read-ahead size must cover two whole stripes, which * is 2 * (datadisks) * chunksize where 'n' is the From 9f7c2220017771253d7d10b3cc017cb79eeac0fb Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 26 Jul 2010 12:04:13 +1000 Subject: [PATCH 14/24] md/raid5: export raid5 unplugging interface. Also remove remaining accesses to ->queue and ->gendisk when ->queue is NULL (As it is in a DM target). Signed-off-by: NeilBrown --- drivers/md/raid5.c | 39 +++++++++++++++++++-------------------- drivers/md/raid5.h | 1 + 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 0a8173e650bb..e30a809cbea0 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -434,7 +434,6 @@ static int has_failed(raid5_conf_t *conf) } static void unplug_slaves(mddev_t *mddev); -static void raid5_unplug_device(raid5_conf_t *conf); static struct stripe_head * get_active_stripe(raid5_conf_t *conf, sector_t sector, @@ -464,7 +463,7 @@ get_active_stripe(raid5_conf_t *conf, sector_t sector, < (conf->max_nr_stripes *3/4) || !conf->inactive_blocked), conf->device_lock, - raid5_unplug_device(conf) + md_raid5_unplug_device(conf) ); conf->inactive_blocked = 0; } else @@ -3659,7 +3658,7 @@ static void unplug_slaves(mddev_t *mddev) rcu_read_unlock(); } -static void raid5_unplug_device(raid5_conf_t *conf) +void md_raid5_unplug_device(raid5_conf_t *conf) { unsigned long flags; @@ -3675,17 +3674,18 @@ static void raid5_unplug_device(raid5_conf_t *conf) unplug_slaves(conf->mddev); } +EXPORT_SYMBOL_GPL(md_raid5_unplug_device); static void raid5_unplug(struct plug_handle *plug) { raid5_conf_t *conf = container_of(plug, raid5_conf_t, plug); - raid5_unplug_device(conf); + md_raid5_unplug_device(conf); } static void raid5_unplug_queue(struct request_queue *q) { mddev_t *mddev = q->queuedata; - raid5_unplug_device(mddev->private); + md_raid5_unplug_device(mddev->private); } int md_raid5_congested(mddev_t *mddev, int bits) @@ -4095,7 +4095,7 @@ static int make_request(mddev_t *mddev, struct bio * bi) * add failed due to overlap. Flush everything * and wait a while */ - raid5_unplug_device(conf); + md_raid5_unplug_device(conf); release_stripe(sh); schedule(); goto retry; @@ -4991,7 +4991,7 @@ static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded static int run(mddev_t *mddev) { raid5_conf_t *conf; - int working_disks = 0, chunk_size; + int working_disks = 0; int dirty_parity_disks = 0; mdk_rdev_t *rdev; sector_t reshape_offset = 0; @@ -5191,6 +5191,7 @@ static int run(mddev_t *mddev) plugger_init(&conf->plug, raid5_unplug); mddev->plug = &conf->plug; if (mddev->queue) { + int chunk_size; /* read-ahead size must cover two whole stripes, which * is 2 * (datadisks) * chunksize where 'n' is the * number of raid devices @@ -5205,21 +5206,19 @@ static int run(mddev_t *mddev) mddev->queue->backing_dev_info.congested_data = mddev; mddev->queue->backing_dev_info.congested_fn = raid5_congested; + mddev->queue->queue_lock = &conf->device_lock; + mddev->queue->unplug_fn = raid5_unplug_queue; + + chunk_size = mddev->chunk_sectors << 9; + blk_queue_io_min(mddev->queue, chunk_size); + blk_queue_io_opt(mddev->queue, chunk_size * + (conf->raid_disks - conf->max_degraded)); + + list_for_each_entry(rdev, &mddev->disks, same_set) + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); } - mddev->queue->queue_lock = &conf->device_lock; - - mddev->queue->unplug_fn = raid5_unplug_queue; - - chunk_size = mddev->chunk_sectors << 9; - blk_queue_io_min(mddev->queue, chunk_size); - blk_queue_io_opt(mddev->queue, chunk_size * - (conf->raid_disks - conf->max_degraded)); - - list_for_each_entry(rdev, &mddev->disks, same_set) - disk_stack_limits(mddev->gendisk, rdev->bdev, - rdev->data_offset << 9); - return 0; abort: md_unregister_thread(mddev->thread); diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 6acd458f239d..36eaed5dfd6e 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -502,5 +502,6 @@ static inline int algorithm_is_DDF(int layout) } extern int md_raid5_congested(mddev_t *mddev, int bits); +extern void md_raid5_unplug_device(raid5_conf_t *conf); extern int raid5_set_cache_size(mddev_t *mddev, int size); #endif From ac2f40be46ce6ab3bec4c8c297d6923f941741ce Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 1 Jun 2010 19:37:31 +1000 Subject: [PATCH 15/24] md/bitmap: white space clean up and similar. Fixes some whitespace problems Fixed some checkpatch.pl complaints. Replaced kmalloc ... memset(0), with kzalloc Fixed an unlikely memory leak on an error path. Reformatted a number of 'if/else' sets, sometimes replacing goto with an else clause. Removed some old comments and commented-out code. Signed-off-by: NeilBrown --- drivers/md/bitmap.c | 303 ++++++++++++++++++++------------------------ drivers/md/md.h | 2 +- 2 files changed, 137 insertions(+), 168 deletions(-) diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 4518994712c7..67fb32d1124d 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -13,7 +13,6 @@ * Still to do: * * flush after percent set rather than just time based. (maybe both). - * wait if count gets too high, wake when it drops to half. */ #include @@ -51,9 +50,6 @@ #define INJECT_FATAL_FAULT_3 0 /* undef */ #endif -//#define DPRINTK PRINTK /* set this NULL to avoid verbose debug output */ -#define DPRINTK(x...) do { } while(0) - #ifndef PRINTK # if DEBUG > 0 # define PRINTK(x...) printk(KERN_DEBUG x) @@ -62,12 +58,11 @@ # endif #endif -static inline char * bmname(struct bitmap *bitmap) +static inline char *bmname(struct bitmap *bitmap) { return bitmap->mddev ? mdname(bitmap->mddev) : "mdX"; } - /* * just a placeholder - calls kmalloc for bitmap pages */ @@ -78,7 +73,7 @@ static unsigned char *bitmap_alloc_page(struct bitmap *bitmap) #ifdef INJECT_FAULTS_1 page = NULL; #else - page = kmalloc(PAGE_SIZE, GFP_NOIO); + page = kzalloc(PAGE_SIZE, GFP_NOIO); #endif if (!page) printk("%s: bitmap_alloc_page FAILED\n", bmname(bitmap)); @@ -107,7 +102,8 @@ static void bitmap_free_page(struct bitmap *bitmap, unsigned char *page) * if we find our page, we increment the page's refcount so that it stays * allocated while we're using it */ -static int bitmap_checkpage(struct bitmap *bitmap, unsigned long page, int create) +static int bitmap_checkpage(struct bitmap *bitmap, + unsigned long page, int create) __releases(bitmap->lock) __acquires(bitmap->lock) { @@ -121,7 +117,6 @@ __acquires(bitmap->lock) return -EINVAL; } - if (bitmap->bp[page].hijacked) /* it's hijacked, don't try to alloc */ return 0; @@ -131,43 +126,34 @@ __acquires(bitmap->lock) if (!create) return -ENOENT; - spin_unlock_irq(&bitmap->lock); - /* this page has not been allocated yet */ - if ((mappage = bitmap_alloc_page(bitmap)) == NULL) { + spin_unlock_irq(&bitmap->lock); + mappage = bitmap_alloc_page(bitmap); + spin_lock_irq(&bitmap->lock); + + if (mappage == NULL) { PRINTK("%s: bitmap map page allocation failed, hijacking\n", bmname(bitmap)); /* failed - set the hijacked flag so that we can use the * pointer as a counter */ - spin_lock_irq(&bitmap->lock); if (!bitmap->bp[page].map) bitmap->bp[page].hijacked = 1; - goto out; - } - - /* got a page */ - - spin_lock_irq(&bitmap->lock); - - /* recheck the page */ - - if (bitmap->bp[page].map || bitmap->bp[page].hijacked) { + } else if (bitmap->bp[page].map || + bitmap->bp[page].hijacked) { /* somebody beat us to getting the page */ bitmap_free_page(bitmap, mappage); return 0; + } else { + + /* no page was in place and we have one, so install it */ + + bitmap->bp[page].map = mappage; + bitmap->missing_pages--; } - - /* no page was in place and we have one, so install it */ - - memset(mappage, 0, PAGE_SIZE); - bitmap->bp[page].map = mappage; - bitmap->missing_pages--; -out: return 0; } - /* if page is completely empty, put it back on the free list, or dealloc it */ /* if page was hijacked, unmark the flag so it might get alloced next time */ /* Note: lock should be held when calling this */ @@ -183,26 +169,15 @@ static void bitmap_checkfree(struct bitmap *bitmap, unsigned long page) if (bitmap->bp[page].hijacked) { /* page was hijacked, undo this now */ bitmap->bp[page].hijacked = 0; bitmap->bp[page].map = NULL; - return; + } else { + /* normal case, free the page */ + ptr = bitmap->bp[page].map; + bitmap->bp[page].map = NULL; + bitmap->missing_pages++; + bitmap_free_page(bitmap, ptr); } - - /* normal case, free the page */ - -#if 0 -/* actually ... let's not. We will probably need the page again exactly when - * memory is tight and we are flusing to disk - */ - return; -#else - ptr = bitmap->bp[page].map; - bitmap->bp[page].map = NULL; - bitmap->missing_pages++; - bitmap_free_page(bitmap, ptr); - return; -#endif } - /* * bitmap file handling - read and write the bitmap file and its superblock */ @@ -220,11 +195,14 @@ static struct page *read_sb_page(mddev_t *mddev, loff_t offset, mdk_rdev_t *rdev; sector_t target; + int did_alloc = 0; - if (!page) + if (!page) { page = alloc_page(GFP_KERNEL); - if (!page) - return ERR_PTR(-ENOMEM); + if (!page) + return ERR_PTR(-ENOMEM); + did_alloc = 1; + } list_for_each_entry(rdev, &mddev->disks, same_set) { if (! test_bit(In_sync, &rdev->flags) @@ -242,6 +220,8 @@ static struct page *read_sb_page(mddev_t *mddev, loff_t offset, return page; } } + if (did_alloc) + put_page(page); return ERR_PTR(-EIO); } @@ -286,49 +266,51 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) mddev_t *mddev = bitmap->mddev; while ((rdev = next_active_rdev(rdev, mddev)) != NULL) { - int size = PAGE_SIZE; - loff_t offset = mddev->bitmap_info.offset; - if (page->index == bitmap->file_pages-1) - size = roundup(bitmap->last_page_size, - bdev_logical_block_size(rdev->bdev)); - /* Just make sure we aren't corrupting data or - * metadata - */ - if (mddev->external) { - /* Bitmap could be anywhere. */ - if (rdev->sb_start + offset + (page->index *(PAGE_SIZE/512)) > - rdev->data_offset && - rdev->sb_start + offset < - rdev->data_offset + mddev->dev_sectors + - (PAGE_SIZE/512)) - goto bad_alignment; - } else if (offset < 0) { - /* DATA BITMAP METADATA */ - if (offset - + (long)(page->index * (PAGE_SIZE/512)) - + size/512 > 0) - /* bitmap runs in to metadata */ - goto bad_alignment; - if (rdev->data_offset + mddev->dev_sectors - > rdev->sb_start + offset) - /* data runs in to bitmap */ - goto bad_alignment; - } else if (rdev->sb_start < rdev->data_offset) { - /* METADATA BITMAP DATA */ - if (rdev->sb_start - + offset - + page->index*(PAGE_SIZE/512) + size/512 - > rdev->data_offset) - /* bitmap runs in to data */ - goto bad_alignment; - } else { - /* DATA METADATA BITMAP - no problems */ - } - md_super_write(mddev, rdev, - rdev->sb_start + offset - + page->index * (PAGE_SIZE/512), - size, - page); + int size = PAGE_SIZE; + loff_t offset = mddev->bitmap_info.offset; + if (page->index == bitmap->file_pages-1) + size = roundup(bitmap->last_page_size, + bdev_logical_block_size(rdev->bdev)); + /* Just make sure we aren't corrupting data or + * metadata + */ + if (mddev->external) { + /* Bitmap could be anywhere. */ + if (rdev->sb_start + offset + (page->index + * (PAGE_SIZE/512)) + > rdev->data_offset + && + rdev->sb_start + offset + < (rdev->data_offset + mddev->dev_sectors + + (PAGE_SIZE/512))) + goto bad_alignment; + } else if (offset < 0) { + /* DATA BITMAP METADATA */ + if (offset + + (long)(page->index * (PAGE_SIZE/512)) + + size/512 > 0) + /* bitmap runs in to metadata */ + goto bad_alignment; + if (rdev->data_offset + mddev->dev_sectors + > rdev->sb_start + offset) + /* data runs in to bitmap */ + goto bad_alignment; + } else if (rdev->sb_start < rdev->data_offset) { + /* METADATA BITMAP DATA */ + if (rdev->sb_start + + offset + + page->index*(PAGE_SIZE/512) + size/512 + > rdev->data_offset) + /* bitmap runs in to data */ + goto bad_alignment; + } else { + /* DATA METADATA BITMAP - no problems */ + } + md_super_write(mddev, rdev, + rdev->sb_start + offset + + page->index * (PAGE_SIZE/512), + size, + page); } if (wait) @@ -364,10 +346,9 @@ static void write_page(struct bitmap *bitmap, struct page *page, int wait) bh = bh->b_this_page; } - if (wait) { + if (wait) wait_event(bitmap->write_wait, atomic_read(&bitmap->pending_writes)==0); - } } if (bitmap->flags & BITMAP_WRITE_ERROR) bitmap_file_kick(bitmap); @@ -424,7 +405,7 @@ static struct page *read_page(struct file *file, unsigned long index, struct buffer_head *bh; sector_t block; - PRINTK("read bitmap file (%dB @ %Lu)\n", (int)PAGE_SIZE, + PRINTK("read bitmap file (%dB @ %llu)\n", (int)PAGE_SIZE, (unsigned long long)index << PAGE_SHIFT); page = alloc_page(GFP_KERNEL); @@ -478,7 +459,7 @@ static struct page *read_page(struct file *file, unsigned long index, } out: if (IS_ERR(page)) - printk(KERN_ALERT "md: bitmap read error: (%dB @ %Lu): %ld\n", + printk(KERN_ALERT "md: bitmap read error: (%dB @ %llu): %ld\n", (int)PAGE_SIZE, (unsigned long long)index << PAGE_SHIFT, PTR_ERR(page)); @@ -664,11 +645,14 @@ static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits, sb = kmap_atomic(bitmap->sb_page, KM_USER0); old = le32_to_cpu(sb->state) & bits; switch (op) { - case MASK_SET: sb->state |= cpu_to_le32(bits); - break; - case MASK_UNSET: sb->state &= cpu_to_le32(~bits); - break; - default: BUG(); + case MASK_SET: + sb->state |= cpu_to_le32(bits); + break; + case MASK_UNSET: + sb->state &= cpu_to_le32(~bits); + break; + default: + BUG(); } kunmap_atomic(sb, KM_USER0); return old; @@ -710,12 +694,12 @@ static inline unsigned long file_page_offset(struct bitmap *bitmap, unsigned lon static inline struct page *filemap_get_page(struct bitmap *bitmap, unsigned long chunk) { - if (file_page_index(bitmap, chunk) >= bitmap->file_pages) return NULL; + if (file_page_index(bitmap, chunk) >= bitmap->file_pages) + return NULL; return bitmap->filemap[file_page_index(bitmap, chunk) - file_page_index(bitmap, 0)]; } - static void bitmap_file_unmap(struct bitmap *bitmap) { struct page **map, *sb_page; @@ -766,7 +750,6 @@ static void bitmap_file_put(struct bitmap *bitmap) } } - /* * bitmap_file_kick - if an error occurs while manipulating the bitmap file * then it is no longer reliable, so we stop using it and we mark the file @@ -785,7 +768,6 @@ static void bitmap_file_kick(struct bitmap *bitmap) ptr = d_path(&bitmap->file->f_path, path, PAGE_SIZE); - printk(KERN_ALERT "%s: kicking failed bitmap file %s from array!\n", bmname(bitmap), IS_ERR(ptr) ? "" : ptr); @@ -803,9 +785,9 @@ static void bitmap_file_kick(struct bitmap *bitmap) } enum bitmap_page_attr { - BITMAP_PAGE_DIRTY = 0, // there are set bits that need to be synced - BITMAP_PAGE_CLEAN = 1, // there are bits that might need to be cleared - BITMAP_PAGE_NEEDWRITE=2, // there are cleared bits that need to be synced + BITMAP_PAGE_DIRTY = 0, /* there are set bits that need to be synced */ + BITMAP_PAGE_CLEAN = 1, /* there are bits that might need to be cleared */ + BITMAP_PAGE_NEEDWRITE = 2, /* there are cleared bits that need to be synced */ }; static inline void set_page_attr(struct bitmap *bitmap, struct page *page, @@ -840,15 +822,15 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) void *kaddr; unsigned long chunk = block >> CHUNK_BLOCK_SHIFT(bitmap); - if (!bitmap->filemap) { + if (!bitmap->filemap) return; - } page = filemap_get_page(bitmap, chunk); - if (!page) return; + if (!page) + return; bit = file_page_offset(bitmap, chunk); - /* set the bit */ + /* set the bit */ kaddr = kmap_atomic(page, KM_USER0); if (bitmap->flags & BITMAP_HOSTENDIAN) set_bit(bit, kaddr); @@ -859,7 +841,6 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) /* record page number so it gets flushed to disk when unplug occurs */ set_page_attr(bitmap, page, BITMAP_PAGE_DIRTY); - } /* this gets called when the md device is ready to unplug its underlying @@ -892,7 +873,7 @@ void bitmap_unplug(struct bitmap *bitmap) wait = 1; spin_unlock_irqrestore(&bitmap->lock, flags); - if (dirty | need_write) + if (dirty || need_write) write_page(bitmap, page, 0); } if (wait) { /* if any writes were performed, we need to wait on them */ @@ -905,6 +886,7 @@ void bitmap_unplug(struct bitmap *bitmap) if (bitmap->flags & BITMAP_WRITE_ERROR) bitmap_file_kick(bitmap); } +EXPORT_SYMBOL(bitmap_unplug); static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed); /* * bitmap_init_from_disk -- called at bitmap_create time to initialize @@ -947,7 +929,6 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) if (!bitmap->mddev->bitmap_info.external) bytes += sizeof(bitmap_super_t); - num_pages = (bytes + PAGE_SIZE - 1) / PAGE_SIZE; if (file && i_size_read(file->f_mapping->host) < bytes) { @@ -966,7 +947,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) /* We need 4 bits per page, rounded up to a multiple of sizeof(unsigned long) */ bitmap->filemap_attr = kzalloc( - roundup( DIV_ROUND_UP(num_pages*4, 8), sizeof(unsigned long)), + roundup(DIV_ROUND_UP(num_pages*4, 8), sizeof(unsigned long)), GFP_KERNEL); if (!bitmap->filemap_attr) goto err; @@ -1021,7 +1002,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) if (outofdate) { /* * if bitmap is out of date, dirty the - * whole page and write it out + * whole page and write it out */ paddr = kmap_atomic(page, KM_USER0); memset(paddr + offset, 0xff, @@ -1052,7 +1033,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) } } - /* everything went OK */ + /* everything went OK */ ret = 0; bitmap_mask_state(bitmap, BITMAP_STALE, MASK_UNSET); @@ -1080,21 +1061,16 @@ void bitmap_write_all(struct bitmap *bitmap) */ int i; - for (i=0; i < bitmap->file_pages; i++) + for (i = 0; i < bitmap->file_pages; i++) set_page_attr(bitmap, bitmap->filemap[i], BITMAP_PAGE_NEEDWRITE); } - static void bitmap_count_page(struct bitmap *bitmap, sector_t offset, int inc) { sector_t chunk = offset >> CHUNK_BLOCK_SHIFT(bitmap); unsigned long page = chunk >> PAGE_COUNTER_SHIFT; bitmap->bp[page].count += inc; -/* - if (page == 0) printk("count page 0, offset %llu: %d gives %d\n", - (unsigned long long)offset, inc, bitmap->bp[page].count); -*/ bitmap_checkfree(bitmap, page); } static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap, @@ -1197,14 +1173,11 @@ void bitmap_daemon_work(mddev_t *mddev) (sector_t)j << CHUNK_BLOCK_SHIFT(bitmap), &blocks, 0); if (bmc) { -/* - if (j < 100) printk("bitmap: j=%lu, *bmc = 0x%x\n", j, *bmc); -*/ if (*bmc) bitmap->allclean = 0; if (*bmc == 2) { - *bmc=1; /* maybe clear the bit next time */ + *bmc = 1; /* maybe clear the bit next time */ set_page_attr(bitmap, page, BITMAP_PAGE_CLEAN); } else if (*bmc == 1 && !bitmap->need_sync) { /* we can clear the bit */ @@ -1243,7 +1216,7 @@ void bitmap_daemon_work(mddev_t *mddev) done: if (bitmap->allclean == 0) - bitmap->mddev->thread->timeout = + bitmap->mddev->thread->timeout = bitmap->mddev->bitmap_info.daemon_sleep; mutex_unlock(&mddev->bitmap_info.mutex); } @@ -1265,7 +1238,7 @@ __acquires(bitmap->lock) if (bitmap_checkpage(bitmap, page, create) < 0) { csize = ((sector_t)1) << (CHUNK_BLOCK_SHIFT(bitmap)); - *blocks = csize - (offset & (csize- 1)); + *blocks = csize - (offset & (csize - 1)); return NULL; } /* now locked ... */ @@ -1276,12 +1249,12 @@ __acquires(bitmap->lock) int hi = (pageoff > PAGE_COUNTER_MASK); csize = ((sector_t)1) << (CHUNK_BLOCK_SHIFT(bitmap) + PAGE_COUNTER_SHIFT - 1); - *blocks = csize - (offset & (csize- 1)); + *blocks = csize - (offset & (csize - 1)); return &((bitmap_counter_t *) &bitmap->bp[page].map)[hi]; } else { /* page is allocated */ csize = ((sector_t)1) << (CHUNK_BLOCK_SHIFT(bitmap)); - *blocks = csize - (offset & (csize- 1)); + *blocks = csize - (offset & (csize - 1)); return (bitmap_counter_t *) &(bitmap->bp[page].map[pageoff]); } @@ -1289,7 +1262,8 @@ __acquires(bitmap->lock) int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, int behind) { - if (!bitmap) return 0; + if (!bitmap) + return 0; if (behind) { int bw; @@ -1328,10 +1302,10 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect continue; } - switch(*bmc) { + switch (*bmc) { case 0: bitmap_file_set_bit(bitmap, offset); - bitmap_count_page(bitmap,offset, 1); + bitmap_count_page(bitmap, offset, 1); blk_plug_device_unlocked(bitmap->mddev->queue); /* fall through */ case 1: @@ -1345,16 +1319,19 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect offset += blocks; if (sectors > blocks) sectors -= blocks; - else sectors = 0; + else + sectors = 0; } bitmap->allclean = 0; return 0; } +EXPORT_SYMBOL(bitmap_startwrite); void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, int success, int behind) { - if (!bitmap) return; + if (!bitmap) + return; if (behind) { if (atomic_dec_and_test(&bitmap->behind_writes)) wake_up(&bitmap->behind_wait); @@ -1391,18 +1368,20 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto wake_up(&bitmap->overflow_wait); (*bmc)--; - if (*bmc <= 2) { + if (*bmc <= 2) set_page_attr(bitmap, filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap)), BITMAP_PAGE_CLEAN); - } + spin_unlock_irqrestore(&bitmap->lock, flags); offset += blocks; if (sectors > blocks) sectors -= blocks; - else sectors = 0; + else + sectors = 0; } } +EXPORT_SYMBOL(bitmap_endwrite); static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int degraded) @@ -1455,14 +1434,14 @@ int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, } return rv; } +EXPORT_SYMBOL(bitmap_start_sync); void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted) { bitmap_counter_t *bmc; unsigned long flags; -/* - if (offset == 0) printk("bitmap_end_sync 0 (%d)\n", aborted); -*/ if (bitmap == NULL) { + + if (bitmap == NULL) { *blocks = 1024; return; } @@ -1471,26 +1450,23 @@ void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int ab if (bmc == NULL) goto unlock; /* locked */ -/* - if (offset == 0) printk("bitmap_end sync found 0x%x, blocks %d\n", *bmc, *blocks); -*/ if (RESYNC(*bmc)) { *bmc &= ~RESYNC_MASK; if (!NEEDED(*bmc) && aborted) *bmc |= NEEDED_MASK; else { - if (*bmc <= 2) { + if (*bmc <= 2) set_page_attr(bitmap, filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap)), BITMAP_PAGE_CLEAN); - } } } unlock: spin_unlock_irqrestore(&bitmap->lock, flags); bitmap->allclean = 0; } +EXPORT_SYMBOL(bitmap_end_sync); void bitmap_close_sync(struct bitmap *bitmap) { @@ -1507,6 +1483,7 @@ void bitmap_close_sync(struct bitmap *bitmap) sector += blocks; } } +EXPORT_SYMBOL(bitmap_close_sync); void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector) { @@ -1537,6 +1514,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector) bitmap->last_end_sync = jiffies; sysfs_notify(&bitmap->mddev->kobj, NULL, "sync_completed"); } +EXPORT_SYMBOL(bitmap_cond_end_sync); static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed) { @@ -1553,9 +1531,9 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n spin_unlock_irq(&bitmap->lock); return; } - if (! *bmc) { + if (!*bmc) { struct page *page; - *bmc = 1 | (needed?NEEDED_MASK:0); + *bmc = 1 | (needed ? NEEDED_MASK : 0); bitmap_count_page(bitmap, offset, 1); page = filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap)); set_page_attr(bitmap, page, BITMAP_PAGE_CLEAN); @@ -1720,9 +1698,9 @@ int bitmap_create(mddev_t *mddev) bitmap->chunkshift = ffz(~mddev->bitmap_info.chunksize); /* now that chunksize and chunkshift are set, we can use these macros */ - chunks = (blocks + CHUNK_BLOCK_RATIO(bitmap) - 1) >> + chunks = (blocks + CHUNK_BLOCK_RATIO(bitmap) - 1) >> CHUNK_BLOCK_SHIFT(bitmap); - pages = (chunks + PAGE_COUNTER_RATIO - 1) / PAGE_COUNTER_RATIO; + pages = (chunks + PAGE_COUNTER_RATIO - 1) / PAGE_COUNTER_RATIO; BUG_ON(!pages); @@ -1775,11 +1753,11 @@ static ssize_t location_show(mddev_t *mddev, char *page) { ssize_t len; - if (mddev->bitmap_info.file) { + if (mddev->bitmap_info.file) len = sprintf(page, "file"); - } else if (mddev->bitmap_info.offset) { + else if (mddev->bitmap_info.offset) len = sprintf(page, "%+lld", (long long)mddev->bitmap_info.offset); - } else + else len = sprintf(page, "none"); len += sprintf(page+len, "\n"); return len; @@ -1868,7 +1846,7 @@ timeout_show(mddev_t *mddev, char *page) ssize_t len; unsigned long secs = mddev->bitmap_info.daemon_sleep / HZ; unsigned long jifs = mddev->bitmap_info.daemon_sleep % HZ; - + len = sprintf(page, "%lu", secs); if (jifs) len += sprintf(page+len, ".%03u", jiffies_to_msecs(jifs)); @@ -2050,12 +2028,3 @@ struct attribute_group md_bitmap_group = { .attrs = md_bitmap_attrs, }; - -/* the bitmap API -- for raid personalities */ -EXPORT_SYMBOL(bitmap_startwrite); -EXPORT_SYMBOL(bitmap_endwrite); -EXPORT_SYMBOL(bitmap_start_sync); -EXPORT_SYMBOL(bitmap_end_sync); -EXPORT_SYMBOL(bitmap_unplug); -EXPORT_SYMBOL(bitmap_close_sync); -EXPORT_SYMBOL(bitmap_cond_end_sync); diff --git a/drivers/md/md.h b/drivers/md/md.h index 57eb864a8249..209993207a55 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -319,7 +319,7 @@ struct mddev_s */ struct mutex mutex; unsigned long chunksize; - unsigned long daemon_sleep; /* how many seconds between updates? */ + unsigned long daemon_sleep; /* how many jiffies between updates? */ unsigned long max_write_behind; /* write-behind mode */ int external; } bitmap_info; From 5ff5afffe6527543866a47ffab12769427283917 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 1 Jun 2010 19:37:32 +1000 Subject: [PATCH 16/24] md/bitmap: reduce dependence on sysfs. For dm-raid45 we will want to use bitmaps in dm-targets which don't have entries in sysfs, so cope with the mddev not living in sysfs. Signed-off-by: NeilBrown --- drivers/md/bitmap.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 67fb32d1124d..8af4d655b2d3 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -1358,7 +1358,7 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto bitmap->events_cleared < bitmap->mddev->events) { bitmap->events_cleared = bitmap->mddev->events; bitmap->need_sync = 1; - sysfs_notify_dirent(bitmap->sysfs_can_clear); + sysfs_notify_dirent_safe(bitmap->sysfs_can_clear); } if (!success && ! (*bmc & NEEDED_MASK)) @@ -1643,7 +1643,7 @@ int bitmap_create(mddev_t *mddev) struct file *file = mddev->bitmap_info.file; int err; sector_t start; - struct sysfs_dirent *bm; + struct sysfs_dirent *bm = NULL; BUILD_BUG_ON(sizeof(bitmap_super_t) != 256); @@ -1664,7 +1664,8 @@ int bitmap_create(mddev_t *mddev) bitmap->mddev = mddev; - bm = sysfs_get_dirent(mddev->kobj.sd, NULL, "bitmap"); + if (mddev->kobj.sd) + bm = sysfs_get_dirent(mddev->kobj.sd, NULL, "bitmap"); if (bm) { bitmap->sysfs_can_clear = sysfs_get_dirent(bm, NULL, "can_clear"); sysfs_put(bm); From b63d7c2e29bf9cc94989806f2df0cfca4976b830 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 1 Jun 2010 19:37:33 +1000 Subject: [PATCH 17/24] md/bitmap: clean up plugging calls. 1/ use md_unplug in bitmap.c as we will soon be using bitmaps under arrays with no queue attached. 2/ Don't bother plugging the queue when we set a bit in the bitmap. The reason for this was to encourage as many bits as possible to get set before we unplug and write stuff out. However every personality already plugs the queue after bitmap_startwrite either directly (raid1/raid10) or be setting STRIPE_BIT_DELAY which causes the queue to be plugged later (raid5). Signed-off-by: NeilBrown --- drivers/md/bitmap.c | 3 +-- drivers/md/md.c | 2 +- drivers/md/md.h | 1 + 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 8af4d655b2d3..3f04699725db 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -1296,7 +1296,7 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect prepare_to_wait(&bitmap->overflow_wait, &__wait, TASK_UNINTERRUPTIBLE); spin_unlock_irq(&bitmap->lock); - blk_unplug(bitmap->mddev->queue); + md_unplug(bitmap->mddev); schedule(); finish_wait(&bitmap->overflow_wait, &__wait); continue; @@ -1306,7 +1306,6 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect case 0: bitmap_file_set_bit(bitmap, offset); bitmap_count_page(bitmap, offset, 1); - blk_plug_device_unlocked(bitmap->mddev->queue); /* fall through */ case 1: *bmc = 2; diff --git a/drivers/md/md.c b/drivers/md/md.c index 03c64e9735fb..9d4e44e460e9 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6627,7 +6627,7 @@ int md_allow_write(mddev_t *mddev) } EXPORT_SYMBOL_GPL(md_allow_write); -static void md_unplug(mddev_t *mddev) +void md_unplug(mddev_t *mddev) { if (mddev->queue) blk_unplug(mddev->queue); diff --git a/drivers/md/md.h b/drivers/md/md.h index 209993207a55..0a850780b5d1 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -509,6 +509,7 @@ extern int md_integrity_register(mddev_t *mddev); extern void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev); extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale); extern void restore_bitmap_write_access(struct file *file); +extern void md_unplug(mddev_t *mddev); extern void mddev_init(mddev_t *mddev); extern int md_run(mddev_t *mddev); From ef4256733506f2459a0c436b62267d22a3f0cec6 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 1 Jun 2010 19:37:33 +1000 Subject: [PATCH 18/24] md/bitmap: optimise scanning of empty bitmaps. A bitmap is stored as one page per 2048 bits. If none of the bits are set, the page is not allocated. When bitmap_get_counter finds that a page isn't allocate, it just reports that one bit work of space isn't flagged, rather than reporting that 2048 bits worth of space are unflagged. This can cause searches for flagged bits (e.g. bitmap_close_sync) to do more work than is really necessary. So change bitmap_get_counter (when creating) to report a number of blocks that more accurately reports the range of the device for which no counter currently exists. Signed-off-by: NeilBrown --- drivers/md/bitmap.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 3f04699725db..29a3c864e6b0 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -1235,29 +1235,32 @@ __acquires(bitmap->lock) unsigned long page = chunk >> PAGE_COUNTER_SHIFT; unsigned long pageoff = (chunk & PAGE_COUNTER_MASK) << COUNTER_BYTE_SHIFT; sector_t csize; + int err; - if (bitmap_checkpage(bitmap, page, create) < 0) { + err = bitmap_checkpage(bitmap, page, create); + + if (bitmap->bp[page].hijacked || + bitmap->bp[page].map == NULL) + csize = ((sector_t)1) << (CHUNK_BLOCK_SHIFT(bitmap) + + PAGE_COUNTER_SHIFT - 1); + else csize = ((sector_t)1) << (CHUNK_BLOCK_SHIFT(bitmap)); - *blocks = csize - (offset & (csize - 1)); + *blocks = csize - (offset & (csize - 1)); + + if (err < 0) return NULL; - } + /* now locked ... */ if (bitmap->bp[page].hijacked) { /* hijacked pointer */ /* should we use the first or second counter field * of the hijacked pointer? */ int hi = (pageoff > PAGE_COUNTER_MASK); - csize = ((sector_t)1) << (CHUNK_BLOCK_SHIFT(bitmap) + - PAGE_COUNTER_SHIFT - 1); - *blocks = csize - (offset & (csize - 1)); return &((bitmap_counter_t *) &bitmap->bp[page].map)[hi]; - } else { /* page is allocated */ - csize = ((sector_t)1) << (CHUNK_BLOCK_SHIFT(bitmap)); - *blocks = csize - (offset & (csize - 1)); + } else /* page is allocated */ return (bitmap_counter_t *) &(bitmap->bp[page].map[pageoff]); - } } int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, int behind) From e384e58549a2e9a83071ad80280c1a9053cfd84c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 1 Jun 2010 19:37:34 +1000 Subject: [PATCH 19/24] md/bitmap: prepare for storing write-intent-bitmap via dm-dirty-log. This allows md/raid5 to fully work as a dm target. Normally md uses a 'filemap' which contains a list of pages of bits each of which may be written separately. dm-log uses and all-or-nothing approach to writing the log, so when using a dm-log, ->filemap is NULL and the flags normally stored in filemap_attr are stored in ->logattrs instead. Signed-off-by: NeilBrown --- drivers/md/bitmap.c | 128 ++++++++++++++++++++++++++++++-------------- drivers/md/bitmap.h | 5 ++ drivers/md/md.h | 5 ++ 3 files changed, 99 insertions(+), 39 deletions(-) diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 29a3c864e6b0..93765261c363 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -29,6 +29,7 @@ #include "md.h" #include "bitmap.h" +#include /* debug macros */ #define DEBUG 0 @@ -694,6 +695,8 @@ static inline unsigned long file_page_offset(struct bitmap *bitmap, unsigned lon static inline struct page *filemap_get_page(struct bitmap *bitmap, unsigned long chunk) { + if (bitmap->filemap == NULL) + return NULL; if (file_page_index(bitmap, chunk) >= bitmap->file_pages) return NULL; return bitmap->filemap[file_page_index(bitmap, chunk) @@ -793,19 +796,28 @@ enum bitmap_page_attr { static inline void set_page_attr(struct bitmap *bitmap, struct page *page, enum bitmap_page_attr attr) { - __set_bit((page->index<<2) + attr, bitmap->filemap_attr); + if (page) + __set_bit((page->index<<2) + attr, bitmap->filemap_attr); + else + __set_bit(attr, &bitmap->logattrs); } static inline void clear_page_attr(struct bitmap *bitmap, struct page *page, enum bitmap_page_attr attr) { - __clear_bit((page->index<<2) + attr, bitmap->filemap_attr); + if (page) + __clear_bit((page->index<<2) + attr, bitmap->filemap_attr); + else + __clear_bit(attr, &bitmap->logattrs); } static inline unsigned long test_page_attr(struct bitmap *bitmap, struct page *page, enum bitmap_page_attr attr) { - return test_bit((page->index<<2) + attr, bitmap->filemap_attr); + if (page) + return test_bit((page->index<<2) + attr, bitmap->filemap_attr); + else + return test_bit(attr, &bitmap->logattrs); } /* @@ -818,27 +830,30 @@ static inline unsigned long test_page_attr(struct bitmap *bitmap, struct page *p static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) { unsigned long bit; - struct page *page; + struct page *page = NULL; void *kaddr; unsigned long chunk = block >> CHUNK_BLOCK_SHIFT(bitmap); - if (!bitmap->filemap) - return; + if (!bitmap->filemap) { + struct dm_dirty_log *log = bitmap->mddev->bitmap_info.log; + if (log) + log->type->mark_region(log, chunk); + } else { - page = filemap_get_page(bitmap, chunk); - if (!page) - return; - bit = file_page_offset(bitmap, chunk); - - /* set the bit */ - kaddr = kmap_atomic(page, KM_USER0); - if (bitmap->flags & BITMAP_HOSTENDIAN) - set_bit(bit, kaddr); - else - ext2_set_bit(bit, kaddr); - kunmap_atomic(kaddr, KM_USER0); - PRINTK("set file bit %lu page %lu\n", bit, page->index); + page = filemap_get_page(bitmap, chunk); + if (!page) + return; + bit = file_page_offset(bitmap, chunk); + /* set the bit */ + kaddr = kmap_atomic(page, KM_USER0); + if (bitmap->flags & BITMAP_HOSTENDIAN) + set_bit(bit, kaddr); + else + ext2_set_bit(bit, kaddr); + kunmap_atomic(kaddr, KM_USER0); + PRINTK("set file bit %lu page %lu\n", bit, page->index); + } /* record page number so it gets flushed to disk when unplug occurs */ set_page_attr(bitmap, page, BITMAP_PAGE_DIRTY); } @@ -855,6 +870,16 @@ void bitmap_unplug(struct bitmap *bitmap) if (!bitmap) return; + if (!bitmap->filemap) { + /* Must be using a dirty_log */ + struct dm_dirty_log *log = bitmap->mddev->bitmap_info.log; + dirty = test_and_clear_bit(BITMAP_PAGE_DIRTY, &bitmap->logattrs); + need_write = test_and_clear_bit(BITMAP_PAGE_NEEDWRITE, &bitmap->logattrs); + if (dirty || need_write) + if (log->type->flush(log)) + bitmap->flags |= BITMAP_WRITE_ERROR; + goto out; + } /* look at each page to see if there are any set bits that need to be * flushed out to disk */ @@ -883,6 +908,7 @@ void bitmap_unplug(struct bitmap *bitmap) else md_super_wait(bitmap->mddev); } +out: if (bitmap->flags & BITMAP_WRITE_ERROR) bitmap_file_kick(bitmap); } @@ -925,11 +951,11 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) printk(KERN_INFO "%s: bitmap file is out of date, doing full " "recovery\n", bmname(bitmap)); - bytes = (chunks + 7) / 8; + bytes = DIV_ROUND_UP(bitmap->chunks, 8); if (!bitmap->mddev->bitmap_info.external) bytes += sizeof(bitmap_super_t); - num_pages = (bytes + PAGE_SIZE - 1) / PAGE_SIZE; + num_pages = DIV_ROUND_UP(bytes, PAGE_SIZE); if (file && i_size_read(file->f_mapping->host) < bytes) { printk(KERN_INFO "%s: bitmap file too short %lu < %lu\n", @@ -1090,6 +1116,7 @@ void bitmap_daemon_work(mddev_t *mddev) struct page *page = NULL, *lastpage = NULL; int blocks; void *paddr; + struct dm_dirty_log *log = mddev->bitmap_info.log; /* Use a mutex to guard daemon_work against * bitmap_destroy. @@ -1114,11 +1141,12 @@ void bitmap_daemon_work(mddev_t *mddev) spin_lock_irqsave(&bitmap->lock, flags); for (j = 0; j < bitmap->chunks; j++) { bitmap_counter_t *bmc; - if (!bitmap->filemap) - /* error or shutdown */ - break; - - page = filemap_get_page(bitmap, j); + if (!bitmap->filemap) { + if (!log) + /* error or shutdown */ + break; + } else + page = filemap_get_page(bitmap, j); if (page != lastpage) { /* skip this page unless it's marked as needing cleaning */ @@ -1187,14 +1215,17 @@ void bitmap_daemon_work(mddev_t *mddev) -1); /* clear the bit */ - paddr = kmap_atomic(page, KM_USER0); - if (bitmap->flags & BITMAP_HOSTENDIAN) - clear_bit(file_page_offset(bitmap, j), - paddr); - else - ext2_clear_bit(file_page_offset(bitmap, j), - paddr); - kunmap_atomic(paddr, KM_USER0); + if (page) { + paddr = kmap_atomic(page, KM_USER0); + if (bitmap->flags & BITMAP_HOSTENDIAN) + clear_bit(file_page_offset(bitmap, j), + paddr); + else + ext2_clear_bit(file_page_offset(bitmap, j), + paddr); + kunmap_atomic(paddr, KM_USER0); + } else + log->type->clear_region(log, j); } } else j |= PAGE_COUNTER_MASK; @@ -1202,12 +1233,16 @@ void bitmap_daemon_work(mddev_t *mddev) spin_unlock_irqrestore(&bitmap->lock, flags); /* now sync the final page */ - if (lastpage != NULL) { + if (lastpage != NULL || log != NULL) { spin_lock_irqsave(&bitmap->lock, flags); if (test_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE)) { clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); spin_unlock_irqrestore(&bitmap->lock, flags); - write_page(bitmap, lastpage, 0); + if (lastpage) + write_page(bitmap, lastpage, 0); + else + if (log->type->flush(log)) + bitmap->flags |= BITMAP_WRITE_ERROR; } else { set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); spin_unlock_irqrestore(&bitmap->lock, flags); @@ -1372,7 +1407,9 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto (*bmc)--; if (*bmc <= 2) set_page_attr(bitmap, - filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap)), + filemap_get_page( + bitmap, + offset >> CHUNK_BLOCK_SHIFT(bitmap)), BITMAP_PAGE_CLEAN); spin_unlock_irqrestore(&bitmap->lock, flags); @@ -1649,10 +1686,13 @@ int bitmap_create(mddev_t *mddev) BUILD_BUG_ON(sizeof(bitmap_super_t) != 256); - if (!file && !mddev->bitmap_info.offset) /* bitmap disabled, nothing to do */ + if (!file + && !mddev->bitmap_info.offset + && !mddev->bitmap_info.log) /* bitmap disabled, nothing to do */ return 0; BUG_ON(file && mddev->bitmap_info.offset); + BUG_ON(mddev->bitmap_info.offset && mddev->bitmap_info.log); bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL); if (!bitmap) @@ -1730,7 +1770,17 @@ int bitmap_create(mddev_t *mddev) || bitmap->events_cleared == mddev->events) /* no need to keep dirty bits to optimise a re-add of a missing device */ start = mddev->recovery_cp; - err = bitmap_init_from_disk(bitmap, start); + if (mddev->bitmap_info.log) { + unsigned long i; + struct dm_dirty_log *log = mddev->bitmap_info.log; + for (i = 0; i < bitmap->chunks; i++) + if (!log->type->in_sync(log, i, 1)) + bitmap_set_memory_bits(bitmap, + (sector_t)i << CHUNK_BLOCK_SHIFT(bitmap), + 1); + err = 0; + } else + err = bitmap_init_from_disk(bitmap, start); if (err) goto error; diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h index 3797dea4723a..a7a11134268d 100644 --- a/drivers/md/bitmap.h +++ b/drivers/md/bitmap.h @@ -222,6 +222,10 @@ struct bitmap { unsigned long file_pages; /* number of pages in the file */ int last_page_size; /* bytes in the last page */ + unsigned long logattrs; /* used when filemap_attr doesn't exist + * because we are working with a dirty_log + */ + unsigned long flags; int allclean; @@ -243,6 +247,7 @@ struct bitmap { wait_queue_head_t behind_wait; struct sysfs_dirent *sysfs_can_clear; + }; /* the bitmap API */ diff --git a/drivers/md/md.h b/drivers/md/md.h index 0a850780b5d1..cccbadb31bae 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -317,6 +317,11 @@ struct mddev_s * hot-adding a bitmap. It should * eventually be settable by sysfs. */ + /* When md is serving under dm, it might use a + * dirty_log to store the bits. + */ + struct dm_dirty_log *log; + struct mutex mutex; unsigned long chunksize; unsigned long daemon_sleep; /* how many jiffies between updates? */ From 69e51b449d383e97b1b9f890f8378c96e9e17346 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 1 Jun 2010 19:37:35 +1000 Subject: [PATCH 20/24] md/bitmap: separate out loading a bitmap from initialising the structures. dm makes this distinction between ->ctr and ->resume, so we need to too. Also get the new bitmap_load to clear out the bitmap first, as this is most consistent with the dm suspend/resume approach Signed-off-by: NeilBrown --- drivers/md/bitmap.c | 69 +++++++++++++++++++++++++++++++-------------- drivers/md/bitmap.h | 1 + drivers/md/md.c | 13 +++++++-- 3 files changed, 60 insertions(+), 23 deletions(-) diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 93765261c363..1ba1e122e948 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -1681,7 +1681,6 @@ int bitmap_create(mddev_t *mddev) unsigned long pages; struct file *file = mddev->bitmap_info.file; int err; - sector_t start; struct sysfs_dirent *bm = NULL; BUILD_BUG_ON(sizeof(bitmap_super_t) != 256); @@ -1763,13 +1762,40 @@ int bitmap_create(mddev_t *mddev) if (!bitmap->bp) goto error; - /* now that we have some pages available, initialize the in-memory - * bitmap from the on-disk bitmap */ - start = 0; - if (mddev->degraded == 0 - || bitmap->events_cleared == mddev->events) - /* no need to keep dirty bits to optimise a re-add of a missing device */ - start = mddev->recovery_cp; + printk(KERN_INFO "created bitmap (%lu pages) for device %s\n", + pages, bmname(bitmap)); + + mddev->bitmap = bitmap; + + + return (bitmap->flags & BITMAP_WRITE_ERROR) ? -EIO : 0; + + error: + bitmap_free(bitmap); + return err; +} + +int bitmap_load(mddev_t *mddev) +{ + int err = 0; + sector_t sector = 0; + struct bitmap *bitmap = mddev->bitmap; + + if (!bitmap) + goto out; + + /* Clear out old bitmap info first: Either there is none, or we + * are resuming after someone else has possibly changed things, + * so we should forget old cached info. + * All chunks should be clean, but some might need_sync. + */ + while (sector < mddev->resync_max_sectors) { + int blocks; + bitmap_start_sync(bitmap, sector, &blocks, 0); + sector += blocks; + } + bitmap_close_sync(bitmap); + if (mddev->bitmap_info.log) { unsigned long i; struct dm_dirty_log *log = mddev->bitmap_info.log; @@ -1778,29 +1804,30 @@ int bitmap_create(mddev_t *mddev) bitmap_set_memory_bits(bitmap, (sector_t)i << CHUNK_BLOCK_SHIFT(bitmap), 1); - err = 0; - } else + } else { + sector_t start = 0; + if (mddev->degraded == 0 + || bitmap->events_cleared == mddev->events) + /* no need to keep dirty bits to optimise a + * re-add of a missing device */ + start = mddev->recovery_cp; + err = bitmap_init_from_disk(bitmap, start); - + } if (err) - goto error; - - printk(KERN_INFO "created bitmap (%lu pages) for device %s\n", - pages, bmname(bitmap)); - - mddev->bitmap = bitmap; + goto out; mddev->thread->timeout = mddev->bitmap_info.daemon_sleep; md_wakeup_thread(mddev->thread); bitmap_update_sb(bitmap); - return (bitmap->flags & BITMAP_WRITE_ERROR) ? -EIO : 0; - - error: - bitmap_free(bitmap); + if (bitmap->flags & BITMAP_WRITE_ERROR) + err = -EIO; +out: return err; } +EXPORT_SYMBOL_GPL(bitmap_load); static ssize_t location_show(mddev_t *mddev, char *page) diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h index a7a11134268d..e872a7bad6b8 100644 --- a/drivers/md/bitmap.h +++ b/drivers/md/bitmap.h @@ -254,6 +254,7 @@ struct bitmap { /* these are used only by md/bitmap */ int bitmap_create(mddev_t *mddev); +int bitmap_load(mddev_t *mddev); void bitmap_flush(mddev_t *mddev); void bitmap_destroy(mddev_t *mddev); diff --git a/drivers/md/md.c b/drivers/md/md.c index 9d4e44e460e9..40b7ca0294ac 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4594,7 +4594,11 @@ static int do_md_run(mddev_t *mddev) err = md_run(mddev); if (err) goto out; - + err = bitmap_load(mddev); + if (err) { + bitmap_destroy(mddev); + goto out; + } set_capacity(mddev->gendisk, mddev->array_sectors); revalidate_disk(mddev->gendisk); kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); @@ -5382,8 +5386,11 @@ static int set_bitmap_file(mddev_t *mddev, int fd) err = 0; if (mddev->pers) { mddev->pers->quiesce(mddev, 1); - if (fd >= 0) + if (fd >= 0) { err = bitmap_create(mddev); + if (!err) + err = bitmap_load(mddev); + } if (fd < 0 || err) { bitmap_destroy(mddev); fd = -1; /* make sure to put the file */ @@ -5632,6 +5639,8 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) mddev->bitmap_info.default_offset; mddev->pers->quiesce(mddev, 1); rv = bitmap_create(mddev); + if (!rv) + rv = bitmap_load(mddev); if (rv) bitmap_destroy(mddev); mddev->pers->quiesce(mddev, 0); From 51e9ac77035a3dfcb6fc0a88a0d80b6f99b5edb1 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Sat, 7 Aug 2010 21:17:00 +1000 Subject: [PATCH 21/24] md/raid10: fix deadlock with unaligned read during resync If the 'bio_split' path in raid10-read is used while resync/recovery is happening it is possible to deadlock. Fix this be elevating ->nr_waiting for the duration of both parts of the split request. This fixes a bug that has been present since 2.6.22 but has only started manifesting recently for unknown reasons. It is suitable for and -stable since then. Reported-by: Justin Bronder Tested-by: Justin Bronder Signed-off-by: NeilBrown Cc: stable@kernel.org --- drivers/md/raid10.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 42e64e4e5e25..d1d689126346 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -825,11 +825,29 @@ static int make_request(mddev_t *mddev, struct bio * bio) */ bp = bio_split(bio, chunk_sects - (bio->bi_sector & (chunk_sects - 1)) ); + + /* Each of these 'make_request' calls will call 'wait_barrier'. + * If the first succeeds but the second blocks due to the resync + * thread raising the barrier, we will deadlock because the + * IO to the underlying device will be queued in generic_make_request + * and will never complete, so will never reduce nr_pending. + * So increment nr_waiting here so no new raise_barriers will + * succeed, and so the second wait_barrier cannot block. + */ + spin_lock_irq(&conf->resync_lock); + conf->nr_waiting++; + spin_unlock_irq(&conf->resync_lock); + if (make_request(mddev, &bp->bio1)) generic_make_request(&bp->bio1); if (make_request(mddev, &bp->bio2)) generic_make_request(&bp->bio2); + spin_lock_irq(&conf->resync_lock); + conf->nr_waiting--; + wake_up(&conf->wait_barrier); + spin_unlock_irq(&conf->resync_lock); + bio_pair_release(bp); return 0; bad_map: From 147e0b6a639ac581ca3bf627bedc3f4a6d3eca66 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 6 Aug 2010 18:01:59 -0700 Subject: [PATCH 22/24] md: move revalidate_disk() back outside open_mutex Commit b821eaa5 "md: remove ->changed and related code" moved revalidate_disk() under open_mutex, and lockdep noticed. [ INFO: possible circular locking dependency detected ] 2.6.32-mdadm-locking #1 ------------------------------------------------------- mdadm/3640 is trying to acquire lock: (&bdev->bd_mutex){+.+.+.}, at: [] revalidate_disk+0x5b/0x90 but task is already holding lock: (&mddev->open_mutex){+.+...}, at: [] do_md_stop+0x4a/0x4d0 [md_mod] which lock already depends on the new lock. It is suitable for 2.6.35.x Cc: Reported-by: Przemyslaw Czarnowski Signed-off-by: Dan Williams Signed-off-by: NeilBrown --- drivers/md/md.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 40b7ca0294ac..00c3fde39a12 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4765,7 +4765,7 @@ out: */ static int do_md_stop(mddev_t * mddev, int mode, int is_open) { - int err = 0; + int err = 0, revalidate = 0; struct gendisk *disk = mddev->gendisk; mdk_rdev_t *rdev; @@ -4795,7 +4795,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) } set_capacity(disk, 0); - revalidate_disk(disk); + revalidate = 1; if (mddev->ro) mddev->ro = 0; @@ -4803,6 +4803,8 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) err = 0; } mutex_unlock(&mddev->open_mutex); + if (revalidate) + revalidate_disk(disk); if (err) return err; /* From bb4f1e9d0e2ef93de8e36ca0f5f26625fcd70b7d Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Sun, 8 Aug 2010 21:18:03 +1000 Subject: [PATCH 23/24] md: fix another deadlock with removing sysfs attributes. Move the deletion of sysfs attributes from reconfig_mutex to open_mutex didn't really help as a process can try to take open_mutex while holding reconfig_mutex, so the same deadlock can happen, just requiring one more process to be involved in the chain. I looks like I cannot easily use locking to wait for the sysfs deletion to complete, so don't. The only things that we cannot do while the deletions are still pending is other things which can change the sysfs namespace: run, takeover, stop. Each of these can fail with -EBUSY. So set a flag while doing a sysfs deletion, and fail run, takeover, stop if that flag is set. This is suitable for 2.6.35.x Cc: stable@kernel.org Signed-off-by: NeilBrown --- drivers/md/md.c | 31 +++++++++++++++++-------------- drivers/md/md.h | 4 ++++ 2 files changed, 21 insertions(+), 14 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 00c3fde39a12..03dcbfbe250f 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -580,13 +580,17 @@ static void mddev_unlock(mddev_t * mddev) * an access to the files will try to take reconfig_mutex * while holding the file unremovable, which leads to * a deadlock. - * So hold open_mutex instead - we are allowed to take - * it while holding reconfig_mutex, and md_run can - * use it to wait for the remove to complete. + * So hold set sysfs_active while the remove in happeing, + * and anything else which might set ->to_remove or my + * otherwise change the sysfs namespace will fail with + * -EBUSY if sysfs_active is still set. + * We set sysfs_active under reconfig_mutex and elsewhere + * test it under the same mutex to ensure its correct value + * is seen. */ struct attribute_group *to_remove = mddev->to_remove; mddev->to_remove = NULL; - mutex_lock(&mddev->open_mutex); + mddev->sysfs_active = 1; mutex_unlock(&mddev->reconfig_mutex); if (mddev->kobj.sd) { @@ -600,7 +604,7 @@ static void mddev_unlock(mddev_t * mddev) mddev->sysfs_action = NULL; } } - mutex_unlock(&mddev->open_mutex); + mddev->sysfs_active = 0; } else mutex_unlock(&mddev->reconfig_mutex); @@ -3008,7 +3012,9 @@ level_store(mddev_t *mddev, const char *buf, size_t len) * - new personality will access other array. */ - if (mddev->sync_thread || mddev->reshape_position != MaxSector) + if (mddev->sync_thread || + mddev->reshape_position != MaxSector || + mddev->sysfs_active) return -EBUSY; if (!mddev->pers->quiesce) { @@ -4393,13 +4399,9 @@ int md_run(mddev_t *mddev) if (mddev->pers) return -EBUSY; - - /* These two calls synchronise us with the - * sysfs_remove_group calls in mddev_unlock, - * so they must have completed. - */ - mutex_lock(&mddev->open_mutex); - mutex_unlock(&mddev->open_mutex); + /* Cannot run until previous stop completes properly */ + if (mddev->sysfs_active) + return -EBUSY; /* * Analyze all RAID superblock(s) @@ -4770,7 +4772,8 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) mdk_rdev_t *rdev; mutex_lock(&mddev->open_mutex); - if (atomic_read(&mddev->openers) > is_open) { + if (atomic_read(&mddev->openers) > is_open || + mddev->sysfs_active) { printk("md: %s still in use.\n",mdname(mddev)); err = -EBUSY; } else if (mddev->pers) { diff --git a/drivers/md/md.h b/drivers/md/md.h index cccbadb31bae..6f797eceae31 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -145,6 +145,10 @@ struct mddev_s int suspended; atomic_t active_io; int ro; + int sysfs_active; /* set when sysfs deletes + * are happening, so run/ + * takeover/stop are not safe + */ struct gendisk *gendisk; From 6e17b0276452912cb13445e5ea552b599984675f Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Sat, 7 Aug 2010 21:41:19 +1000 Subject: [PATCH 24/24] md: clean up do_md_stop There is only one error exit from do_md_stop, so make that more explicit and discard the 'err' variable. Also drop the 'revalidate' variable by moving the unlock calls around. Signed-off-by: NeilBrown --- drivers/md/md.c | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 03dcbfbe250f..d44efb267a69 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4767,7 +4767,6 @@ out: */ static int do_md_stop(mddev_t * mddev, int mode, int is_open) { - int err = 0, revalidate = 0; struct gendisk *disk = mddev->gendisk; mdk_rdev_t *rdev; @@ -4775,9 +4774,11 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) if (atomic_read(&mddev->openers) > is_open || mddev->sysfs_active) { printk("md: %s still in use.\n",mdname(mddev)); - err = -EBUSY; - } else if (mddev->pers) { + mutex_unlock(&mddev->open_mutex); + return -EBUSY; + } + if (mddev->pers) { if (mddev->ro) set_disk_ro(disk, 0); @@ -4798,23 +4799,17 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) } set_capacity(disk, 0); - revalidate = 1; + mutex_unlock(&mddev->open_mutex); + revalidate_disk(disk); if (mddev->ro) mddev->ro = 0; - - err = 0; - } - mutex_unlock(&mddev->open_mutex); - if (revalidate) - revalidate_disk(disk); - if (err) - return err; + } else + mutex_unlock(&mddev->open_mutex); /* * Free resources if final stop */ if (mode == 0) { - printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); bitmap_destroy(mddev); @@ -4831,13 +4826,11 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); if (mddev->hold_active == UNTIL_STOP) mddev->hold_active = 0; - } - err = 0; blk_integrity_unregister(disk); md_new_event(mddev); sysfs_notify_dirent_safe(mddev->sysfs_state); - return err; + return 0; } #ifndef MODULE