diff --git a/drivers/md/md.c b/drivers/md/md.c index 22203eba1e6e..29b0cd9ec951 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -132,6 +132,24 @@ static inline int speed_max(struct mddev *mddev) mddev->sync_speed_max : sysctl_speed_limit_max; } +static void * flush_info_alloc(gfp_t gfp_flags, void *data) +{ + return kzalloc(sizeof(struct flush_info), gfp_flags); +} +static void flush_info_free(void *flush_info, void *data) +{ + kfree(flush_info); +} + +static void * flush_bio_alloc(gfp_t gfp_flags, void *data) +{ + return kzalloc(sizeof(struct flush_bio), gfp_flags); +} +static void flush_bio_free(void *flush_bio, void *data) +{ + kfree(flush_bio); +} + static struct ctl_table_header *raid_table_header; static struct ctl_table raid_table[] = { @@ -414,30 +432,53 @@ static int md_congested(void *data, int bits) /* * Generic flush handling for md */ - -static void md_end_flush(struct bio *bio) +static void submit_flushes(struct work_struct *ws) { - struct md_rdev *rdev = bio->bi_private; - struct mddev *mddev = rdev->mddev; + struct flush_info *fi = container_of(ws, struct flush_info, flush_work); + struct mddev *mddev = fi->mddev; + struct bio *bio = fi->bio; + + bio->bi_opf &= ~REQ_PREFLUSH; + md_handle_request(mddev, bio); + + mempool_free(fi, mddev->flush_pool); +} + +static void md_end_flush(struct bio *fbio) +{ + struct flush_bio *fb = fbio->bi_private; + struct md_rdev *rdev = fb->rdev; + struct flush_info *fi = fb->fi; + struct bio *bio = fi->bio; + struct mddev *mddev = fi->mddev; rdev_dec_pending(rdev, mddev); - if (atomic_dec_and_test(&mddev->flush_pending)) { - /* The pre-request flush has finished */ - queue_work(md_wq, &mddev->flush_work); + if (atomic_dec_and_test(&fi->flush_pending)) { + if (bio->bi_iter.bi_size == 0) + /* an empty barrier - all done */ + bio_endio(bio); + else { + INIT_WORK(&fi->flush_work, submit_flushes); + queue_work(md_wq, &fi->flush_work); + } } - bio_put(bio); + + mempool_free(fb, mddev->flush_bio_pool); + bio_put(fbio); } -static void md_submit_flush_data(struct work_struct *ws); - -static void submit_flushes(struct work_struct *ws) +void md_flush_request(struct mddev *mddev, struct bio *bio) { - struct mddev *mddev = container_of(ws, struct mddev, flush_work); struct md_rdev *rdev; + struct flush_info *fi; + + fi = mempool_alloc(mddev->flush_pool, GFP_NOIO); + + fi->bio = bio; + fi->mddev = mddev; + atomic_set(&fi->flush_pending, 1); - INIT_WORK(&mddev->flush_work, md_submit_flush_data); - atomic_set(&mddev->flush_pending, 1); rcu_read_lock(); rdev_for_each_rcu(rdev, mddev) if (rdev->raid_disk >= 0 && @@ -447,59 +488,39 @@ static void submit_flushes(struct work_struct *ws) * we reclaim rcu_read_lock */ struct bio *bi; + struct flush_bio *fb; atomic_inc(&rdev->nr_pending); atomic_inc(&rdev->nr_pending); rcu_read_unlock(); + + fb = mempool_alloc(mddev->flush_bio_pool, GFP_NOIO); + fb->fi = fi; + fb->rdev = rdev; + bi = bio_alloc_mddev(GFP_NOIO, 0, mddev); - bi->bi_end_io = md_end_flush; - bi->bi_private = rdev; bio_set_dev(bi, rdev->bdev); + bi->bi_end_io = md_end_flush; + bi->bi_private = fb; bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; - atomic_inc(&mddev->flush_pending); + + atomic_inc(&fi->flush_pending); submit_bio(bi); + rcu_read_lock(); rdev_dec_pending(rdev, mddev); } rcu_read_unlock(); - if (atomic_dec_and_test(&mddev->flush_pending)) - queue_work(md_wq, &mddev->flush_work); -} -static void md_submit_flush_data(struct work_struct *ws) -{ - struct mddev *mddev = container_of(ws, struct mddev, flush_work); - struct bio *bio = mddev->flush_bio; - - /* - * must reset flush_bio before calling into md_handle_request to avoid a - * deadlock, because other bios passed md_handle_request suspend check - * could wait for this and below md_handle_request could wait for those - * bios because of suspend check - */ - mddev->flush_bio = NULL; - wake_up(&mddev->sb_wait); - - if (bio->bi_iter.bi_size == 0) - /* an empty barrier - all done */ - bio_endio(bio); - else { - bio->bi_opf &= ~REQ_PREFLUSH; - md_handle_request(mddev, bio); + if (atomic_dec_and_test(&fi->flush_pending)) { + if (bio->bi_iter.bi_size == 0) + /* an empty barrier - all done */ + bio_endio(bio); + else { + INIT_WORK(&fi->flush_work, submit_flushes); + queue_work(md_wq, &fi->flush_work); + } } } - -void md_flush_request(struct mddev *mddev, struct bio *bio) -{ - spin_lock_irq(&mddev->lock); - wait_event_lock_irq(mddev->sb_wait, - !mddev->flush_bio, - mddev->lock); - mddev->flush_bio = bio; - spin_unlock_irq(&mddev->lock); - - INIT_WORK(&mddev->flush_work, submit_flushes); - queue_work(md_wq, &mddev->flush_work); -} EXPORT_SYMBOL(md_flush_request); static inline struct mddev *mddev_get(struct mddev *mddev) @@ -546,7 +567,6 @@ void mddev_init(struct mddev *mddev) atomic_set(&mddev->openers, 0); atomic_set(&mddev->active_io, 0); spin_lock_init(&mddev->lock); - atomic_set(&mddev->flush_pending, 0); init_waitqueue_head(&mddev->sb_wait); init_waitqueue_head(&mddev->recovery_wait); mddev->reshape_position = MaxSector; @@ -2844,7 +2864,8 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) err = 0; } } else if (cmd_match(buf, "re-add")) { - if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1)) { + if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1) && + rdev->saved_raid_disk >= 0) { /* clear_bit is performed _after_ all the devices * have their local Faulty bit cleared. If any writes * happen in the meantime in the local node, they @@ -5499,6 +5520,22 @@ int md_run(struct mddev *mddev) if (err) return err; } + if (mddev->flush_pool == NULL) { + mddev->flush_pool = mempool_create(NR_FLUSH_INFOS, flush_info_alloc, + flush_info_free, mddev); + if (!mddev->flush_pool) { + err = -ENOMEM; + goto abort; + } + } + if (mddev->flush_bio_pool == NULL) { + mddev->flush_bio_pool = mempool_create(NR_FLUSH_BIOS, flush_bio_alloc, + flush_bio_free, mddev); + if (!mddev->flush_bio_pool) { + err = -ENOMEM; + goto abort; + } + } spin_lock(&pers_lock); pers = find_pers(mddev->level, mddev->clevel); @@ -5654,6 +5691,18 @@ int md_run(struct mddev *mddev) sysfs_notify_dirent_safe(mddev->sysfs_action); sysfs_notify(&mddev->kobj, NULL, "degraded"); return 0; + +abort: + if (mddev->flush_bio_pool) { + mempool_destroy(mddev->flush_bio_pool); + mddev->flush_bio_pool = NULL; + } + if (mddev->flush_pool){ + mempool_destroy(mddev->flush_pool); + mddev->flush_pool = NULL; + } + + return err; } EXPORT_SYMBOL_GPL(md_run); @@ -5864,6 +5913,14 @@ void md_stop(struct mddev *mddev) * This is called from dm-raid */ __md_stop(mddev); + if (mddev->flush_bio_pool) { + mempool_destroy(mddev->flush_bio_pool); + mddev->flush_bio_pool = NULL; + } + if (mddev->flush_pool) { + mempool_destroy(mddev->flush_pool); + mddev->flush_pool = NULL; + } bioset_exit(&mddev->bio_set); bioset_exit(&mddev->sync_set); } @@ -6494,6 +6551,9 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev) char b[BDEVNAME_SIZE]; struct md_rdev *rdev; + if (!mddev->pers) + return -ENODEV; + rdev = find_rdev(mddev, dev); if (!rdev) return -ENXIO; @@ -8611,6 +8671,7 @@ static int remove_and_add_spares(struct mddev *mddev, if (mddev->pers->hot_remove_disk( mddev, rdev) == 0) { sysfs_unlink_rdev(mddev, rdev); + rdev->saved_raid_disk = rdev->raid_disk; rdev->raid_disk = -1; removed++; } diff --git a/drivers/md/md.h b/drivers/md/md.h index 3507cab22cb6..2d148bdaba74 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -252,6 +252,19 @@ enum mddev_sb_flags { MD_SB_NEED_REWRITE, /* metadata write needs to be repeated */ }; +#define NR_FLUSH_INFOS 8 +#define NR_FLUSH_BIOS 64 +struct flush_info { + struct bio *bio; + struct mddev *mddev; + struct work_struct flush_work; + atomic_t flush_pending; +}; +struct flush_bio { + struct flush_info *fi; + struct md_rdev *rdev; +}; + struct mddev { void *private; struct md_personality *pers; @@ -457,13 +470,8 @@ struct mddev { * metadata and bitmap writes */ - /* Generic flush handling. - * The last to finish preflush schedules a worker to submit - * the rest of the request (without the REQ_PREFLUSH flag). - */ - struct bio *flush_bio; - atomic_t flush_pending; - struct work_struct flush_work; + mempool_t *flush_pool; + mempool_t *flush_bio_pool; struct work_struct event_work; /* used by dm to report failure event */ void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); struct md_cluster_info *cluster_info; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index bad28520719b..0b344d087581 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2449,7 +2449,6 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio) struct mddev *mddev = conf->mddev; struct bio *bio; struct md_rdev *rdev; - sector_t bio_sector; clear_bit(R1BIO_ReadError, &r1_bio->state); /* we got a read error. Maybe the drive is bad. Maybe just @@ -2462,7 +2461,6 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio) */ bio = r1_bio->bios[r1_bio->read_disk]; - bio_sector = conf->mirrors[r1_bio->read_disk].rdev->data_offset + r1_bio->sector; bio_put(bio); r1_bio->bios[r1_bio->read_disk] = NULL; @@ -2473,6 +2471,8 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio) fix_read_error(conf, r1_bio->read_disk, r1_bio->sector, r1_bio->sectors); unfreeze_array(conf); + } else if (mddev->ro == 0 && test_bit(FailFast, &rdev->flags)) { + md_error(mddev, rdev); } else { r1_bio->bios[r1_bio->read_disk] = IO_BLOCKED; } diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 37d4b236b81b..1147ae59e3b6 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -255,9 +255,11 @@ static void r10buf_pool_free(void *__r10_bio, void *data) for (j = conf->copies; j--; ) { struct bio *bio = r10bio->devs[j].bio; - rp = get_resync_pages(bio); - resync_free_pages(rp); - bio_put(bio); + if (bio) { + rp = get_resync_pages(bio); + resync_free_pages(rp); + bio_put(bio); + } bio = r10bio->devs[j].repl_bio; if (bio) @@ -2362,7 +2364,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 { int sect = 0; /* Offset from r10_bio->sector */ int sectors = r10_bio->sectors; - struct md_rdev*rdev; + struct md_rdev *rdev; int max_read_errors = atomic_read(&mddev->max_corr_read_errors); int d = r10_bio->devs[r10_bio->read_slot].devnum; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index a2e64989b01f..73489446bbcb 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1139,6 +1139,9 @@ again: bi->bi_io_vec[0].bv_len = STRIPE_SIZE; bi->bi_io_vec[0].bv_offset = 0; bi->bi_iter.bi_size = STRIPE_SIZE; + bi->bi_write_hint = sh->dev[i].write_hint; + if (!rrdev) + sh->dev[i].write_hint = RWF_WRITE_LIFE_NOT_SET; /* * If this is discard request, set bi_vcnt 0. We don't * want to confuse SCSI because SCSI will replace payload @@ -1190,6 +1193,8 @@ again: rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; rbi->bi_io_vec[0].bv_offset = 0; rbi->bi_iter.bi_size = STRIPE_SIZE; + rbi->bi_write_hint = sh->dev[i].write_hint; + sh->dev[i].write_hint = RWF_WRITE_LIFE_NOT_SET; /* * If this is discard request, set bi_vcnt 0. We don't * want to confuse SCSI because SCSI will replace payload @@ -3204,6 +3209,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, (unsigned long long)sh->sector); spin_lock_irq(&sh->stripe_lock); + sh->dev[dd_idx].write_hint = bi->bi_write_hint; /* Don't allow new IO added to stripes in batch list */ if (sh->batch_head) goto overlap; @@ -4614,15 +4620,15 @@ static void break_stripe_batch_list(struct stripe_head *head_sh, sh->check_state = head_sh->check_state; sh->reconstruct_state = head_sh->reconstruct_state; + spin_lock_irq(&sh->stripe_lock); + sh->batch_head = NULL; + spin_unlock_irq(&sh->stripe_lock); for (i = 0; i < sh->disks; i++) { if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) do_wakeup = 1; sh->dev[i].flags = head_sh->dev[i].flags & (~((1 << R5_WriteError) | (1 << R5_Overlap))); } - spin_lock_irq(&sh->stripe_lock); - sh->batch_head = NULL; - spin_unlock_irq(&sh->stripe_lock); if (handle_flags == 0 || sh->state & handle_flags) set_bit(STRIPE_HANDLE, &sh->state); diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 72e75ba6abf0..8474c224127b 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -257,6 +257,7 @@ struct stripe_head { sector_t sector; /* sector of this page */ unsigned long flags; u32 log_checksum; + unsigned short write_hint; } dev[1]; /* allocated with extra space depending of RAID geometry */ };