diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 14878a9ae09d..516baf49a1fa 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -503,6 +503,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) for (i = disks; i--; ) { int rw; + int replace_only = 0; struct bio *bi, *rbi; struct md_rdev *rdev, *rrdev = NULL; if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { @@ -512,7 +513,11 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) rw = WRITE; } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) rw = READ; - else + else if (test_and_clear_bit(R5_WantReplace, + &sh->dev[i].flags)) { + rw = WRITE; + replace_only = 1; + } else continue; bi = &sh->dev[i].req; @@ -528,10 +533,15 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) rcu_read_lock(); rdev = rcu_dereference(conf->disks[i].rdev); - if (rw & WRITE) - rrdev = rcu_dereference(conf->disks[i].replacement); - else if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) - rdev = rcu_dereference(conf->disks[i].replacement); + rrdev = rcu_dereference(conf->disks[i].replacement); + if (rw & WRITE) { + if (replace_only) + rdev = NULL; + } else { + if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) + rdev = rrdev; + rrdev = NULL; + } if (rdev && test_bit(Faulty, &rdev->flags)) rdev = NULL; @@ -575,7 +585,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) } if (rdev) { - if (s->syncing || s->expanding || s->expanded) + if (s->syncing || s->expanding || s->expanded + || s->replacing) md_sync_acct(rdev->bdev, STRIPE_SECTORS); set_bit(STRIPE_IO_STARTED, &sh->state); @@ -597,7 +608,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) generic_make_request(bi); } if (rrdev) { - if (s->syncing || s->expanding || s->expanded) + if (s->syncing || s->expanding || s->expanded + || s->replacing) md_sync_acct(rrdev->bdev, STRIPE_SECTORS); set_bit(STRIPE_IO_STARTED, &sh->state); @@ -2440,8 +2452,9 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, md_done_sync(conf->mddev, STRIPE_SECTORS, 0); clear_bit(STRIPE_SYNCING, &sh->state); s->syncing = 0; + s->replacing = 0; /* There is nothing more to do for sync/check/repair. - * For recover we need to record a bad block on all + * For recover/replace we need to record a bad block on all * non-sync devices, or abort the recovery */ if (!test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) @@ -2451,12 +2464,18 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, */ for (i = 0; i < conf->raid_disks; i++) { struct md_rdev *rdev = conf->disks[i].rdev; - if (!rdev - || test_bit(Faulty, &rdev->flags) - || test_bit(In_sync, &rdev->flags)) - continue; - if (!rdev_set_badblocks(rdev, sh->sector, - STRIPE_SECTORS, 0)) + if (rdev + && !test_bit(Faulty, &rdev->flags) + && !test_bit(In_sync, &rdev->flags) + && !rdev_set_badblocks(rdev, sh->sector, + STRIPE_SECTORS, 0)) + abort = 1; + rdev = conf->disks[i].replacement; + if (rdev + && !test_bit(Faulty, &rdev->flags) + && !test_bit(In_sync, &rdev->flags) + && !rdev_set_badblocks(rdev, sh->sector, + STRIPE_SECTORS, 0)) abort = 1; } if (abort) { @@ -2465,6 +2484,22 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, } } +static int want_replace(struct stripe_head *sh, int disk_idx) +{ + struct md_rdev *rdev; + int rv = 0; + /* Doing recovery so rcu locking not required */ + rdev = sh->raid_conf->disks[disk_idx].replacement; + if (rdev + && !test_bit(Faulty, &rdev->flags) + && !test_bit(In_sync, &rdev->flags) + && (rdev->recovery_offset <= sh->sector + || rdev->mddev->recovery_cp <= sh->sector)) + rv = 1; + + return rv; +} + /* fetch_block - checks the given member device to see if its data needs * to be read or computed to satisfy a request. * @@ -2484,6 +2519,7 @@ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, (dev->toread || (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || s->syncing || s->expanding || + (s->replacing && want_replace(sh, disk_idx)) || (s->failed >= 1 && fdev[0]->toread) || (s->failed >= 2 && fdev[1]->toread) || (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite && @@ -3037,22 +3073,18 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh) } } - /* * handle_stripe - do things to a stripe. * - * We lock the stripe and then examine the state of various bits - * to see what needs to be done. + * We lock the stripe by setting STRIPE_ACTIVE and then examine the + * state of various bits to see what needs to be done. * Possible results: - * return some read request which now have data - * return some write requests which are safely on disc + * return some read requests which now have data + * return some write requests which are safely on storage * schedule a read on some buffers * schedule a write of some buffers * return confirmation of parity correctness * - * buffers are taken off read_list or write_list, and bh_cache buffers - * get BH_Lock set before the stripe lock is released. - * */ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) @@ -3061,10 +3093,10 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) int disks = sh->disks; struct r5dev *dev; int i; + int do_recovery = 0; memset(s, 0, sizeof(*s)); - s->syncing = test_bit(STRIPE_SYNCING, &sh->state); s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); s->failed_num[0] = -1; @@ -3082,7 +3114,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) dev = &sh->dev[i]; pr_debug("check %d: state 0x%lx read %p write %p written %p\n", - i, dev->flags, dev->toread, dev->towrite, dev->written); + i, dev->flags, + dev->toread, dev->towrite, dev->written); /* maybe we can reply to a read * * new wantfill requests are only permitted while @@ -3123,6 +3156,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) &first_bad, &bad_sectors)) set_bit(R5_ReadRepl, &dev->flags); else { + if (rdev) + set_bit(R5_NeedReplace, &dev->flags); rdev = rcu_dereference(conf->disks[i].rdev); clear_bit(R5_ReadRepl, &dev->flags); } @@ -3210,9 +3245,25 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) if (s->failed < 2) s->failed_num[s->failed] = i; s->failed++; + if (rdev && !test_bit(Faulty, &rdev->flags)) + do_recovery = 1; } } spin_unlock_irq(&conf->device_lock); + if (test_bit(STRIPE_SYNCING, &sh->state)) { + /* If there is a failed device being replaced, + * we must be recovering. + * else if we are after recovery_cp, we must be syncing + * else we can only be replacing + * sync and recovery both need to read all devices, and so + * use the same flag. + */ + if (do_recovery || + sh->sector >= conf->mddev->recovery_cp) + s->syncing = 1; + else + s->replacing = 1; + } rcu_read_unlock(); } @@ -3254,7 +3305,7 @@ static void handle_stripe(struct stripe_head *sh) if (unlikely(s.blocked_rdev)) { if (s.syncing || s.expanding || s.expanded || - s.to_write || s.written) { + s.replacing || s.to_write || s.written) { set_bit(STRIPE_HANDLE, &sh->state); goto finish; } @@ -3280,7 +3331,7 @@ static void handle_stripe(struct stripe_head *sh) sh->reconstruct_state = 0; if (s.to_read+s.to_write+s.written) handle_failed_stripe(conf, sh, &s, disks, &s.return_bi); - if (s.syncing) + if (s.syncing + s.replacing) handle_failed_sync(conf, sh, &s); } @@ -3311,7 +3362,9 @@ static void handle_stripe(struct stripe_head *sh) */ if (s.to_read || s.non_overwrite || (conf->level == 6 && s.to_write && s.failed) - || (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding) + || (s.syncing && (s.uptodate + s.compute < disks)) + || s.replacing + || s.expanding) handle_stripe_fill(sh, &s, disks); /* Now we check to see if any write operations have recently @@ -3373,7 +3426,20 @@ static void handle_stripe(struct stripe_head *sh) handle_parity_checks5(conf, sh, &s, disks); } - if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { + if (s.replacing && s.locked == 0 + && !test_bit(STRIPE_INSYNC, &sh->state)) { + /* Write out to replacement devices where possible */ + for (i = 0; i < conf->raid_disks; i++) + if (test_bit(R5_UPTODATE, &sh->dev[i].flags) && + test_bit(R5_NeedReplace, &sh->dev[i].flags)) { + set_bit(R5_WantReplace, &sh->dev[i].flags); + set_bit(R5_LOCKED, &sh->dev[i].flags); + s.locked++; + } + set_bit(STRIPE_INSYNC, &sh->state); + } + if ((s.syncing || s.replacing) && s.locked == 0 && + test_bit(STRIPE_INSYNC, &sh->state)) { md_done_sync(conf->mddev, STRIPE_SECTORS, 1); clear_bit(STRIPE_SYNCING, &sh->state); } @@ -4262,7 +4328,6 @@ static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ } - bitmap_cond_end_sync(mddev->bitmap, sector_nr); sh = get_active_stripe(conf, sector_nr, 0, 1, 0); diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index f6faaa16a565..8d8e13934a48 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -242,7 +242,13 @@ struct stripe_head { * for handle_stripe. */ struct stripe_head_state { - int syncing, expanding, expanded; + /* 'syncing' means that we need to read all devices, either + * to check/correct parity, or to reconstruct a missing device. + * 'replacing' means we are replacing one or more drives and + * the source is valid at this point so we don't need to + * read all devices, just the replacement targets. + */ + int syncing, expanding, expanded, replacing; int locked, uptodate, to_read, to_write, failed, written; int to_fill, compute, req_compute, non_overwrite; int failed_num[2]; @@ -284,6 +290,11 @@ enum r5dev_flags { R5_ReadRepl, /* Will/did read from replacement rather than orig */ R5_MadeGoodRepl,/* A bad block on the replacement device has been * fixed by writing to it */ + R5_NeedReplace, /* This device has a replacement which is not + * up-to-date at this stripe. */ + R5_WantReplace, /* We need to update the replacement, we have read + * data in, and now is a good time to write it out. + */ }; /*