diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 4ad8447fa868..05773ee1b5ba 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1516,6 +1516,256 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors) raid10_write_request(mddev, bio, r10_bio); } +static struct bio *raid10_split_bio(struct r10conf *conf, + struct bio *bio, sector_t sectors, bool want_first) +{ + struct bio *split; + + split = bio_split(bio, sectors, GFP_NOIO, &conf->bio_split); + bio_chain(split, bio); + allow_barrier(conf); + if (want_first) { + submit_bio_noacct(bio); + bio = split; + } else + submit_bio_noacct(split); + wait_barrier(conf); + + return bio; +} + +static void raid10_end_discard_request(struct bio *bio) +{ + struct r10bio *r10_bio = bio->bi_private; + struct r10conf *conf = r10_bio->mddev->private; + struct md_rdev *rdev = NULL; + int dev; + int slot, repl; + + /* + * We don't care the return value of discard bio + */ + if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) + set_bit(R10BIO_Uptodate, &r10_bio->state); + + dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl); + if (repl) + rdev = conf->mirrors[dev].replacement; + if (!rdev) { + /* raid10_remove_disk uses smp_mb to make sure rdev is set to + * replacement before setting replacement to NULL. It can read + * rdev first without barrier protect even replacment is NULL + */ + smp_rmb(); + rdev = conf->mirrors[dev].rdev; + } + + if (atomic_dec_and_test(&r10_bio->remaining)) { + md_write_end(r10_bio->mddev); + raid_end_bio_io(r10_bio); + } + + rdev_dec_pending(rdev, conf->mddev); +} + +/* There are some limitations to handle discard bio + * 1st, the discard size is bigger than stripe_size*2. + * 2st, if the discard bio spans reshape progress, we use the old way to + * handle discard bio + */ +static int raid10_handle_discard(struct mddev *mddev, struct bio *bio) +{ + struct r10conf *conf = mddev->private; + struct geom *geo = &conf->geo; + struct r10bio *r10_bio; + + int disk; + sector_t chunk; + unsigned int stripe_size; + sector_t split_size; + + sector_t bio_start, bio_end; + sector_t first_stripe_index, last_stripe_index; + sector_t start_disk_offset; + unsigned int start_disk_index; + sector_t end_disk_offset; + unsigned int end_disk_index; + unsigned int remainder; + + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) + return -EAGAIN; + + wait_barrier(conf); + + /* Check reshape again to avoid reshape happens after checking + * MD_RECOVERY_RESHAPE and before wait_barrier + */ + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) + goto out; + + stripe_size = geo->raid_disks << geo->chunk_shift; + bio_start = bio->bi_iter.bi_sector; + bio_end = bio_end_sector(bio); + + /* Maybe one discard bio is smaller than strip size or across one stripe + * and discard region is larger than one stripe size. For far offset layout, + * if the discard region is not aligned with stripe size, there is hole + * when we submit discard bio to member disk. For simplicity, we only + * handle discard bio which discard region is bigger than stripe_size*2 + */ + if (bio_sectors(bio) < stripe_size*2) + goto out; + + /* For far offset layout, if bio is not aligned with stripe size, it splits + * the part that is not aligned with strip size. + */ + div_u64_rem(bio_start, stripe_size, &remainder); + if (geo->far_offset && remainder) { + split_size = stripe_size - remainder; + bio = raid10_split_bio(conf, bio, split_size, false); + } + div_u64_rem(bio_end, stripe_size, &remainder); + if (geo->far_offset && remainder) { + split_size = bio_sectors(bio) - remainder; + bio = raid10_split_bio(conf, bio, split_size, true); + } + + r10_bio = mempool_alloc(&conf->r10bio_pool, GFP_NOIO); + r10_bio->mddev = mddev; + r10_bio->state = 0; + r10_bio->sectors = 0; + memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * geo->raid_disks); + + wait_blocked_dev(mddev, r10_bio); + + r10_bio->master_bio = bio; + + bio_start = bio->bi_iter.bi_sector; + bio_end = bio_end_sector(bio); + + /* raid10 uses chunk as the unit to store data. It's similar like raid0. + * One stripe contains the chunks from all member disk (one chunk from + * one disk at the same HBA address). For layout detail, see 'man md 4' + */ + chunk = bio_start >> geo->chunk_shift; + chunk *= geo->near_copies; + first_stripe_index = chunk; + start_disk_index = sector_div(first_stripe_index, geo->raid_disks); + if (geo->far_offset) + first_stripe_index *= geo->far_copies; + start_disk_offset = (bio_start & geo->chunk_mask) + + (first_stripe_index << geo->chunk_shift); + + chunk = bio_end >> geo->chunk_shift; + chunk *= geo->near_copies; + last_stripe_index = chunk; + end_disk_index = sector_div(last_stripe_index, geo->raid_disks); + if (geo->far_offset) + last_stripe_index *= geo->far_copies; + end_disk_offset = (bio_end & geo->chunk_mask) + + (last_stripe_index << geo->chunk_shift); + + rcu_read_lock(); + for (disk = 0; disk < geo->raid_disks; disk++) { + struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev); + struct md_rdev *rrdev = rcu_dereference( + conf->mirrors[disk].replacement); + + r10_bio->devs[disk].bio = NULL; + r10_bio->devs[disk].repl_bio = NULL; + + if (rdev && (test_bit(Faulty, &rdev->flags))) + rdev = NULL; + if (rrdev && (test_bit(Faulty, &rrdev->flags))) + rrdev = NULL; + if (!rdev && !rrdev) + continue; + + if (rdev) { + r10_bio->devs[disk].bio = bio; + atomic_inc(&rdev->nr_pending); + } + if (rrdev) { + r10_bio->devs[disk].repl_bio = bio; + atomic_inc(&rrdev->nr_pending); + } + } + rcu_read_unlock(); + + atomic_set(&r10_bio->remaining, 1); + for (disk = 0; disk < geo->raid_disks; disk++) { + sector_t dev_start, dev_end; + struct bio *mbio, *rbio = NULL; + struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev); + struct md_rdev *rrdev = rcu_dereference( + conf->mirrors[disk].replacement); + + /* + * Now start to calculate the start and end address for each disk. + * The space between dev_start and dev_end is the discard region. + * + * For dev_start, it needs to consider three conditions: + * 1st, the disk is before start_disk, you can imagine the disk in + * the next stripe. So the dev_start is the start address of next + * stripe. + * 2st, the disk is after start_disk, it means the disk is at the + * same stripe of first disk + * 3st, the first disk itself, we can use start_disk_offset directly + */ + if (disk < start_disk_index) + dev_start = (first_stripe_index + 1) * mddev->chunk_sectors; + else if (disk > start_disk_index) + dev_start = first_stripe_index * mddev->chunk_sectors; + else + dev_start = start_disk_offset; + + if (disk < end_disk_index) + dev_end = (last_stripe_index + 1) * mddev->chunk_sectors; + else if (disk > end_disk_index) + dev_end = last_stripe_index * mddev->chunk_sectors; + else + dev_end = end_disk_offset; + + /* It only handles discard bio which size is >= stripe size, so + * dev_end > dev_start all the time + */ + if (r10_bio->devs[disk].bio) { + mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set); + mbio->bi_end_io = raid10_end_discard_request; + mbio->bi_private = r10_bio; + r10_bio->devs[disk].bio = mbio; + r10_bio->devs[disk].devnum = disk; + atomic_inc(&r10_bio->remaining); + md_submit_discard_bio(mddev, rdev, mbio, + dev_start + choose_data_offset(r10_bio, rdev), + dev_end - dev_start); + bio_endio(mbio); + } + if (r10_bio->devs[disk].repl_bio) { + rbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set); + rbio->bi_end_io = raid10_end_discard_request; + rbio->bi_private = r10_bio; + r10_bio->devs[disk].repl_bio = rbio; + r10_bio->devs[disk].devnum = disk; + atomic_inc(&r10_bio->remaining); + md_submit_discard_bio(mddev, rrdev, rbio, + dev_start + choose_data_offset(r10_bio, rrdev), + dev_end - dev_start); + bio_endio(rbio); + } + } + + if (atomic_dec_and_test(&r10_bio->remaining)) { + md_write_end(r10_bio->mddev); + raid_end_bio_io(r10_bio); + } + + return 0; +out: + allow_barrier(conf); + return -EAGAIN; +} + static bool raid10_make_request(struct mddev *mddev, struct bio *bio) { struct r10conf *conf = mddev->private; @@ -1530,6 +1780,10 @@ static bool raid10_make_request(struct mddev *mddev, struct bio *bio) if (!md_write_start(mddev, bio)) return false; + if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) + if (!raid10_handle_discard(mddev, bio)) + return true; + /* * If this request crosses a chunk boundary, we need to split * it. @@ -3769,7 +4023,7 @@ static int raid10_run(struct mddev *mddev) if (mddev->queue) { blk_queue_max_discard_sectors(mddev->queue, - mddev->chunk_sectors); + UINT_MAX); blk_queue_max_write_same_sectors(mddev->queue, 0); blk_queue_max_write_zeroes_sectors(mddev->queue, 0); blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);