diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index d87a2de667ea..32fa1131cafc 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -53,6 +53,7 @@ #include #include #include +#include #include #include "md.h" @@ -60,6 +61,10 @@ #include "raid0.h" #include "bitmap.h" +#define cpu_to_group(cpu) cpu_to_node(cpu) +#define ANY_GROUP NUMA_NO_NODE + +static struct workqueue_struct *raid5_wq; /* * Stripe cache */ @@ -200,6 +205,34 @@ static int stripe_operations_active(struct stripe_head *sh) test_bit(STRIPE_COMPUTE_RUN, &sh->state); } +static void raid5_wakeup_stripe_thread(struct stripe_head *sh) +{ + struct r5conf *conf = sh->raid_conf; + struct r5worker_group *group; + int i, cpu = sh->cpu; + + if (!cpu_online(cpu)) { + cpu = cpumask_any(cpu_online_mask); + sh->cpu = cpu; + } + + if (list_empty(&sh->lru)) { + struct r5worker_group *group; + group = conf->worker_groups + cpu_to_group(cpu); + list_add_tail(&sh->lru, &group->handle_list); + } + + if (conf->worker_cnt_per_group == 0) { + md_wakeup_thread(conf->mddev->thread); + return; + } + + group = conf->worker_groups + cpu_to_group(sh->cpu); + + for (i = 0; i < conf->worker_cnt_per_group; i++) + queue_work_on(sh->cpu, raid5_wq, &group->workers[i].work); +} + static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh) { BUG_ON(!list_empty(&sh->lru)); @@ -214,7 +247,12 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh) else { clear_bit(STRIPE_DELAYED, &sh->state); clear_bit(STRIPE_BIT_DELAY, &sh->state); - list_add_tail(&sh->lru, &conf->handle_list); + if (conf->worker_cnt_per_group == 0) { + list_add_tail(&sh->lru, &conf->handle_list); + } else { + raid5_wakeup_stripe_thread(sh); + return; + } } md_wakeup_thread(conf->mddev->thread); } else { @@ -409,6 +447,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) raid5_build_block(sh, i, previous); } insert_hash(conf, sh); + sh->cpu = smp_processor_id(); } static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector, @@ -3830,6 +3869,7 @@ static void raid5_activate_delayed(struct r5conf *conf) if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) atomic_inc(&conf->preread_active_stripes); list_add_tail(&sh->lru, &conf->hold_list); + raid5_wakeup_stripe_thread(sh); } } } @@ -4109,18 +4149,32 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) * head of the hold_list has changed, i.e. the head was promoted to the * handle_list. */ -static struct stripe_head *__get_priority_stripe(struct r5conf *conf) +static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group) { - struct stripe_head *sh; + struct stripe_head *sh = NULL, *tmp; + struct list_head *handle_list = NULL; + + if (conf->worker_cnt_per_group == 0) { + handle_list = &conf->handle_list; + } else if (group != ANY_GROUP) { + handle_list = &conf->worker_groups[group].handle_list; + } else { + int i; + for (i = 0; i < conf->group_cnt; i++) { + handle_list = &conf->worker_groups[i].handle_list; + if (!list_empty(handle_list)) + break; + } + } pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n", __func__, - list_empty(&conf->handle_list) ? "empty" : "busy", + list_empty(handle_list) ? "empty" : "busy", list_empty(&conf->hold_list) ? "empty" : "busy", atomic_read(&conf->pending_full_writes), conf->bypass_count); - if (!list_empty(&conf->handle_list)) { - sh = list_entry(conf->handle_list.next, typeof(*sh), lru); + if (!list_empty(handle_list)) { + sh = list_entry(handle_list->next, typeof(*sh), lru); if (list_empty(&conf->hold_list)) conf->bypass_count = 0; @@ -4138,12 +4192,25 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf) ((conf->bypass_threshold && conf->bypass_count > conf->bypass_threshold) || atomic_read(&conf->pending_full_writes) == 0)) { - sh = list_entry(conf->hold_list.next, - typeof(*sh), lru); - conf->bypass_count -= conf->bypass_threshold; - if (conf->bypass_count < 0) - conf->bypass_count = 0; - } else + + list_for_each_entry(tmp, &conf->hold_list, lru) { + if (conf->worker_cnt_per_group == 0 || + group == ANY_GROUP || + !cpu_online(tmp->cpu) || + cpu_to_group(tmp->cpu) == group) { + sh = tmp; + break; + } + } + + if (sh) { + conf->bypass_count -= conf->bypass_threshold; + if (conf->bypass_count < 0) + conf->bypass_count = 0; + } + } + + if (!sh) return NULL; list_del_init(&sh->lru); @@ -4844,13 +4911,13 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) } #define MAX_STRIPE_BATCH 8 -static int handle_active_stripes(struct r5conf *conf) +static int handle_active_stripes(struct r5conf *conf, int group) { struct stripe_head *batch[MAX_STRIPE_BATCH], *sh; int i, batch_size = 0; while (batch_size < MAX_STRIPE_BATCH && - (sh = __get_priority_stripe(conf)) != NULL) + (sh = __get_priority_stripe(conf, group)) != NULL) batch[batch_size++] = sh; if (batch_size == 0) @@ -4868,6 +4935,38 @@ static int handle_active_stripes(struct r5conf *conf) return batch_size; } +static void raid5_do_work(struct work_struct *work) +{ + struct r5worker *worker = container_of(work, struct r5worker, work); + struct r5worker_group *group = worker->group; + struct r5conf *conf = group->conf; + int group_id = group - conf->worker_groups; + int handled; + struct blk_plug plug; + + pr_debug("+++ raid5worker active\n"); + + blk_start_plug(&plug); + handled = 0; + spin_lock_irq(&conf->device_lock); + while (1) { + int batch_size, released; + + released = release_stripe_list(conf); + + batch_size = handle_active_stripes(conf, group_id); + if (!batch_size && !released) + break; + handled += batch_size; + } + pr_debug("%d stripes handled\n", handled); + + spin_unlock_irq(&conf->device_lock); + blk_finish_plug(&plug); + + pr_debug("--- raid5worker inactive\n"); +} + /* * This is our raid5 kernel thread. * @@ -4917,7 +5016,7 @@ static void raid5d(struct md_thread *thread) handled++; } - batch_size = handle_active_stripes(conf); + batch_size = handle_active_stripes(conf, ANY_GROUP); if (!batch_size && !released) break; handled += batch_size; @@ -5057,6 +5156,54 @@ static struct attribute_group raid5_attrs_group = { .attrs = raid5_attrs, }; +static int alloc_thread_groups(struct r5conf *conf, int cnt) +{ + int i, j; + ssize_t size; + struct r5worker *workers; + + conf->worker_cnt_per_group = cnt; + if (cnt == 0) { + conf->worker_groups = NULL; + return 0; + } + conf->group_cnt = num_possible_nodes(); + size = sizeof(struct r5worker) * cnt; + workers = kzalloc(size * conf->group_cnt, GFP_NOIO); + conf->worker_groups = kzalloc(sizeof(struct r5worker_group) * + conf->group_cnt, GFP_NOIO); + if (!conf->worker_groups || !workers) { + kfree(workers); + kfree(conf->worker_groups); + conf->worker_groups = NULL; + return -ENOMEM; + } + + for (i = 0; i < conf->group_cnt; i++) { + struct r5worker_group *group; + + group = &conf->worker_groups[i]; + INIT_LIST_HEAD(&group->handle_list); + group->conf = conf; + group->workers = workers + i * cnt; + + for (j = 0; j < cnt; j++) { + group->workers[j].group = group; + INIT_WORK(&group->workers[j].work, raid5_do_work); + } + } + + return 0; +} + +static void free_thread_groups(struct r5conf *conf) +{ + if (conf->worker_groups) + kfree(conf->worker_groups[0].workers); + kfree(conf->worker_groups); + conf->worker_groups = NULL; +} + static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks) { @@ -5097,6 +5244,7 @@ static void raid5_free_percpu(struct r5conf *conf) static void free_conf(struct r5conf *conf) { + free_thread_groups(conf); shrink_stripes(conf); raid5_free_percpu(conf); kfree(conf->disks); @@ -5225,6 +5373,9 @@ static struct r5conf *setup_conf(struct mddev *mddev) conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL); if (conf == NULL) goto abort; + /* Don't enable multi-threading by default*/ + if (alloc_thread_groups(conf, 0)) + goto abort; spin_lock_init(&conf->device_lock); init_waitqueue_head(&conf->wait_for_stripe); init_waitqueue_head(&conf->wait_for_overlap); @@ -6530,6 +6681,10 @@ static struct md_personality raid4_personality = static int __init raid5_init(void) { + raid5_wq = alloc_workqueue("raid5wq", + WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE|WQ_SYSFS, 0); + if (!raid5_wq) + return -ENOMEM; register_md_personality(&raid6_personality); register_md_personality(&raid5_personality); register_md_personality(&raid4_personality); @@ -6541,6 +6696,7 @@ static void raid5_exit(void) unregister_md_personality(&raid6_personality); unregister_md_personality(&raid5_personality); unregister_md_personality(&raid4_personality); + destroy_workqueue(raid5_wq); } module_init(raid5_init); diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index a98f99d2a58f..105366371fbf 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -212,6 +212,7 @@ struct stripe_head { enum check_states check_state; enum reconstruct_states reconstruct_state; spinlock_t stripe_lock; + int cpu; /** * struct stripe_operations * @target - STRIPE_OP_COMPUTE_BLK target @@ -365,6 +366,17 @@ struct disk_info { struct md_rdev *rdev, *replacement; }; +struct r5worker { + struct work_struct work; + struct r5worker_group *group; +}; + +struct r5worker_group { + struct list_head handle_list; + struct r5conf *conf; + struct r5worker *workers; +}; + struct r5conf { struct hlist_head *stripe_hashtbl; struct mddev *mddev; @@ -461,6 +473,9 @@ struct r5conf { * the new thread here until we fully activate the array. */ struct md_thread *thread; + struct r5worker_group *worker_groups; + int group_cnt; + int worker_cnt_per_group; }; /*