From 51f2176d74ace4c3f58579a605ef5a9720befb00 Mon Sep 17 00:00:00 2001 From: Ben Segall Date: Mon, 19 May 2014 15:49:45 -0700 Subject: [PATCH 01/18] sched/fair: Fix unlocked reads of some cfs_b->quota/period sched_cfs_period_timer() reads cfs_b->period without locks before calling do_sched_cfs_period_timer(), and similarly unthrottle_offline_cfs_rqs() would read cfs_b->period without the right lock. Thus a simultaneous change of bandwidth could cause corruption on any platform where ktime_t or u64 writes/reads are not atomic. Extend cfs_b->lock from do_sched_cfs_period_timer() to include the read of cfs_b->period to solve that issue; unthrottle_offline_cfs_rqs() can just use 1 rather than the exact quota, much like distribute_cfs_runtime() does. There is also an unlocked read of cfs_b->runtime_expires, but a race there would only delay runtime expiry by a tick. Still, the comparison should just be != anyway, which clarifies even that problem. Signed-off-by: Ben Segall Tested-by: Roman Gushchin [peterz: Fix compile warn] Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140519224945.20303.93530.stgit@sword-of-the-dawn.mtv.corp.google.com Cc: pjt@google.com Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c9617b73bcc0..b71d8c39f1fd 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3224,10 +3224,12 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) * has not truly expired. * * Fortunately we can check determine whether this the case by checking - * whether the global deadline has advanced. + * whether the global deadline has advanced. It is valid to compare + * cfs_b->runtime_expires without any locks since we only care about + * exact equality, so a partial write will still work. */ - if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) { + if (cfs_rq->runtime_expires != cfs_b->runtime_expires) { /* extend local deadline, drift is bounded above by 2 ticks */ cfs_rq->runtime_expires += TICK_NSEC; } else { @@ -3456,21 +3458,21 @@ next: static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) { u64 runtime, runtime_expires; - int idle = 1, throttled; + int throttled; - raw_spin_lock(&cfs_b->lock); /* no need to continue the timer with no bandwidth constraint */ if (cfs_b->quota == RUNTIME_INF) - goto out_unlock; + goto out_deactivate; throttled = !list_empty(&cfs_b->throttled_cfs_rq); - /* idle depends on !throttled (for the case of a large deficit) */ - idle = cfs_b->idle && !throttled; cfs_b->nr_periods += overrun; - /* if we're going inactive then everything else can be deferred */ - if (idle) - goto out_unlock; + /* + * idle depends on !throttled (for the case of a large deficit), and if + * we're going inactive then everything else can be deferred + */ + if (cfs_b->idle && !throttled) + goto out_deactivate; /* * if we have relooped after returning idle once, we need to update our @@ -3484,7 +3486,7 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) if (!throttled) { /* mark as potentially idle for the upcoming period */ cfs_b->idle = 1; - goto out_unlock; + return 0; } /* account preceding periods in which throttling occurred */ @@ -3524,12 +3526,12 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) * timer to remain active while there are any throttled entities.) */ cfs_b->idle = 0; -out_unlock: - if (idle) - cfs_b->timer_active = 0; - raw_spin_unlock(&cfs_b->lock); - return idle; + return 0; + +out_deactivate: + cfs_b->timer_active = 0; + return 1; } /* a cfs_rq won't donate quota below this amount */ @@ -3706,6 +3708,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) int overrun; int idle = 0; + raw_spin_lock(&cfs_b->lock); for (;;) { now = hrtimer_cb_get_time(timer); overrun = hrtimer_forward(timer, now, cfs_b->period); @@ -3715,6 +3718,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) idle = do_sched_cfs_period_timer(cfs_b, overrun); } + raw_spin_unlock(&cfs_b->lock); return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; } @@ -3774,8 +3778,6 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) struct cfs_rq *cfs_rq; for_each_leaf_cfs_rq(rq, cfs_rq) { - struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); - if (!cfs_rq->runtime_enabled) continue; @@ -3783,7 +3785,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) * clock_task is not advancing so we just need to make sure * there's some valid quota amount */ - cfs_rq->runtime_remaining = cfs_b->quota; + cfs_rq->runtime_remaining = 1; if (cfs_rq_throttled(cfs_rq)) unthrottle_cfs_rq(cfs_rq); } From ed61bbc69c773465782476c7e5869fa5607fa73a Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Tue, 20 May 2014 14:39:27 -0700 Subject: [PATCH 02/18] sched/balancing: Reduce the rate of needless idle load balancing The current no_hz idle load balancer do load balancing for *all* idle cpus, even though the time due to load balance for a particular idle cpu could be still a while in the future. This introduces a much higher load balancing rate than what is necessary. The patch changes the behavior by only doing idle load balancing on behalf of an idle cpu only when it is due for load balancing. On SGI's systems with over 3000 cores, the cpu responsible for idle balancing got overwhelmed with idle balancing, and introduces a lot of OS noise to workloads. This patch fixes the issue. Signed-off-by: Tim Chen Acked-by: Russ Anderson Reviewed-by: Rik van Riel Reviewed-by: Jason Low Signed-off-by: Peter Zijlstra Cc: Andrew Morton Cc: Len Brown Cc: Dimitri Sivanich Cc: Hedi Berriche Cc: Andi Kleen Cc: MichelLespinasse Cc: Peter Hurley Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1400621967.2970.280.camel@schen9-DESK Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b71d8c39f1fd..7a0c000b6005 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7193,12 +7193,17 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) rq = cpu_rq(balance_cpu); - raw_spin_lock_irq(&rq->lock); - update_rq_clock(rq); - update_idle_cpu_load(rq); - raw_spin_unlock_irq(&rq->lock); - - rebalance_domains(rq, CPU_IDLE); + /* + * If time for next balance is due, + * do the balance. + */ + if (time_after_eq(jiffies, rq->next_balance)) { + raw_spin_lock_irq(&rq->lock); + update_rq_clock(rq); + update_idle_cpu_load(rq); + raw_spin_unlock_irq(&rq->lock); + rebalance_domains(rq, CPU_IDLE); + } if (time_after(this_rq->next_balance, rq->next_balance)) this_rq->next_balance = rq->next_balance; From 2538d960d0c74cdc639f05723e04a67aed1efdf9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Manuel=20Sch=C3=B6lling?= Date: Thu, 22 May 2014 19:45:23 +0200 Subject: [PATCH 03/18] sched/fair: Use time_after() in record_wakee() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To be future-proof and for better readability the time comparisons are modified to use time_after() instead of plain, error-prone math. Signed-off-by: Manuel Schölling Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1400780723-24626-1-git-send-email-manuel.schoelling@gmx.de Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7a0c000b6005..c63dde984956 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4066,7 +4066,7 @@ static void record_wakee(struct task_struct *p) * about the boundary, really active task won't care * about the loss. */ - if (jiffies > current->wakee_flip_decay_ts + HZ) { + if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) { current->wakee_flips >>= 1; current->wakee_flip_decay_ts = jiffies; } From fa93384f40deeb294fd29f2fdcadbd0ebc2dedf1 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 23 May 2014 13:20:42 +0300 Subject: [PATCH 04/18] sched: Fix signedness bug in yield_to() yield_to() is supposed to return -ESRCH if there is no task to yield to, but because the type is bool that is the same as returning true. The only place I see which cares is kvm_vcpu_on_spin(). Signed-off-by: Dan Carpenter Reviewed-by: Raghavendra Signed-off-by: Peter Zijlstra Cc: Gleb Natapov Cc: Linus Torvalds Cc: Paolo Bonzini Cc: kvm@vger.kernel.org Link: http://lkml.kernel.org/r/20140523102042.GA7267@mwanda Signed-off-by: Ingo Molnar --- include/linux/kvm_host.h | 2 +- include/linux/sched.h | 2 +- kernel/sched/core.c | 2 +- virt/kvm/kvm_main.c | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 7d21cf9f4380..3c4bcf146159 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -584,7 +584,7 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn); void kvm_vcpu_block(struct kvm_vcpu *vcpu); void kvm_vcpu_kick(struct kvm_vcpu *vcpu); -bool kvm_vcpu_yield_to(struct kvm_vcpu *target); +int kvm_vcpu_yield_to(struct kvm_vcpu *target); void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu); void kvm_load_guest_fpu(struct kvm_vcpu *vcpu); void kvm_put_guest_fpu(struct kvm_vcpu *vcpu); diff --git a/include/linux/sched.h b/include/linux/sched.h index 0f91d00efd87..6790c3b42072 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2180,7 +2180,7 @@ static inline void sched_autogroup_fork(struct signal_struct *sig) { } static inline void sched_autogroup_exit(struct signal_struct *sig) { } #endif -extern bool yield_to(struct task_struct *p, bool preempt); +extern int yield_to(struct task_struct *p, bool preempt); extern void set_user_nice(struct task_struct *p, long nice); extern int task_prio(const struct task_struct *p); /** diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 321d800e4baa..afcc84234a3e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4195,7 +4195,7 @@ EXPORT_SYMBOL(yield); * false (0) if we failed to boost the target. * -ESRCH if there's no task to yield to. */ -bool __sched yield_to(struct task_struct *p, bool preempt) +int __sched yield_to(struct task_struct *p, bool preempt) { struct task_struct *curr = current; struct rq *rq, *p_rq; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 56baae8c2f56..86d1c457458d 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1708,11 +1708,11 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu) EXPORT_SYMBOL_GPL(kvm_vcpu_kick); #endif /* !CONFIG_S390 */ -bool kvm_vcpu_yield_to(struct kvm_vcpu *target) +int kvm_vcpu_yield_to(struct kvm_vcpu *target) { struct pid *pid; struct task_struct *task = NULL; - bool ret = false; + int ret = 0; rcu_read_lock(); pid = rcu_dereference(target->pid); From 5ef20ca181ec592e4684a45f4d5f1385f6055534 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 26 May 2014 18:19:34 -0400 Subject: [PATCH 05/18] sched/fair: Remove "power" from 'struct numa_stats' It is better not to think about compute capacity as being equivalent to "CPU power". The upcoming "power aware" scheduler work may create confusion with the notion of energy consumption if "power" is used too liberally. To make things explicit and not create more confusion with the existing "capacity" member, let's rename things as follows: power -> compute_capacity capacity -> task_capacity Note: none of those fields are actually used outside update_numa_stats(). Signed-off-by: Nicolas Pitre Signed-off-by: Peter Zijlstra Cc: Vincent Guittot Cc: Daniel Lezcano Cc: Morten Rasmussen Cc: "Rafael J. Wysocki" Cc: linaro-kernel@lists.linaro.org Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/n/tip-2e2ndymj5gyshyjq8am79f20@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c63dde984956..1cfe5a25086d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1026,10 +1026,10 @@ struct numa_stats { unsigned long load; /* Total compute capacity of CPUs on a node */ - unsigned long power; + unsigned long compute_capacity; /* Approximate capacity in terms of runnable tasks on a node */ - unsigned long capacity; + unsigned long task_capacity; int has_capacity; }; @@ -1046,7 +1046,7 @@ static void update_numa_stats(struct numa_stats *ns, int nid) ns->nr_running += rq->nr_running; ns->load += weighted_cpuload(cpu); - ns->power += power_of(cpu); + ns->compute_capacity += power_of(cpu); cpus++; } @@ -1062,9 +1062,10 @@ static void update_numa_stats(struct numa_stats *ns, int nid) if (!cpus) return; - ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power; - ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE); - ns->has_capacity = (ns->nr_running < ns->capacity); + ns->load = (ns->load * SCHED_POWER_SCALE) / ns->compute_capacity; + ns->task_capacity = + DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_POWER_SCALE); + ns->has_capacity = (ns->nr_running < ns->task_capacity); } struct task_numa_env { From 1b6a7495d343fcfe22ff3a8285544bb8e40f1920 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 26 May 2014 18:19:35 -0400 Subject: [PATCH 06/18] sched/fair: Change "has_capacity" to "has_free_capacity" The capacity of a CPU/group should be some intrinsic value that doesn't change with task placement. It is like a container which capacity is stable regardless of the amount of liquid in it (its "utilization")... unless the container itself is crushed that is, but that's another story. Therefore let's rename "has_capacity" to "has_free_capacity" in order to better convey the wanted meaning. Signed-off-by: Nicolas Pitre Signed-off-by: Peter Zijlstra Cc: Vincent Guittot Cc: Daniel Lezcano Cc: Morten Rasmussen Cc: "Rafael J. Wysocki" Cc: linaro-kernel@lists.linaro.org Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/n/tip-djzkk027jm0e8x8jxy70opzh@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1cfe5a25086d..8993dfa2e82b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1030,7 +1030,7 @@ struct numa_stats { /* Approximate capacity in terms of runnable tasks on a node */ unsigned long task_capacity; - int has_capacity; + int has_free_capacity; }; /* @@ -1056,8 +1056,8 @@ static void update_numa_stats(struct numa_stats *ns, int nid) * the @ns structure is NULL'ed and task_numa_compare() will * not find this node attractive. * - * We'll either bail at !has_capacity, or we'll detect a huge imbalance - * and bail there. + * We'll either bail at !has_free_capacity, or we'll detect a huge + * imbalance and bail there. */ if (!cpus) return; @@ -1065,7 +1065,7 @@ static void update_numa_stats(struct numa_stats *ns, int nid) ns->load = (ns->load * SCHED_POWER_SCALE) / ns->compute_capacity; ns->task_capacity = DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_POWER_SCALE); - ns->has_capacity = (ns->nr_running < ns->task_capacity); + ns->has_free_capacity = (ns->nr_running < ns->task_capacity); } struct task_numa_env { @@ -1196,8 +1196,8 @@ static void task_numa_compare(struct task_numa_env *env, if (!cur) { /* Is there capacity at our destination? */ - if (env->src_stats.has_capacity && - !env->dst_stats.has_capacity) + if (env->src_stats.has_free_capacity && + !env->dst_stats.has_free_capacity) goto unlock; goto balance; @@ -1302,8 +1302,8 @@ static int task_numa_migrate(struct task_struct *p) groupimp = group_weight(p, env.dst_nid) - groupweight; update_numa_stats(&env.dst_stats, env.dst_nid); - /* If the preferred nid has capacity, try to use it. */ - if (env.dst_stats.has_capacity) + /* If the preferred nid has free capacity, try to use it. */ + if (env.dst_stats.has_free_capacity) task_numa_find_cpu(&env, taskimp, groupimp); /* No space available on the preferred nid. Look elsewhere. */ @@ -5538,7 +5538,7 @@ struct sg_lb_stats { unsigned int idle_cpus; unsigned int group_weight; int group_imb; /* Is there an imbalance in the group ? */ - int group_has_capacity; /* Is there extra capacity in the group? */ + int group_has_free_capacity; #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; unsigned int nr_preferred_running; @@ -5905,7 +5905,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_capacity = sg_capacity(env, group); if (sgs->group_capacity > sgs->sum_nr_running) - sgs->group_has_capacity = 1; + sgs->group_has_free_capacity = 1; } /** @@ -6029,7 +6029,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd * with a large weight task outweighs the tasks on the system). */ if (prefer_sibling && sds->local && - sds->local_stat.group_has_capacity) + sds->local_stat.group_has_free_capacity) sgs->group_capacity = min(sgs->group_capacity, 1U); if (update_sd_pick_busiest(env, sds, sg, sgs)) { @@ -6289,8 +6289,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env) goto force_balance; /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ - if (env->idle == CPU_NEWLY_IDLE && local->group_has_capacity && - !busiest->group_has_capacity) + if (env->idle == CPU_NEWLY_IDLE && local->group_has_free_capacity && + !busiest->group_has_free_capacity) goto force_balance; /* From 0fedc6c8e34f4ce0b37b1f25c3619b4a8faa244c Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 26 May 2014 18:19:36 -0400 Subject: [PATCH 07/18] sched/fair: Disambiguate existing/remaining "capacity" usage We have "power" (which should actually become "capacity") and "capacity" which is a scaled down "capacity factor" in terms of unitary tasks. Let's use "capacity_factor" to make room for proper usage of "capacity" later. Signed-off-by: Nicolas Pitre Signed-off-by: Peter Zijlstra Cc: Vincent Guittot Cc: Daniel Lezcano Cc: Morten Rasmussen Cc: "Rafael J. Wysocki" Cc: linaro-kernel@lists.linaro.org Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/n/tip-gk1co8sqdev3763opqm6ovml@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8993dfa2e82b..e401e446e87c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5534,7 +5534,7 @@ struct sg_lb_stats { unsigned long load_per_task; unsigned long group_power; unsigned int sum_nr_running; /* Nr tasks running in the group */ - unsigned int group_capacity; + unsigned int group_capacity_factor; unsigned int idle_cpus; unsigned int group_weight; int group_imb; /* Is there an imbalance in the group ? */ @@ -5829,15 +5829,15 @@ static inline int sg_imbalanced(struct sched_group *group) } /* - * Compute the group capacity. + * Compute the group capacity factor. * * Avoid the issue where N*frac(smt_power) >= 1 creates 'phantom' cores by * first dividing out the smt factor and computing the actual number of cores * and limit power unit capacity with that. */ -static inline int sg_capacity(struct lb_env *env, struct sched_group *group) +static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *group) { - unsigned int capacity, smt, cpus; + unsigned int capacity_factor, smt, cpus; unsigned int power, power_orig; power = group->sgp->power; @@ -5846,13 +5846,13 @@ static inline int sg_capacity(struct lb_env *env, struct sched_group *group) /* smt := ceil(cpus / power), assumes: 1 < smt_power < 2 */ smt = DIV_ROUND_UP(SCHED_POWER_SCALE * cpus, power_orig); - capacity = cpus / smt; /* cores */ + capacity_factor = cpus / smt; /* cores */ - capacity = min_t(unsigned, capacity, DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE)); - if (!capacity) - capacity = fix_small_capacity(env->sd, group); + capacity_factor = min_t(unsigned, capacity_factor, DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE)); + if (!capacity_factor) + capacity_factor = fix_small_capacity(env->sd, group); - return capacity; + return capacity_factor; } /** @@ -5902,9 +5902,9 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_weight = group->group_weight; sgs->group_imb = sg_imbalanced(group); - sgs->group_capacity = sg_capacity(env, group); + sgs->group_capacity_factor = sg_capacity_factor(env, group); - if (sgs->group_capacity > sgs->sum_nr_running) + if (sgs->group_capacity_factor > sgs->sum_nr_running) sgs->group_has_free_capacity = 1; } @@ -5929,7 +5929,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, if (sgs->avg_load <= sds->busiest_stat.avg_load) return false; - if (sgs->sum_nr_running > sgs->group_capacity) + if (sgs->sum_nr_running > sgs->group_capacity_factor) return true; if (sgs->group_imb) @@ -6020,17 +6020,17 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd /* * In case the child domain prefers tasks go to siblings - * first, lower the sg capacity to one so that we'll try + * first, lower the sg capacity factor to one so that we'll try * and move all the excess tasks away. We lower the capacity * of a group only if the local group has the capacity to fit - * these excess tasks, i.e. nr_running < group_capacity. The + * these excess tasks, i.e. nr_running < group_capacity_factor. The * extra check prevents the case where you always pull from the * heaviest group when it is already under-utilized (possible * with a large weight task outweighs the tasks on the system). */ if (prefer_sibling && sds->local && sds->local_stat.group_has_free_capacity) - sgs->group_capacity = min(sgs->group_capacity, 1U); + sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U); if (update_sd_pick_busiest(env, sds, sg, sgs)) { sds->busiest = sg; @@ -6204,7 +6204,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * have to drop below capacity to reach cpu-load equilibrium. */ load_above_capacity = - (busiest->sum_nr_running - busiest->group_capacity); + (busiest->sum_nr_running - busiest->group_capacity_factor); load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE); load_above_capacity /= busiest->group_power; @@ -6348,7 +6348,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, int i; for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { - unsigned long power, capacity, wl; + unsigned long power, capacity_factor, wl; enum fbq_type rt; rq = cpu_rq(i); @@ -6377,9 +6377,9 @@ static struct rq *find_busiest_queue(struct lb_env *env, continue; power = power_of(i); - capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); - if (!capacity) - capacity = fix_small_capacity(env->sd, group); + capacity_factor = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); + if (!capacity_factor) + capacity_factor = fix_small_capacity(env->sd, group); wl = weighted_cpuload(i); @@ -6387,7 +6387,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, * When comparing with imbalance, use weighted_cpuload() * which is not scaled with the cpu power. */ - if (capacity && rq->nr_running == 1 && wl > env->imbalance) + if (capacity_factor && rq->nr_running == 1 && wl > env->imbalance) continue; /* From 63b2ca30bdb3dbf60bc7ac5f46713c0d32308261 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 26 May 2014 18:19:37 -0400 Subject: [PATCH 08/18] sched: Let 'struct sched_group_power' care about CPU capacity It is better not to think about compute capacity as being equivalent to "CPU power". The upcoming "power aware" scheduler work may create confusion with the notion of energy consumption if "power" is used too liberally. Since struct sched_group_power is really about compute capacity of sched groups, let's rename it to struct sched_group_capacity. Similarly sgp becomes sgc. Related variables and functions dealing with groups are also adjusted accordingly. Signed-off-by: Nicolas Pitre Signed-off-by: Peter Zijlstra Cc: Vincent Guittot Cc: Daniel Lezcano Cc: Morten Rasmussen Cc: "Rafael J. Wysocki" Cc: linaro-kernel@lists.linaro.org Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/n/tip-5yeix833vvgf2uyj5o36hpu9@git.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 +- kernel/sched/core.c | 81 +++++++++++++------------- kernel/sched/fair.c | 131 +++++++++++++++++++++--------------------- kernel/sched/sched.h | 16 +++--- 4 files changed, 115 insertions(+), 115 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 6790c3b42072..a96f03598c61 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1013,7 +1013,7 @@ typedef const int (*sched_domain_flags_f)(void); struct sd_data { struct sched_domain **__percpu sd; struct sched_group **__percpu sg; - struct sched_group_power **__percpu sgp; + struct sched_group_capacity **__percpu sgc; }; struct sched_domain_topology_level { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index afcc84234a3e..2e1fb0902200 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5221,14 +5221,13 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, } /* - * Even though we initialize ->power to something semi-sane, - * we leave power_orig unset. This allows us to detect if + * Even though we initialize ->capacity to something semi-sane, + * we leave capacity_orig unset. This allows us to detect if * domain iteration is still funny without causing /0 traps. */ - if (!group->sgp->power_orig) { + if (!group->sgc->capacity_orig) { printk(KERN_CONT "\n"); - printk(KERN_ERR "ERROR: domain->cpu_power not " - "set\n"); + printk(KERN_ERR "ERROR: domain->cpu_capacity not set\n"); break; } @@ -5250,9 +5249,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); printk(KERN_CONT " %s", str); - if (group->sgp->power != SCHED_POWER_SCALE) { - printk(KERN_CONT " (cpu_power = %d)", - group->sgp->power); + if (group->sgc->capacity != SCHED_POWER_SCALE) { + printk(KERN_CONT " (cpu_capacity = %d)", + group->sgc->capacity); } group = group->next; @@ -5466,7 +5465,7 @@ static struct root_domain *alloc_rootdomain(void) return rd; } -static void free_sched_groups(struct sched_group *sg, int free_sgp) +static void free_sched_groups(struct sched_group *sg, int free_sgc) { struct sched_group *tmp, *first; @@ -5477,8 +5476,8 @@ static void free_sched_groups(struct sched_group *sg, int free_sgp) do { tmp = sg->next; - if (free_sgp && atomic_dec_and_test(&sg->sgp->ref)) - kfree(sg->sgp); + if (free_sgc && atomic_dec_and_test(&sg->sgc->ref)) + kfree(sg->sgc); kfree(sg); sg = tmp; @@ -5496,7 +5495,7 @@ static void free_sched_domain(struct rcu_head *rcu) if (sd->flags & SD_OVERLAP) { free_sched_groups(sd->groups, 1); } else if (atomic_dec_and_test(&sd->groups->ref)) { - kfree(sd->groups->sgp); + kfree(sd->groups->sgc); kfree(sd->groups); } kfree(sd); @@ -5707,17 +5706,17 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) cpumask_or(covered, covered, sg_span); - sg->sgp = *per_cpu_ptr(sdd->sgp, i); - if (atomic_inc_return(&sg->sgp->ref) == 1) + sg->sgc = *per_cpu_ptr(sdd->sgc, i); + if (atomic_inc_return(&sg->sgc->ref) == 1) build_group_mask(sd, sg); /* - * Initialize sgp->power such that even if we mess up the + * Initialize sgc->capacity such that even if we mess up the * domains and no possible iteration will get us here, we won't * die on a /0 trap. */ - sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span); - sg->sgp->power_orig = sg->sgp->power; + sg->sgc->capacity = SCHED_POWER_SCALE * cpumask_weight(sg_span); + sg->sgc->capacity_orig = sg->sgc->capacity; /* * Make sure the first group of this domain contains the @@ -5755,8 +5754,8 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) if (sg) { *sg = *per_cpu_ptr(sdd->sg, cpu); - (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu); - atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */ + (*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu); + atomic_set(&(*sg)->sgc->ref, 1); /* for claim_allocations */ } return cpu; @@ -5819,16 +5818,16 @@ build_sched_groups(struct sched_domain *sd, int cpu) } /* - * Initialize sched groups cpu_power. + * Initialize sched groups cpu_capacity. * - * cpu_power indicates the capacity of sched group, which is used while + * cpu_capacity indicates the capacity of sched group, which is used while * distributing the load between different sched groups in a sched domain. - * Typically cpu_power for all the groups in a sched domain will be same unless - * there are asymmetries in the topology. If there are asymmetries, group - * having more cpu_power will pickup more load compared to the group having - * less cpu_power. + * Typically cpu_capacity for all the groups in a sched domain will be same + * unless there are asymmetries in the topology. If there are asymmetries, + * group having more cpu_capacity will pickup more load compared to the + * group having less cpu_capacity. */ -static void init_sched_groups_power(int cpu, struct sched_domain *sd) +static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) { struct sched_group *sg = sd->groups; @@ -5842,8 +5841,8 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) if (cpu != group_balance_cpu(sg)) return; - update_group_power(sd, cpu); - atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight); + update_group_capacity(sd, cpu); + atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight); } /* @@ -5934,8 +5933,8 @@ static void claim_allocations(int cpu, struct sched_domain *sd) if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) *per_cpu_ptr(sdd->sg, cpu) = NULL; - if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref)) - *per_cpu_ptr(sdd->sgp, cpu) = NULL; + if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref)) + *per_cpu_ptr(sdd->sgc, cpu) = NULL; } #ifdef CONFIG_NUMA @@ -6337,14 +6336,14 @@ static int __sdt_alloc(const struct cpumask *cpu_map) if (!sdd->sg) return -ENOMEM; - sdd->sgp = alloc_percpu(struct sched_group_power *); - if (!sdd->sgp) + sdd->sgc = alloc_percpu(struct sched_group_capacity *); + if (!sdd->sgc) return -ENOMEM; for_each_cpu(j, cpu_map) { struct sched_domain *sd; struct sched_group *sg; - struct sched_group_power *sgp; + struct sched_group_capacity *sgc; sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), GFP_KERNEL, cpu_to_node(j)); @@ -6362,12 +6361,12 @@ static int __sdt_alloc(const struct cpumask *cpu_map) *per_cpu_ptr(sdd->sg, j) = sg; - sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(), + sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(), GFP_KERNEL, cpu_to_node(j)); - if (!sgp) + if (!sgc) return -ENOMEM; - *per_cpu_ptr(sdd->sgp, j) = sgp; + *per_cpu_ptr(sdd->sgc, j) = sgc; } } @@ -6394,15 +6393,15 @@ static void __sdt_free(const struct cpumask *cpu_map) if (sdd->sg) kfree(*per_cpu_ptr(sdd->sg, j)); - if (sdd->sgp) - kfree(*per_cpu_ptr(sdd->sgp, j)); + if (sdd->sgc) + kfree(*per_cpu_ptr(sdd->sgc, j)); } free_percpu(sdd->sd); sdd->sd = NULL; free_percpu(sdd->sg); sdd->sg = NULL; - free_percpu(sdd->sgp); - sdd->sgp = NULL; + free_percpu(sdd->sgc); + sdd->sgc = NULL; } } @@ -6479,7 +6478,7 @@ static int build_sched_domains(const struct cpumask *cpu_map, for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { claim_allocations(i, sd); - init_sched_groups_power(i, sd); + init_sched_groups_capacity(i, sd); } } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e401e446e87c..36bd4d23fca8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4369,8 +4369,8 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, avg_load += load; } - /* Adjust by relative CPU power of the group */ - avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power; + /* Adjust by relative CPU capacity of the group */ + avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgc->capacity; if (local_group) { this_load = avg_load; @@ -5532,7 +5532,7 @@ struct sg_lb_stats { unsigned long group_load; /* Total load over the CPUs of the group */ unsigned long sum_weighted_load; /* Weighted load of group's tasks */ unsigned long load_per_task; - unsigned long group_power; + unsigned long group_capacity; unsigned int sum_nr_running; /* Nr tasks running in the group */ unsigned int group_capacity_factor; unsigned int idle_cpus; @@ -5553,7 +5553,7 @@ struct sd_lb_stats { struct sched_group *busiest; /* Busiest group in this sd */ struct sched_group *local; /* Local group in this sd */ unsigned long total_load; /* Total load of all groups in sd */ - unsigned long total_pwr; /* Total power of all groups in sd */ + unsigned long total_capacity; /* Total capacity of all groups in sd */ unsigned long avg_load; /* Average load across all groups in sd */ struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */ @@ -5572,7 +5572,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) .busiest = NULL, .local = NULL, .total_load = 0UL, - .total_pwr = 0UL, + .total_capacity = 0UL, .busiest_stat = { .avg_load = 0UL, }, @@ -5681,7 +5681,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) power >>= SCHED_POWER_SHIFT; } - sdg->sgp->power_orig = power; + sdg->sgc->capacity_orig = power; if (sched_feat(ARCH_POWER)) power *= arch_scale_freq_power(sd, cpu); @@ -5697,26 +5697,26 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) power = 1; cpu_rq(cpu)->cpu_power = power; - sdg->sgp->power = power; + sdg->sgc->capacity = power; } -void update_group_power(struct sched_domain *sd, int cpu) +void update_group_capacity(struct sched_domain *sd, int cpu) { struct sched_domain *child = sd->child; struct sched_group *group, *sdg = sd->groups; - unsigned long power, power_orig; + unsigned long capacity, capacity_orig; unsigned long interval; interval = msecs_to_jiffies(sd->balance_interval); interval = clamp(interval, 1UL, max_load_balance_interval); - sdg->sgp->next_update = jiffies + interval; + sdg->sgc->next_update = jiffies + interval; if (!child) { update_cpu_power(sd, cpu); return; } - power_orig = power = 0; + capacity_orig = capacity = 0; if (child->flags & SD_OVERLAP) { /* @@ -5725,31 +5725,31 @@ void update_group_power(struct sched_domain *sd, int cpu) */ for_each_cpu(cpu, sched_group_cpus(sdg)) { - struct sched_group_power *sgp; + struct sched_group_capacity *sgc; struct rq *rq = cpu_rq(cpu); /* - * build_sched_domains() -> init_sched_groups_power() + * build_sched_domains() -> init_sched_groups_capacity() * gets here before we've attached the domains to the * runqueues. * * Use power_of(), which is set irrespective of domains * in update_cpu_power(). * - * This avoids power/power_orig from being 0 and + * This avoids capacity/capacity_orig from being 0 and * causing divide-by-zero issues on boot. * - * Runtime updates will correct power_orig. + * Runtime updates will correct capacity_orig. */ if (unlikely(!rq->sd)) { - power_orig += power_of(cpu); - power += power_of(cpu); + capacity_orig += power_of(cpu); + capacity += power_of(cpu); continue; } - sgp = rq->sd->groups->sgp; - power_orig += sgp->power_orig; - power += sgp->power; + sgc = rq->sd->groups->sgc; + capacity_orig += sgc->capacity_orig; + capacity += sgc->capacity; } } else { /* @@ -5759,14 +5759,14 @@ void update_group_power(struct sched_domain *sd, int cpu) group = child->groups; do { - power_orig += group->sgp->power_orig; - power += group->sgp->power; + capacity_orig += group->sgc->capacity_orig; + capacity += group->sgc->capacity; group = group->next; } while (group != child->groups); } - sdg->sgp->power_orig = power_orig; - sdg->sgp->power = power; + sdg->sgc->capacity_orig = capacity_orig; + sdg->sgc->capacity = capacity; } /* @@ -5786,9 +5786,9 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) return 0; /* - * If ~90% of the cpu_power is still there, we're good. + * If ~90% of the cpu_capacity is still there, we're good. */ - if (group->sgp->power * 32 > group->sgp->power_orig * 29) + if (group->sgc->capacity * 32 > group->sgc->capacity_orig * 29) return 1; return 0; @@ -5825,7 +5825,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) static inline int sg_imbalanced(struct sched_group *group) { - return group->sgp->imbalance; + return group->sgc->imbalance; } /* @@ -5833,22 +5833,23 @@ static inline int sg_imbalanced(struct sched_group *group) * * Avoid the issue where N*frac(smt_power) >= 1 creates 'phantom' cores by * first dividing out the smt factor and computing the actual number of cores - * and limit power unit capacity with that. + * and limit unit capacity with that. */ static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *group) { unsigned int capacity_factor, smt, cpus; - unsigned int power, power_orig; + unsigned int capacity, capacity_orig; - power = group->sgp->power; - power_orig = group->sgp->power_orig; + capacity = group->sgc->capacity; + capacity_orig = group->sgc->capacity_orig; cpus = group->group_weight; - /* smt := ceil(cpus / power), assumes: 1 < smt_power < 2 */ - smt = DIV_ROUND_UP(SCHED_POWER_SCALE * cpus, power_orig); + /* smt := ceil(cpus / capacity), assumes: 1 < smt_capacity < 2 */ + smt = DIV_ROUND_UP(SCHED_POWER_SCALE * cpus, capacity_orig); capacity_factor = cpus / smt; /* cores */ - capacity_factor = min_t(unsigned, capacity_factor, DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE)); + capacity_factor = min_t(unsigned, + capacity_factor, DIV_ROUND_CLOSEST(capacity, SCHED_POWER_SCALE)); if (!capacity_factor) capacity_factor = fix_small_capacity(env->sd, group); @@ -5892,9 +5893,9 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->idle_cpus++; } - /* Adjust by relative CPU power of the group */ - sgs->group_power = group->sgp->power; - sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power; + /* Adjust by relative CPU capacity of the group */ + sgs->group_capacity = group->sgc->capacity; + sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_capacity; if (sgs->sum_nr_running) sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; @@ -6009,8 +6010,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd sgs = &sds->local_stat; if (env->idle != CPU_NEWLY_IDLE || - time_after_eq(jiffies, sg->sgp->next_update)) - update_group_power(env->sd, env->dst_cpu); + time_after_eq(jiffies, sg->sgc->next_update)) + update_group_capacity(env->sd, env->dst_cpu); } update_sg_lb_stats(env, sg, load_idx, local_group, sgs); @@ -6040,7 +6041,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd next_group: /* Now, start updating sd_lb_stats */ sds->total_load += sgs->group_load; - sds->total_pwr += sgs->group_power; + sds->total_capacity += sgs->group_capacity; sg = sg->next; } while (sg != env->sd->groups); @@ -6087,7 +6088,7 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) return 0; env->imbalance = DIV_ROUND_CLOSEST( - sds->busiest_stat.avg_load * sds->busiest_stat.group_power, + sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity, SCHED_POWER_SCALE); return 1; @@ -6103,7 +6104,7 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) static inline void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) { - unsigned long tmp, pwr_now = 0, pwr_move = 0; + unsigned long tmp, capa_now = 0, capa_move = 0; unsigned int imbn = 2; unsigned long scaled_busy_load_per_task; struct sg_lb_stats *local, *busiest; @@ -6118,7 +6119,7 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) scaled_busy_load_per_task = (busiest->load_per_task * SCHED_POWER_SCALE) / - busiest->group_power; + busiest->group_capacity; if (busiest->avg_load + scaled_busy_load_per_task >= local->avg_load + (scaled_busy_load_per_task * imbn)) { @@ -6132,34 +6133,34 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) * moving them. */ - pwr_now += busiest->group_power * + capa_now += busiest->group_capacity * min(busiest->load_per_task, busiest->avg_load); - pwr_now += local->group_power * + capa_now += local->group_capacity * min(local->load_per_task, local->avg_load); - pwr_now /= SCHED_POWER_SCALE; + capa_now /= SCHED_POWER_SCALE; /* Amount of load we'd subtract */ if (busiest->avg_load > scaled_busy_load_per_task) { - pwr_move += busiest->group_power * + capa_move += busiest->group_capacity * min(busiest->load_per_task, busiest->avg_load - scaled_busy_load_per_task); } /* Amount of load we'd add */ - if (busiest->avg_load * busiest->group_power < + if (busiest->avg_load * busiest->group_capacity < busiest->load_per_task * SCHED_POWER_SCALE) { - tmp = (busiest->avg_load * busiest->group_power) / - local->group_power; + tmp = (busiest->avg_load * busiest->group_capacity) / + local->group_capacity; } else { tmp = (busiest->load_per_task * SCHED_POWER_SCALE) / - local->group_power; + local->group_capacity; } - pwr_move += local->group_power * + capa_move += local->group_capacity * min(local->load_per_task, local->avg_load + tmp); - pwr_move /= SCHED_POWER_SCALE; + capa_move /= SCHED_POWER_SCALE; /* Move if we gain throughput */ - if (pwr_move > pwr_now) + if (capa_move > capa_now) env->imbalance = busiest->load_per_task; } @@ -6207,7 +6208,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s (busiest->sum_nr_running - busiest->group_capacity_factor); load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE); - load_above_capacity /= busiest->group_power; + load_above_capacity /= busiest->group_capacity; } /* @@ -6222,8 +6223,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s /* How much load to actually move to equalise the imbalance */ env->imbalance = min( - max_pull * busiest->group_power, - (sds->avg_load - local->avg_load) * local->group_power + max_pull * busiest->group_capacity, + (sds->avg_load - local->avg_load) * local->group_capacity ) / SCHED_POWER_SCALE; /* @@ -6278,7 +6279,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) if (!sds.busiest || busiest->sum_nr_running == 0) goto out_balanced; - sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr; + sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_capacity; /* * If the busiest group is imbalanced the below checks don't @@ -6611,7 +6612,7 @@ more_balance: * We failed to reach balance because of affinity. */ if (sd_parent) { - int *group_imbalance = &sd_parent->groups->sgp->imbalance; + int *group_imbalance = &sd_parent->groups->sgc->imbalance; if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { *group_imbalance = 1; @@ -6998,7 +6999,7 @@ static inline void set_cpu_sd_state_busy(void) goto unlock; sd->nohz_idle = 0; - atomic_inc(&sd->groups->sgp->nr_busy_cpus); + atomic_inc(&sd->groups->sgc->nr_busy_cpus); unlock: rcu_read_unlock(); } @@ -7015,7 +7016,7 @@ void set_cpu_sd_state_idle(void) goto unlock; sd->nohz_idle = 1; - atomic_dec(&sd->groups->sgp->nr_busy_cpus); + atomic_dec(&sd->groups->sgc->nr_busy_cpus); unlock: rcu_read_unlock(); } @@ -7219,7 +7220,7 @@ end: * of an idle cpu is the system. * - This rq has more than one task. * - At any scheduler domain level, this cpu's scheduler group has multiple - * busy cpu's exceeding the group's power. + * busy cpu's exceeding the group's capacity. * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler * domain span are idle. */ @@ -7227,7 +7228,7 @@ static inline int nohz_kick_needed(struct rq *rq) { unsigned long now = jiffies; struct sched_domain *sd; - struct sched_group_power *sgp; + struct sched_group_capacity *sgc; int nr_busy, cpu = rq->cpu; if (unlikely(rq->idle_balance)) @@ -7257,8 +7258,8 @@ static inline int nohz_kick_needed(struct rq *rq) sd = rcu_dereference(per_cpu(sd_busy, cpu)); if (sd) { - sgp = sd->groups->sgp; - nr_busy = atomic_read(&sgp->nr_busy_cpus); + sgc = sd->groups->sgc; + nr_busy = atomic_read(&sgc->nr_busy_cpus); if (nr_busy > 1) goto need_kick_unlock; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 600e2291a75c..a5b957d53c92 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -728,15 +728,15 @@ DECLARE_PER_CPU(struct sched_domain *, sd_numa); DECLARE_PER_CPU(struct sched_domain *, sd_busy); DECLARE_PER_CPU(struct sched_domain *, sd_asym); -struct sched_group_power { +struct sched_group_capacity { atomic_t ref; /* - * CPU power of this group, SCHED_LOAD_SCALE being max power for a - * single CPU. + * CPU capacity of this group, SCHED_LOAD_SCALE being max capacity + * for a single CPU. */ - unsigned int power, power_orig; + unsigned int capacity, capacity_orig; unsigned long next_update; - int imbalance; /* XXX unrelated to power but shared group state */ + int imbalance; /* XXX unrelated to capacity but shared group state */ /* * Number of busy cpus in this group. */ @@ -750,7 +750,7 @@ struct sched_group { atomic_t ref; unsigned int group_weight; - struct sched_group_power *sgp; + struct sched_group_capacity *sgc; /* * The CPUs this group covers. @@ -773,7 +773,7 @@ static inline struct cpumask *sched_group_cpus(struct sched_group *sg) */ static inline struct cpumask *sched_group_mask(struct sched_group *sg) { - return to_cpumask(sg->sgp->cpumask); + return to_cpumask(sg->sgc->cpumask); } /** @@ -1167,7 +1167,7 @@ extern const struct sched_class idle_sched_class; #ifdef CONFIG_SMP -extern void update_group_power(struct sched_domain *sd, int cpu); +extern void update_group_capacity(struct sched_domain *sd, int cpu); extern void trigger_load_balance(struct rq *rq); From ced549fa5fc1fdaf7fac93894dc673092eb3dc20 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 26 May 2014 18:19:38 -0400 Subject: [PATCH 09/18] sched: Remove remaining dubious usage of "power" It is better not to think about compute capacity as being equivalent to "CPU power". The upcoming "power aware" scheduler work may create confusion with the notion of energy consumption if "power" is used too liberally. This is the remaining "power" -> "capacity" rename for local symbols. Those symbols visible to the rest of the kernel are not included yet. Signed-off-by: Nicolas Pitre Signed-off-by: Peter Zijlstra Cc: Vincent Guittot Cc: Daniel Lezcano Cc: Morten Rasmussen Cc: "Rafael J. Wysocki" Cc: linaro-kernel@lists.linaro.org Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/n/tip-yyyhohzhkwnaotr3lx8zd5aa@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 6 +-- kernel/sched/fair.c | 102 +++++++++++++++++++++---------------------- kernel/sched/sched.h | 2 +- 3 files changed, 55 insertions(+), 55 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2e1fb0902200..07bc78a50329 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5764,7 +5764,7 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) /* * build_sched_groups will build a circular linked list of the groups * covered by the given span, and will set each group's ->cpumask correctly, - * and ->cpu_power to 0. + * and ->cpu_capacity to 0. * * Assumes the sched_domain tree is fully constructed */ @@ -6471,7 +6471,7 @@ static int build_sched_domains(const struct cpumask *cpu_map, } } - /* Calculate CPU power for physical packages and nodes */ + /* Calculate CPU capacity for physical packages and nodes */ for (i = nr_cpumask_bits-1; i >= 0; i--) { if (!cpumask_test_cpu(i, cpu_map)) continue; @@ -6921,7 +6921,7 @@ void __init sched_init(void) #ifdef CONFIG_SMP rq->sd = NULL; rq->rd = NULL; - rq->cpu_power = SCHED_POWER_SCALE; + rq->cpu_capacity = SCHED_POWER_SCALE; rq->post_schedule = 0; rq->active_balance = 0; rq->next_balance = jiffies; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 36bd4d23fca8..58684f684fa8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1017,7 +1017,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, static unsigned long weighted_cpuload(const int cpu); static unsigned long source_load(int cpu, int type); static unsigned long target_load(int cpu, int type); -static unsigned long power_of(int cpu); +static unsigned long capacity_of(int cpu); static long effective_load(struct task_group *tg, int cpu, long wl, long wg); /* Cached statistics for all CPUs within a node */ @@ -1046,7 +1046,7 @@ static void update_numa_stats(struct numa_stats *ns, int nid) ns->nr_running += rq->nr_running; ns->load += weighted_cpuload(cpu); - ns->compute_capacity += power_of(cpu); + ns->compute_capacity += capacity_of(cpu); cpus++; } @@ -1214,7 +1214,7 @@ balance: orig_dst_load = env->dst_stats.load; orig_src_load = env->src_stats.load; - /* XXX missing power terms */ + /* XXX missing capacity terms */ load = task_h_load(env->p); dst_load = orig_dst_load + load; src_load = orig_src_load - load; @@ -4043,9 +4043,9 @@ static unsigned long target_load(int cpu, int type) return max(rq->cpu_load[type-1], total); } -static unsigned long power_of(int cpu) +static unsigned long capacity_of(int cpu) { - return cpu_rq(cpu)->cpu_power; + return cpu_rq(cpu)->cpu_capacity; } static unsigned long cpu_avg_load_per_task(int cpu) @@ -4288,12 +4288,12 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) s64 this_eff_load, prev_eff_load; this_eff_load = 100; - this_eff_load *= power_of(prev_cpu); + this_eff_load *= capacity_of(prev_cpu); this_eff_load *= this_load + effective_load(tg, this_cpu, weight, weight); prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; - prev_eff_load *= power_of(this_cpu); + prev_eff_load *= capacity_of(this_cpu); prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight); balanced = this_eff_load <= prev_eff_load; @@ -4950,14 +4950,14 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp * * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3) * - * P_i is the cpu power (or compute capacity) of cpu i, typically it is the + * C_i is the compute capacity of cpu i, typically it is the * fraction of 'recent' time available for SCHED_OTHER task execution. But it * can also include other factors [XXX]. * * To achieve this balance we define a measure of imbalance which follows * directly from (1): * - * imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j } (4) + * imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j } (4) * * We them move tasks around to minimize the imbalance. In the continuous * function space it is obvious this converges, in the discrete case we get @@ -5607,17 +5607,17 @@ static inline int get_sd_load_idx(struct sched_domain *sd, return load_idx; } -static unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) +static unsigned long default_scale_capacity(struct sched_domain *sd, int cpu) { return SCHED_POWER_SCALE; } unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) { - return default_scale_freq_power(sd, cpu); + return default_scale_capacity(sd, cpu); } -static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) +static unsigned long default_scale_smt_capacity(struct sched_domain *sd, int cpu) { unsigned long weight = sd->span_weight; unsigned long smt_gain = sd->smt_gain; @@ -5629,10 +5629,10 @@ static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) { - return default_scale_smt_power(sd, cpu); + return default_scale_smt_capacity(sd, cpu); } -static unsigned long scale_rt_power(int cpu) +static unsigned long scale_rt_capacity(int cpu) { struct rq *rq = cpu_rq(cpu); u64 total, available, age_stamp, avg; @@ -5652,7 +5652,7 @@ static unsigned long scale_rt_power(int cpu) total = sched_avg_period() + delta; if (unlikely(total < avg)) { - /* Ensures that power won't end up being negative */ + /* Ensures that capacity won't end up being negative */ available = 0; } else { available = total - avg; @@ -5666,38 +5666,38 @@ static unsigned long scale_rt_power(int cpu) return div_u64(available, total); } -static void update_cpu_power(struct sched_domain *sd, int cpu) +static void update_cpu_capacity(struct sched_domain *sd, int cpu) { unsigned long weight = sd->span_weight; - unsigned long power = SCHED_POWER_SCALE; + unsigned long capacity = SCHED_POWER_SCALE; struct sched_group *sdg = sd->groups; if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { if (sched_feat(ARCH_POWER)) - power *= arch_scale_smt_power(sd, cpu); + capacity *= arch_scale_smt_power(sd, cpu); else - power *= default_scale_smt_power(sd, cpu); + capacity *= default_scale_smt_capacity(sd, cpu); - power >>= SCHED_POWER_SHIFT; + capacity >>= SCHED_POWER_SHIFT; } - sdg->sgc->capacity_orig = power; + sdg->sgc->capacity_orig = capacity; if (sched_feat(ARCH_POWER)) - power *= arch_scale_freq_power(sd, cpu); + capacity *= arch_scale_freq_power(sd, cpu); else - power *= default_scale_freq_power(sd, cpu); + capacity *= default_scale_capacity(sd, cpu); - power >>= SCHED_POWER_SHIFT; + capacity >>= SCHED_POWER_SHIFT; - power *= scale_rt_power(cpu); - power >>= SCHED_POWER_SHIFT; + capacity *= scale_rt_capacity(cpu); + capacity >>= SCHED_POWER_SHIFT; - if (!power) - power = 1; + if (!capacity) + capacity = 1; - cpu_rq(cpu)->cpu_power = power; - sdg->sgc->capacity = power; + cpu_rq(cpu)->cpu_capacity = capacity; + sdg->sgc->capacity = capacity; } void update_group_capacity(struct sched_domain *sd, int cpu) @@ -5712,7 +5712,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) sdg->sgc->next_update = jiffies + interval; if (!child) { - update_cpu_power(sd, cpu); + update_cpu_capacity(sd, cpu); return; } @@ -5733,8 +5733,8 @@ void update_group_capacity(struct sched_domain *sd, int cpu) * gets here before we've attached the domains to the * runqueues. * - * Use power_of(), which is set irrespective of domains - * in update_cpu_power(). + * Use capacity_of(), which is set irrespective of domains + * in update_cpu_capacity(). * * This avoids capacity/capacity_orig from being 0 and * causing divide-by-zero issues on boot. @@ -5742,8 +5742,8 @@ void update_group_capacity(struct sched_domain *sd, int cpu) * Runtime updates will correct capacity_orig. */ if (unlikely(!rq->sd)) { - capacity_orig += power_of(cpu); - capacity += power_of(cpu); + capacity_orig += capacity_of(cpu); + capacity += capacity_of(cpu); continue; } @@ -5831,7 +5831,7 @@ static inline int sg_imbalanced(struct sched_group *group) /* * Compute the group capacity factor. * - * Avoid the issue where N*frac(smt_power) >= 1 creates 'phantom' cores by + * Avoid the issue where N*frac(smt_capacity) >= 1 creates 'phantom' cores by * first dividing out the smt factor and computing the actual number of cores * and limit unit capacity with that. */ @@ -6129,7 +6129,7 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) /* * OK, we don't have enough imbalance to justify moving tasks, - * however we may be able to increase total CPU power used by + * however we may be able to increase total CPU capacity used by * moving them. */ @@ -6190,7 +6190,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s /* * In the presence of smp nice balancing, certain scenarios can have * max load less than avg load(as we skip the groups at or below - * its cpu_power, while calculating max_load..) + * its cpu_capacity, while calculating max_load..) */ if (busiest->avg_load <= sds->avg_load || local->avg_load >= sds->avg_load) { @@ -6345,11 +6345,11 @@ static struct rq *find_busiest_queue(struct lb_env *env, struct sched_group *group) { struct rq *busiest = NULL, *rq; - unsigned long busiest_load = 0, busiest_power = 1; + unsigned long busiest_load = 0, busiest_capacity = 1; int i; for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { - unsigned long power, capacity_factor, wl; + unsigned long capacity, capacity_factor, wl; enum fbq_type rt; rq = cpu_rq(i); @@ -6377,8 +6377,8 @@ static struct rq *find_busiest_queue(struct lb_env *env, if (rt > env->fbq_type) continue; - power = power_of(i); - capacity_factor = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); + capacity = capacity_of(i); + capacity_factor = DIV_ROUND_CLOSEST(capacity, SCHED_POWER_SCALE); if (!capacity_factor) capacity_factor = fix_small_capacity(env->sd, group); @@ -6386,25 +6386,25 @@ static struct rq *find_busiest_queue(struct lb_env *env, /* * When comparing with imbalance, use weighted_cpuload() - * which is not scaled with the cpu power. + * which is not scaled with the cpu capacity. */ if (capacity_factor && rq->nr_running == 1 && wl > env->imbalance) continue; /* * For the load comparisons with the other cpu's, consider - * the weighted_cpuload() scaled with the cpu power, so that - * the load can be moved away from the cpu that is potentially - * running at a lower capacity. + * the weighted_cpuload() scaled with the cpu capacity, so + * that the load can be moved away from the cpu that is + * potentially running at a lower capacity. * - * Thus we're looking for max(wl_i / power_i), crosswise + * Thus we're looking for max(wl_i / capacity_i), crosswise * multiplication to rid ourselves of the division works out - * to: wl_i * power_j > wl_j * power_i; where j is our - * previous maximum. + * to: wl_i * capacity_j > wl_j * capacity_i; where j is + * our previous maximum. */ - if (wl * busiest_power > busiest_load * power) { + if (wl * busiest_capacity > busiest_load * capacity) { busiest_load = wl; - busiest_power = power; + busiest_capacity = capacity; busiest = rq; } } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index a5b957d53c92..956b8ca24893 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -567,7 +567,7 @@ struct rq { struct root_domain *rd; struct sched_domain *sd; - unsigned long cpu_power; + unsigned long cpu_capacity; unsigned char idle_balance; /* For active balancing */ From ca8ce3d0b144c318a5a9ce99649053e9029061ea Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 26 May 2014 18:19:39 -0400 Subject: [PATCH 10/18] sched: Final power vs. capacity cleanups It is better not to think about compute capacity as being equivalent to "CPU power". The upcoming "power aware" scheduler work may create confusion with the notion of energy consumption if "power" is used too liberally. This contains the architecture visible changes. Incidentally, only ARM takes advantage of the available pow^H^H^Hcapacity scaling hooks and therefore those changes outside kernel/sched/ are confined to one ARM specific file. The default arch_scale_smt_power() hook is not overridden by anyone. Replacements are as follows: arch_scale_freq_power --> arch_scale_freq_capacity arch_scale_smt_power --> arch_scale_smt_capacity SCHED_POWER_SCALE --> SCHED_CAPACITY_SCALE SCHED_POWER_SHIFT --> SCHED_CAPACITY_SHIFT The local usage of "power" in arch/arm/kernel/topology.c is also changed to "capacity" as appropriate. Signed-off-by: Nicolas Pitre Signed-off-by: Peter Zijlstra Cc: Vincent Guittot Cc: Daniel Lezcano Cc: Morten Rasmussen Cc: "Rafael J. Wysocki" Cc: linaro-kernel@lists.linaro.org Cc: Arnd Bergmann Cc: Dietmar Eggemann Cc: Grant Likely Cc: Linus Torvalds Cc: Mark Brown Cc: Rob Herring Cc: Russell King Cc: Sudeep KarkadaNagesha Cc: Vincent Guittot Cc: devicetree@vger.kernel.org Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/n/tip-48zba9qbznvglwelgq2cfygh@git.kernel.org Signed-off-by: Ingo Molnar --- arch/arm/kernel/topology.c | 54 +++++++++++++++++----------------- include/linux/sched.h | 6 ++-- kernel/sched/core.c | 6 ++-- kernel/sched/fair.c | 59 +++++++++++++++++++------------------- 4 files changed, 63 insertions(+), 62 deletions(-) diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c index 71e1fec6d31a..d42a7db22236 100644 --- a/arch/arm/kernel/topology.c +++ b/arch/arm/kernel/topology.c @@ -26,30 +26,30 @@ #include /* - * cpu power scale management + * cpu capacity scale management */ /* - * cpu power table + * cpu capacity table * This per cpu data structure describes the relative capacity of each core. * On a heteregenous system, cores don't have the same computation capacity - * and we reflect that difference in the cpu_power field so the scheduler can - * take this difference into account during load balance. A per cpu structure - * is preferred because each CPU updates its own cpu_power field during the - * load balance except for idle cores. One idle core is selected to run the - * rebalance_domains for all idle cores and the cpu_power can be updated - * during this sequence. + * and we reflect that difference in the cpu_capacity field so the scheduler + * can take this difference into account during load balance. A per cpu + * structure is preferred because each CPU updates its own cpu_capacity field + * during the load balance except for idle cores. One idle core is selected + * to run the rebalance_domains for all idle cores and the cpu_capacity can be + * updated during this sequence. */ static DEFINE_PER_CPU(unsigned long, cpu_scale); -unsigned long arch_scale_freq_power(struct sched_domain *sd, int cpu) +unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) { return per_cpu(cpu_scale, cpu); } -static void set_power_scale(unsigned int cpu, unsigned long power) +static void set_capacity_scale(unsigned int cpu, unsigned long capacity) { - per_cpu(cpu_scale, cpu) = power; + per_cpu(cpu_scale, cpu) = capacity; } #ifdef CONFIG_OF @@ -62,11 +62,11 @@ struct cpu_efficiency { * Table of relative efficiency of each processors * The efficiency value must fit in 20bit and the final * cpu_scale value must be in the range - * 0 < cpu_scale < 3*SCHED_POWER_SCALE/2 + * 0 < cpu_scale < 3*SCHED_CAPACITY_SCALE/2 * in order to return at most 1 when DIV_ROUND_CLOSEST * is used to compute the capacity of a CPU. * Processors that are not defined in the table, - * use the default SCHED_POWER_SCALE value for cpu_scale. + * use the default SCHED_CAPACITY_SCALE value for cpu_scale. */ static const struct cpu_efficiency table_efficiency[] = { {"arm,cortex-a15", 3891}, @@ -83,9 +83,9 @@ static unsigned long middle_capacity = 1; * Iterate all CPUs' descriptor in DT and compute the efficiency * (as per table_efficiency). Also calculate a middle efficiency * as close as possible to (max{eff_i} - min{eff_i}) / 2 - * This is later used to scale the cpu_power field such that an - * 'average' CPU is of middle power. Also see the comments near - * table_efficiency[] and update_cpu_power(). + * This is later used to scale the cpu_capacity field such that an + * 'average' CPU is of middle capacity. Also see the comments near + * table_efficiency[] and update_cpu_capacity(). */ static void __init parse_dt_topology(void) { @@ -141,15 +141,15 @@ static void __init parse_dt_topology(void) * cpu_scale because all CPUs have the same capacity. Otherwise, we * compute a middle_capacity factor that will ensure that the capacity * of an 'average' CPU of the system will be as close as possible to - * SCHED_POWER_SCALE, which is the default value, but with the + * SCHED_CAPACITY_SCALE, which is the default value, but with the * constraint explained near table_efficiency[]. */ if (4*max_capacity < (3*(max_capacity + min_capacity))) middle_capacity = (min_capacity + max_capacity) - >> (SCHED_POWER_SHIFT+1); + >> (SCHED_CAPACITY_SHIFT+1); else middle_capacity = ((max_capacity / 3) - >> (SCHED_POWER_SHIFT-1)) + 1; + >> (SCHED_CAPACITY_SHIFT-1)) + 1; } @@ -158,20 +158,20 @@ static void __init parse_dt_topology(void) * boot. The update of all CPUs is in O(n^2) for heteregeneous system but the * function returns directly for SMP system. */ -static void update_cpu_power(unsigned int cpu) +static void update_cpu_capacity(unsigned int cpu) { if (!cpu_capacity(cpu)) return; - set_power_scale(cpu, cpu_capacity(cpu) / middle_capacity); + set_capacity_scale(cpu, cpu_capacity(cpu) / middle_capacity); - printk(KERN_INFO "CPU%u: update cpu_power %lu\n", - cpu, arch_scale_freq_power(NULL, cpu)); + printk(KERN_INFO "CPU%u: update cpu_capacity %lu\n", + cpu, arch_scale_freq_capacity(NULL, cpu)); } #else static inline void parse_dt_topology(void) {} -static inline void update_cpu_power(unsigned int cpuid) {} +static inline void update_cpu_capacity(unsigned int cpuid) {} #endif /* @@ -267,7 +267,7 @@ void store_cpu_topology(unsigned int cpuid) update_siblings_masks(cpuid); - update_cpu_power(cpuid); + update_cpu_capacity(cpuid); printk(KERN_INFO "CPU%u: thread %d, cpu %d, socket %d, mpidr %x\n", cpuid, cpu_topology[cpuid].thread_id, @@ -297,7 +297,7 @@ void __init init_cpu_topology(void) { unsigned int cpu; - /* init core mask and power*/ + /* init core mask and capacity */ for_each_possible_cpu(cpu) { struct cputopo_arm *cpu_topo = &(cpu_topology[cpu]); @@ -307,7 +307,7 @@ void __init init_cpu_topology(void) cpumask_clear(&cpu_topo->core_sibling); cpumask_clear(&cpu_topo->thread_sibling); - set_power_scale(cpu, SCHED_POWER_SCALE); + set_capacity_scale(cpu, SCHED_CAPACITY_SCALE); } smp_wmb(); diff --git a/include/linux/sched.h b/include/linux/sched.h index a96f03598c61..322110affe63 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -854,10 +854,10 @@ enum cpu_idle_type { }; /* - * Increase resolution of cpu_power calculations + * Increase resolution of cpu_capacity calculations */ -#define SCHED_POWER_SHIFT 10 -#define SCHED_POWER_SCALE (1L << SCHED_POWER_SHIFT) +#define SCHED_CAPACITY_SHIFT 10 +#define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT) /* * sched-domains (multiprocessor balancing) declarations: diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 07bc78a50329..7ba4f5413a10 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5249,7 +5249,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); printk(KERN_CONT " %s", str); - if (group->sgc->capacity != SCHED_POWER_SCALE) { + if (group->sgc->capacity != SCHED_CAPACITY_SCALE) { printk(KERN_CONT " (cpu_capacity = %d)", group->sgc->capacity); } @@ -5715,7 +5715,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) * domains and no possible iteration will get us here, we won't * die on a /0 trap. */ - sg->sgc->capacity = SCHED_POWER_SCALE * cpumask_weight(sg_span); + sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); sg->sgc->capacity_orig = sg->sgc->capacity; /* @@ -6921,7 +6921,7 @@ void __init sched_init(void) #ifdef CONFIG_SMP rq->sd = NULL; rq->rd = NULL; - rq->cpu_capacity = SCHED_POWER_SCALE; + rq->cpu_capacity = SCHED_CAPACITY_SCALE; rq->post_schedule = 0; rq->active_balance = 0; rq->next_balance = jiffies; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 58684f684fa8..dc7d6527a282 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1062,9 +1062,9 @@ static void update_numa_stats(struct numa_stats *ns, int nid) if (!cpus) return; - ns->load = (ns->load * SCHED_POWER_SCALE) / ns->compute_capacity; + ns->load = (ns->load * SCHED_CAPACITY_SCALE) / ns->compute_capacity; ns->task_capacity = - DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_POWER_SCALE); + DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE); ns->has_free_capacity = (ns->nr_running < ns->task_capacity); } @@ -4370,7 +4370,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, } /* Adjust by relative CPU capacity of the group */ - avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgc->capacity; + avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity; if (local_group) { this_load = avg_load; @@ -5609,10 +5609,10 @@ static inline int get_sd_load_idx(struct sched_domain *sd, static unsigned long default_scale_capacity(struct sched_domain *sd, int cpu) { - return SCHED_POWER_SCALE; + return SCHED_CAPACITY_SCALE; } -unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) +unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu) { return default_scale_capacity(sd, cpu); } @@ -5627,7 +5627,7 @@ static unsigned long default_scale_smt_capacity(struct sched_domain *sd, int cpu return smt_gain; } -unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) +unsigned long __weak arch_scale_smt_capacity(struct sched_domain *sd, int cpu) { return default_scale_smt_capacity(sd, cpu); } @@ -5658,10 +5658,10 @@ static unsigned long scale_rt_capacity(int cpu) available = total - avg; } - if (unlikely((s64)total < SCHED_POWER_SCALE)) - total = SCHED_POWER_SCALE; + if (unlikely((s64)total < SCHED_CAPACITY_SCALE)) + total = SCHED_CAPACITY_SCALE; - total >>= SCHED_POWER_SHIFT; + total >>= SCHED_CAPACITY_SHIFT; return div_u64(available, total); } @@ -5669,29 +5669,29 @@ static unsigned long scale_rt_capacity(int cpu) static void update_cpu_capacity(struct sched_domain *sd, int cpu) { unsigned long weight = sd->span_weight; - unsigned long capacity = SCHED_POWER_SCALE; + unsigned long capacity = SCHED_CAPACITY_SCALE; struct sched_group *sdg = sd->groups; if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { if (sched_feat(ARCH_POWER)) - capacity *= arch_scale_smt_power(sd, cpu); + capacity *= arch_scale_smt_capacity(sd, cpu); else capacity *= default_scale_smt_capacity(sd, cpu); - capacity >>= SCHED_POWER_SHIFT; + capacity >>= SCHED_CAPACITY_SHIFT; } sdg->sgc->capacity_orig = capacity; if (sched_feat(ARCH_POWER)) - capacity *= arch_scale_freq_power(sd, cpu); + capacity *= arch_scale_freq_capacity(sd, cpu); else capacity *= default_scale_capacity(sd, cpu); - capacity >>= SCHED_POWER_SHIFT; + capacity >>= SCHED_CAPACITY_SHIFT; capacity *= scale_rt_capacity(cpu); - capacity >>= SCHED_POWER_SHIFT; + capacity >>= SCHED_CAPACITY_SHIFT; if (!capacity) capacity = 1; @@ -5780,7 +5780,7 @@ static inline int fix_small_capacity(struct sched_domain *sd, struct sched_group *group) { /* - * Only siblings can have significantly less than SCHED_POWER_SCALE + * Only siblings can have significantly less than SCHED_CAPACITY_SCALE */ if (!(sd->flags & SD_SHARE_CPUPOWER)) return 0; @@ -5845,11 +5845,11 @@ static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *gro cpus = group->group_weight; /* smt := ceil(cpus / capacity), assumes: 1 < smt_capacity < 2 */ - smt = DIV_ROUND_UP(SCHED_POWER_SCALE * cpus, capacity_orig); + smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, capacity_orig); capacity_factor = cpus / smt; /* cores */ capacity_factor = min_t(unsigned, - capacity_factor, DIV_ROUND_CLOSEST(capacity, SCHED_POWER_SCALE)); + capacity_factor, DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE)); if (!capacity_factor) capacity_factor = fix_small_capacity(env->sd, group); @@ -5895,7 +5895,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, /* Adjust by relative CPU capacity of the group */ sgs->group_capacity = group->sgc->capacity; - sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_capacity; + sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity; if (sgs->sum_nr_running) sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; @@ -6089,7 +6089,7 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) env->imbalance = DIV_ROUND_CLOSEST( sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity, - SCHED_POWER_SCALE); + SCHED_CAPACITY_SCALE); return 1; } @@ -6118,7 +6118,7 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) imbn = 1; scaled_busy_load_per_task = - (busiest->load_per_task * SCHED_POWER_SCALE) / + (busiest->load_per_task * SCHED_CAPACITY_SCALE) / busiest->group_capacity; if (busiest->avg_load + scaled_busy_load_per_task >= @@ -6137,7 +6137,7 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) min(busiest->load_per_task, busiest->avg_load); capa_now += local->group_capacity * min(local->load_per_task, local->avg_load); - capa_now /= SCHED_POWER_SCALE; + capa_now /= SCHED_CAPACITY_SCALE; /* Amount of load we'd subtract */ if (busiest->avg_load > scaled_busy_load_per_task) { @@ -6148,16 +6148,16 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) /* Amount of load we'd add */ if (busiest->avg_load * busiest->group_capacity < - busiest->load_per_task * SCHED_POWER_SCALE) { + busiest->load_per_task * SCHED_CAPACITY_SCALE) { tmp = (busiest->avg_load * busiest->group_capacity) / local->group_capacity; } else { - tmp = (busiest->load_per_task * SCHED_POWER_SCALE) / + tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) / local->group_capacity; } capa_move += local->group_capacity * min(local->load_per_task, local->avg_load + tmp); - capa_move /= SCHED_POWER_SCALE; + capa_move /= SCHED_CAPACITY_SCALE; /* Move if we gain throughput */ if (capa_move > capa_now) @@ -6207,7 +6207,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s load_above_capacity = (busiest->sum_nr_running - busiest->group_capacity_factor); - load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE); + load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_CAPACITY_SCALE); load_above_capacity /= busiest->group_capacity; } @@ -6225,7 +6225,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s env->imbalance = min( max_pull * busiest->group_capacity, (sds->avg_load - local->avg_load) * local->group_capacity - ) / SCHED_POWER_SCALE; + ) / SCHED_CAPACITY_SCALE; /* * if *imbalance is less than the average load per runnable task @@ -6279,7 +6279,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env) if (!sds.busiest || busiest->sum_nr_running == 0) goto out_balanced; - sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_capacity; + sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load) + / sds.total_capacity; /* * If the busiest group is imbalanced the below checks don't @@ -6378,7 +6379,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, continue; capacity = capacity_of(i); - capacity_factor = DIV_ROUND_CLOSEST(capacity, SCHED_POWER_SCALE); + capacity_factor = DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE); if (!capacity_factor) capacity_factor = fix_small_capacity(env->sd, group); From 5d4dfddd4f02b028d6ddaaa04d75d3b0cad1c9ae Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Tue, 27 May 2014 13:50:41 -0400 Subject: [PATCH 11/18] sched: Rename capacity related flags It is better not to think about compute capacity as being equivalent to "CPU power". The upcoming "power aware" scheduler work may create confusion with the notion of energy consumption if "power" is used too liberally. Let's rename the following feature flags since they do relate to capacity: SD_SHARE_CPUPOWER -> SD_SHARE_CPUCAPACITY ARCH_POWER -> ARCH_CAPACITY NONTASK_POWER -> NONTASK_CAPACITY Signed-off-by: Nicolas Pitre Signed-off-by: Peter Zijlstra Cc: Vincent Guittot Cc: Daniel Lezcano Cc: Morten Rasmussen Cc: "Rafael J. Wysocki" Cc: linaro-kernel@lists.linaro.org Cc: Andy Fleming Cc: Anton Blanchard Cc: Benjamin Herrenschmidt Cc: Grant Likely Cc: Linus Torvalds Cc: Michael Ellerman Cc: Paul Gortmaker Cc: Paul Mackerras Cc: Preeti U Murthy Cc: Rob Herring Cc: Srivatsa S. Bhat Cc: Toshi Kani Cc: Vasant Hegde Cc: Vincent Guittot Cc: devicetree@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/n/tip-e93lpnxb87owfievqatey6b5@git.kernel.org Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/smp.c | 2 +- include/linux/sched.h | 4 ++-- kernel/sched/core.c | 14 +++++++------- kernel/sched/fair.c | 8 ++++---- kernel/sched/features.h | 8 ++++---- 5 files changed, 18 insertions(+), 18 deletions(-) diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 10ffffef0414..c51d16379cba 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -770,7 +770,7 @@ int setup_profiling_timer(unsigned int multiplier) /* cpumask of CPUs with asymetric SMT dependancy */ static const int powerpc_smt_flags(void) { - int flags = SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES; + int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES; if (cpu_has_feature(CPU_FTR_ASYM_SMT)) { printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n"); diff --git a/include/linux/sched.h b/include/linux/sched.h index 322110affe63..ce93768a3312 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -869,7 +869,7 @@ enum cpu_idle_type { #define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */ #define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */ #define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */ -#define SD_SHARE_CPUPOWER 0x0080 /* Domain members share cpu power */ +#define SD_SHARE_CPUCAPACITY 0x0080 /* Domain members share cpu power */ #define SD_SHARE_POWERDOMAIN 0x0100 /* Domain members share power domain */ #define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */ #define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */ @@ -881,7 +881,7 @@ enum cpu_idle_type { #ifdef CONFIG_SCHED_SMT static inline const int cpu_smt_flags(void) { - return SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES; + return SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES; } #endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7ba4f5413a10..5976ca579d3e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -872,7 +872,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) rq->clock_task += delta; #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) - if ((irq_delta + steal) && sched_feat(NONTASK_POWER)) + if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY)) sched_rt_avg_update(rq, irq_delta + steal); #endif } @@ -5309,7 +5309,7 @@ static int sd_degenerate(struct sched_domain *sd) SD_BALANCE_NEWIDLE | SD_BALANCE_FORK | SD_BALANCE_EXEC | - SD_SHARE_CPUPOWER | + SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES | SD_SHARE_POWERDOMAIN)) { if (sd->groups != sd->groups->next) @@ -5340,7 +5340,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) SD_BALANCE_NEWIDLE | SD_BALANCE_FORK | SD_BALANCE_EXEC | - SD_SHARE_CPUPOWER | + SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES | SD_PREFER_SIBLING | SD_SHARE_POWERDOMAIN); @@ -5947,7 +5947,7 @@ static int sched_domains_curr_level; /* * SD_flags allowed in topology descriptions. * - * SD_SHARE_CPUPOWER - describes SMT topologies + * SD_SHARE_CPUCAPACITY - describes SMT topologies * SD_SHARE_PKG_RESOURCES - describes shared caches * SD_NUMA - describes NUMA topologies * SD_SHARE_POWERDOMAIN - describes shared power domain @@ -5956,7 +5956,7 @@ static int sched_domains_curr_level; * SD_ASYM_PACKING - describes SMT quirks */ #define TOPOLOGY_SD_FLAGS \ - (SD_SHARE_CPUPOWER | \ + (SD_SHARE_CPUCAPACITY | \ SD_SHARE_PKG_RESOURCES | \ SD_NUMA | \ SD_ASYM_PACKING | \ @@ -6002,7 +6002,7 @@ sd_init(struct sched_domain_topology_level *tl, int cpu) | 1*SD_BALANCE_FORK | 0*SD_BALANCE_WAKE | 1*SD_WAKE_AFFINE - | 0*SD_SHARE_CPUPOWER + | 0*SD_SHARE_CPUCAPACITY | 0*SD_SHARE_PKG_RESOURCES | 0*SD_SERIALIZE | 0*SD_PREFER_SIBLING @@ -6024,7 +6024,7 @@ sd_init(struct sched_domain_topology_level *tl, int cpu) * Convert topological properties into behaviour. */ - if (sd->flags & SD_SHARE_CPUPOWER) { + if (sd->flags & SD_SHARE_CPUCAPACITY) { sd->imbalance_pct = 110; sd->smt_gain = 1178; /* ~15% */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index dc7d6527a282..d3c731222199 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5672,8 +5672,8 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) unsigned long capacity = SCHED_CAPACITY_SCALE; struct sched_group *sdg = sd->groups; - if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { - if (sched_feat(ARCH_POWER)) + if ((sd->flags & SD_SHARE_CPUCAPACITY) && weight > 1) { + if (sched_feat(ARCH_CAPACITY)) capacity *= arch_scale_smt_capacity(sd, cpu); else capacity *= default_scale_smt_capacity(sd, cpu); @@ -5683,7 +5683,7 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) sdg->sgc->capacity_orig = capacity; - if (sched_feat(ARCH_POWER)) + if (sched_feat(ARCH_CAPACITY)) capacity *= arch_scale_freq_capacity(sd, cpu); else capacity *= default_scale_capacity(sd, cpu); @@ -5782,7 +5782,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) /* * Only siblings can have significantly less than SCHED_CAPACITY_SCALE */ - if (!(sd->flags & SD_SHARE_CPUPOWER)) + if (!(sd->flags & SD_SHARE_CPUCAPACITY)) return 0; /* diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 5716929a2e3a..90284d117fe6 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -37,18 +37,18 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true) SCHED_FEAT(WAKEUP_PREEMPTION, true) /* - * Use arch dependent cpu power functions + * Use arch dependent cpu capacity functions */ -SCHED_FEAT(ARCH_POWER, true) +SCHED_FEAT(ARCH_CAPACITY, true) SCHED_FEAT(HRTICK, false) SCHED_FEAT(DOUBLE_TICK, false) SCHED_FEAT(LB_BIAS, true) /* - * Decrement CPU power based on time not spent running tasks + * Decrement CPU capacity based on time not spent running tasks */ -SCHED_FEAT(NONTASK_POWER, true) +SCHED_FEAT(NONTASK_CAPACITY, true) /* * Queue remote wakeups on the target CPU and process them From 0b07939cbfdd05bed0c5ec01b8b25493e6ecd34c Mon Sep 17 00:00:00 2001 From: Giedrius Rekasius Date: Sun, 25 May 2014 15:23:31 +0100 Subject: [PATCH 12/18] sched: Remove redundant assignment to "rt_rq" in update_curr_rt(...) Variable "rt_rq" is used only in block "for_each_sched_rt_entity" so the value assigned to it at the beginning of the update_curr_rt(...) gets overwritten without ever being read. Remove redundant assignment and move variable declaration to the block in which it is being used. Signed-off-by: Giedrius Rekasius Signed-off-by: Peter Zijlstra Cc: kernel-janitors@vger.kernel.org Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/1401027811-30066-1-git-send-email-giedrius.rekasius@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 0ebfd7a29472..43406db306af 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -924,7 +924,6 @@ static void update_curr_rt(struct rq *rq) { struct task_struct *curr = rq->curr; struct sched_rt_entity *rt_se = &curr->rt; - struct rt_rq *rt_rq = rt_rq_of_se(rt_se); u64 delta_exec; if (curr->sched_class != &rt_sched_class) @@ -949,7 +948,7 @@ static void update_curr_rt(struct rq *rq) return; for_each_sched_rt_entity(rt_se) { - rt_rq = rt_rq_of_se(rt_se); + struct rt_rq *rt_rq = rt_rq_of_se(rt_se); if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { raw_spin_lock(&rt_rq->rt_runtime_lock); From 84c407084137d4e491b07ea5ff8665d19106a5ac Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 4 Jun 2014 10:31:14 -0700 Subject: [PATCH 13/18] cpuidle: Set polling in poll_idle poll_idle is the archetypal polling idle loop; tell the core idle code about it. This avoids pointless IPIs when all of the other cpuidle states are disabled. Signed-off-by: Andy Lutomirski Signed-off-by: Peter Zijlstra Cc: nicolas.pitre@linaro.org Cc: umgwanakikbuti@gmail.com Cc: Daniel Lezcano Cc: Linus Torvalds Cc: Rafael J. Wysocki Cc: linux-kernel@vger.kernel.org Cc: linux-pm@vger.kernel.org Link: http://lkml.kernel.org/r/c65ce49615d338bae8fb79df5daffab19353c900.1401902905.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- drivers/cpuidle/driver.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/cpuidle/driver.c b/drivers/cpuidle/driver.c index 136d6a283e0a..9634f20e3926 100644 --- a/drivers/cpuidle/driver.c +++ b/drivers/cpuidle/driver.c @@ -187,8 +187,11 @@ static int poll_idle(struct cpuidle_device *dev, t1 = ktime_get(); local_irq_enable(); - while (!need_resched()) - cpu_relax(); + if (!current_set_polling_and_test()) { + while (!need_resched()) + cpu_relax(); + } + current_clr_polling(); t2 = ktime_get(); diff = ktime_to_us(ktime_sub(t2, t1)); From dfc68f29ae67f2a6e799b44e6a4eb3417dffbfcd Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 4 Jun 2014 10:31:15 -0700 Subject: [PATCH 14/18] sched, trace: Add a tracepoint for IPI-less remote wakeups Remote wakeups of polling CPUs are a valuable performance improvement; add a tracepoint to make it much easier to verify that they're working. Signed-off-by: Andy Lutomirski Signed-off-by: Peter Zijlstra Cc: nicolas.pitre@linaro.org Cc: daniel.lezcano@linaro.org Cc: umgwanakikbuti@gmail.com Cc: David Ahern Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Mel Gorman Cc: Oleg Nesterov Cc: Steven Rostedt Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/16205aee116772aa686814f9b13bccb562108047.1401902905.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- include/trace/events/sched.h | 20 ++++++++++++++++++++ kernel/sched/core.c | 4 ++++ 2 files changed, 24 insertions(+) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 67e1bbf83695..0a68d5ae584e 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -530,6 +530,26 @@ TRACE_EVENT(sched_swap_numa, __entry->dst_pid, __entry->dst_tgid, __entry->dst_ngid, __entry->dst_cpu, __entry->dst_nid) ); + +/* + * Tracepoint for waking a polling cpu without an IPI. + */ +TRACE_EVENT(sched_wake_idle_without_ipi, + + TP_PROTO(int cpu), + + TP_ARGS(cpu), + + TP_STRUCT__entry( + __field( int, cpu ) + ), + + TP_fast_assign( + __entry->cpu = cpu; + ), + + TP_printk("cpu=%d", __entry->cpu) +); #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5976ca579d3e..e4c0ddd3db8e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -564,6 +564,8 @@ void resched_task(struct task_struct *p) if (set_nr_and_not_polling(p)) smp_send_reschedule(cpu); + else + trace_sched_wake_idle_without_ipi(cpu); } void resched_cpu(int cpu) @@ -647,6 +649,8 @@ static void wake_up_idle_cpu(int cpu) smp_mb(); if (!tsk_is_polling(rq->idle)) smp_send_reschedule(cpu); + else + trace_sched_wake_idle_without_ipi(cpu); } static bool wake_up_full_nohz_cpu(int cpu) From 82c65d60d64401aedc1006d6572469bbfdf148de Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 4 Jun 2014 10:31:16 -0700 Subject: [PATCH 15/18] sched/idle: Clear polling before descheduling the idle thread Currently, the only real guarantee provided by the polling bit is that, if you hold rq->lock and the polling bit is set, then you can set need_resched to force a reschedule. The only reason the lock is needed is that the idle thread might not be running at all when setting its need_resched bit, and rq->lock keeps it pinned. This is easy to fix: just clear the polling bit before scheduling. Now the idle thread's polling bit is only ever set when rq->curr == rq->idle. Signed-off-by: Andy Lutomirski Signed-off-by: Peter Zijlstra Cc: nicolas.pitre@linaro.org Cc: daniel.lezcano@linaro.org Cc: umgwanakikbuti@gmail.com Cc: Rafael J. Wysocki Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/b2059fcb4c613d520cb503b6fad6e47033c7c203.1401902905.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- kernel/sched/idle.c | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 25b9423abce9..fe4b24bf33ca 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -67,6 +67,10 @@ void __weak arch_cpu_idle(void) * cpuidle_idle_call - the main idle function * * NOTE: no locks or semaphores should be used here + * + * On archs that support TIF_POLLING_NRFLAG, is called with polling + * set, and it returns with polling set. If it ever stops polling, it + * must clear the polling bit. */ static void cpuidle_idle_call(void) { @@ -175,10 +179,22 @@ exit_idle: /* * Generic idle loop implementation + * + * Called with polling cleared. */ static void cpu_idle_loop(void) { while (1) { + /* + * If the arch has a polling bit, we maintain an invariant: + * + * Our polling bit is clear if we're not scheduled (i.e. if + * rq->curr != rq->idle). This means that, if rq->idle has + * the polling bit set, then setting need_resched is + * guaranteed to cause the cpu to reschedule. + */ + + __current_set_polling(); tick_nohz_idle_enter(); while (!need_resched()) { @@ -218,6 +234,15 @@ static void cpu_idle_loop(void) */ preempt_set_need_resched(); tick_nohz_idle_exit(); + __current_clr_polling(); + + /* + * We promise to reschedule if need_resched is set while + * polling is set. That means that clearing polling + * needs to be visible before rescheduling. + */ + smp_mb__after_atomic(); + schedule_preempt_disabled(); } } @@ -239,7 +264,6 @@ void cpu_startup_entry(enum cpuhp_state state) */ boot_init_stack_canary(); #endif - __current_set_polling(); arch_cpu_idle_prepare(); cpu_idle_loop(); } From 67b9ca70c3030e832999e8d1cdba2984c7bb5bfc Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 4 Jun 2014 10:31:17 -0700 Subject: [PATCH 16/18] sched/idle: Simplify wake_up_idle_cpu() Now that rq->idle's polling bit is a reliable indication that the cpu is polling, use it. Signed-off-by: Andy Lutomirski Signed-off-by: Peter Zijlstra Cc: Rafael J. Wysocki Cc: nicolas.pitre@linaro.org Cc: daniel.lezcano@linaro.org Cc: umgwanakikbuti@gmail.com Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/922f00761445a830ebb23d058e2ae53956ce2d73.1401902905.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e4c0ddd3db8e..6afbfeefddd6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -628,26 +628,7 @@ static void wake_up_idle_cpu(int cpu) if (cpu == smp_processor_id()) return; - /* - * This is safe, as this function is called with the timer - * wheel base lock of (cpu) held. When the CPU is on the way - * to idle and has not yet set rq->curr to idle then it will - * be serialized on the timer wheel base lock and take the new - * timer into account automatically. - */ - if (rq->curr != rq->idle) - return; - - /* - * We can set TIF_RESCHED on the idle task of the other CPU - * lockless. The worst case is that the other CPU runs the - * idle task through an additional NOOP schedule() - */ - set_tsk_need_resched(rq->idle); - - /* NEED_RESCHED must be visible before we test polling */ - smp_mb(); - if (!tsk_is_polling(rq->idle)) + if (set_nr_and_not_polling(rq->idle)) smp_send_reschedule(cpu); else trace_sched_wake_idle_without_ipi(cpu); From e3baac47f0e82c4be632f4f97215bb93bf16b342 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 4 Jun 2014 10:31:18 -0700 Subject: [PATCH 17/18] sched/idle: Optimize try-to-wake-up IPI [ This series reduces the number of IPIs on Andy's workload by something like 99%. It's down from many hundreds per second to very few. The basic idea behind this series is to make TIF_POLLING_NRFLAG be a reliable indication that the idle task is polling. Once that's done, the rest is reasonably straightforward. ] When enqueueing tasks on remote LLC domains, we send an IPI to do the work 'locally' and avoid bouncing all the cachelines over. However, when the remote CPU is idle (and polling, say x86 mwait), we don't need to send an IPI, we can simply kick the TIF word to wake it up and have the 'idle' loop do the work. So when _TIF_POLLING_NRFLAG is set, but _TIF_NEED_RESCHED is not (yet) set, set _TIF_NEED_RESCHED and avoid sending the IPI. Much-requested-by: Andy Lutomirski Signed-off-by: Peter Zijlstra [Edited by Andy Lutomirski, but this is mostly Peter Zijlstra's code.] Signed-off-by: Andy Lutomirski Cc: nicolas.pitre@linaro.org Cc: daniel.lezcano@linaro.org Cc: Mike Galbraith Cc: umgwanakikbuti@gmail.com Cc: Rafael J. Wysocki Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/ce06f8b02e7e337be63e97597fc4b248d3aa6f9b.1401902905.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 54 +++++++++++++++++++++++++++++++++++++++----- kernel/sched/idle.c | 10 +++++--- kernel/sched/sched.h | 6 +++++ 3 files changed, 61 insertions(+), 9 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6afbfeefddd6..60d4e05d64dd 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -519,7 +519,7 @@ static inline void init_hrtick(void) __old; \ }) -#ifdef TIF_POLLING_NRFLAG +#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) /* * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, * this avoids any races wrt polling state changes and thereby avoids @@ -530,12 +530,44 @@ static bool set_nr_and_not_polling(struct task_struct *p) struct thread_info *ti = task_thread_info(p); return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); } + +/* + * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set. + * + * If this returns true, then the idle task promises to call + * sched_ttwu_pending() and reschedule soon. + */ +static bool set_nr_if_polling(struct task_struct *p) +{ + struct thread_info *ti = task_thread_info(p); + typeof(ti->flags) old, val = ACCESS_ONCE(ti->flags); + + for (;;) { + if (!(val & _TIF_POLLING_NRFLAG)) + return false; + if (val & _TIF_NEED_RESCHED) + return true; + old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED); + if (old == val) + break; + val = old; + } + return true; +} + #else static bool set_nr_and_not_polling(struct task_struct *p) { set_tsk_need_resched(p); return true; } + +#ifdef CONFIG_SMP +static bool set_nr_if_polling(struct task_struct *p) +{ + return false; +} +#endif #endif /* @@ -1490,13 +1522,17 @@ static int ttwu_remote(struct task_struct *p, int wake_flags) } #ifdef CONFIG_SMP -static void sched_ttwu_pending(void) +void sched_ttwu_pending(void) { struct rq *rq = this_rq(); struct llist_node *llist = llist_del_all(&rq->wake_list); struct task_struct *p; + unsigned long flags; - raw_spin_lock(&rq->lock); + if (!llist) + return; + + raw_spin_lock_irqsave(&rq->lock, flags); while (llist) { p = llist_entry(llist, struct task_struct, wake_entry); @@ -1504,7 +1540,7 @@ static void sched_ttwu_pending(void) ttwu_do_activate(rq, p, 0); } - raw_spin_unlock(&rq->lock); + raw_spin_unlock_irqrestore(&rq->lock, flags); } void scheduler_ipi(void) @@ -1550,8 +1586,14 @@ void scheduler_ipi(void) static void ttwu_queue_remote(struct task_struct *p, int cpu) { - if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) - smp_send_reschedule(cpu); + struct rq *rq = cpu_rq(cpu); + + if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) { + if (!set_nr_if_polling(rq->idle)) + smp_send_reschedule(cpu); + else + trace_sched_wake_idle_without_ipi(cpu); + } } bool cpus_share_cache(int this_cpu, int that_cpu) diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index fe4b24bf33ca..cf009fb0bc25 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -12,6 +12,8 @@ #include +#include "sched.h" + static int __read_mostly cpu_idle_force_poll; void cpu_idle_poll_ctrl(bool enable) @@ -237,12 +239,14 @@ static void cpu_idle_loop(void) __current_clr_polling(); /* - * We promise to reschedule if need_resched is set while - * polling is set. That means that clearing polling - * needs to be visible before rescheduling. + * We promise to call sched_ttwu_pending and reschedule + * if need_resched is set while polling is set. That + * means that clearing polling needs to be visible + * before doing these things. */ smp_mb__after_atomic(); + sched_ttwu_pending(); schedule_preempt_disabled(); } } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 956b8ca24893..2f8636199b83 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -670,6 +670,8 @@ extern int migrate_swap(struct task_struct *, struct task_struct *); #ifdef CONFIG_SMP +extern void sched_ttwu_pending(void); + #define rcu_dereference_check_sched_domain(p) \ rcu_dereference_check((p), \ lockdep_is_held(&sched_domains_mutex)) @@ -787,6 +789,10 @@ static inline unsigned int group_first_cpu(struct sched_group *group) extern int group_balance_cpu(struct sched_group *sg); +#else + +static inline void sched_ttwu_pending(void) { } + #endif /* CONFIG_SMP */ #include "stats.h" From f602d0632755be4f7eddfdd6c0af13216790177b Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Tue, 13 May 2014 16:40:05 -0400 Subject: [PATCH 18/18] sched/deadline: Delete extraneous extern for to_ratio() There was a prototype for it added to kernel/sched/sched.h at the same time the extern was added, so the extern in the C file was never really ever needed. See commit 332ac17ef5bfcff4766dfdfd3b4cdf10b8f8f155 ("sched/deadline: Add bandwidth management for SCHED_DEADLINE tasks") for details. Signed-off-by: Paul Gortmaker Cc: Peter Zijlstra Cc: Dario Faggioli Link: http://lkml.kernel.org/r/1400013605-18717-1-git-send-email-paul.gortmaker@windriver.com Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index f9ca7d19781a..0d6b17057188 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -57,8 +57,6 @@ void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime) dl_b->dl_runtime = runtime; } -extern unsigned long to_ratio(u64 period, u64 runtime); - void init_dl_bw(struct dl_bw *dl_b) { raw_spin_lock_init(&dl_b->lock);