From 3bd3706251ee8ab67e69d9340ac2abdca217e733 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 23 Apr 2019 16:26:36 +0200 Subject: [PATCH 01/33] sched/core: Provide a pointer to the valid CPU mask In commit: 4b53a3412d66 ("sched/core: Remove the tsk_nr_cpus_allowed() wrapper") the tsk_nr_cpus_allowed() wrapper was removed. There was not much difference in !RT but in RT we used this to implement migrate_disable(). Within a migrate_disable() section the CPU mask is restricted to single CPU while the "normal" CPU mask remains untouched. As an alternative implementation Ingo suggested to use: struct task_struct { const cpumask_t *cpus_ptr; cpumask_t cpus_mask; }; with t->cpus_ptr = &t->cpus_mask; In -RT we then can switch the cpus_ptr to: t->cpus_ptr = &cpumask_of(task_cpu(p)); in a migration disabled region. The rules are simple: - Code that 'uses' ->cpus_allowed would use the pointer. - Code that 'modifies' ->cpus_allowed would use the direct mask. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lkml.kernel.org/r/20190423142636.14347-1-bigeasy@linutronix.de Signed-off-by: Ingo Molnar --- arch/ia64/kernel/mca.c | 2 +- arch/mips/include/asm/switch_to.h | 4 +-- arch/mips/kernel/mips-mt-fpaff.c | 2 +- arch/mips/kernel/traps.c | 6 ++-- arch/powerpc/platforms/cell/spufs/sched.c | 2 +- arch/x86/kernel/cpu/resctrl/pseudo_lock.c | 2 +- drivers/infiniband/hw/hfi1/affinity.c | 6 ++-- drivers/infiniband/hw/hfi1/sdma.c | 3 +- drivers/infiniband/hw/qib/qib_file_ops.c | 7 ++-- fs/proc/array.c | 4 +-- include/linux/sched.h | 5 +-- init/init_task.c | 3 +- kernel/cgroup/cpuset.c | 2 +- kernel/fork.c | 2 ++ kernel/sched/core.c | 40 +++++++++++----------- kernel/sched/cpudeadline.c | 4 +-- kernel/sched/cpupri.c | 4 +-- kernel/sched/deadline.c | 6 ++-- kernel/sched/fair.c | 34 +++++++++--------- kernel/sched/rt.c | 4 +-- kernel/trace/trace_hwlat.c | 2 +- lib/smp_processor_id.c | 2 +- samples/trace_events/trace-events-sample.c | 2 +- 23 files changed, 75 insertions(+), 73 deletions(-) diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c index 6a52d761854b..79190d877fa7 100644 --- a/arch/ia64/kernel/mca.c +++ b/arch/ia64/kernel/mca.c @@ -1831,7 +1831,7 @@ format_mca_init_stack(void *mca_data, unsigned long offset, ti->cpu = cpu; p->stack = ti; p->state = TASK_UNINTERRUPTIBLE; - cpumask_set_cpu(cpu, &p->cpus_allowed); + cpumask_set_cpu(cpu, &p->cpus_mask); INIT_LIST_HEAD(&p->tasks); p->parent = p->real_parent = p->group_leader = p; INIT_LIST_HEAD(&p->children); diff --git a/arch/mips/include/asm/switch_to.h b/arch/mips/include/asm/switch_to.h index 0f813bb753c6..09cbe9042828 100644 --- a/arch/mips/include/asm/switch_to.h +++ b/arch/mips/include/asm/switch_to.h @@ -42,7 +42,7 @@ extern struct task_struct *ll_task; * inline to try to keep the overhead down. If we have been forced to run on * a "CPU" with an FPU because of a previous high level of FP computation, * but did not actually use the FPU during the most recent time-slice (CU1 - * isn't set), we undo the restriction on cpus_allowed. + * isn't set), we undo the restriction on cpus_mask. * * We're not calling set_cpus_allowed() here, because we have no need to * force prompt migration - we're already switching the current CPU to a @@ -57,7 +57,7 @@ do { \ test_ti_thread_flag(__prev_ti, TIF_FPUBOUND) && \ (!(KSTK_STATUS(prev) & ST0_CU1))) { \ clear_ti_thread_flag(__prev_ti, TIF_FPUBOUND); \ - prev->cpus_allowed = prev->thread.user_cpus_allowed; \ + prev->cpus_mask = prev->thread.user_cpus_allowed; \ } \ next->thread.emulated_fp = 0; \ } while(0) diff --git a/arch/mips/kernel/mips-mt-fpaff.c b/arch/mips/kernel/mips-mt-fpaff.c index a7c0f97e4b0d..1a08428eedcf 100644 --- a/arch/mips/kernel/mips-mt-fpaff.c +++ b/arch/mips/kernel/mips-mt-fpaff.c @@ -177,7 +177,7 @@ asmlinkage long mipsmt_sys_sched_getaffinity(pid_t pid, unsigned int len, if (retval) goto out_unlock; - cpumask_or(&allowed, &p->thread.user_cpus_allowed, &p->cpus_allowed); + cpumask_or(&allowed, &p->thread.user_cpus_allowed, p->cpus_ptr); cpumask_and(&mask, &allowed, cpu_active_mask); out_unlock: diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c index c52766a5b85f..ac7159263da0 100644 --- a/arch/mips/kernel/traps.c +++ b/arch/mips/kernel/traps.c @@ -891,12 +891,12 @@ static void mt_ase_fp_affinity(void) * restricted the allowed set to exclude any CPUs with FPUs, * we'll skip the procedure. */ - if (cpumask_intersects(¤t->cpus_allowed, &mt_fpu_cpumask)) { + if (cpumask_intersects(¤t->cpus_mask, &mt_fpu_cpumask)) { cpumask_t tmask; current->thread.user_cpus_allowed - = current->cpus_allowed; - cpumask_and(&tmask, ¤t->cpus_allowed, + = current->cpus_mask; + cpumask_and(&tmask, ¤t->cpus_mask, &mt_fpu_cpumask); set_cpus_allowed_ptr(current, &tmask); set_thread_flag(TIF_FPUBOUND); diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c index e56b553de27b..f18d5067cd0f 100644 --- a/arch/powerpc/platforms/cell/spufs/sched.c +++ b/arch/powerpc/platforms/cell/spufs/sched.c @@ -128,7 +128,7 @@ void __spu_update_sched_info(struct spu_context *ctx) * runqueue. The context will be rescheduled on the proper node * if it is timesliced or preempted. */ - cpumask_copy(&ctx->cpus_allowed, ¤t->cpus_allowed); + cpumask_copy(&ctx->cpus_allowed, current->cpus_ptr); /* Save the current cpu id for spu interrupt routing. */ ctx->last_ran = raw_smp_processor_id(); diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c index 604c0e3bcc83..f68baccc69f0 100644 --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c @@ -1503,7 +1503,7 @@ static int pseudo_lock_dev_mmap(struct file *filp, struct vm_area_struct *vma) * may be scheduled elsewhere and invalidate entries in the * pseudo-locked region. */ - if (!cpumask_subset(¤t->cpus_allowed, &plr->d->cpu_mask)) { + if (!cpumask_subset(current->cpus_ptr, &plr->d->cpu_mask)) { mutex_unlock(&rdtgroup_mutex); return -EINVAL; } diff --git a/drivers/infiniband/hw/hfi1/affinity.c b/drivers/infiniband/hw/hfi1/affinity.c index 4fe662c3bbc1..c142b23bb401 100644 --- a/drivers/infiniband/hw/hfi1/affinity.c +++ b/drivers/infiniband/hw/hfi1/affinity.c @@ -1038,7 +1038,7 @@ int hfi1_get_proc_affinity(int node) struct hfi1_affinity_node *entry; cpumask_var_t diff, hw_thread_mask, available_mask, intrs_mask; const struct cpumask *node_mask, - *proc_mask = ¤t->cpus_allowed; + *proc_mask = current->cpus_ptr; struct hfi1_affinity_node_list *affinity = &node_affinity; struct cpu_mask_set *set = &affinity->proc; @@ -1046,7 +1046,7 @@ int hfi1_get_proc_affinity(int node) * check whether process/context affinity has already * been set */ - if (cpumask_weight(proc_mask) == 1) { + if (current->nr_cpus_allowed == 1) { hfi1_cdbg(PROC, "PID %u %s affinity set to CPU %*pbl", current->pid, current->comm, cpumask_pr_args(proc_mask)); @@ -1057,7 +1057,7 @@ int hfi1_get_proc_affinity(int node) cpu = cpumask_first(proc_mask); cpumask_set_cpu(cpu, &set->used); goto done; - } else if (cpumask_weight(proc_mask) < cpumask_weight(&set->mask)) { + } else if (current->nr_cpus_allowed < cpumask_weight(&set->mask)) { hfi1_cdbg(PROC, "PID %u %s affinity set to CPU set(s) %*pbl", current->pid, current->comm, cpumask_pr_args(proc_mask)); diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c index b0110728f541..7e8139ee0cc1 100644 --- a/drivers/infiniband/hw/hfi1/sdma.c +++ b/drivers/infiniband/hw/hfi1/sdma.c @@ -855,14 +855,13 @@ struct sdma_engine *sdma_select_user_engine(struct hfi1_devdata *dd, { struct sdma_rht_node *rht_node; struct sdma_engine *sde = NULL; - const struct cpumask *current_mask = ¤t->cpus_allowed; unsigned long cpu_id; /* * To ensure that always the same sdma engine(s) will be * selected make sure the process is pinned to this CPU only. */ - if (cpumask_weight(current_mask) != 1) + if (current->nr_cpus_allowed != 1) goto out; cpu_id = smp_processor_id(); diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c index 78fa634de98a..27b6e664e59d 100644 --- a/drivers/infiniband/hw/qib/qib_file_ops.c +++ b/drivers/infiniband/hw/qib/qib_file_ops.c @@ -1142,7 +1142,7 @@ static __poll_t qib_poll(struct file *fp, struct poll_table_struct *pt) static void assign_ctxt_affinity(struct file *fp, struct qib_devdata *dd) { struct qib_filedata *fd = fp->private_data; - const unsigned int weight = cpumask_weight(¤t->cpus_allowed); + const unsigned int weight = current->nr_cpus_allowed; const struct cpumask *local_mask = cpumask_of_pcibus(dd->pcidev->bus); int local_cpu; @@ -1623,9 +1623,8 @@ static int qib_assign_ctxt(struct file *fp, const struct qib_user_info *uinfo) ret = find_free_ctxt(i_minor - 1, fp, uinfo); else { int unit; - const unsigned int cpu = cpumask_first(¤t->cpus_allowed); - const unsigned int weight = - cpumask_weight(¤t->cpus_allowed); + const unsigned int cpu = cpumask_first(current->cpus_ptr); + const unsigned int weight = current->nr_cpus_allowed; if (weight == 1 && !test_bit(cpu, qib_cpulist)) if (!find_hca(cpu, &unit) && unit >= 0) diff --git a/fs/proc/array.c b/fs/proc/array.c index 2edbb657f859..84908556ea58 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -381,9 +381,9 @@ static inline void task_context_switch_counts(struct seq_file *m, static void task_cpus_allowed(struct seq_file *m, struct task_struct *task) { seq_printf(m, "Cpus_allowed:\t%*pb\n", - cpumask_pr_args(&task->cpus_allowed)); + cpumask_pr_args(task->cpus_ptr)); seq_printf(m, "Cpus_allowed_list:\t%*pbl\n", - cpumask_pr_args(&task->cpus_allowed)); + cpumask_pr_args(task->cpus_ptr)); } static inline void task_core_dumping(struct seq_file *m, struct mm_struct *mm) diff --git a/include/linux/sched.h b/include/linux/sched.h index 11837410690f..1b2590a8d038 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -651,7 +651,8 @@ struct task_struct { unsigned int policy; int nr_cpus_allowed; - cpumask_t cpus_allowed; + const cpumask_t *cpus_ptr; + cpumask_t cpus_mask; #ifdef CONFIG_PREEMPT_RCU int rcu_read_lock_nesting; @@ -1399,7 +1400,7 @@ extern struct pid *cad_pid; #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ #define PF_MEMSTALL 0x01000000 /* Stalled due to lack of memory */ #define PF_UMH 0x02000000 /* I'm an Usermodehelper process */ -#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */ +#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ #define PF_MEMALLOC_NOCMA 0x10000000 /* All allocation request will have _GFP_MOVABLE cleared */ #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */ diff --git a/init/init_task.c b/init/init_task.c index c70ef656d0f4..3c27c0efa316 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -72,7 +72,8 @@ struct task_struct init_task .static_prio = MAX_PRIO - 20, .normal_prio = MAX_PRIO - 20, .policy = SCHED_NORMAL, - .cpus_allowed = CPU_MASK_ALL, + .cpus_ptr = &init_task.cpus_mask, + .cpus_mask = CPU_MASK_ALL, .nr_cpus_allowed= NR_CPUS, .mm = NULL, .active_mm = &init_mm, diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 6a1942ed781c..fe90fa1899e6 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2829,7 +2829,7 @@ static void cpuset_fork(struct task_struct *task) if (task_css_is_root(task, cpuset_cgrp_id)) return; - set_cpus_allowed_ptr(task, ¤t->cpus_allowed); + set_cpus_allowed_ptr(task, current->cpus_ptr); task->mems_allowed = current->mems_allowed; } diff --git a/kernel/fork.c b/kernel/fork.c index 75675b9bf6df..6be686283e55 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -894,6 +894,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) #ifdef CONFIG_STACKPROTECTOR tsk->stack_canary = get_random_canary(); #endif + if (orig->cpus_ptr == &orig->cpus_mask) + tsk->cpus_ptr = &tsk->cpus_mask; /* * One for us, one for whoever does the "release_task()" (usually diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 874c427742a9..93ab85f0d076 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -930,7 +930,7 @@ static inline bool is_per_cpu_kthread(struct task_struct *p) */ static inline bool is_cpu_allowed(struct task_struct *p, int cpu) { - if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) + if (!cpumask_test_cpu(cpu, p->cpus_ptr)) return false; if (is_per_cpu_kthread(p)) @@ -1025,7 +1025,7 @@ static int migration_cpu_stop(void *data) local_irq_disable(); /* * We need to explicitly wake pending tasks before running - * __migrate_task() such that we will not miss enforcing cpus_allowed + * __migrate_task() such that we will not miss enforcing cpus_ptr * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test. */ sched_ttwu_pending(); @@ -1056,7 +1056,7 @@ static int migration_cpu_stop(void *data) */ void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) { - cpumask_copy(&p->cpus_allowed, new_mask); + cpumask_copy(&p->cpus_mask, new_mask); p->nr_cpus_allowed = cpumask_weight(new_mask); } @@ -1126,7 +1126,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, goto out; } - if (cpumask_equal(&p->cpus_allowed, new_mask)) + if (cpumask_equal(p->cpus_ptr, new_mask)) goto out; if (!cpumask_intersects(new_mask, cpu_valid_mask)) { @@ -1286,10 +1286,10 @@ static int migrate_swap_stop(void *data) if (task_cpu(arg->src_task) != arg->src_cpu) goto unlock; - if (!cpumask_test_cpu(arg->dst_cpu, &arg->src_task->cpus_allowed)) + if (!cpumask_test_cpu(arg->dst_cpu, arg->src_task->cpus_ptr)) goto unlock; - if (!cpumask_test_cpu(arg->src_cpu, &arg->dst_task->cpus_allowed)) + if (!cpumask_test_cpu(arg->src_cpu, arg->dst_task->cpus_ptr)) goto unlock; __migrate_swap_task(arg->src_task, arg->dst_cpu); @@ -1331,10 +1331,10 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p, if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu)) goto out; - if (!cpumask_test_cpu(arg.dst_cpu, &arg.src_task->cpus_allowed)) + if (!cpumask_test_cpu(arg.dst_cpu, arg.src_task->cpus_ptr)) goto out; - if (!cpumask_test_cpu(arg.src_cpu, &arg.dst_task->cpus_allowed)) + if (!cpumask_test_cpu(arg.src_cpu, arg.dst_task->cpus_ptr)) goto out; trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu); @@ -1479,7 +1479,7 @@ void kick_process(struct task_struct *p) EXPORT_SYMBOL_GPL(kick_process); /* - * ->cpus_allowed is protected by both rq->lock and p->pi_lock + * ->cpus_ptr is protected by both rq->lock and p->pi_lock * * A few notes on cpu_active vs cpu_online: * @@ -1519,14 +1519,14 @@ static int select_fallback_rq(int cpu, struct task_struct *p) for_each_cpu(dest_cpu, nodemask) { if (!cpu_active(dest_cpu)) continue; - if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) + if (cpumask_test_cpu(dest_cpu, p->cpus_ptr)) return dest_cpu; } } for (;;) { /* Any allowed, online CPU? */ - for_each_cpu(dest_cpu, &p->cpus_allowed) { + for_each_cpu(dest_cpu, p->cpus_ptr) { if (!is_cpu_allowed(p, dest_cpu)) continue; @@ -1570,7 +1570,7 @@ out: } /* - * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. + * The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable. */ static inline int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) @@ -1580,11 +1580,11 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) if (p->nr_cpus_allowed > 1) cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); else - cpu = cpumask_any(&p->cpus_allowed); + cpu = cpumask_any(p->cpus_ptr); /* * In order not to call set_task_cpu() on a blocking task we need - * to rely on ttwu() to place the task on a valid ->cpus_allowed + * to rely on ttwu() to place the task on a valid ->cpus_ptr * CPU. * * Since this is common to all placement strategies, this lives here. @@ -2395,7 +2395,7 @@ void wake_up_new_task(struct task_struct *p) #ifdef CONFIG_SMP /* * Fork balancing, do it here and not earlier because: - * - cpus_allowed can change in the fork path + * - cpus_ptr can change in the fork path * - any previously selected CPU might disappear through hotplug * * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, @@ -4267,7 +4267,7 @@ change: * the entire root_domain to become SCHED_DEADLINE. We * will also fail if there's no bandwidth available. */ - if (!cpumask_subset(span, &p->cpus_allowed) || + if (!cpumask_subset(span, p->cpus_ptr) || rq->rd->dl_bw.bw == 0) { task_rq_unlock(rq, p, &rf); return -EPERM; @@ -4866,7 +4866,7 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) goto out_unlock; raw_spin_lock_irqsave(&p->pi_lock, flags); - cpumask_and(mask, &p->cpus_allowed, cpu_active_mask); + cpumask_and(mask, &p->cpus_mask, cpu_active_mask); raw_spin_unlock_irqrestore(&p->pi_lock, flags); out_unlock: @@ -5443,7 +5443,7 @@ int task_can_attach(struct task_struct *p, * allowed nodes is unnecessary. Thus, cpusets are not * applicable for such threads. This prevents checking for * success of set_cpus_allowed_ptr() on all attached tasks - * before cpus_allowed may be changed. + * before cpus_mask may be changed. */ if (p->flags & PF_NO_SETAFFINITY) { ret = -EINVAL; @@ -5470,7 +5470,7 @@ int migrate_task_to(struct task_struct *p, int target_cpu) if (curr_cpu == target_cpu) return 0; - if (!cpumask_test_cpu(target_cpu, &p->cpus_allowed)) + if (!cpumask_test_cpu(target_cpu, p->cpus_ptr)) return -EINVAL; /* TODO: This is not properly updating schedstats */ @@ -5608,7 +5608,7 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf) put_prev_task(rq, next); /* - * Rules for changing task_struct::cpus_allowed are holding + * Rules for changing task_struct::cpus_mask are holding * both pi_lock and rq->lock, such that holding either * stabilizes the mask. * diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 50316455ea66..d57fb2f8ae67 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -124,14 +124,14 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, const struct sched_dl_entity *dl_se = &p->dl; if (later_mask && - cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) { + cpumask_and(later_mask, cp->free_cpus, p->cpus_ptr)) { return 1; } else { int best_cpu = cpudl_maximum(cp); WARN_ON(best_cpu != -1 && !cpu_present(best_cpu)); - if (cpumask_test_cpu(best_cpu, &p->cpus_allowed) && + if (cpumask_test_cpu(best_cpu, p->cpus_ptr) && dl_time_before(dl_se->deadline, cp->elements[0].dl)) { if (later_mask) cpumask_set_cpu(best_cpu, later_mask); diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index daaadf939ccb..f7d2c10b4c92 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -98,11 +98,11 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, if (skip) continue; - if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids) + if (cpumask_any_and(p->cpus_ptr, vec->mask) >= nr_cpu_ids) continue; if (lowest_mask) { - cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask); + cpumask_and(lowest_mask, p->cpus_ptr, vec->mask); /* * We have to ensure that we have at least one bit diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 43901fa3f269..c1ef30861068 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -538,7 +538,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p * If we cannot preempt any rq, fall back to pick any * online CPU: */ - cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed); + cpu = cpumask_any_and(cpu_active_mask, p->cpus_ptr); if (cpu >= nr_cpu_ids) { /* * Failed to find any suitable CPU. @@ -1824,7 +1824,7 @@ static void set_curr_task_dl(struct rq *rq) static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) { if (!task_running(rq, p) && - cpumask_test_cpu(cpu, &p->cpus_allowed)) + cpumask_test_cpu(cpu, p->cpus_ptr)) return 1; return 0; } @@ -1974,7 +1974,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) /* Retry if something changed. */ if (double_lock_balance(rq, later_rq)) { if (unlikely(task_rq(task) != rq || - !cpumask_test_cpu(later_rq->cpu, &task->cpus_allowed) || + !cpumask_test_cpu(later_rq->cpu, task->cpus_ptr) || task_running(rq, task) || !dl_task(task) || !task_on_rq_queued(task))) { diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f35930f5e528..8691a8fffe40 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1621,7 +1621,7 @@ static void task_numa_compare(struct task_numa_env *env, * be incurred if the tasks were swapped. */ /* Skip this swap candidate if cannot move to the source cpu */ - if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed)) + if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr)) goto unlock; /* @@ -1718,7 +1718,7 @@ static void task_numa_find_cpu(struct task_numa_env *env, for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) { /* Skip this CPU if the source task cannot migrate */ - if (!cpumask_test_cpu(cpu, &env->p->cpus_allowed)) + if (!cpumask_test_cpu(cpu, env->p->cpus_ptr)) continue; env->dst_cpu = cpu; @@ -5831,7 +5831,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, /* Skip over this group if it has no CPUs allowed */ if (!cpumask_intersects(sched_group_span(group), - &p->cpus_allowed)) + p->cpus_ptr)) continue; local_group = cpumask_test_cpu(this_cpu, @@ -5963,7 +5963,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this return cpumask_first(sched_group_span(group)); /* Traverse only the allowed CPUs */ - for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) { + for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) { if (available_idle_cpu(i)) { struct rq *rq = cpu_rq(i); struct cpuidle_state *idle = idle_get_state(rq); @@ -6003,7 +6003,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p { int new_cpu = cpu; - if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed)) + if (!cpumask_intersects(sched_domain_span(sd), p->cpus_ptr)) return prev_cpu; /* @@ -6120,7 +6120,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int if (!test_idle_cores(target, false)) return -1; - cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed); + cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); for_each_cpu_wrap(core, cpus, target) { bool idle = true; @@ -6154,7 +6154,7 @@ static int select_idle_smt(struct task_struct *p, int target) return -1; for_each_cpu(cpu, cpu_smt_mask(target)) { - if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) + if (!cpumask_test_cpu(cpu, p->cpus_ptr)) continue; if (available_idle_cpu(cpu)) return cpu; @@ -6217,7 +6217,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t for_each_cpu_wrap(cpu, sched_domain_span(sd), target) { if (!--nr) return -1; - if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) + if (!cpumask_test_cpu(cpu, p->cpus_ptr)) continue; if (available_idle_cpu(cpu)) break; @@ -6254,7 +6254,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) recent_used_cpu != target && cpus_share_cache(recent_used_cpu, target) && available_idle_cpu(recent_used_cpu) && - cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) { + cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) { /* * Replace recent_used_cpu with prev as it is a potential * candidate for the next wake: @@ -6600,7 +6600,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) int max_spare_cap_cpu = -1; for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) { - if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) + if (!cpumask_test_cpu(cpu, p->cpus_ptr)) continue; /* Skip CPUs that will be overutilized. */ @@ -6689,7 +6689,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f } want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) && - cpumask_test_cpu(cpu, &p->cpus_allowed); + cpumask_test_cpu(cpu, p->cpus_ptr); } rcu_read_lock(); @@ -7445,14 +7445,14 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) /* * We do not migrate tasks that are: * 1) throttled_lb_pair, or - * 2) cannot be migrated to this CPU due to cpus_allowed, or + * 2) cannot be migrated to this CPU due to cpus_ptr, or * 3) running (obviously), or * 4) are cache-hot on their current CPU. */ if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) return 0; - if (!cpumask_test_cpu(env->dst_cpu, &p->cpus_allowed)) { + if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) { int cpu; schedstat_inc(p->se.statistics.nr_failed_migrations_affine); @@ -7472,7 +7472,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) /* Prevent to re-select dst_cpu via env's CPUs: */ for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { - if (cpumask_test_cpu(cpu, &p->cpus_allowed)) { + if (cpumask_test_cpu(cpu, p->cpus_ptr)) { env->flags |= LBF_DST_PINNED; env->new_dst_cpu = cpu; break; @@ -8099,7 +8099,7 @@ static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd) /* * Group imbalance indicates (and tries to solve) the problem where balancing - * groups is inadequate due to ->cpus_allowed constraints. + * groups is inadequate due to ->cpus_ptr constraints. * * Imagine a situation of two groups of 4 CPUs each and 4 tasks each with a * cpumask covering 1 CPU of the first group and 3 CPUs of the second group. @@ -8768,7 +8768,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) /* * If the busiest group is imbalanced the below checks don't * work because they assume all things are equal, which typically - * isn't true due to cpus_allowed constraints and the like. + * isn't true due to cpus_ptr constraints and the like. */ if (busiest->group_type == group_imbalanced) goto force_balance; @@ -9210,7 +9210,7 @@ more_balance: * if the curr task on busiest CPU can't be * moved to this_cpu: */ - if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) { + if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) { raw_spin_unlock_irqrestore(&busiest->lock, flags); env.flags |= LBF_ALL_PINNED; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 1e6b909dca36..63ad7c90822c 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1614,7 +1614,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) { if (!task_running(rq, p) && - cpumask_test_cpu(cpu, &p->cpus_allowed)) + cpumask_test_cpu(cpu, p->cpus_ptr)) return 1; return 0; @@ -1751,7 +1751,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) * Also make sure that it wasn't scheduled on its rq. */ if (unlikely(task_rq(task) != rq || - !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_allowed) || + !cpumask_test_cpu(lowest_rq->cpu, task->cpus_ptr) || task_running(rq, task) || !rt_task(task) || !task_on_rq_queued(task))) { diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index 1e6db9cbe4dc..fa95139445b2 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -277,7 +277,7 @@ static void move_to_next_cpu(void) * of this thread, than stop migrating for the duration * of the current test. */ - if (!cpumask_equal(current_mask, ¤t->cpus_allowed)) + if (!cpumask_equal(current_mask, current->cpus_ptr)) goto disable; get_online_cpus(); diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c index 157d9e31f6c2..60ba93fc42ce 100644 --- a/lib/smp_processor_id.c +++ b/lib/smp_processor_id.c @@ -23,7 +23,7 @@ unsigned int check_preemption_disabled(const char *what1, const char *what2) * Kernel threads bound to a single CPU can safely use * smp_processor_id(): */ - if (cpumask_equal(¤t->cpus_allowed, cpumask_of(this_cpu))) + if (cpumask_equal(current->cpus_ptr, cpumask_of(this_cpu))) goto out; /* diff --git a/samples/trace_events/trace-events-sample.c b/samples/trace_events/trace-events-sample.c index 1da597aa6141..1a72b7d95cdc 100644 --- a/samples/trace_events/trace-events-sample.c +++ b/samples/trace_events/trace-events-sample.c @@ -34,7 +34,7 @@ static void simple_thread_func(int cnt) /* Silly tracepoints */ trace_foo_bar("hello", cnt, array, random_strings[len], - ¤t->cpus_allowed); + current->cpus_ptr); trace_foo_with_template_simple("HELLO", cnt); From f2bedc4705659216bd60948029ad8dfedf923ad9 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Wed, 24 Apr 2019 09:45:56 +0100 Subject: [PATCH 02/33] sched/fair: Remove rq->load The CFS class is the only one maintaining and using the CPU wide load (rq->load(.weight)). The last use case of the CPU wide load in CFS's set_next_entity() can be replaced by using the load of the CFS class (rq->cfs.load(.weight)) instead. Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190424084556.604-1-dietmar.eggemann@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 2 -- kernel/sched/fair.c | 7 ++----- kernel/sched/sched.h | 2 -- 3 files changed, 2 insertions(+), 9 deletions(-) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 678bfb9bd87f..150043e1d716 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -656,8 +656,6 @@ do { \ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x)) P(nr_running); - SEQ_printf(m, " .%-30s: %lu\n", "load", - rq->load.weight); P(nr_switches); P(nr_load_updates); P(nr_uninterruptible); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8691a8fffe40..08b1cb06f968 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2686,8 +2686,6 @@ static void account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) { update_load_add(&cfs_rq->load, se->load.weight); - if (!parent_entity(se)) - update_load_add(&rq_of(cfs_rq)->load, se->load.weight); #ifdef CONFIG_SMP if (entity_is_task(se)) { struct rq *rq = rq_of(cfs_rq); @@ -2703,8 +2701,6 @@ static void account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) { update_load_sub(&cfs_rq->load, se->load.weight); - if (!parent_entity(se)) - update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); #ifdef CONFIG_SMP if (entity_is_task(se)) { account_numa_dequeue(rq_of(cfs_rq), task_of(se)); @@ -4100,7 +4096,8 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) * least twice that of our own weight (i.e. dont track it * when there are only lesser-weight tasks around): */ - if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { + if (schedstat_enabled() && + rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) { schedstat_set(se->statistics.slice_max, max((u64)schedstat_val(se->statistics.slice_max), se->sum_exec_runtime - se->prev_sum_exec_runtime)); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b52ed1ada0be..c308410675ed 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -830,8 +830,6 @@ struct rq { atomic_t nohz_flags; #endif /* CONFIG_NO_HZ_COMMON */ - /* capture load from *all* tasks on this CPU: */ - struct load_weight load; unsigned long nr_load_updates; u64 nr_switches; From 5e83eafbfd3b351537c0d74467fc43e8a88f4ae4 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Mon, 27 May 2019 07:21:10 +0100 Subject: [PATCH 03/33] sched/fair: Remove the rq->cpu_load[] update code With LB_BIAS disabled, there is no need to update the rq->cpu_load[idx] any more. Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Acked-by: Rik van Riel Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Patrick Bellasi Cc: Peter Zijlstra Cc: Quentin Perret Cc: Thomas Gleixner Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lkml.kernel.org/r/20190527062116.11512-2-dietmar.eggemann@arm.com Signed-off-by: Ingo Molnar --- include/linux/sched/nohz.h | 8 -- kernel/sched/core.c | 1 - kernel/sched/fair.c | 255 ------------------------------------- kernel/sched/sched.h | 6 - kernel/time/tick-sched.c | 2 - 5 files changed, 272 deletions(-) diff --git a/include/linux/sched/nohz.h b/include/linux/sched/nohz.h index b36f4cf38111..1abe91ff6e4a 100644 --- a/include/linux/sched/nohz.h +++ b/include/linux/sched/nohz.h @@ -6,14 +6,6 @@ * This is the interface between the scheduler and nohz/dynticks: */ -#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) -extern void cpu_load_update_nohz_start(void); -extern void cpu_load_update_nohz_stop(void); -#else -static inline void cpu_load_update_nohz_start(void) { } -static inline void cpu_load_update_nohz_stop(void) { } -#endif - #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) extern void nohz_balance_enter_idle(int cpu); extern int get_nohz_timer_target(void); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 93ab85f0d076..00b8966802a8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3033,7 +3033,6 @@ void scheduler_tick(void) update_rq_clock(rq); curr->sched_class->task_tick(rq, curr, 0); - cpu_load_update_active(rq); calc_global_load_tick(rq); psi_task_tick(rq); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 08b1cb06f968..1aab323f1b4b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5322,71 +5322,6 @@ DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); DEFINE_PER_CPU(cpumask_var_t, select_idle_mask); #ifdef CONFIG_NO_HZ_COMMON -/* - * per rq 'load' arrray crap; XXX kill this. - */ - -/* - * The exact cpuload calculated at every tick would be: - * - * load' = (1 - 1/2^i) * load + (1/2^i) * cur_load - * - * If a CPU misses updates for n ticks (as it was idle) and update gets - * called on the n+1-th tick when CPU may be busy, then we have: - * - * load_n = (1 - 1/2^i)^n * load_0 - * load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load - * - * decay_load_missed() below does efficient calculation of - * - * load' = (1 - 1/2^i)^n * load - * - * Because x^(n+m) := x^n * x^m we can decompose any x^n in power-of-2 factors. - * This allows us to precompute the above in said factors, thereby allowing the - * reduction of an arbitrary n in O(log_2 n) steps. (See also - * fixed_power_int()) - * - * The calculation is approximated on a 128 point scale. - */ -#define DEGRADE_SHIFT 7 - -static const u8 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; -static const u8 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { - { 0, 0, 0, 0, 0, 0, 0, 0 }, - { 64, 32, 8, 0, 0, 0, 0, 0 }, - { 96, 72, 40, 12, 1, 0, 0, 0 }, - { 112, 98, 75, 43, 15, 1, 0, 0 }, - { 120, 112, 98, 76, 45, 16, 2, 0 } -}; - -/* - * Update cpu_load for any missed ticks, due to tickless idle. The backlog - * would be when CPU is idle and so we just decay the old load without - * adding any new load. - */ -static unsigned long -decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) -{ - int j = 0; - - if (!missed_updates) - return load; - - if (missed_updates >= degrade_zero_ticks[idx]) - return 0; - - if (idx == 1) - return load >> missed_updates; - - while (missed_updates) { - if (missed_updates % 2) - load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; - - missed_updates >>= 1; - j++; - } - return load; -} static struct { cpumask_var_t idle_cpus_mask; @@ -5398,201 +5333,12 @@ static struct { #endif /* CONFIG_NO_HZ_COMMON */ -/** - * __cpu_load_update - update the rq->cpu_load[] statistics - * @this_rq: The rq to update statistics for - * @this_load: The current load - * @pending_updates: The number of missed updates - * - * Update rq->cpu_load[] statistics. This function is usually called every - * scheduler tick (TICK_NSEC). - * - * This function computes a decaying average: - * - * load[i]' = (1 - 1/2^i) * load[i] + (1/2^i) * load - * - * Because of NOHZ it might not get called on every tick which gives need for - * the @pending_updates argument. - * - * load[i]_n = (1 - 1/2^i) * load[i]_n-1 + (1/2^i) * load_n-1 - * = A * load[i]_n-1 + B ; A := (1 - 1/2^i), B := (1/2^i) * load - * = A * (A * load[i]_n-2 + B) + B - * = A * (A * (A * load[i]_n-3 + B) + B) + B - * = A^3 * load[i]_n-3 + (A^2 + A + 1) * B - * = A^n * load[i]_0 + (A^(n-1) + A^(n-2) + ... + 1) * B - * = A^n * load[i]_0 + ((1 - A^n) / (1 - A)) * B - * = (1 - 1/2^i)^n * (load[i]_0 - load) + load - * - * In the above we've assumed load_n := load, which is true for NOHZ_FULL as - * any change in load would have resulted in the tick being turned back on. - * - * For regular NOHZ, this reduces to: - * - * load[i]_n = (1 - 1/2^i)^n * load[i]_0 - * - * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra - * term. - */ -static void cpu_load_update(struct rq *this_rq, unsigned long this_load, - unsigned long pending_updates) -{ - unsigned long __maybe_unused tickless_load = this_rq->cpu_load[0]; - int i, scale; - - this_rq->nr_load_updates++; - - /* Update our load: */ - this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ - for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { - unsigned long old_load, new_load; - - /* scale is effectively 1 << i now, and >> i divides by scale */ - - old_load = this_rq->cpu_load[i]; -#ifdef CONFIG_NO_HZ_COMMON - old_load = decay_load_missed(old_load, pending_updates - 1, i); - if (tickless_load) { - old_load -= decay_load_missed(tickless_load, pending_updates - 1, i); - /* - * old_load can never be a negative value because a - * decayed tickless_load cannot be greater than the - * original tickless_load. - */ - old_load += tickless_load; - } -#endif - new_load = this_load; - /* - * Round up the averaging division if load is increasing. This - * prevents us from getting stuck on 9 if the load is 10, for - * example. - */ - if (new_load > old_load) - new_load += scale - 1; - - this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; - } -} - /* Used instead of source_load when we know the type == 0 */ static unsigned long weighted_cpuload(struct rq *rq) { return cfs_rq_runnable_load_avg(&rq->cfs); } -#ifdef CONFIG_NO_HZ_COMMON -/* - * There is no sane way to deal with nohz on smp when using jiffies because the - * CPU doing the jiffies update might drift wrt the CPU doing the jiffy reading - * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. - * - * Therefore we need to avoid the delta approach from the regular tick when - * possible since that would seriously skew the load calculation. This is why we - * use cpu_load_update_periodic() for CPUs out of nohz. However we'll rely on - * jiffies deltas for updates happening while in nohz mode (idle ticks, idle - * loop exit, nohz_idle_balance, nohz full exit...) - * - * This means we might still be one tick off for nohz periods. - */ - -static void cpu_load_update_nohz(struct rq *this_rq, - unsigned long curr_jiffies, - unsigned long load) -{ - unsigned long pending_updates; - - pending_updates = curr_jiffies - this_rq->last_load_update_tick; - if (pending_updates) { - this_rq->last_load_update_tick = curr_jiffies; - /* - * In the regular NOHZ case, we were idle, this means load 0. - * In the NOHZ_FULL case, we were non-idle, we should consider - * its weighted load. - */ - cpu_load_update(this_rq, load, pending_updates); - } -} - -/* - * Called from nohz_idle_balance() to update the load ratings before doing the - * idle balance. - */ -static void cpu_load_update_idle(struct rq *this_rq) -{ - /* - * bail if there's load or we're actually up-to-date. - */ - if (weighted_cpuload(this_rq)) - return; - - cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0); -} - -/* - * Record CPU load on nohz entry so we know the tickless load to account - * on nohz exit. cpu_load[0] happens then to be updated more frequently - * than other cpu_load[idx] but it should be fine as cpu_load readers - * shouldn't rely into synchronized cpu_load[*] updates. - */ -void cpu_load_update_nohz_start(void) -{ - struct rq *this_rq = this_rq(); - - /* - * This is all lockless but should be fine. If weighted_cpuload changes - * concurrently we'll exit nohz. And cpu_load write can race with - * cpu_load_update_idle() but both updater would be writing the same. - */ - this_rq->cpu_load[0] = weighted_cpuload(this_rq); -} - -/* - * Account the tickless load in the end of a nohz frame. - */ -void cpu_load_update_nohz_stop(void) -{ - unsigned long curr_jiffies = READ_ONCE(jiffies); - struct rq *this_rq = this_rq(); - unsigned long load; - struct rq_flags rf; - - if (curr_jiffies == this_rq->last_load_update_tick) - return; - - load = weighted_cpuload(this_rq); - rq_lock(this_rq, &rf); - update_rq_clock(this_rq); - cpu_load_update_nohz(this_rq, curr_jiffies, load); - rq_unlock(this_rq, &rf); -} -#else /* !CONFIG_NO_HZ_COMMON */ -static inline void cpu_load_update_nohz(struct rq *this_rq, - unsigned long curr_jiffies, - unsigned long load) { } -#endif /* CONFIG_NO_HZ_COMMON */ - -static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load) -{ -#ifdef CONFIG_NO_HZ_COMMON - /* See the mess around cpu_load_update_nohz(). */ - this_rq->last_load_update_tick = READ_ONCE(jiffies); -#endif - cpu_load_update(this_rq, load, 1); -} - -/* - * Called from scheduler_tick() - */ -void cpu_load_update_active(struct rq *this_rq) -{ - unsigned long load = weighted_cpuload(this_rq); - - if (tick_nohz_tick_stopped()) - cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load); - else - cpu_load_update_periodic(this_rq, load); -} - /* * Return a low guess at the load of a migration-source CPU weighted * according to the scheduling class and "nice" value. @@ -9876,7 +9622,6 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, rq_lock_irqsave(rq, &rf); update_rq_clock(rq); - cpu_load_update_idle(rq); rq_unlock_irqrestore(rq, &rf); if (flags & NOHZ_BALANCE_KICK) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c308410675ed..3750b5e53792 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -96,12 +96,6 @@ extern atomic_long_t calc_load_tasks; extern void calc_global_load_tick(struct rq *this_rq); extern long calc_load_fold_active(struct rq *this_rq, long adjust); -#ifdef CONFIG_SMP -extern void cpu_load_update_active(struct rq *this_rq); -#else -static inline void cpu_load_update_active(struct rq *this_rq) { } -#endif - /* * Helpers for converting nanosecond timing to jiffy resolution */ diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index f4ee1a3428ae..be9707f68024 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -782,7 +782,6 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) */ if (!ts->tick_stopped) { calc_load_nohz_start(); - cpu_load_update_nohz_start(); quiet_vmstat(); ts->last_tick = hrtimer_get_expires(&ts->sched_timer); @@ -829,7 +828,6 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) { /* Update jiffies first */ tick_do_update_jiffies64(now); - cpu_load_update_nohz_stop(); /* * Clear the timer idle flag, so we avoid IPIs on remote queueing and From 1c1b8a7b03ef50f80f5d0c871ee261c04a6c967e Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Mon, 27 May 2019 07:21:11 +0100 Subject: [PATCH 04/33] sched/fair: Replace source_load() & target_load() with weighted_cpuload() With LB_BIAS disabled, source_load() & target_load() return weighted_cpuload(). Replace both with calls to weighted_cpuload(). The function to obtain the load index (sd->*_idx) for an sd, get_sd_load_idx(), can be removed as well. Finally, get rid of the sched feature LB_BIAS. Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Acked-by: Rik van Riel Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Patrick Bellasi Cc: Peter Zijlstra Cc: Quentin Perret Cc: Thomas Gleixner Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lkml.kernel.org/r/20190527062116.11512-3-dietmar.eggemann@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 90 ++--------------------------------------- kernel/sched/features.h | 1 - 2 files changed, 4 insertions(+), 87 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1aab323f1b4b..5b9691e5ea59 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1467,8 +1467,6 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, } static unsigned long weighted_cpuload(struct rq *rq); -static unsigned long source_load(int cpu, int type); -static unsigned long target_load(int cpu, int type); /* Cached statistics for all CPUs within a node */ struct numa_stats { @@ -5333,45 +5331,11 @@ static struct { #endif /* CONFIG_NO_HZ_COMMON */ -/* Used instead of source_load when we know the type == 0 */ static unsigned long weighted_cpuload(struct rq *rq) { return cfs_rq_runnable_load_avg(&rq->cfs); } -/* - * Return a low guess at the load of a migration-source CPU weighted - * according to the scheduling class and "nice" value. - * - * We want to under-estimate the load of migration sources, to - * balance conservatively. - */ -static unsigned long source_load(int cpu, int type) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long total = weighted_cpuload(rq); - - if (type == 0 || !sched_feat(LB_BIAS)) - return total; - - return min(rq->cpu_load[type-1], total); -} - -/* - * Return a high guess at the load of a migration-target CPU weighted - * according to the scheduling class and "nice" value. - */ -static unsigned long target_load(int cpu, int type) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long total = weighted_cpuload(rq); - - if (type == 0 || !sched_feat(LB_BIAS)) - return total; - - return max(rq->cpu_load[type-1], total); -} - static unsigned long capacity_of(int cpu) { return cpu_rq(cpu)->cpu_capacity; @@ -5479,7 +5443,7 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p, s64 this_eff_load, prev_eff_load; unsigned long task_load; - this_eff_load = target_load(this_cpu, sd->wake_idx); + this_eff_load = weighted_cpuload(cpu_rq(this_cpu)); if (sync) { unsigned long current_load = task_h_load(current); @@ -5497,7 +5461,7 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p, this_eff_load *= 100; this_eff_load *= capacity_of(prev_cpu); - prev_eff_load = source_load(prev_cpu, sd->wake_idx); + prev_eff_load = weighted_cpuload(cpu_rq(prev_cpu)); prev_eff_load -= task_load; if (sched_feat(WA_BIAS)) prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2; @@ -5558,14 +5522,10 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, unsigned long this_runnable_load = ULONG_MAX; unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX; unsigned long most_spare = 0, this_spare = 0; - int load_idx = sd->forkexec_idx; int imbalance_scale = 100 + (sd->imbalance_pct-100)/2; unsigned long imbalance = scale_load_down(NICE_0_LOAD) * (sd->imbalance_pct-100) / 100; - if (sd_flag & SD_BALANCE_WAKE) - load_idx = sd->wake_idx; - do { unsigned long load, avg_load, runnable_load; unsigned long spare_cap, max_spare_cap; @@ -5589,12 +5549,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, max_spare_cap = 0; for_each_cpu(i, sched_group_span(group)) { - /* Bias balancing toward CPUs of our domain */ - if (local_group) - load = source_load(i, load_idx); - else - load = target_load(i, load_idx); - + load = weighted_cpuload(cpu_rq(i)); runnable_load += load; avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs); @@ -7676,34 +7631,6 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) }; } -/** - * get_sd_load_idx - Obtain the load index for a given sched domain. - * @sd: The sched_domain whose load_idx is to be obtained. - * @idle: The idle status of the CPU for whose sd load_idx is obtained. - * - * Return: The load index. - */ -static inline int get_sd_load_idx(struct sched_domain *sd, - enum cpu_idle_type idle) -{ - int load_idx; - - switch (idle) { - case CPU_NOT_IDLE: - load_idx = sd->busy_idx; - break; - - case CPU_NEWLY_IDLE: - load_idx = sd->newidle_idx; - break; - default: - load_idx = sd->idle_idx; - break; - } - - return load_idx; -} - static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu) { struct rq *rq = cpu_rq(cpu); @@ -7992,9 +7919,6 @@ static inline void update_sg_lb_stats(struct lb_env *env, struct sg_lb_stats *sgs, int *sg_status) { - int local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group)); - int load_idx = get_sd_load_idx(env->sd, env->idle); - unsigned long load; int i, nr_running; memset(sgs, 0, sizeof(*sgs)); @@ -8005,13 +7929,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false)) env->flags |= LBF_NOHZ_AGAIN; - /* Bias balancing toward CPUs of our domain: */ - if (local_group) - load = target_load(i, load_idx); - else - load = source_load(i, load_idx); - - sgs->group_load += load; + sgs->group_load += weighted_cpuload(rq); sgs->group_util += cpu_util(i); sgs->sum_nr_running += rq->cfs.h_nr_running; diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 858589b83377..2410db5e9a35 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -39,7 +39,6 @@ SCHED_FEAT(WAKEUP_PREEMPTION, true) SCHED_FEAT(HRTICK, false) SCHED_FEAT(DOUBLE_TICK, false) -SCHED_FEAT(LB_BIAS, false) /* * Decrement CPU capacity based on time not spent running tasks From 3d8d53554405952993bb0279ef3ebebc51740074 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Mon, 27 May 2019 07:21:12 +0100 Subject: [PATCH 05/33] sched/debug: Remove sd->*_idx range on sysctl This reverts: commit 201c373e8e48 ("sched/debug: Limit sd->*_idx range on sysctl") Load indexes (sd->*_idx) are no longer needed without rq->cpu_load[]. The range check for load indexes can be removed as well. Get rid of it before the rq->cpu_load[] since it uses CPU_LOAD_IDX_MAX. At the same time, fix the following coding style issues detected by scripts/checkpatch.pl: ERROR: space prohibited before that ',' ERROR: space prohibited before that close parenthesis ')' Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Acked-by: Rik van Riel Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Patrick Bellasi Cc: Peter Zijlstra Cc: Quentin Perret Cc: Thomas Gleixner Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lkml.kernel.org/r/20190527062116.11512-4-dietmar.eggemann@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 37 ++++++++++++++----------------------- 1 file changed, 14 insertions(+), 23 deletions(-) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 150043e1d716..5c7b066d7de6 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -236,25 +236,16 @@ static void sd_free_ctl_entry(struct ctl_table **tablep) *tablep = NULL; } -static int min_load_idx = 0; -static int max_load_idx = CPU_LOAD_IDX_MAX-1; - static void set_table_entry(struct ctl_table *entry, const char *procname, void *data, int maxlen, - umode_t mode, proc_handler *proc_handler, - bool load_idx) + umode_t mode, proc_handler *proc_handler) { entry->procname = procname; entry->data = data; entry->maxlen = maxlen; entry->mode = mode; entry->proc_handler = proc_handler; - - if (load_idx) { - entry->extra1 = &min_load_idx; - entry->extra2 = &max_load_idx; - } } static struct ctl_table * @@ -265,19 +256,19 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) if (table == NULL) return NULL; - set_table_entry(&table[0] , "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax, false); - set_table_entry(&table[1] , "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax, false); - set_table_entry(&table[2] , "busy_idx", &sd->busy_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); - set_table_entry(&table[3] , "idle_idx", &sd->idle_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); - set_table_entry(&table[4] , "newidle_idx", &sd->newidle_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); - set_table_entry(&table[5] , "wake_idx", &sd->wake_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); - set_table_entry(&table[6] , "forkexec_idx", &sd->forkexec_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); - set_table_entry(&table[7] , "busy_factor", &sd->busy_factor, sizeof(int) , 0644, proc_dointvec_minmax, false); - set_table_entry(&table[8] , "imbalance_pct", &sd->imbalance_pct, sizeof(int) , 0644, proc_dointvec_minmax, false); - set_table_entry(&table[9] , "cache_nice_tries", &sd->cache_nice_tries, sizeof(int) , 0644, proc_dointvec_minmax, false); - set_table_entry(&table[10], "flags", &sd->flags, sizeof(int) , 0644, proc_dointvec_minmax, false); - set_table_entry(&table[11], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax, false); - set_table_entry(&table[12], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring, false); + set_table_entry(&table[0], "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax); + set_table_entry(&table[1], "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax); + set_table_entry(&table[2], "busy_idx", &sd->busy_idx, sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[3], "idle_idx", &sd->idle_idx, sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[5], "wake_idx", &sd->wake_idx, sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[7], "busy_factor", &sd->busy_factor, sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[9], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[10], "flags", &sd->flags, sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[11], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax); + set_table_entry(&table[12], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring); /* &table[13] is terminator */ return table; From 55627e3cd22c315c4a02fe3bbbb7234ec439cb1d Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Mon, 27 May 2019 07:21:13 +0100 Subject: [PATCH 06/33] sched/core: Remove rq->cpu_load[] The per rq load array values also disappear from the cpu#X sections in /proc/sched_debug. Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Acked-by: Rik van Riel Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Patrick Bellasi Cc: Peter Zijlstra Cc: Quentin Perret Cc: Thomas Gleixner Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lkml.kernel.org/r/20190527062116.11512-5-dietmar.eggemann@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 6 +----- kernel/sched/debug.c | 5 ----- kernel/sched/sched.h | 2 -- 3 files changed, 1 insertion(+), 12 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 00b8966802a8..29984d8c41f0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5901,8 +5901,8 @@ DECLARE_PER_CPU(cpumask_var_t, select_idle_mask); void __init sched_init(void) { - int i, j; unsigned long alloc_size = 0, ptr; + int i; wait_bit_init(); @@ -6004,10 +6004,6 @@ void __init sched_init(void) #ifdef CONFIG_RT_GROUP_SCHED init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); #endif - - for (j = 0; j < CPU_LOAD_IDX_MAX; j++) - rq->cpu_load[j] = 0; - #ifdef CONFIG_SMP rq->sd = NULL; rq->rd = NULL; diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 5c7b066d7de6..a0b0d6e21e5b 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -654,11 +654,6 @@ do { \ SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr))); PN(clock); PN(clock_task); - P(cpu_load[0]); - P(cpu_load[1]); - P(cpu_load[2]); - P(cpu_load[3]); - P(cpu_load[4]); #undef P #undef PN diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3750b5e53792..607859a18b2a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -812,8 +812,6 @@ struct rq { unsigned int nr_preferred_running; unsigned int numa_migrate_on; #endif - #define CPU_LOAD_IDX_MAX 5 - unsigned long cpu_load[CPU_LOAD_IDX_MAX]; #ifdef CONFIG_NO_HZ_COMMON #ifdef CONFIG_SMP unsigned long last_load_update_tick; From 0e1fef63d92d61ed561e504c3a078a827a0f9bfe Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Mon, 27 May 2019 07:21:14 +0100 Subject: [PATCH 07/33] sched/core: Remove sd->*_idx The sched domain per rq load index files also disappear from the /proc/sys/kernel/sched_domain/cpuX/domainY directories. Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Acked-by: Rik van Riel Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Patrick Bellasi Cc: Peter Zijlstra Cc: Quentin Perret Cc: Thomas Gleixner Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lkml.kernel.org/r/20190527062116.11512-6-dietmar.eggemann@arm.com Signed-off-by: Ingo Molnar --- include/linux/sched/topology.h | 5 ----- kernel/sched/debug.c | 25 ++++++++++--------------- kernel/sched/topology.c | 10 ---------- 3 files changed, 10 insertions(+), 30 deletions(-) diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index cfc0a89a7159..53afbe07354a 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -84,11 +84,6 @@ struct sched_domain { unsigned int busy_factor; /* less balancing by factor if busy */ unsigned int imbalance_pct; /* No balance until over watermark */ unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */ - unsigned int busy_idx; - unsigned int idle_idx; - unsigned int newidle_idx; - unsigned int wake_idx; - unsigned int forkexec_idx; int nohz_idle; /* NOHZ IDLE status */ int flags; /* See SD_* */ diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index a0b0d6e21e5b..7ffde8ce82fd 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -251,25 +251,20 @@ set_table_entry(struct ctl_table *entry, static struct ctl_table * sd_alloc_ctl_domain_table(struct sched_domain *sd) { - struct ctl_table *table = sd_alloc_ctl_entry(14); + struct ctl_table *table = sd_alloc_ctl_entry(9); if (table == NULL) return NULL; - set_table_entry(&table[0], "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax); - set_table_entry(&table[1], "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax); - set_table_entry(&table[2], "busy_idx", &sd->busy_idx, sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[3], "idle_idx", &sd->idle_idx, sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[5], "wake_idx", &sd->wake_idx, sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[7], "busy_factor", &sd->busy_factor, sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[9], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[10], "flags", &sd->flags, sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[11], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax); - set_table_entry(&table[12], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring); - /* &table[13] is terminator */ + set_table_entry(&table[0], "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax); + set_table_entry(&table[1], "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax); + set_table_entry(&table[2], "busy_factor", &sd->busy_factor, sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[3], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[4], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax); + set_table_entry(&table[7], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring); + /* &table[8] is terminator */ return table; } diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index f53f89df837d..63184cf0d0d7 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1344,11 +1344,6 @@ sd_init(struct sched_domain_topology_level *tl, .imbalance_pct = 125, .cache_nice_tries = 0, - .busy_idx = 0, - .idle_idx = 0, - .newidle_idx = 0, - .wake_idx = 0, - .forkexec_idx = 0, .flags = 1*SD_LOAD_BALANCE | 1*SD_BALANCE_NEWIDLE @@ -1400,13 +1395,10 @@ sd_init(struct sched_domain_topology_level *tl, } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { sd->imbalance_pct = 117; sd->cache_nice_tries = 1; - sd->busy_idx = 2; #ifdef CONFIG_NUMA } else if (sd->flags & SD_NUMA) { sd->cache_nice_tries = 2; - sd->busy_idx = 3; - sd->idle_idx = 2; sd->flags &= ~SD_PREFER_SIBLING; sd->flags |= SD_SERIALIZE; @@ -1419,8 +1411,6 @@ sd_init(struct sched_domain_topology_level *tl, #endif } else { sd->cache_nice_tries = 1; - sd->busy_idx = 2; - sd->idle_idx = 1; } /* From af75d1a9a9f75bf030c2f35705f1ff6d226f96fe Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Mon, 27 May 2019 07:21:15 +0100 Subject: [PATCH 08/33] sched/fair: Remove sgs->sum_weighted_load Since sg_lb_stats::sum_weighted_load is now identical with sg_lb_stats::group_load remove it and replace its use case (calculating load per task) with the latter. Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Acked-by: Rik van Riel Acked-by: Vincent Guittot Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Patrick Bellasi Cc: Peter Zijlstra Cc: Quentin Perret Cc: Thomas Gleixner Cc: Valentin Schneider Link: https://lkml.kernel.org/r/20190527062116.11512-7-dietmar.eggemann@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5b9691e5ea59..7f8d477f90fe 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7577,7 +7577,6 @@ static unsigned long task_h_load(struct task_struct *p) struct sg_lb_stats { unsigned long avg_load; /*Avg load across the CPUs of the group */ unsigned long group_load; /* Total load over the CPUs of the group */ - unsigned long sum_weighted_load; /* Weighted load of group's tasks */ unsigned long load_per_task; unsigned long group_capacity; unsigned long group_util; /* Total utilization of the group */ @@ -7944,7 +7943,6 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->nr_numa_running += rq->nr_numa_running; sgs->nr_preferred_running += rq->nr_preferred_running; #endif - sgs->sum_weighted_load += weighted_cpuload(rq); /* * No need to call idle_cpu() if nr_running is not 0 */ @@ -7963,7 +7961,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity; if (sgs->sum_nr_running) - sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; + sgs->load_per_task = sgs->group_load / sgs->sum_nr_running; sgs->group_weight = group->group_weight; From e3b929b0a184edb35531153c5afcaebb09014f9d Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Mon, 3 Jun 2019 17:13:38 +0800 Subject: [PATCH 09/33] sched/core: Add __sched tag for io_schedule() Non-inline io_schedule() was introduced in: commit 10ab56434f2f ("sched/core: Separate out io_schedule_prepare() and io_schedule_finish()") Keep in line with io_schedule_timeout(), otherwise "/proc//wchan" will report io_schedule() rather than its callers when waiting for IO. Reported-by: Jilong Kou Signed-off-by: Gao Xiang Signed-off-by: Peter Zijlstra (Intel) Acked-by: Tejun Heo Cc: Andrew Morton Cc: Linus Torvalds Cc: Miao Xie Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 10ab56434f2f ("sched/core: Separate out io_schedule_prepare() and io_schedule_finish()") Link: https://lkml.kernel.org/r/20190603091338.2695-1-gaoxiang25@huawei.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 29984d8c41f0..cd047927f707 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5122,7 +5122,7 @@ long __sched io_schedule_timeout(long timeout) } EXPORT_SYMBOL(io_schedule_timeout); -void io_schedule(void) +void __sched io_schedule(void) { int token; From b0c792244138d3ef099e7fce978675dc4acae570 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 3 Jun 2019 12:54:24 +0100 Subject: [PATCH 10/33] sched/fair: Clean up definition of NOHZ blocked load functions cfs_rq_has_blocked() and others_have_blocked() are only used within update_blocked_averages(). The !CONFIG_FAIR_GROUP_SCHED version of the latter calls them within a #define CONFIG_NO_HZ_COMMON block, whereas the CONFIG_FAIR_GROUP_SCHED one calls them unconditionnally. As reported by Qian, the above leads to this warning in !CONFIG_NO_HZ_COMMON configs: kernel/sched/fair.c: In function 'update_blocked_averages': kernel/sched/fair.c:7750:7: warning: variable 'done' set but not used [-Wunused-but-set-variable] It wouldn't be wrong to keep cfs_rq_has_blocked() and others_have_blocked() as they are, but since their only current use is to figure out when we can stop calling update_blocked_averages() on fully decayed NOHZ idle CPUs, we can give them a new definition for !CONFIG_NO_HZ_COMMON. Change the definition of cfs_rq_has_blocked() and others_have_blocked() for !CONFIG_NO_HZ_COMMON so that the NOHZ-specific blocks of update_blocked_averages() become no-ops and the 'done' variable gets optimised out. While at it, remove the CONFIG_NO_HZ_COMMON block from the !CONFIG_FAIR_GROUP_SCHED definition of update_blocked_averages() by using the newly-introduced update_blocked_load_status() helper. No change in functionality intended. [ Additions by Peter Zijlstra. ] Reported-by: Qian Cai Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Acked-by: Vincent Guittot Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190603115424.7951-1-valentin.schneider@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7f8d477f90fe..4c8f45ed093c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7393,6 +7393,7 @@ static void attach_tasks(struct lb_env *env) rq_unlock(env->dst_rq, &rf); } +#ifdef CONFIG_NO_HZ_COMMON static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { if (cfs_rq->avg.load_avg) @@ -7420,6 +7421,19 @@ static inline bool others_have_blocked(struct rq *rq) return false; } +static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) +{ + rq->last_blocked_load_update_tick = jiffies; + + if (!has_blocked) + rq->has_blocked_load = 0; +} +#else +static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; } +static inline bool others_have_blocked(struct rq *rq) { return false; } +static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {} +#endif + #ifdef CONFIG_FAIR_GROUP_SCHED static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) @@ -7485,11 +7499,7 @@ static void update_blocked_averages(int cpu) if (others_have_blocked(rq)) done = false; -#ifdef CONFIG_NO_HZ_COMMON - rq->last_blocked_load_update_tick = jiffies; - if (done) - rq->has_blocked_load = 0; -#endif + update_blocked_load_status(rq, !done); rq_unlock_irqrestore(rq, &rf); } @@ -7555,11 +7565,7 @@ static inline void update_blocked_averages(int cpu) update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class); update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class); update_irq_load_avg(rq, 0); -#ifdef CONFIG_NO_HZ_COMMON - rq->last_blocked_load_update_tick = jiffies; - if (!cfs_rq_has_blocked(cfs_rq) && !others_have_blocked(rq)) - rq->has_blocked_load = 0; -#endif + update_blocked_load_status(rq, cfs_rq_has_blocked(cfs_rq) || others_have_blocked(rq)); rq_unlock_irqrestore(rq, &rf); } From 509466b7d480bc5d22e90b9fbe6122ae0e2fbe39 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Mon, 3 Jun 2019 17:11:44 -0400 Subject: [PATCH 11/33] sched/fair: Fix "runnable_avg_yN_inv" not used warnings runnable_avg_yN_inv[] is only used in kernel/sched/pelt.c but was included in several other places because they need other macros all came from kernel/sched/sched-pelt.h which was generated by Documentation/scheduler/sched-pelt. As the result, it causes compilation a lot of warnings, kernel/sched/sched-pelt.h:4:18: warning: 'runnable_avg_yN_inv' defined but not used [-Wunused-const-variable=] kernel/sched/sched-pelt.h:4:18: warning: 'runnable_avg_yN_inv' defined but not used [-Wunused-const-variable=] kernel/sched/sched-pelt.h:4:18: warning: 'runnable_avg_yN_inv' defined but not used [-Wunused-const-variable=] ... Silence it by appending the __maybe_unused attribute for it, so all generated variables and macros can still be kept in the same file. Signed-off-by: Qian Cai Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/1559596304-31581-1-git-send-email-cai@lca.pw Signed-off-by: Ingo Molnar --- Documentation/scheduler/sched-pelt.c | 3 ++- kernel/sched/sched-pelt.h | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/Documentation/scheduler/sched-pelt.c b/Documentation/scheduler/sched-pelt.c index e4219139386a..7238b355919c 100644 --- a/Documentation/scheduler/sched-pelt.c +++ b/Documentation/scheduler/sched-pelt.c @@ -20,7 +20,8 @@ void calc_runnable_avg_yN_inv(void) int i; unsigned int x; - printf("static const u32 runnable_avg_yN_inv[] = {"); + /* To silence -Wunused-but-set-variable warnings. */ + printf("static const u32 runnable_avg_yN_inv[] __maybe_unused = {"); for (i = 0; i < HALFLIFE; i++) { x = ((1UL<<32)-1)*pow(y, i); diff --git a/kernel/sched/sched-pelt.h b/kernel/sched/sched-pelt.h index a26473674fb7..c529706bed11 100644 --- a/kernel/sched/sched-pelt.h +++ b/kernel/sched/sched-pelt.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* Generated by Documentation/scheduler/sched-pelt; do not modify. */ -static const u32 runnable_avg_yN_inv[] = { +static const u32 runnable_avg_yN_inv[] __maybe_unused = { 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6, 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85, 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581, From aacedf26fb7601222f2452cf0a54cab4fee160c5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 7 Jun 2019 15:39:49 +0200 Subject: [PATCH 12/33] sched/core: Optimize try_to_wake_up() for local wakeups Jens reported that significant performance can be had on some block workloads by special casing local wakeups. That is, wakeups on the current task before it schedules out. Given something like the normal wait pattern: for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); if (cond) break; schedule(); } __set_current_state(TASK_RUNNING); Any wakeup (on this CPU) after set_current_state() and before schedule() would benefit from this. Normal wakeups take p->pi_lock, which serializes wakeups to the same task. By eliding that we gain concurrency on: - ttwu_stat(); we already had concurrency on rq stats, this now also brings it to task stats. -ENOCARE - tracepoints; it is now possible to get multiple instances of trace_sched_waking() (and possibly trace_sched_wakeup()) for the same task. Tracers will have to learn to cope. Furthermore, p->pi_lock is used by set_special_state(), to order against TASK_RUNNING stores from other CPUs. But since this is strictly CPU local, we don't need the lock, and set_special_state()'s disabling of IRQs is sufficient. After the normal wakeup takes p->pi_lock it issues smp_mb__after_spinlock(), in order to ensure the woken task must observe prior stores before we observe the p->state. If this is CPU local, this will be satisfied with a compiler barrier, and we rely on try_to_wake_up() being a funcation call, which implies such. Since, when 'p == current', 'p->on_rq' must be true, the normal wakeup would continue into the ttwu_remote() branch, which normally is concerned with exactly this wakeup scenario, except from a remote CPU. IOW we're waking a task that is still running. In this case, we can trivially avoid taking rq->lock, all that's left from this is to set p->state. This then yields an extremely simple and fast path for 'p == current'. Reported-by: Jens Axboe Tested-by: Jens Axboe Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Qian Cai Cc: Thomas Gleixner Cc: akpm@linux-foundation.org Cc: gkohli@codeaurora.org Cc: hch@lst.de Cc: oleg@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 34 +++++++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index cd047927f707..83bd6bb32a34 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1991,6 +1991,29 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) unsigned long flags; int cpu, success = 0; + if (p == current) { + /* + * We're waking current, this means 'p->on_rq' and 'task_cpu(p) + * == smp_processor_id()'. Together this means we can special + * case the whole 'p->on_rq && ttwu_remote()' case below + * without taking any locks. + * + * In particular: + * - we rely on Program-Order guarantees for all the ordering, + * - we're serialized against set_special_state() by virtue of + * it disabling IRQs (this allows not taking ->pi_lock). + */ + if (!(p->state & state)) + return false; + + success = 1; + cpu = task_cpu(p); + trace_sched_waking(p); + p->state = TASK_RUNNING; + trace_sched_wakeup(p); + goto out; + } + /* * If we are going to wake up a thread waiting for CONDITION we * need to ensure that CONDITION=1 done by the caller can not be @@ -2000,7 +2023,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) raw_spin_lock_irqsave(&p->pi_lock, flags); smp_mb__after_spinlock(); if (!(p->state & state)) - goto out; + goto unlock; trace_sched_waking(p); @@ -2030,7 +2053,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) */ smp_rmb(); if (p->on_rq && ttwu_remote(p, wake_flags)) - goto stat; + goto unlock; #ifdef CONFIG_SMP /* @@ -2090,10 +2113,11 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) #endif /* CONFIG_SMP */ ttwu_queue(p, cpu, wake_flags); -stat: - ttwu_stat(p, cpu, wake_flags); -out: +unlock: raw_spin_unlock_irqrestore(&p->pi_lock, flags); +out: + if (success) + ttwu_stat(p, cpu, wake_flags); return success; } From 66567fcbaecac455caa1b13643155d686b51ce63 Mon Sep 17 00:00:00 2001 From: "bsegall@google.com" Date: Thu, 6 Jun 2019 10:21:01 -0700 Subject: [PATCH 13/33] sched/fair: Don't push cfs_bandwith slack timers forward When a cfs_rq sleeps and returns its quota, we delay for 5ms before waking any throttled cfs_rqs to coalesce with other cfs_rqs going to sleep, as this has to be done outside of the rq lock we hold. The current code waits for 5ms without any sleeps, instead of waiting for 5ms from the first sleep, which can delay the unthrottle more than we want. Switch this around so that we can't push this forward forever. This requires an extra flag rather than using hrtimer_active, since we need to start a new timer if the current one is in the process of finishing. Signed-off-by: Ben Segall Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Xunlei Pang Acked-by: Phil Auld Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/xm26a7euy6iq.fsf_-_@bsegall-linux.svl.corp.google.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 7 +++++++ kernel/sched/sched.h | 8 ++++---- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4c8f45ed093c..3c11dcdedcbc 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4729,6 +4729,11 @@ static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b) if (runtime_refresh_within(cfs_b, min_left)) return; + /* don't push forwards an existing deferred unthrottle */ + if (cfs_b->slack_started) + return; + cfs_b->slack_started = true; + hrtimer_start(&cfs_b->slack_timer, ns_to_ktime(cfs_bandwidth_slack_period), HRTIMER_MODE_REL); @@ -4782,6 +4787,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) /* confirm we're still not at a refresh boundary */ raw_spin_lock_irqsave(&cfs_b->lock, flags); + cfs_b->slack_started = false; if (cfs_b->distribute_running) { raw_spin_unlock_irqrestore(&cfs_b->lock, flags); return; @@ -4945,6 +4951,7 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); cfs_b->slack_timer.function = sched_cfs_slack_timer; cfs_b->distribute_running = 0; + cfs_b->slack_started = false; } static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 607859a18b2a..b08dee29ef5e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -338,8 +338,10 @@ struct cfs_bandwidth { u64 runtime_expires; int expires_seq; - short idle; - short period_active; + u8 idle; + u8 period_active; + u8 distribute_running; + u8 slack_started; struct hrtimer period_timer; struct hrtimer slack_timer; struct list_head throttled_cfs_rq; @@ -348,8 +350,6 @@ struct cfs_bandwidth { int nr_periods; int nr_throttled; u64 throttled_time; - - bool distribute_running; #endif }; From 8ec59c0f5f4966f89f4e3e3cab81710c7fa959d0 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Mon, 17 Jun 2019 17:00:17 +0200 Subject: [PATCH 14/33] sched/topology: Remove unused 'sd' parameter from arch_scale_cpu_capacity() The 'struct sched_domain *sd' parameter to arch_scale_cpu_capacity() is unused since commit: 765d0af19f5f ("sched/topology: Remove the ::smt_gain field from 'struct sched_domain'") Remove it. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Viresh Kumar Reviewed-by: Valentin Schneider Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: gregkh@linuxfoundation.org Cc: linux@armlinux.org.uk Cc: quentin.perret@arm.com Cc: rafael@kernel.org Link: https://lkml.kernel.org/r/1560783617-5827-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- arch/arm/kernel/topology.c | 2 +- drivers/base/arch_topology.c | 6 +++--- include/linux/arch_topology.h | 2 +- include/linux/energy_model.h | 2 +- include/linux/sched/topology.h | 14 +++----------- kernel/power/energy_model.c | 2 +- kernel/sched/cpufreq_schedutil.c | 2 +- kernel/sched/deadline.c | 2 +- kernel/sched/fair.c | 6 +++--- kernel/sched/pelt.c | 2 +- kernel/sched/pelt.h | 2 +- kernel/sched/sched.h | 2 +- kernel/sched/topology.c | 8 ++++---- 13 files changed, 22 insertions(+), 30 deletions(-) diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c index 60e375ce1ab2..d17cb1e6d679 100644 --- a/arch/arm/kernel/topology.c +++ b/arch/arm/kernel/topology.c @@ -169,7 +169,7 @@ static void update_cpu_capacity(unsigned int cpu) topology_set_cpu_scale(cpu, cpu_capacity(cpu) / middle_capacity); pr_info("CPU%u: update cpu_capacity %lu\n", - cpu, topology_get_cpu_scale(NULL, cpu)); + cpu, topology_get_cpu_scale(cpu)); } #else diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index 1739d7e1952a..9b09e31ae82f 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c @@ -43,7 +43,7 @@ static ssize_t cpu_capacity_show(struct device *dev, { struct cpu *cpu = container_of(dev, struct cpu, dev); - return sprintf(buf, "%lu\n", topology_get_cpu_scale(NULL, cpu->dev.id)); + return sprintf(buf, "%lu\n", topology_get_cpu_scale(cpu->dev.id)); } static void update_topology_flags_workfn(struct work_struct *work); @@ -116,7 +116,7 @@ void topology_normalize_cpu_scale(void) / capacity_scale; topology_set_cpu_scale(cpu, capacity); pr_debug("cpu_capacity: CPU%d cpu_capacity=%lu\n", - cpu, topology_get_cpu_scale(NULL, cpu)); + cpu, topology_get_cpu_scale(cpu)); } } @@ -185,7 +185,7 @@ init_cpu_capacity_callback(struct notifier_block *nb, cpumask_andnot(cpus_to_visit, cpus_to_visit, policy->related_cpus); for_each_cpu(cpu, policy->related_cpus) { - raw_capacity[cpu] = topology_get_cpu_scale(NULL, cpu) * + raw_capacity[cpu] = topology_get_cpu_scale(cpu) * policy->cpuinfo.max_freq / 1000UL; capacity_scale = max(raw_capacity[cpu], capacity_scale); } diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h index d9bdc1a7f4e7..1cfe05ea1d89 100644 --- a/include/linux/arch_topology.h +++ b/include/linux/arch_topology.h @@ -18,7 +18,7 @@ DECLARE_PER_CPU(unsigned long, cpu_scale); struct sched_domain; static inline -unsigned long topology_get_cpu_scale(struct sched_domain *sd, int cpu) +unsigned long topology_get_cpu_scale(int cpu) { return per_cpu(cpu_scale, cpu); } diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index aa027f7bcb3e..73f8c3cb9588 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -89,7 +89,7 @@ static inline unsigned long em_pd_energy(struct em_perf_domain *pd, * like schedutil. */ cpu = cpumask_first(to_cpumask(pd->cpus)); - scale_cpu = arch_scale_cpu_capacity(NULL, cpu); + scale_cpu = arch_scale_cpu_capacity(cpu); cs = &pd->table[pd->nr_cap_states - 1]; freq = map_util_freq(max_util, cs->frequency, scale_cpu); diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 53afbe07354a..e445d3767cdd 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -196,14 +196,6 @@ extern void set_sched_topology(struct sched_domain_topology_level *tl); # define SD_INIT_NAME(type) #endif -#ifndef arch_scale_cpu_capacity -static __always_inline -unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) -{ - return SCHED_CAPACITY_SCALE; -} -#endif - #else /* CONFIG_SMP */ struct sched_domain_attr; @@ -219,16 +211,16 @@ static inline bool cpus_share_cache(int this_cpu, int that_cpu) return true; } +#endif /* !CONFIG_SMP */ + #ifndef arch_scale_cpu_capacity static __always_inline -unsigned long arch_scale_cpu_capacity(void __always_unused *sd, int cpu) +unsigned long arch_scale_cpu_capacity(int cpu) { return SCHED_CAPACITY_SCALE; } #endif -#endif /* !CONFIG_SMP */ - static inline int task_node(const struct task_struct *p) { return cpu_to_node(task_cpu(p)); diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 7d66ee68aaaf..0a9326f5f421 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -223,7 +223,7 @@ int em_register_perf_domain(cpumask_t *span, unsigned int nr_states, * All CPUs of a domain must have the same micro-architecture * since they all share the same table. */ - cap = arch_scale_cpu_capacity(NULL, cpu); + cap = arch_scale_cpu_capacity(cpu); if (prev_cap && prev_cap != cap) { pr_err("CPUs of %*pbl must have the same capacity\n", cpumask_pr_args(span)); diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 962cf343f798..7c4ce69067c4 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -276,7 +276,7 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) { struct rq *rq = cpu_rq(sg_cpu->cpu); unsigned long util = cpu_util_cfs(rq); - unsigned long max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu); + unsigned long max = arch_scale_cpu_capacity(sg_cpu->cpu); sg_cpu->max = max; sg_cpu->bw_dl = cpu_bw_dl(rq); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index c1ef30861068..8b5bb2ac16e2 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1195,7 +1195,7 @@ static void update_curr_dl(struct rq *rq) &curr->dl); } else { unsigned long scale_freq = arch_scale_freq_capacity(cpu); - unsigned long scale_cpu = arch_scale_cpu_capacity(NULL, cpu); + unsigned long scale_cpu = arch_scale_cpu_capacity(cpu); scaled_delta_exec = cap_scale(delta_exec, scale_freq); scaled_delta_exec = cap_scale(scaled_delta_exec, scale_cpu); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3c11dcdedcbc..4f8754157763 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -764,7 +764,7 @@ void post_init_entity_util_avg(struct task_struct *p) struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); struct sched_avg *sa = &se->avg; - long cpu_scale = arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq))); + long cpu_scale = arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq))); long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2; if (cap > 0) { @@ -7646,7 +7646,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu) { struct rq *rq = cpu_rq(cpu); - unsigned long max = arch_scale_cpu_capacity(sd, cpu); + unsigned long max = arch_scale_cpu_capacity(cpu); unsigned long used, free; unsigned long irq; @@ -7671,7 +7671,7 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) unsigned long capacity = scale_rt_capacity(sd, cpu); struct sched_group *sdg = sd->groups; - cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(sd, cpu); + cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu); if (!capacity) capacity = 1; diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index befce29bd882..42ea66b07b1d 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -366,7 +366,7 @@ int update_irq_load_avg(struct rq *rq, u64 running) * reflect the real amount of computation */ running = cap_scale(running, arch_scale_freq_capacity(cpu_of(rq))); - running = cap_scale(running, arch_scale_cpu_capacity(NULL, cpu_of(rq))); + running = cap_scale(running, arch_scale_cpu_capacity(cpu_of(rq))); /* * We know the time that has been used by interrupt since last update diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index 7489d5f56960..afff644da065 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -79,7 +79,7 @@ static inline void update_rq_clock_pelt(struct rq *rq, s64 delta) * Scale the elapsed time to reflect the real amount of * computation */ - delta = cap_scale(delta, arch_scale_cpu_capacity(NULL, cpu_of(rq))); + delta = cap_scale(delta, arch_scale_cpu_capacity(cpu_of(rq))); delta = cap_scale(delta, arch_scale_freq_capacity(cpu_of(rq))); rq->clock_pelt += delta; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b08dee29ef5e..e58ab597ec88 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2248,7 +2248,7 @@ unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs, static inline unsigned long schedutil_energy_util(int cpu, unsigned long cfs) { - unsigned long max = arch_scale_cpu_capacity(NULL, cpu); + unsigned long max = arch_scale_cpu_capacity(cpu); return schedutil_freq_util(cpu, cfs, max, ENERGY_UTIL); } diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 63184cf0d0d7..f751ce0b783e 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1874,10 +1874,10 @@ static struct sched_domain_topology_level unsigned long cap; /* Is there any asymmetry? */ - cap = arch_scale_cpu_capacity(NULL, cpumask_first(cpu_map)); + cap = arch_scale_cpu_capacity(cpumask_first(cpu_map)); for_each_cpu(i, cpu_map) { - if (arch_scale_cpu_capacity(NULL, i) != cap) { + if (arch_scale_cpu_capacity(i) != cap) { asym = true; break; } @@ -1892,7 +1892,7 @@ static struct sched_domain_topology_level * to everyone. */ for_each_cpu(i, cpu_map) { - unsigned long max_capacity = arch_scale_cpu_capacity(NULL, i); + unsigned long max_capacity = arch_scale_cpu_capacity(i); int tl_id = 0; for_each_sd_topology(tl) { @@ -1902,7 +1902,7 @@ static struct sched_domain_topology_level for_each_cpu_and(j, tl->mask(i), cpu_map) { unsigned long capacity; - capacity = arch_scale_cpu_capacity(NULL, j); + capacity = arch_scale_cpu_capacity(j); if (capacity <= max_capacity) continue; From 016190a4b5824df2d5bb97951a04dd3629973671 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 11 Jun 2019 15:29:07 +0300 Subject: [PATCH 15/33] sched/wait: Deduplicate code with do-while Statements in the loop's body and before it are identical. Use do-while to not repeat it. Signed-off-by: Pavel Begunkov Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/43ffea6ee2152b90dedf962eac851609e4197218.1560256112.git.asml.silence@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/wait.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index fa0f9adfb752..c1e566a114ca 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -118,16 +118,12 @@ static void __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int bookmark.func = NULL; INIT_LIST_HEAD(&bookmark.entry); - spin_lock_irqsave(&wq_head->lock, flags); - nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive, wake_flags, key, &bookmark); - spin_unlock_irqrestore(&wq_head->lock, flags); - - while (bookmark.flags & WQ_FLAG_BOOKMARK) { + do { spin_lock_irqsave(&wq_head->lock, flags); nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive, wake_flags, key, &bookmark); spin_unlock_irqrestore(&wq_head->lock, flags); - } + } while (bookmark.flags & WQ_FLAG_BOOKMARK); } /** From 9ba5090aecac08ff3ae54ac3bd94b61db7708ffc Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Tue, 4 Jun 2019 12:14:54 +0100 Subject: [PATCH 16/33] sched/autogroup: Make autogroup_path() always available Remove the #ifdef CONFIG_SCHED_DEBUG. Some of the tracepoints to be introduced in later patches need to access this function. Hence make it always available since the tracepoints are not protected by CONFIG_SCHED_DEBUG. Signed-off-by: Qais Yousef Signed-off-by: Peter Zijlstra (Intel) Cc: Dietmar Eggemann Cc: Linus Torvalds Cc: Pavankumar Kondeti Cc: Peter Zijlstra Cc: Quentin Perret Cc: Sebastian Andrzej Siewior Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Uwe Kleine-Konig Link: https://lkml.kernel.org/r/20190604111459.2862-2-qais.yousef@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/autogroup.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index 2d4ff5353ded..2067080bb235 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -259,7 +259,6 @@ out: } #endif /* CONFIG_PROC_FS */ -#ifdef CONFIG_SCHED_DEBUG int autogroup_path(struct task_group *tg, char *buf, int buflen) { if (!task_group_is_autogroup(tg)) @@ -267,4 +266,3 @@ int autogroup_path(struct task_group *tg, char *buf, int buflen) return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); } -#endif From 3c93a0c04dfdcba199982b53b97488b1b1d90eff Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Tue, 4 Jun 2019 12:14:55 +0100 Subject: [PATCH 17/33] sched/debug: Add a new sched_trace_*() helper functions The new functions allow modules to access internal data structures of unexported struct cfs_rq and struct rq to extract important information from the tracepoints to be introduced in later patches. While at it fix alphabetical order of struct declarations in sched.h Signed-off-by: Qais Yousef Signed-off-by: Peter Zijlstra (Intel) Cc: Dietmar Eggemann Cc: Linus Torvalds Cc: Pavankumar Kondeti Cc: Peter Zijlstra Cc: Quentin Perret Cc: Sebastian Andrzej Siewior Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Uwe Kleine-Konig Link: https://lkml.kernel.org/r/20190604111459.2862-3-qais.yousef@arm.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 16 ++++++- kernel/sched/fair.c | 99 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 114 insertions(+), 1 deletion(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 1b2590a8d038..044c023875e8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -35,6 +35,7 @@ struct audit_context; struct backing_dev_info; struct bio_list; struct blk_plug; +struct capture_control; struct cfs_rq; struct fs_struct; struct futex_pi_state; @@ -47,8 +48,9 @@ struct pid_namespace; struct pipe_inode_info; struct rcu_node; struct reclaim_state; -struct capture_control; struct robust_list_head; +struct root_domain; +struct rq; struct sched_attr; struct sched_param; struct seq_file; @@ -1920,4 +1922,16 @@ static inline void rseq_syscall(struct pt_regs *regs) #endif +const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq); +char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len); +int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq); + +const struct sched_avg *sched_trace_rq_avg_rt(struct rq *rq); +const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq); +const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq); + +int sched_trace_rq_cpu(struct rq *rq); + +const struct cpumask *sched_trace_rd_span(struct root_domain *rd); + #endif diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4f8754157763..461c3e9a67b2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -275,6 +275,19 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) return grp->my_q; } +static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len) +{ + if (!path) + return; + + if (cfs_rq && task_group_is_autogroup(cfs_rq->tg)) + autogroup_path(cfs_rq->tg, path, len); + else if (cfs_rq && cfs_rq->tg->css.cgroup) + cgroup_path(cfs_rq->tg->css.cgroup, path, len); + else + strlcpy(path, "(null)", len); +} + static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); @@ -449,6 +462,12 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) return NULL; } +static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len) +{ + if (path) + strlcpy(path, "(null)", len); +} + static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) { return true; @@ -10408,3 +10427,83 @@ __init void init_sched_fair_class(void) #endif /* SMP */ } + +/* + * Helper functions to facilitate extracting info from tracepoints. + */ + +const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq) +{ +#ifdef CONFIG_SMP + return cfs_rq ? &cfs_rq->avg : NULL; +#else + return NULL; +#endif +} +EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_avg); + +char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len) +{ + if (!cfs_rq) { + if (str) + strlcpy(str, "(null)", len); + else + return NULL; + } + + cfs_rq_tg_path(cfs_rq, str, len); + return str; +} +EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_path); + +int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq) +{ + return cfs_rq ? cpu_of(rq_of(cfs_rq)) : -1; +} +EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_cpu); + +const struct sched_avg *sched_trace_rq_avg_rt(struct rq *rq) +{ +#ifdef CONFIG_SMP + return rq ? &rq->avg_rt : NULL; +#else + return NULL; +#endif +} +EXPORT_SYMBOL_GPL(sched_trace_rq_avg_rt); + +const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq) +{ +#ifdef CONFIG_SMP + return rq ? &rq->avg_dl : NULL; +#else + return NULL; +#endif +} +EXPORT_SYMBOL_GPL(sched_trace_rq_avg_dl); + +const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq) +{ +#if defined(CONFIG_SMP) && defined(CONFIG_HAVE_SCHED_AVG_IRQ) + return rq ? &rq->avg_irq : NULL; +#else + return NULL; +#endif +} +EXPORT_SYMBOL_GPL(sched_trace_rq_avg_irq); + +int sched_trace_rq_cpu(struct rq *rq) +{ + return rq ? cpu_of(rq) : -1; +} +EXPORT_SYMBOL_GPL(sched_trace_rq_cpu); + +const struct cpumask *sched_trace_rd_span(struct root_domain *rd) +{ +#ifdef CONFIG_SMP + return rd ? rd->span : NULL; +#else + return NULL; +#endif +} +EXPORT_SYMBOL_GPL(sched_trace_rd_span); From ba19f51fcb549c7ee6261da243eea55a47e98d78 Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Tue, 4 Jun 2019 12:14:56 +0100 Subject: [PATCH 18/33] sched/debug: Add new tracepoints to track PELT at rq level The new tracepoints allow tracking PELT signals at rq level for all scheduling classes + irq. Signed-off-by: Qais Yousef Signed-off-by: Peter Zijlstra (Intel) Cc: Dietmar Eggemann Cc: Linus Torvalds Cc: Pavankumar Kondeti Cc: Peter Zijlstra Cc: Quentin Perret Cc: Sebastian Andrzej Siewior Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Uwe Kleine-Konig Link: https://lkml.kernel.org/r/20190604111459.2862-4-qais.yousef@arm.com Signed-off-by: Ingo Molnar --- include/trace/events/sched.h | 23 +++++++++++++++++++++++ kernel/sched/fair.c | 6 ++++++ kernel/sched/pelt.c | 9 ++++++++- 3 files changed, 37 insertions(+), 1 deletion(-) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index c8c7c7efb487..520b89d384ec 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -594,6 +594,29 @@ TRACE_EVENT(sched_wake_idle_without_ipi, TP_printk("cpu=%d", __entry->cpu) ); + +/* + * Following tracepoints are not exported in tracefs and provide hooking + * mechanisms only for testing and debugging purposes. + * + * Postfixed with _tp to make them easily identifiable in the code. + */ +DECLARE_TRACE(pelt_cfs_tp, + TP_PROTO(struct cfs_rq *cfs_rq), + TP_ARGS(cfs_rq)); + +DECLARE_TRACE(pelt_rt_tp, + TP_PROTO(struct rq *rq), + TP_ARGS(rq)); + +DECLARE_TRACE(pelt_dl_tp, + TP_PROTO(struct rq *rq), + TP_ARGS(rq)); + +DECLARE_TRACE(pelt_irq_tp, + TP_PROTO(struct rq *rq), + TP_ARGS(rq)); + #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 461c3e9a67b2..e883d7e17e36 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3347,6 +3347,8 @@ static inline int propagate_entity_load_avg(struct sched_entity *se) update_tg_cfs_util(cfs_rq, se, gcfs_rq); update_tg_cfs_runnable(cfs_rq, se, gcfs_rq); + trace_pelt_cfs_tp(cfs_rq); + return 1; } @@ -3499,6 +3501,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s add_tg_cfs_propagate(cfs_rq, se->avg.load_sum); cfs_rq_util_change(cfs_rq, flags); + + trace_pelt_cfs_tp(cfs_rq); } /** @@ -3518,6 +3522,8 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum); cfs_rq_util_change(cfs_rq, 0); + + trace_pelt_cfs_tp(cfs_rq); } /* diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index 42ea66b07b1d..4e961b55b5ea 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -28,6 +28,8 @@ #include "sched.h" #include "pelt.h" +#include + /* * Approximate: * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) @@ -292,6 +294,7 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq) cfs_rq->curr != NULL)) { ___update_load_avg(&cfs_rq->avg, 1, 1); + trace_pelt_cfs_tp(cfs_rq); return 1; } @@ -317,6 +320,7 @@ int update_rt_rq_load_avg(u64 now, struct rq *rq, int running) running)) { ___update_load_avg(&rq->avg_rt, 1, 1); + trace_pelt_rt_tp(rq); return 1; } @@ -340,6 +344,7 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) running)) { ___update_load_avg(&rq->avg_dl, 1, 1); + trace_pelt_dl_tp(rq); return 1; } @@ -388,8 +393,10 @@ int update_irq_load_avg(struct rq *rq, u64 running) 1, 1); - if (ret) + if (ret) { ___update_load_avg(&rq->avg_irq, 1, 1); + trace_pelt_irq_tp(rq); + } return ret; } From 8de6242cca17d9299e654e29c966d8612d397272 Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Tue, 4 Jun 2019 12:14:57 +0100 Subject: [PATCH 19/33] sched/debug: Add new tracepoint to track PELT at se level The new tracepoint allows tracking PELT signals at sched_entity level. Which is supported in CFS tasks and taskgroups only. Signed-off-by: Qais Yousef Signed-off-by: Peter Zijlstra (Intel) Cc: Dietmar Eggemann Cc: Linus Torvalds Cc: Pavankumar Kondeti Cc: Peter Zijlstra Cc: Quentin Perret Cc: Sebastian Andrzej Siewior Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Uwe Kleine-Konig Link: https://lkml.kernel.org/r/20190604111459.2862-5-qais.yousef@arm.com Signed-off-by: Ingo Molnar --- include/trace/events/sched.h | 4 ++++ kernel/sched/fair.c | 1 + kernel/sched/pelt.c | 2 ++ 3 files changed, 7 insertions(+) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 520b89d384ec..c7dd9bc7f001 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -617,6 +617,10 @@ DECLARE_TRACE(pelt_irq_tp, TP_PROTO(struct rq *rq), TP_ARGS(rq)); +DECLARE_TRACE(pelt_se_tp, + TP_PROTO(struct sched_entity *se), + TP_ARGS(se)); + #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e883d7e17e36..75218ab1fa07 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3348,6 +3348,7 @@ static inline int propagate_entity_load_avg(struct sched_entity *se) update_tg_cfs_runnable(cfs_rq, se, gcfs_rq); trace_pelt_cfs_tp(cfs_rq); + trace_pelt_se_tp(se); return 1; } diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index 4e961b55b5ea..a96db50d40e0 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -267,6 +267,7 @@ int __update_load_avg_blocked_se(u64 now, struct sched_entity *se) { if (___update_load_sum(now, &se->avg, 0, 0, 0)) { ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); + trace_pelt_se_tp(se); return 1; } @@ -280,6 +281,7 @@ int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); cfs_se_util_change(&se->avg); + trace_pelt_se_tp(se); return 1; } From f9f240f96efc5bcec62379eac701523e11fbb45b Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Tue, 4 Jun 2019 12:14:58 +0100 Subject: [PATCH 20/33] sched/debug: Add sched_overutilized tracepoint The new tracepoint allows us to track the changes in overutilized status. Overutilized status is associated with EAS. It indicates that the system is in high performance state. EAS is disabled when the system is in this state since there's not much energy savings while high performance tasks are pushing the system to the limit and it's better to default to the spreading behavior of the scheduler. This tracepoint helps understanding and debugging the conditions under which this happens. Signed-off-by: Qais Yousef Signed-off-by: Peter Zijlstra (Intel) Cc: Dietmar Eggemann Cc: Linus Torvalds Cc: Pavankumar Kondeti Cc: Peter Zijlstra Cc: Quentin Perret Cc: Sebastian Andrzej Siewior Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Uwe Kleine-Konig Link: https://lkml.kernel.org/r/20190604111459.2862-6-qais.yousef@arm.com Signed-off-by: Ingo Molnar --- include/trace/events/sched.h | 4 ++++ kernel/sched/fair.c | 10 ++++++++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index c7dd9bc7f001..420e80e56e55 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -621,6 +621,10 @@ DECLARE_TRACE(pelt_se_tp, TP_PROTO(struct sched_entity *se), TP_ARGS(se)); +DECLARE_TRACE(sched_overutilized_tp, + TP_PROTO(struct root_domain *rd, bool overutilized), + TP_ARGS(rd, overutilized)); + #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 75218ab1fa07..11ec52709323 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5181,8 +5181,10 @@ static inline bool cpu_overutilized(int cpu) static inline void update_overutilized_status(struct rq *rq) { - if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) + if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) { WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED); + trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED); + } } #else static inline void update_overutilized_status(struct rq *rq) { } @@ -8214,8 +8216,12 @@ next_group: /* Update over-utilization (tipping point, U >= 0) indicator */ WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED); + trace_sched_overutilized_tp(rd, sg_status & SG_OVERUTILIZED); } else if (sg_status & SG_OVERUTILIZED) { - WRITE_ONCE(env->dst_rq->rd->overutilized, SG_OVERUTILIZED); + struct root_domain *rd = env->dst_rq->rd; + + WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED); + trace_sched_overutilized_tp(rd, SG_OVERUTILIZED); } } From a056a5bed7fa67706574b00cf1122c38596b2be1 Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Tue, 4 Jun 2019 12:14:59 +0100 Subject: [PATCH 21/33] sched/debug: Export the newly added tracepoints So that external modules can hook into them and extract the info they need. Since these new tracepoints have no events associated with them exporting these tracepoints make them useful for external modules to perform testing and debugging. There's no other way otherwise to access them. BPF doesn't have infrastructure to access these bare tracepoints either. Signed-off-by: Qais Yousef Signed-off-by: Peter Zijlstra (Intel) Cc: Dietmar Eggemann Cc: Linus Torvalds Cc: Pavankumar Kondeti Cc: Peter Zijlstra Cc: Quentin Perret Cc: Sebastian Andrzej Siewior Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Uwe Kleine-Konig Link: https://lkml.kernel.org/r/20190604111459.2862-7-qais.yousef@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 83bd6bb32a34..e5e02d23e693 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -23,6 +23,17 @@ #define CREATE_TRACE_POINTS #include +/* + * Export tracepoints that act as a bare tracehook (ie: have no trace event + * associated with them) to allow external modules to probe them. + */ +EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_cfs_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp); + DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_JUMP_LABEL) From a3df067974c52df936f548ed218120f623c4c560 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Tue, 18 Jun 2019 14:23:10 +0200 Subject: [PATCH 22/33] sched/fair: Rename weighted_cpuload() to cpu_runnable_load() The term 'weighted' is not needed since there is no 'unweighted' load. Instead use the term 'runnable' to distinguish 'runnable' load (avg.runnable_load_avg) used in load balance from load (avg.load_avg) which is the sum of 'runnable' and 'blocked' load. Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Patrick Bellasi Cc: Peter Zijlstra Cc: Quentin Perret Cc: Rik van Riel Cc: Thomas Gleixner Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lkml.kernel.org/r/57f27a7f-2775-d832-e965-0f4d51bb1954@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 11ec52709323..3bdcd3c718bc 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1485,7 +1485,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4; } -static unsigned long weighted_cpuload(struct rq *rq); +static unsigned long cpu_runnable_load(struct rq *rq); /* Cached statistics for all CPUs within a node */ struct numa_stats { @@ -1506,7 +1506,7 @@ static void update_numa_stats(struct numa_stats *ns, int nid) for_each_cpu(cpu, cpumask_of_node(nid)) { struct rq *rq = cpu_rq(cpu); - ns->load += weighted_cpuload(rq); + ns->load += cpu_runnable_load(rq); ns->compute_capacity += capacity_of(cpu); } @@ -5366,7 +5366,7 @@ static struct { #endif /* CONFIG_NO_HZ_COMMON */ -static unsigned long weighted_cpuload(struct rq *rq) +static unsigned long cpu_runnable_load(struct rq *rq) { return cfs_rq_runnable_load_avg(&rq->cfs); } @@ -5380,7 +5380,7 @@ static unsigned long cpu_avg_load_per_task(int cpu) { struct rq *rq = cpu_rq(cpu); unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running); - unsigned long load_avg = weighted_cpuload(rq); + unsigned long load_avg = cpu_runnable_load(rq); if (nr_running) return load_avg / nr_running; @@ -5478,7 +5478,7 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p, s64 this_eff_load, prev_eff_load; unsigned long task_load; - this_eff_load = weighted_cpuload(cpu_rq(this_cpu)); + this_eff_load = cpu_runnable_load(cpu_rq(this_cpu)); if (sync) { unsigned long current_load = task_h_load(current); @@ -5496,7 +5496,7 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p, this_eff_load *= 100; this_eff_load *= capacity_of(prev_cpu); - prev_eff_load = weighted_cpuload(cpu_rq(prev_cpu)); + prev_eff_load = cpu_runnable_load(cpu_rq(prev_cpu)); prev_eff_load -= task_load; if (sched_feat(WA_BIAS)) prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2; @@ -5584,7 +5584,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, max_spare_cap = 0; for_each_cpu(i, sched_group_span(group)) { - load = weighted_cpuload(cpu_rq(i)); + load = cpu_runnable_load(cpu_rq(i)); runnable_load += load; avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs); @@ -5720,7 +5720,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this shallowest_idle_cpu = i; } } else if (shallowest_idle_cpu == -1) { - load = weighted_cpuload(cpu_rq(i)); + load = cpu_runnable_load(cpu_rq(i)); if (load < min_load) { min_load = load; least_loaded_cpu = i; @@ -7291,7 +7291,7 @@ static struct task_struct *detach_one_task(struct lb_env *env) static const unsigned int sched_nr_migrate_break = 32; /* - * detach_tasks() -- tries to detach up to imbalance weighted load from + * detach_tasks() -- tries to detach up to imbalance runnable load from * busiest_rq, as part of a balancing operation within domain "sd". * * Returns number of detached tasks if successful and 0 otherwise. @@ -7359,7 +7359,7 @@ static int detach_tasks(struct lb_env *env) /* * We only want to steal up to the prescribed amount of - * weighted load. + * runnable load. */ if (env->imbalance <= 0) break; @@ -7969,7 +7969,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false)) env->flags |= LBF_NOHZ_AGAIN; - sgs->group_load += weighted_cpuload(rq); + sgs->group_load += cpu_runnable_load(rq); sgs->group_util += cpu_util(i); sgs->sum_nr_running += rq->cfs.h_nr_running; @@ -8427,7 +8427,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * find_busiest_group - Returns the busiest group within the sched_domain * if there is an imbalance. * - * Also calculates the amount of weighted load which should be moved + * Also calculates the amount of runnable load which should be moved * to restore balance. * * @env: The load balancing environment. @@ -8546,7 +8546,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, int i; for_each_cpu_and(i, sched_group_span(group), env->cpus) { - unsigned long capacity, wl; + unsigned long capacity, load; enum fbq_type rt; rq = cpu_rq(i); @@ -8600,30 +8600,30 @@ static struct rq *find_busiest_queue(struct lb_env *env, rq->nr_running == 1) continue; - wl = weighted_cpuload(rq); + load = cpu_runnable_load(rq); /* - * When comparing with imbalance, use weighted_cpuload() + * When comparing with imbalance, use cpu_runnable_load() * which is not scaled with the CPU capacity. */ - if (rq->nr_running == 1 && wl > env->imbalance && + if (rq->nr_running == 1 && load > env->imbalance && !check_cpu_capacity(rq, env->sd)) continue; /* * For the load comparisons with the other CPU's, consider - * the weighted_cpuload() scaled with the CPU capacity, so + * the cpu_runnable_load() scaled with the CPU capacity, so * that the load can be moved away from the CPU that is * potentially running at a lower capacity. * - * Thus we're looking for max(wl_i / capacity_i), crosswise + * Thus we're looking for max(load_i / capacity_i), crosswise * multiplication to rid ourselves of the division works out - * to: wl_i * capacity_j > wl_j * capacity_i; where j is + * to: load_i * capacity_j > load_j * capacity_i; where j is * our previous maximum. */ - if (wl * busiest_capacity > busiest_load * capacity) { - busiest_load = wl; + if (load * busiest_capacity > busiest_load * capacity) { + busiest_load = load; busiest_capacity = capacity; busiest = rq; } From 69842cba9ace84849bb9b8edcdf2cefccd97901c Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Fri, 21 Jun 2019 09:42:02 +0100 Subject: [PATCH 23/33] sched/uclamp: Add CPU's clamp buckets refcounting Utilization clamping allows to clamp the CPU's utilization within a [util_min, util_max] range, depending on the set of RUNNABLE tasks on that CPU. Each task references two "clamp buckets" defining its minimum and maximum (util_{min,max}) utilization "clamp values". A CPU's clamp bucket is active if there is at least one RUNNABLE tasks enqueued on that CPU and refcounting that bucket. When a task is {en,de}queued {on,from} a rq, the set of active clamp buckets on that CPU can change. If the set of active clamp buckets changes for a CPU a new "aggregated" clamp value is computed for that CPU. This is because each clamp bucket enforces a different utilization clamp value. Clamp values are always MAX aggregated for both util_min and util_max. This ensures that no task can affect the performance of other co-scheduled tasks which are more boosted (i.e. with higher util_min clamp) or less capped (i.e. with higher util_max clamp). A task has: task_struct::uclamp[clamp_id]::bucket_id to track the "bucket index" of the CPU's clamp bucket it refcounts while enqueued, for each clamp index (clamp_id). A runqueue has: rq::uclamp[clamp_id]::bucket[bucket_id].tasks to track how many RUNNABLE tasks on that CPU refcount each clamp bucket (bucket_id) of a clamp index (clamp_id). It also has a: rq::uclamp[clamp_id]::bucket[bucket_id].value to track the clamp value of each clamp bucket (bucket_id) of a clamp index (clamp_id). The rq::uclamp::bucket[clamp_id][] array is scanned every time it's needed to find a new MAX aggregated clamp value for a clamp_id. This operation is required only when it's dequeued the last task of a clamp bucket tracking the current MAX aggregated clamp value. In this case, the CPU is either entering IDLE or going to schedule a less boosted or more clamped task. The expected number of different clamp values configured at build time is small enough to fit the full unordered array into a single cache line, for configurations of up to 7 buckets. Add to struct rq the basic data structures required to refcount the number of RUNNABLE tasks for each clamp bucket. Add also the max aggregation required to update the rq's clamp value at each enqueue/dequeue event. Use a simple linear mapping of clamp values into clamp buckets. Pre-compute and cache bucket_id to avoid integer divisions at enqueue/dequeue time. Signed-off-by: Patrick Bellasi Signed-off-by: Peter Zijlstra (Intel) Cc: Alessio Balsini Cc: Dietmar Eggemann Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Quentin Perret Cc: Rafael J . Wysocki Cc: Steve Muckle Cc: Suren Baghdasaryan Cc: Tejun Heo Cc: Thomas Gleixner Cc: Todd Kjos Cc: Vincent Guittot Cc: Viresh Kumar Link: https://lkml.kernel.org/r/20190621084217.8167-2-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar --- include/linux/log2.h | 34 +++++++ include/linux/sched.h | 39 ++++++++ include/linux/sched/topology.h | 6 -- init/Kconfig | 53 +++++++++++ kernel/sched/core.c | 166 +++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 51 ++++++++++ 6 files changed, 343 insertions(+), 6 deletions(-) diff --git a/include/linux/log2.h b/include/linux/log2.h index 1aec01365ed4..83a4a3ca3e8a 100644 --- a/include/linux/log2.h +++ b/include/linux/log2.h @@ -220,4 +220,38 @@ int __order_base_2(unsigned long n) ilog2((n) - 1) + 1) : \ __order_base_2(n) \ ) + +static inline __attribute__((const)) +int __bits_per(unsigned long n) +{ + if (n < 2) + return 1; + if (is_power_of_2(n)) + return order_base_2(n) + 1; + return order_base_2(n); +} + +/** + * bits_per - calculate the number of bits required for the argument + * @n: parameter + * + * This is constant-capable and can be used for compile time + * initializations, e.g bitfields. + * + * The first few values calculated by this routine: + * bf(0) = 1 + * bf(1) = 1 + * bf(2) = 2 + * bf(3) = 2 + * bf(4) = 3 + * ... and so on. + */ +#define bits_per(n) \ +( \ + __builtin_constant_p(n) ? ( \ + ((n) == 0 || (n) == 1) \ + ? 1 : ilog2(n) + 1 \ + ) : \ + __bits_per(n) \ +) #endif /* _LINUX_LOG2_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 044c023875e8..80235bcd05f2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -283,6 +283,18 @@ struct vtime { u64 gtime; }; +/* + * Utilization clamp constraints. + * @UCLAMP_MIN: Minimum utilization + * @UCLAMP_MAX: Maximum utilization + * @UCLAMP_CNT: Utilization clamp constraints count + */ +enum uclamp_id { + UCLAMP_MIN = 0, + UCLAMP_MAX, + UCLAMP_CNT +}; + struct sched_info { #ifdef CONFIG_SCHED_INFO /* Cumulative counters: */ @@ -314,6 +326,10 @@ struct sched_info { # define SCHED_FIXEDPOINT_SHIFT 10 # define SCHED_FIXEDPOINT_SCALE (1L << SCHED_FIXEDPOINT_SHIFT) +/* Increase resolution of cpu_capacity calculations */ +# define SCHED_CAPACITY_SHIFT SCHED_FIXEDPOINT_SHIFT +# define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT) + struct load_weight { unsigned long weight; u32 inv_weight; @@ -562,6 +578,25 @@ struct sched_dl_entity { struct hrtimer inactive_timer; }; +#ifdef CONFIG_UCLAMP_TASK +/* Number of utilization clamp buckets (shorter alias) */ +#define UCLAMP_BUCKETS CONFIG_UCLAMP_BUCKETS_COUNT + +/* + * Utilization clamp for a scheduling entity + * @value: clamp value "assigned" to a se + * @bucket_id: bucket index corresponding to the "assigned" value + * + * The bucket_id is the index of the clamp bucket matching the clamp value + * which is pre-computed and stored to avoid expensive integer divisions from + * the fast path. + */ +struct uclamp_se { + unsigned int value : bits_per(SCHED_CAPACITY_SCALE); + unsigned int bucket_id : bits_per(UCLAMP_BUCKETS); +}; +#endif /* CONFIG_UCLAMP_TASK */ + union rcu_special { struct { u8 blocked; @@ -642,6 +677,10 @@ struct task_struct { #endif struct sched_dl_entity dl; +#ifdef CONFIG_UCLAMP_TASK + struct uclamp_se uclamp[UCLAMP_CNT]; +#endif + #ifdef CONFIG_PREEMPT_NOTIFIERS /* List of struct preempt_notifier: */ struct hlist_head preempt_notifiers; diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index e445d3767cdd..7863bb62d2ab 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -6,12 +6,6 @@ #include -/* - * Increase resolution of cpu_capacity calculations - */ -#define SCHED_CAPACITY_SHIFT SCHED_FIXEDPOINT_SHIFT -#define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT) - /* * sched-domains (multiprocessor balancing) declarations: */ diff --git a/init/Kconfig b/init/Kconfig index 0e2344389501..c88289c18d59 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -677,6 +677,59 @@ config HAVE_UNSTABLE_SCHED_CLOCK config GENERIC_SCHED_CLOCK bool +menu "Scheduler features" + +config UCLAMP_TASK + bool "Enable utilization clamping for RT/FAIR tasks" + depends on CPU_FREQ_GOV_SCHEDUTIL + help + This feature enables the scheduler to track the clamped utilization + of each CPU based on RUNNABLE tasks scheduled on that CPU. + + With this option, the user can specify the min and max CPU + utilization allowed for RUNNABLE tasks. The max utilization defines + the maximum frequency a task should use while the min utilization + defines the minimum frequency it should use. + + Both min and max utilization clamp values are hints to the scheduler, + aiming at improving its frequency selection policy, but they do not + enforce or grant any specific bandwidth for tasks. + + If in doubt, say N. + +config UCLAMP_BUCKETS_COUNT + int "Number of supported utilization clamp buckets" + range 5 20 + default 5 + depends on UCLAMP_TASK + help + Defines the number of clamp buckets to use. The range of each bucket + will be SCHED_CAPACITY_SCALE/UCLAMP_BUCKETS_COUNT. The higher the + number of clamp buckets the finer their granularity and the higher + the precision of clamping aggregation and tracking at run-time. + + For example, with the minimum configuration value we will have 5 + clamp buckets tracking 20% utilization each. A 25% boosted tasks will + be refcounted in the [20..39]% bucket and will set the bucket clamp + effective value to 25%. + If a second 30% boosted task should be co-scheduled on the same CPU, + that task will be refcounted in the same bucket of the first task and + it will boost the bucket clamp effective value to 30%. + The clamp effective value of a bucket is reset to its nominal value + (20% in the example above) when there are no more tasks refcounted in + that bucket. + + An additional boost/capping margin can be added to some tasks. In the + example above the 25% task will be boosted to 30% until it exits the + CPU. If that should be considered not acceptable on certain systems, + it's always possible to reduce the margin by increasing the number of + clamp buckets to trade off used memory for run-time tracking + precision. + + If in doubt, use the default value. + +endmenu + # # For architectures that want to enable the support for NUMA-affine scheduler # balancing logic: diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e5e02d23e693..d8c1e67afd82 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -772,6 +772,168 @@ static void set_load_weight(struct task_struct *p, bool update_load) } } +#ifdef CONFIG_UCLAMP_TASK + +/* Integer rounded range for each bucket */ +#define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS) + +#define for_each_clamp_id(clamp_id) \ + for ((clamp_id) = 0; (clamp_id) < UCLAMP_CNT; (clamp_id)++) + +static inline unsigned int uclamp_bucket_id(unsigned int clamp_value) +{ + return clamp_value / UCLAMP_BUCKET_DELTA; +} + +static inline unsigned int uclamp_none(int clamp_id) +{ + if (clamp_id == UCLAMP_MIN) + return 0; + return SCHED_CAPACITY_SCALE; +} + +static inline void uclamp_se_set(struct uclamp_se *uc_se, unsigned int value) +{ + uc_se->value = value; + uc_se->bucket_id = uclamp_bucket_id(value); +} + +static inline +unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id) +{ + struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket; + int bucket_id = UCLAMP_BUCKETS - 1; + + /* + * Since both min and max clamps are max aggregated, find the + * top most bucket with tasks in. + */ + for ( ; bucket_id >= 0; bucket_id--) { + if (!bucket[bucket_id].tasks) + continue; + return bucket[bucket_id].value; + } + + /* No tasks -- default clamp values */ + return uclamp_none(clamp_id); +} + +/* + * When a task is enqueued on a rq, the clamp bucket currently defined by the + * task's uclamp::bucket_id is refcounted on that rq. This also immediately + * updates the rq's clamp value if required. + */ +static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p, + unsigned int clamp_id) +{ + struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id]; + struct uclamp_se *uc_se = &p->uclamp[clamp_id]; + struct uclamp_bucket *bucket; + + lockdep_assert_held(&rq->lock); + + bucket = &uc_rq->bucket[uc_se->bucket_id]; + bucket->tasks++; + + if (uc_se->value > READ_ONCE(uc_rq->value)) + WRITE_ONCE(uc_rq->value, bucket->value); +} + +/* + * When a task is dequeued from a rq, the clamp bucket refcounted by the task + * is released. If this is the last task reference counting the rq's max + * active clamp value, then the rq's clamp value is updated. + * + * Both refcounted tasks and rq's cached clamp values are expected to be + * always valid. If it's detected they are not, as defensive programming, + * enforce the expected state and warn. + */ +static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, + unsigned int clamp_id) +{ + struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id]; + struct uclamp_se *uc_se = &p->uclamp[clamp_id]; + struct uclamp_bucket *bucket; + unsigned int rq_clamp; + + lockdep_assert_held(&rq->lock); + + bucket = &uc_rq->bucket[uc_se->bucket_id]; + SCHED_WARN_ON(!bucket->tasks); + if (likely(bucket->tasks)) + bucket->tasks--; + + if (likely(bucket->tasks)) + return; + + rq_clamp = READ_ONCE(uc_rq->value); + /* + * Defensive programming: this should never happen. If it happens, + * e.g. due to future modification, warn and fixup the expected value. + */ + SCHED_WARN_ON(bucket->value > rq_clamp); + if (bucket->value >= rq_clamp) + WRITE_ONCE(uc_rq->value, uclamp_rq_max_value(rq, clamp_id)); +} + +static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) +{ + unsigned int clamp_id; + + if (unlikely(!p->sched_class->uclamp_enabled)) + return; + + for_each_clamp_id(clamp_id) + uclamp_rq_inc_id(rq, p, clamp_id); +} + +static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) +{ + unsigned int clamp_id; + + if (unlikely(!p->sched_class->uclamp_enabled)) + return; + + for_each_clamp_id(clamp_id) + uclamp_rq_dec_id(rq, p, clamp_id); +} + +static void __init init_uclamp(void) +{ + unsigned int clamp_id; + int cpu; + + for_each_possible_cpu(cpu) { + struct uclamp_bucket *bucket; + struct uclamp_rq *uc_rq; + unsigned int bucket_id; + + memset(&cpu_rq(cpu)->uclamp, 0, sizeof(struct uclamp_rq)); + + for_each_clamp_id(clamp_id) { + uc_rq = &cpu_rq(cpu)->uclamp[clamp_id]; + + bucket_id = 1; + while (bucket_id < UCLAMP_BUCKETS) { + bucket = &uc_rq->bucket[bucket_id]; + bucket->value = bucket_id * UCLAMP_BUCKET_DELTA; + ++bucket_id; + } + } + } + + for_each_clamp_id(clamp_id) { + uclamp_se_set(&init_task.uclamp[clamp_id], + uclamp_none(clamp_id)); + } +} + +#else /* CONFIG_UCLAMP_TASK */ +static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { } +static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { } +static inline void init_uclamp(void) { } +#endif /* CONFIG_UCLAMP_TASK */ + static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) { if (!(flags & ENQUEUE_NOCLOCK)) @@ -782,6 +944,7 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) psi_enqueue(p, flags & ENQUEUE_WAKEUP); } + uclamp_rq_inc(rq, p); p->sched_class->enqueue_task(rq, p, flags); } @@ -795,6 +958,7 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) psi_dequeue(p, flags & DEQUEUE_SLEEP); } + uclamp_rq_dec(rq, p); p->sched_class->dequeue_task(rq, p, flags); } @@ -6093,6 +6257,8 @@ void __init sched_init(void) psi_init(); + init_uclamp(); + scheduler_running = 1; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e58ab597ec88..cecc6baaba93 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -791,6 +791,48 @@ extern void rto_push_irq_work_func(struct irq_work *work); #endif #endif /* CONFIG_SMP */ +#ifdef CONFIG_UCLAMP_TASK +/* + * struct uclamp_bucket - Utilization clamp bucket + * @value: utilization clamp value for tasks on this clamp bucket + * @tasks: number of RUNNABLE tasks on this clamp bucket + * + * Keep track of how many tasks are RUNNABLE for a given utilization + * clamp value. + */ +struct uclamp_bucket { + unsigned long value : bits_per(SCHED_CAPACITY_SCALE); + unsigned long tasks : BITS_PER_LONG - bits_per(SCHED_CAPACITY_SCALE); +}; + +/* + * struct uclamp_rq - rq's utilization clamp + * @value: currently active clamp values for a rq + * @bucket: utilization clamp buckets affecting a rq + * + * Keep track of RUNNABLE tasks on a rq to aggregate their clamp values. + * A clamp value is affecting a rq when there is at least one task RUNNABLE + * (or actually running) with that value. + * + * There are up to UCLAMP_CNT possible different clamp values, currently there + * are only two: minimum utilization and maximum utilization. + * + * All utilization clamping values are MAX aggregated, since: + * - for util_min: we want to run the CPU at least at the max of the minimum + * utilization required by its currently RUNNABLE tasks. + * - for util_max: we want to allow the CPU to run up to the max of the + * maximum utilization allowed by its currently RUNNABLE tasks. + * + * Since on each system we expect only a limited number of different + * utilization clamp values (UCLAMP_BUCKETS), use a simple array to track + * the metrics required to compute all the per-rq utilization clamp values. + */ +struct uclamp_rq { + unsigned int value; + struct uclamp_bucket bucket[UCLAMP_BUCKETS]; +}; +#endif /* CONFIG_UCLAMP_TASK */ + /* * This is the main, per-CPU runqueue data structure. * @@ -825,6 +867,11 @@ struct rq { unsigned long nr_load_updates; u64 nr_switches; +#ifdef CONFIG_UCLAMP_TASK + /* Utilization clamp values based on CPU's RUNNABLE tasks */ + struct uclamp_rq uclamp[UCLAMP_CNT] ____cacheline_aligned; +#endif + struct cfs_rq cfs; struct rt_rq rt; struct dl_rq dl; @@ -1639,6 +1686,10 @@ extern const u32 sched_prio_to_wmult[40]; struct sched_class { const struct sched_class *next; +#ifdef CONFIG_UCLAMP_TASK + int uclamp_enabled; +#endif + void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); void (*yield_task) (struct rq *rq); From 60daf9c19410604f08c99e146bc378c8a64f4ccd Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Fri, 21 Jun 2019 09:42:03 +0100 Subject: [PATCH 24/33] sched/uclamp: Add bucket local max tracking Because of bucketization, different task-specific clamp values are tracked in the same bucket. For example, with 20% bucket size and assuming to have: Task1: util_min=25% Task2: util_min=35% both tasks will be refcounted in the [20..39]% bucket and always boosted only up to 20% thus implementing a simple floor aggregation normally used in histograms. In systems with only few and well-defined clamp values, it would be useful to track the exact clamp value required by a task whenever possible. For example, if a system requires only 23% and 47% boost values then it's possible to track the exact boost required by each task using only 3 buckets of ~33% size each. Introduce a mechanism to max aggregate the requested clamp values of RUNNABLE tasks in the same bucket. Keep it simple by resetting the bucket value to its base value only when a bucket becomes inactive. Allow a limited and controlled overboosting margin for tasks recounted in the same bucket. In systems where the boost values are not known in advance, it is still possible to control the maximum acceptable overboosting margin by tuning the number of clamp groups. For example, 20 groups ensure a 5% maximum overboost. Remove the rq bucket initialization code since a correct bucket value is now computed when a task is refcounted into a CPU's rq. Signed-off-by: Patrick Bellasi Signed-off-by: Peter Zijlstra (Intel) Cc: Alessio Balsini Cc: Dietmar Eggemann Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Quentin Perret Cc: Rafael J . Wysocki Cc: Steve Muckle Cc: Suren Baghdasaryan Cc: Tejun Heo Cc: Thomas Gleixner Cc: Todd Kjos Cc: Vincent Guittot Cc: Viresh Kumar Link: https://lkml.kernel.org/r/20190621084217.8167-3-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 43 +++++++++++++++++++++++++------------------ 1 file changed, 25 insertions(+), 18 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d8c1e67afd82..0a6eff8a278b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -785,6 +785,11 @@ static inline unsigned int uclamp_bucket_id(unsigned int clamp_value) return clamp_value / UCLAMP_BUCKET_DELTA; } +static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value) +{ + return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value); +} + static inline unsigned int uclamp_none(int clamp_id) { if (clamp_id == UCLAMP_MIN) @@ -822,6 +827,11 @@ unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id) * When a task is enqueued on a rq, the clamp bucket currently defined by the * task's uclamp::bucket_id is refcounted on that rq. This also immediately * updates the rq's clamp value if required. + * + * Tasks can have a task-specific value requested from user-space, track + * within each bucket the maximum value for tasks refcounted in it. + * This "local max aggregation" allows to track the exact "requested" value + * for each bucket when all its RUNNABLE tasks require the same clamp. */ static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p, unsigned int clamp_id) @@ -835,8 +845,15 @@ static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p, bucket = &uc_rq->bucket[uc_se->bucket_id]; bucket->tasks++; + /* + * Local max aggregation: rq buckets always track the max + * "requested" clamp value of its RUNNABLE tasks. + */ + if (bucket->tasks == 1 || uc_se->value > bucket->value) + bucket->value = uc_se->value; + if (uc_se->value > READ_ONCE(uc_rq->value)) - WRITE_ONCE(uc_rq->value, bucket->value); + WRITE_ONCE(uc_rq->value, uc_se->value); } /* @@ -863,6 +880,12 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, if (likely(bucket->tasks)) bucket->tasks--; + /* + * Keep "local max aggregation" simple and accept to (possibly) + * overboost some RUNNABLE tasks in the same bucket. + * The rq clamp bucket value is reset to its base value whenever + * there are no more RUNNABLE tasks refcounting it. + */ if (likely(bucket->tasks)) return; @@ -903,25 +926,9 @@ static void __init init_uclamp(void) unsigned int clamp_id; int cpu; - for_each_possible_cpu(cpu) { - struct uclamp_bucket *bucket; - struct uclamp_rq *uc_rq; - unsigned int bucket_id; - + for_each_possible_cpu(cpu) memset(&cpu_rq(cpu)->uclamp, 0, sizeof(struct uclamp_rq)); - for_each_clamp_id(clamp_id) { - uc_rq = &cpu_rq(cpu)->uclamp[clamp_id]; - - bucket_id = 1; - while (bucket_id < UCLAMP_BUCKETS) { - bucket = &uc_rq->bucket[bucket_id]; - bucket->value = bucket_id * UCLAMP_BUCKET_DELTA; - ++bucket_id; - } - } - } - for_each_clamp_id(clamp_id) { uclamp_se_set(&init_task.uclamp[clamp_id], uclamp_none(clamp_id)); From e496187da71070687b55ff455e7d8d7d7f0ae0b9 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Fri, 21 Jun 2019 09:42:04 +0100 Subject: [PATCH 25/33] sched/uclamp: Enforce last task's UCLAMP_MAX When a task sleeps it removes its max utilization clamp from its CPU. However, the blocked utilization on that CPU can be higher than the max clamp value enforced while the task was running. This allows undesired CPU frequency increases while a CPU is idle, for example, when another CPU on the same frequency domain triggers a frequency update, since schedutil can now see the full not clamped blocked utilization of the idle CPU. Fix this by using: uclamp_rq_dec_id(p, rq, UCLAMP_MAX) uclamp_rq_max_value(rq, UCLAMP_MAX, clamp_value) to detect when a CPU has no more RUNNABLE clamped tasks and to flag this condition. Don't track any minimum utilization clamps since an idle CPU never requires a minimum frequency. The decay of the blocked utilization is good enough to reduce the CPU frequency. Signed-off-by: Patrick Bellasi Signed-off-by: Peter Zijlstra (Intel) Cc: Alessio Balsini Cc: Dietmar Eggemann Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Quentin Perret Cc: Rafael J . Wysocki Cc: Steve Muckle Cc: Suren Baghdasaryan Cc: Tejun Heo Cc: Thomas Gleixner Cc: Todd Kjos Cc: Vincent Guittot Cc: Viresh Kumar Link: https://lkml.kernel.org/r/20190621084217.8167-4-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 49 +++++++++++++++++++++++++++++++++++++++----- kernel/sched/sched.h | 2 ++ 2 files changed, 46 insertions(+), 5 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0a6eff8a278b..2dde735635ec 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -803,8 +803,36 @@ static inline void uclamp_se_set(struct uclamp_se *uc_se, unsigned int value) uc_se->bucket_id = uclamp_bucket_id(value); } +static inline unsigned int +uclamp_idle_value(struct rq *rq, unsigned int clamp_id, + unsigned int clamp_value) +{ + /* + * Avoid blocked utilization pushing up the frequency when we go + * idle (which drops the max-clamp) by retaining the last known + * max-clamp. + */ + if (clamp_id == UCLAMP_MAX) { + rq->uclamp_flags |= UCLAMP_FLAG_IDLE; + return clamp_value; + } + + return uclamp_none(UCLAMP_MIN); +} + +static inline void uclamp_idle_reset(struct rq *rq, unsigned int clamp_id, + unsigned int clamp_value) +{ + /* Reset max-clamp retention only on idle exit */ + if (!(rq->uclamp_flags & UCLAMP_FLAG_IDLE)) + return; + + WRITE_ONCE(rq->uclamp[clamp_id].value, clamp_value); +} + static inline -unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id) +unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id, + unsigned int clamp_value) { struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket; int bucket_id = UCLAMP_BUCKETS - 1; @@ -820,7 +848,7 @@ unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id) } /* No tasks -- default clamp values */ - return uclamp_none(clamp_id); + return uclamp_idle_value(rq, clamp_id, clamp_value); } /* @@ -845,6 +873,8 @@ static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p, bucket = &uc_rq->bucket[uc_se->bucket_id]; bucket->tasks++; + uclamp_idle_reset(rq, clamp_id, uc_se->value); + /* * Local max aggregation: rq buckets always track the max * "requested" clamp value of its RUNNABLE tasks. @@ -871,6 +901,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id]; struct uclamp_se *uc_se = &p->uclamp[clamp_id]; struct uclamp_bucket *bucket; + unsigned int bkt_clamp; unsigned int rq_clamp; lockdep_assert_held(&rq->lock); @@ -895,8 +926,10 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, * e.g. due to future modification, warn and fixup the expected value. */ SCHED_WARN_ON(bucket->value > rq_clamp); - if (bucket->value >= rq_clamp) - WRITE_ONCE(uc_rq->value, uclamp_rq_max_value(rq, clamp_id)); + if (bucket->value >= rq_clamp) { + bkt_clamp = uclamp_rq_max_value(rq, clamp_id, uc_se->value); + WRITE_ONCE(uc_rq->value, bkt_clamp); + } } static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) @@ -908,6 +941,10 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) for_each_clamp_id(clamp_id) uclamp_rq_inc_id(rq, p, clamp_id); + + /* Reset clamp idle holding when there is one RUNNABLE task */ + if (rq->uclamp_flags & UCLAMP_FLAG_IDLE) + rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE; } static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) @@ -926,8 +963,10 @@ static void __init init_uclamp(void) unsigned int clamp_id; int cpu; - for_each_possible_cpu(cpu) + for_each_possible_cpu(cpu) { memset(&cpu_rq(cpu)->uclamp, 0, sizeof(struct uclamp_rq)); + cpu_rq(cpu)->uclamp_flags = 0; + } for_each_clamp_id(clamp_id) { uclamp_se_set(&init_task.uclamp[clamp_id], diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index cecc6baaba93..0d2ba8bb2cb3 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -870,6 +870,8 @@ struct rq { #ifdef CONFIG_UCLAMP_TASK /* Utilization clamp values based on CPU's RUNNABLE tasks */ struct uclamp_rq uclamp[UCLAMP_CNT] ____cacheline_aligned; + unsigned int uclamp_flags; +#define UCLAMP_FLAG_IDLE 0x01 #endif struct cfs_rq cfs; From e8f14172c6b11e9a86c65532497087f8eb0f91b1 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Fri, 21 Jun 2019 09:42:05 +0100 Subject: [PATCH 26/33] sched/uclamp: Add system default clamps Tasks without a user-defined clamp value are considered not clamped and by default their utilization can have any value in the [0..SCHED_CAPACITY_SCALE] range. Tasks with a user-defined clamp value are allowed to request any value in that range, and the required clamp is unconditionally enforced. However, a "System Management Software" could be interested in limiting the range of clamp values allowed for all tasks. Add a privileged interface to define a system default configuration via: /proc/sys/kernel/sched_uclamp_util_{min,max} which works as an unconditional clamp range restriction for all tasks. With the default configuration, the full SCHED_CAPACITY_SCALE range of values is allowed for each clamp index. Otherwise, the task-specific clamp is capped by the corresponding system default value. Do that by tracking, for each task, the "effective" clamp value and bucket the task has been refcounted in at enqueue time. This allows to lazy aggregate "requested" and "system default" values at enqueue time and simplifies refcounting updates at dequeue time. The cached bucket ids are used to avoid (relatively) more expensive integer divisions every time a task is enqueued. An active flag is used to report when the "effective" value is valid and thus the task is actually refcounted in the corresponding rq's bucket. Signed-off-by: Patrick Bellasi Signed-off-by: Peter Zijlstra (Intel) Cc: Alessio Balsini Cc: Dietmar Eggemann Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Quentin Perret Cc: Rafael J . Wysocki Cc: Steve Muckle Cc: Suren Baghdasaryan Cc: Tejun Heo Cc: Thomas Gleixner Cc: Todd Kjos Cc: Vincent Guittot Cc: Viresh Kumar Link: https://lkml.kernel.org/r/20190621084217.8167-5-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 10 ++++ include/linux/sched/sysctl.h | 11 ++++ kernel/sched/core.c | 99 +++++++++++++++++++++++++++++++++++- kernel/sysctl.c | 16 ++++++ 4 files changed, 135 insertions(+), 1 deletion(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 80235bcd05f2..5485f411e8e1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -586,14 +586,21 @@ struct sched_dl_entity { * Utilization clamp for a scheduling entity * @value: clamp value "assigned" to a se * @bucket_id: bucket index corresponding to the "assigned" value + * @active: the se is currently refcounted in a rq's bucket * * The bucket_id is the index of the clamp bucket matching the clamp value * which is pre-computed and stored to avoid expensive integer divisions from * the fast path. + * + * The active bit is set whenever a task has got an "effective" value assigned, + * which can be different from the clamp value "requested" from user-space. + * This allows to know a task is refcounted in the rq's bucket corresponding + * to the "effective" bucket_id. */ struct uclamp_se { unsigned int value : bits_per(SCHED_CAPACITY_SCALE); unsigned int bucket_id : bits_per(UCLAMP_BUCKETS); + unsigned int active : 1; }; #endif /* CONFIG_UCLAMP_TASK */ @@ -678,6 +685,9 @@ struct task_struct { struct sched_dl_entity dl; #ifdef CONFIG_UCLAMP_TASK + /* Clamp values requested for a scheduling entity */ + struct uclamp_se uclamp_req[UCLAMP_CNT]; + /* Effective clamp values used for a scheduling entity */ struct uclamp_se uclamp[UCLAMP_CNT]; #endif diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 99ce6d728df7..d4f6215ee03f 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -56,6 +56,11 @@ int sched_proc_update_handler(struct ctl_table *table, int write, extern unsigned int sysctl_sched_rt_period; extern int sysctl_sched_rt_runtime; +#ifdef CONFIG_UCLAMP_TASK +extern unsigned int sysctl_sched_uclamp_util_min; +extern unsigned int sysctl_sched_uclamp_util_max; +#endif + #ifdef CONFIG_CFS_BANDWIDTH extern unsigned int sysctl_sched_cfs_bandwidth_slice; #endif @@ -75,6 +80,12 @@ extern int sched_rt_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); +#ifdef CONFIG_UCLAMP_TASK +extern int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos); +#endif + extern int sysctl_numa_balancing(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2dde735635ec..b74de86b68c7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -773,6 +773,14 @@ static void set_load_weight(struct task_struct *p, bool update_load) } #ifdef CONFIG_UCLAMP_TASK +/* Max allowed minimum utilization */ +unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE; + +/* Max allowed maximum utilization */ +unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE; + +/* All clamps are required to be less or equal than these values */ +static struct uclamp_se uclamp_default[UCLAMP_CNT]; /* Integer rounded range for each bucket */ #define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS) @@ -851,6 +859,25 @@ unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id, return uclamp_idle_value(rq, clamp_id, clamp_value); } +/* + * The effective clamp bucket index of a task depends on, by increasing + * priority: + * - the task specific clamp value, when explicitly requested from userspace + * - the system default clamp value, defined by the sysadmin + */ +static inline struct uclamp_se +uclamp_eff_get(struct task_struct *p, unsigned int clamp_id) +{ + struct uclamp_se uc_req = p->uclamp_req[clamp_id]; + struct uclamp_se uc_max = uclamp_default[clamp_id]; + + /* System default restrictions always apply */ + if (unlikely(uc_req.value > uc_max.value)) + return uc_max; + + return uc_req; +} + /* * When a task is enqueued on a rq, the clamp bucket currently defined by the * task's uclamp::bucket_id is refcounted on that rq. This also immediately @@ -870,8 +897,12 @@ static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p, lockdep_assert_held(&rq->lock); + /* Update task effective clamp */ + p->uclamp[clamp_id] = uclamp_eff_get(p, clamp_id); + bucket = &uc_rq->bucket[uc_se->bucket_id]; bucket->tasks++; + uc_se->active = true; uclamp_idle_reset(rq, clamp_id, uc_se->value); @@ -910,6 +941,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, SCHED_WARN_ON(!bucket->tasks); if (likely(bucket->tasks)) bucket->tasks--; + uc_se->active = false; /* * Keep "local max aggregation" simple and accept to (possibly) @@ -958,8 +990,65 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) uclamp_rq_dec_id(rq, p, clamp_id); } +int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int old_min, old_max; + static DEFINE_MUTEX(mutex); + int result; + + mutex_lock(&mutex); + old_min = sysctl_sched_uclamp_util_min; + old_max = sysctl_sched_uclamp_util_max; + + result = proc_dointvec(table, write, buffer, lenp, ppos); + if (result) + goto undo; + if (!write) + goto done; + + if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max || + sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE) { + result = -EINVAL; + goto undo; + } + + if (old_min != sysctl_sched_uclamp_util_min) { + uclamp_se_set(&uclamp_default[UCLAMP_MIN], + sysctl_sched_uclamp_util_min); + } + if (old_max != sysctl_sched_uclamp_util_max) { + uclamp_se_set(&uclamp_default[UCLAMP_MAX], + sysctl_sched_uclamp_util_max); + } + + /* + * Updating all the RUNNABLE task is expensive, keep it simple and do + * just a lazy update at each next enqueue time. + */ + goto done; + +undo: + sysctl_sched_uclamp_util_min = old_min; + sysctl_sched_uclamp_util_max = old_max; +done: + mutex_unlock(&mutex); + + return result; +} + +static void uclamp_fork(struct task_struct *p) +{ + unsigned int clamp_id; + + for_each_clamp_id(clamp_id) + p->uclamp[clamp_id].active = false; +} + static void __init init_uclamp(void) { + struct uclamp_se uc_max = {}; unsigned int clamp_id; int cpu; @@ -969,14 +1058,20 @@ static void __init init_uclamp(void) } for_each_clamp_id(clamp_id) { - uclamp_se_set(&init_task.uclamp[clamp_id], + uclamp_se_set(&init_task.uclamp_req[clamp_id], uclamp_none(clamp_id)); } + + /* System defaults allow max clamp values for both indexes */ + uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX)); + for_each_clamp_id(clamp_id) + uclamp_default[clamp_id] = uc_max; } #else /* CONFIG_UCLAMP_TASK */ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { } static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { } +static inline void uclamp_fork(struct task_struct *p) { } static inline void init_uclamp(void) { } #endif /* CONFIG_UCLAMP_TASK */ @@ -2545,6 +2640,8 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) */ p->prio = current->normal_prio; + uclamp_fork(p); + /* * Revert to default priority/policy on fork if requested. */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 1beca96fb625..1c1ad1e14f21 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -452,6 +452,22 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = sched_rr_handler, }, +#ifdef CONFIG_UCLAMP_TASK + { + .procname = "sched_util_clamp_min", + .data = &sysctl_sched_uclamp_util_min, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_sched_uclamp_handler, + }, + { + .procname = "sched_util_clamp_max", + .data = &sysctl_sched_uclamp_util_max, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_sched_uclamp_handler, + }, +#endif #ifdef CONFIG_SCHED_AUTOGROUP { .procname = "sched_autogroup_enabled", From 1d6362fa0cfc8c7b243fa92924429d826599e691 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Fri, 21 Jun 2019 09:42:06 +0100 Subject: [PATCH 27/33] sched/core: Allow sched_setattr() to use the current policy The sched_setattr() syscall mandates that a policy is always specified. This requires to always know which policy a task will have when attributes are configured and this makes it impossible to add more generic task attributes valid across different scheduling policies. Reading the policy before setting generic tasks attributes is racy since we cannot be sure it is not changed concurrently. Introduce the required support to change generic task attributes without affecting the current task policy. This is done by adding an attribute flag (SCHED_FLAG_KEEP_POLICY) to enforce the usage of the current policy. Add support for the SETPARAM_POLICY policy, which is already used by the sched_setparam() POSIX syscall, to the sched_setattr() non-POSIX syscall. Signed-off-by: Patrick Bellasi Signed-off-by: Peter Zijlstra (Intel) Cc: Alessio Balsini Cc: Dietmar Eggemann Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Quentin Perret Cc: Rafael J . Wysocki Cc: Steve Muckle Cc: Suren Baghdasaryan Cc: Tejun Heo Cc: Thomas Gleixner Cc: Todd Kjos Cc: Vincent Guittot Cc: Viresh Kumar Link: https://lkml.kernel.org/r/20190621084217.8167-6-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar --- include/uapi/linux/sched.h | 4 +++- kernel/sched/core.c | 2 ++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index ed4ee170bee2..58b2368d3634 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -51,9 +51,11 @@ #define SCHED_FLAG_RESET_ON_FORK 0x01 #define SCHED_FLAG_RECLAIM 0x02 #define SCHED_FLAG_DL_OVERRUN 0x04 +#define SCHED_FLAG_KEEP_POLICY 0x08 #define SCHED_FLAG_ALL (SCHED_FLAG_RESET_ON_FORK | \ SCHED_FLAG_RECLAIM | \ - SCHED_FLAG_DL_OVERRUN) + SCHED_FLAG_DL_OVERRUN | \ + SCHED_FLAG_KEEP_POLICY) #endif /* _UAPI_LINUX_SCHED_H */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b74de86b68c7..6d519f3f9789 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4897,6 +4897,8 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, if ((int)attr.sched_policy < 0) return -EINVAL; + if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY) + attr.sched_policy = SETPARAM_POLICY; rcu_read_lock(); retval = -ESRCH; From a509a7cd79747074a2c018a45bbbc52d1f4aed44 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Fri, 21 Jun 2019 09:42:07 +0100 Subject: [PATCH 28/33] sched/uclamp: Extend sched_setattr() to support utilization clamping The SCHED_DEADLINE scheduling class provides an advanced and formal model to define tasks requirements that can translate into proper decisions for both task placements and frequencies selections. Other classes have a more simplified model based on the POSIX concept of priorities. Such a simple priority based model however does not allow to exploit most advanced features of the Linux scheduler like, for example, driving frequencies selection via the schedutil cpufreq governor. However, also for non SCHED_DEADLINE tasks, it's still interesting to define tasks properties to support scheduler decisions. Utilization clamping exposes to user-space a new set of per-task attributes the scheduler can use as hints about the expected/required utilization for a task. This allows to implement a "proactive" per-task frequency control policy, a more advanced policy than the current one based just on "passive" measured task utilization. For example, it's possible to boost interactive tasks (e.g. to get better performance) or cap background tasks (e.g. to be more energy/thermal efficient). Introduce a new API to set utilization clamping values for a specified task by extending sched_setattr(), a syscall which already allows to define task specific properties for different scheduling classes. A new pair of attributes allows to specify a minimum and maximum utilization the scheduler can consider for a task. Do that by validating the required clamp values before and then applying the required changes using _the_ same pattern already in use for __setscheduler(). This ensures that the task is re-enqueued with the new clamp values. Signed-off-by: Patrick Bellasi Signed-off-by: Peter Zijlstra (Intel) Cc: Alessio Balsini Cc: Dietmar Eggemann Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Quentin Perret Cc: Rafael J . Wysocki Cc: Steve Muckle Cc: Suren Baghdasaryan Cc: Tejun Heo Cc: Thomas Gleixner Cc: Todd Kjos Cc: Vincent Guittot Cc: Viresh Kumar Link: https://lkml.kernel.org/r/20190621084217.8167-7-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 9 ++++ include/uapi/linux/sched.h | 12 ++++- include/uapi/linux/sched/types.h | 66 +++++++++++++++++++---- kernel/sched/core.c | 91 +++++++++++++++++++++++++++++--- 4 files changed, 161 insertions(+), 17 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 5485f411e8e1..1113dd4706ae 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -587,6 +587,7 @@ struct sched_dl_entity { * @value: clamp value "assigned" to a se * @bucket_id: bucket index corresponding to the "assigned" value * @active: the se is currently refcounted in a rq's bucket + * @user_defined: the requested clamp value comes from user-space * * The bucket_id is the index of the clamp bucket matching the clamp value * which is pre-computed and stored to avoid expensive integer divisions from @@ -596,11 +597,19 @@ struct sched_dl_entity { * which can be different from the clamp value "requested" from user-space. * This allows to know a task is refcounted in the rq's bucket corresponding * to the "effective" bucket_id. + * + * The user_defined bit is set whenever a task has got a task-specific clamp + * value requested from userspace, i.e. the system defaults apply to this task + * just as a restriction. This allows to relax default clamps when a less + * restrictive task-specific value has been requested, thus allowing to + * implement a "nice" semantic. For example, a task running with a 20% + * default boost can still drop its own boosting to 0%. */ struct uclamp_se { unsigned int value : bits_per(SCHED_CAPACITY_SCALE); unsigned int bucket_id : bits_per(UCLAMP_BUCKETS); unsigned int active : 1; + unsigned int user_defined : 1; }; #endif /* CONFIG_UCLAMP_TASK */ diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 58b2368d3634..617bb59aa8ba 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -52,10 +52,20 @@ #define SCHED_FLAG_RECLAIM 0x02 #define SCHED_FLAG_DL_OVERRUN 0x04 #define SCHED_FLAG_KEEP_POLICY 0x08 +#define SCHED_FLAG_KEEP_PARAMS 0x10 +#define SCHED_FLAG_UTIL_CLAMP_MIN 0x20 +#define SCHED_FLAG_UTIL_CLAMP_MAX 0x40 + +#define SCHED_FLAG_KEEP_ALL (SCHED_FLAG_KEEP_POLICY | \ + SCHED_FLAG_KEEP_PARAMS) + +#define SCHED_FLAG_UTIL_CLAMP (SCHED_FLAG_UTIL_CLAMP_MIN | \ + SCHED_FLAG_UTIL_CLAMP_MAX) #define SCHED_FLAG_ALL (SCHED_FLAG_RESET_ON_FORK | \ SCHED_FLAG_RECLAIM | \ SCHED_FLAG_DL_OVERRUN | \ - SCHED_FLAG_KEEP_POLICY) + SCHED_FLAG_KEEP_ALL | \ + SCHED_FLAG_UTIL_CLAMP) #endif /* _UAPI_LINUX_SCHED_H */ diff --git a/include/uapi/linux/sched/types.h b/include/uapi/linux/sched/types.h index 10fbb8031930..c852153ddb0d 100644 --- a/include/uapi/linux/sched/types.h +++ b/include/uapi/linux/sched/types.h @@ -9,6 +9,7 @@ struct sched_param { }; #define SCHED_ATTR_SIZE_VER0 48 /* sizeof first published struct */ +#define SCHED_ATTR_SIZE_VER1 56 /* add: util_{min,max} */ /* * Extended scheduling parameters data structure. @@ -21,8 +22,33 @@ struct sched_param { * the tasks may be useful for a wide variety of application fields, e.g., * multimedia, streaming, automation and control, and many others. * - * This variant (sched_attr) is meant at describing a so-called - * sporadic time-constrained task. In such model a task is specified by: + * This variant (sched_attr) allows to define additional attributes to + * improve the scheduler knowledge about task requirements. + * + * Scheduling Class Attributes + * =========================== + * + * A subset of sched_attr attributes specifies the + * scheduling policy and relative POSIX attributes: + * + * @size size of the structure, for fwd/bwd compat. + * + * @sched_policy task's scheduling policy + * @sched_nice task's nice value (SCHED_NORMAL/BATCH) + * @sched_priority task's static priority (SCHED_FIFO/RR) + * + * Certain more advanced scheduling features can be controlled by a + * predefined set of flags via the attribute: + * + * @sched_flags for customizing the scheduler behaviour + * + * Sporadic Time-Constrained Task Attributes + * ========================================= + * + * A subset of sched_attr attributes allows to describe a so-called + * sporadic time-constrained task. + * + * In such a model a task is specified by: * - the activation period or minimum instance inter-arrival time; * - the maximum (or average, depending on the actual scheduling * discipline) computation time of all instances, a.k.a. runtime; @@ -34,14 +60,8 @@ struct sched_param { * than the runtime and must be completed by time instant t equal to * the instance activation time + the deadline. * - * This is reflected by the actual fields of the sched_attr structure: + * This is reflected by the following fields of the sched_attr structure: * - * @size size of the structure, for fwd/bwd compat. - * - * @sched_policy task's scheduling policy - * @sched_flags for customizing the scheduler behaviour - * @sched_nice task's nice value (SCHED_NORMAL/BATCH) - * @sched_priority task's static priority (SCHED_FIFO/RR) * @sched_deadline representative of the task's deadline * @sched_runtime representative of the task's runtime * @sched_period representative of the task's period @@ -53,6 +73,29 @@ struct sched_param { * As of now, the SCHED_DEADLINE policy (sched_dl scheduling class) is the * only user of this new interface. More information about the algorithm * available in the scheduling class file or in Documentation/. + * + * Task Utilization Attributes + * =========================== + * + * A subset of sched_attr attributes allows to specify the utilization + * expected for a task. These attributes allow to inform the scheduler about + * the utilization boundaries within which it should schedule the task. These + * boundaries are valuable hints to support scheduler decisions on both task + * placement and frequency selection. + * + * @sched_util_min represents the minimum utilization + * @sched_util_max represents the maximum utilization + * + * Utilization is a value in the range [0..SCHED_CAPACITY_SCALE]. It + * represents the percentage of CPU time used by a task when running at the + * maximum frequency on the highest capacity CPU of the system. For example, a + * 20% utilization task is a task running for 2ms every 10ms at maximum + * frequency. + * + * A task with a min utilization value bigger than 0 is more likely scheduled + * on a CPU with a capacity big enough to fit the specified value. + * A task with a max utilization value smaller than 1024 is more likely + * scheduled on a CPU with no more capacity than the specified value. */ struct sched_attr { __u32 size; @@ -70,6 +113,11 @@ struct sched_attr { __u64 sched_runtime; __u64 sched_deadline; __u64 sched_period; + + /* Utilization hints */ + __u32 sched_util_min; + __u32 sched_util_max; + }; #endif /* _UAPI_LINUX_SCHED_TYPES_H */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6d519f3f9789..e9a669266fa9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -805,10 +805,12 @@ static inline unsigned int uclamp_none(int clamp_id) return SCHED_CAPACITY_SCALE; } -static inline void uclamp_se_set(struct uclamp_se *uc_se, unsigned int value) +static inline void uclamp_se_set(struct uclamp_se *uc_se, + unsigned int value, bool user_defined) { uc_se->value = value; uc_se->bucket_id = uclamp_bucket_id(value); + uc_se->user_defined = user_defined; } static inline unsigned int @@ -1016,11 +1018,11 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, if (old_min != sysctl_sched_uclamp_util_min) { uclamp_se_set(&uclamp_default[UCLAMP_MIN], - sysctl_sched_uclamp_util_min); + sysctl_sched_uclamp_util_min, false); } if (old_max != sysctl_sched_uclamp_util_max) { uclamp_se_set(&uclamp_default[UCLAMP_MAX], - sysctl_sched_uclamp_util_max); + sysctl_sched_uclamp_util_max, false); } /* @@ -1038,6 +1040,42 @@ done: return result; } +static int uclamp_validate(struct task_struct *p, + const struct sched_attr *attr) +{ + unsigned int lower_bound = p->uclamp_req[UCLAMP_MIN].value; + unsigned int upper_bound = p->uclamp_req[UCLAMP_MAX].value; + + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) + lower_bound = attr->sched_util_min; + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) + upper_bound = attr->sched_util_max; + + if (lower_bound > upper_bound) + return -EINVAL; + if (upper_bound > SCHED_CAPACITY_SCALE) + return -EINVAL; + + return 0; +} + +static void __setscheduler_uclamp(struct task_struct *p, + const struct sched_attr *attr) +{ + if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP))) + return; + + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) { + uclamp_se_set(&p->uclamp_req[UCLAMP_MIN], + attr->sched_util_min, true); + } + + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) { + uclamp_se_set(&p->uclamp_req[UCLAMP_MAX], + attr->sched_util_max, true); + } +} + static void uclamp_fork(struct task_struct *p) { unsigned int clamp_id; @@ -1059,11 +1097,11 @@ static void __init init_uclamp(void) for_each_clamp_id(clamp_id) { uclamp_se_set(&init_task.uclamp_req[clamp_id], - uclamp_none(clamp_id)); + uclamp_none(clamp_id), false); } /* System defaults allow max clamp values for both indexes */ - uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX)); + uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false); for_each_clamp_id(clamp_id) uclamp_default[clamp_id] = uc_max; } @@ -1071,6 +1109,13 @@ static void __init init_uclamp(void) #else /* CONFIG_UCLAMP_TASK */ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { } static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { } +static inline int uclamp_validate(struct task_struct *p, + const struct sched_attr *attr) +{ + return -EOPNOTSUPP; +} +static void __setscheduler_uclamp(struct task_struct *p, + const struct sched_attr *attr) { } static inline void uclamp_fork(struct task_struct *p) { } static inline void init_uclamp(void) { } #endif /* CONFIG_UCLAMP_TASK */ @@ -4412,6 +4457,13 @@ static void __setscheduler_params(struct task_struct *p, static void __setscheduler(struct rq *rq, struct task_struct *p, const struct sched_attr *attr, bool keep_boost) { + /* + * If params can't change scheduling class changes aren't allowed + * either. + */ + if (attr->sched_flags & SCHED_FLAG_KEEP_PARAMS) + return; + __setscheduler_params(p, attr); /* @@ -4549,6 +4601,13 @@ recheck: return retval; } + /* Update task specific "requested" clamps */ + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) { + retval = uclamp_validate(p, attr); + if (retval) + return retval; + } + /* * Make sure no PI-waiters arrive (or leave) while we are * changing the priority of the task: @@ -4578,6 +4637,8 @@ recheck: goto change; if (dl_policy(policy) && dl_param_changed(p, attr)) goto change; + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) + goto change; p->sched_reset_on_fork = reset_on_fork; task_rq_unlock(rq, p, &rf); @@ -4658,7 +4719,9 @@ change: put_prev_task(rq, p); prev_class = p->sched_class; + __setscheduler(rq, p, attr, pi); + __setscheduler_uclamp(p, attr); if (queued) { /* @@ -4834,6 +4897,10 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a if (ret) return -EFAULT; + if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) && + size < SCHED_ATTR_SIZE_VER1) + return -EINVAL; + /* * XXX: Do we want to be lenient like existing syscalls; or do we want * to be strict and return an error on out-of-bounds values? @@ -4903,10 +4970,15 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, rcu_read_lock(); retval = -ESRCH; p = find_process_by_pid(pid); - if (p != NULL) - retval = sched_setattr(p, &attr); + if (likely(p)) + get_task_struct(p); rcu_read_unlock(); + if (likely(p)) { + retval = sched_setattr(p, &attr); + put_task_struct(p); + } + return retval; } @@ -5057,6 +5129,11 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, else attr.sched_nice = task_nice(p); +#ifdef CONFIG_UCLAMP_TASK + attr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; + attr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; +#endif + rcu_read_unlock(); retval = sched_read_attr(uattr, &attr, size); From a87498ace58e23b62a572dc7267579ede4c8495c Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Fri, 21 Jun 2019 09:42:08 +0100 Subject: [PATCH 29/33] sched/uclamp: Reset uclamp values on RESET_ON_FORK A forked tasks gets the same clamp values of its parent however, when the RESET_ON_FORK flag is set on parent, e.g. via: sys_sched_setattr() sched_setattr() __sched_setscheduler(attr::SCHED_FLAG_RESET_ON_FORK) the new forked task is expected to start with all attributes reset to default values. Do that for utilization clamp values too by checking the reset request from the existing uclamp_fork() call which already provides the required initialization for other uclamp related bits. Signed-off-by: Patrick Bellasi Signed-off-by: Peter Zijlstra (Intel) Cc: Alessio Balsini Cc: Dietmar Eggemann Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Quentin Perret Cc: Rafael J . Wysocki Cc: Steve Muckle Cc: Suren Baghdasaryan Cc: Tejun Heo Cc: Thomas Gleixner Cc: Todd Kjos Cc: Vincent Guittot Cc: Viresh Kumar Link: https://lkml.kernel.org/r/20190621084217.8167-8-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e9a669266fa9..ecc304ab906f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1082,6 +1082,14 @@ static void uclamp_fork(struct task_struct *p) for_each_clamp_id(clamp_id) p->uclamp[clamp_id].active = false; + + if (likely(!p->sched_reset_on_fork)) + return; + + for_each_clamp_id(clamp_id) { + uclamp_se_set(&p->uclamp_req[clamp_id], + uclamp_none(clamp_id), false); + } } static void __init init_uclamp(void) From 1a00d999971c78ab024a17b0efc37d78404dd120 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Fri, 21 Jun 2019 09:42:09 +0100 Subject: [PATCH 30/33] sched/uclamp: Set default clamps for RT tasks By default FAIR tasks start without clamps, i.e. neither boosted nor capped, and they run at the best frequency matching their utilization demand. This default behavior does not fit RT tasks which instead are expected to run at the maximum available frequency, if not otherwise required by explicitly capping them. Enforce the correct behavior for RT tasks by setting util_min to max whenever: 1. the task is switched to the RT class and it does not already have a user-defined clamp value assigned. 2. an RT task is forked from a parent with RESET_ON_FORK set. NOTE: utilization clamp values are cross scheduling class attributes and thus they are never changed/reset once a value has been explicitly defined from user-space. Signed-off-by: Patrick Bellasi Signed-off-by: Peter Zijlstra (Intel) Cc: Alessio Balsini Cc: Dietmar Eggemann Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Quentin Perret Cc: Rafael J . Wysocki Cc: Steve Muckle Cc: Suren Baghdasaryan Cc: Tejun Heo Cc: Thomas Gleixner Cc: Todd Kjos Cc: Vincent Guittot Cc: Viresh Kumar Link: https://lkml.kernel.org/r/20190621084217.8167-9-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ecc304ab906f..6cd5133f0c2a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1062,6 +1062,27 @@ static int uclamp_validate(struct task_struct *p, static void __setscheduler_uclamp(struct task_struct *p, const struct sched_attr *attr) { + unsigned int clamp_id; + + /* + * On scheduling class change, reset to default clamps for tasks + * without a task-specific value. + */ + for_each_clamp_id(clamp_id) { + struct uclamp_se *uc_se = &p->uclamp_req[clamp_id]; + unsigned int clamp_value = uclamp_none(clamp_id); + + /* Keep using defined clamps across class changes */ + if (uc_se->user_defined) + continue; + + /* By default, RT tasks always get 100% boost */ + if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN)) + clamp_value = uclamp_none(UCLAMP_MAX); + + uclamp_se_set(uc_se, clamp_value, false); + } + if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP))) return; @@ -1087,8 +1108,13 @@ static void uclamp_fork(struct task_struct *p) return; for_each_clamp_id(clamp_id) { - uclamp_se_set(&p->uclamp_req[clamp_id], - uclamp_none(clamp_id), false); + unsigned int clamp_value = uclamp_none(clamp_id); + + /* By default, RT tasks always get 100% boost */ + if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN)) + clamp_value = uclamp_none(UCLAMP_MAX); + + uclamp_se_set(&p->uclamp_req[clamp_id], clamp_value, false); } } From 982d9cdc22c9f6df5ad790caa229ff74fb1d95e7 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Fri, 21 Jun 2019 09:42:10 +0100 Subject: [PATCH 31/33] sched/cpufreq, sched/uclamp: Add clamps for FAIR and RT tasks Each time a frequency update is required via schedutil, a frequency is selected to (possibly) satisfy the utilization reported by each scheduling class and irqs. However, when utilization clamping is in use, the frequency selection should consider userspace utilization clamping hints. This will allow, for example, to: - boost tasks which are directly affecting the user experience by running them at least at a minimum "requested" frequency - cap low priority tasks not directly affecting the user experience by running them only up to a maximum "allowed" frequency These constraints are meant to support a per-task based tuning of the frequency selection thus supporting a fine grained definition of performance boosting vs energy saving strategies in kernel space. Add support to clamp the utilization of RUNNABLE FAIR and RT tasks within the boundaries defined by their aggregated utilization clamp constraints. Do that by considering the max(min_util, max_util) to give boosted tasks the performance they need even when they happen to be co-scheduled with other capped tasks. Signed-off-by: Patrick Bellasi Signed-off-by: Peter Zijlstra (Intel) Cc: Alessio Balsini Cc: Dietmar Eggemann Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Quentin Perret Cc: Rafael J . Wysocki Cc: Steve Muckle Cc: Suren Baghdasaryan Cc: Tejun Heo Cc: Thomas Gleixner Cc: Todd Kjos Cc: Vincent Guittot Cc: Viresh Kumar Link: https://lkml.kernel.org/r/20190621084217.8167-10-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/cpufreq_schedutil.c | 15 ++++++++++++--- kernel/sched/fair.c | 4 ++++ kernel/sched/rt.c | 4 ++++ kernel/sched/sched.h | 23 +++++++++++++++++++++++ 4 files changed, 43 insertions(+), 3 deletions(-) diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 7c4ce69067c4..d84e036a7536 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -202,8 +202,10 @@ unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs, unsigned long dl_util, util, irq; struct rq *rq = cpu_rq(cpu); - if (type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) + if (!IS_BUILTIN(CONFIG_UCLAMP_TASK) && + type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) { return max; + } /* * Early check to see if IRQ/steal time saturates the CPU, can be @@ -219,9 +221,16 @@ unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs, * CFS tasks and we use the same metric to track the effective * utilization (PELT windows are synchronized) we can directly add them * to obtain the CPU's actual utilization. + * + * CFS and RT utilization can be boosted or capped, depending on + * utilization clamp constraints requested by currently RUNNABLE + * tasks. + * When there are no CFS RUNNABLE tasks, clamps are released and + * frequency will be gracefully reduced with the utilization decay. */ - util = util_cfs; - util += cpu_util_rt(rq); + util = util_cfs + cpu_util_rt(rq); + if (type == FREQUENCY_UTIL) + util = uclamp_util(rq, util); dl_util = cpu_util_dl(rq); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3bdcd3c718bc..28db7ce5c3a6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10393,6 +10393,10 @@ const struct sched_class fair_sched_class = { #ifdef CONFIG_FAIR_GROUP_SCHED .task_change_group = task_change_group_fair, #endif + +#ifdef CONFIG_UCLAMP_TASK + .uclamp_enabled = 1, +#endif }; #ifdef CONFIG_SCHED_DEBUG diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 63ad7c90822c..a532558a5176 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2400,6 +2400,10 @@ const struct sched_class rt_sched_class = { .switched_to = switched_to_rt, .update_curr = update_curr_rt, + +#ifdef CONFIG_UCLAMP_TASK + .uclamp_enabled = 1, +#endif }; #ifdef CONFIG_RT_GROUP_SCHED diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 0d2ba8bb2cb3..9b0c77a99346 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2265,6 +2265,29 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} #endif /* CONFIG_CPU_FREQ */ +#ifdef CONFIG_UCLAMP_TASK +static inline unsigned int uclamp_util(struct rq *rq, unsigned int util) +{ + unsigned int min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value); + unsigned int max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value); + + /* + * Since CPU's {min,max}_util clamps are MAX aggregated considering + * RUNNABLE tasks with _different_ clamps, we can end up with an + * inversion. Fix it now when the clamps are applied. + */ + if (unlikely(min_util >= max_util)) + return min_util; + + return clamp(util, min_util, max_util); +} +#else /* CONFIG_UCLAMP_TASK */ +static inline unsigned int uclamp_util(struct rq *rq, unsigned int util) +{ + return util; +} +#endif /* CONFIG_UCLAMP_TASK */ + #ifdef arch_scale_freq_capacity # ifndef arch_scale_freq_invariant # define arch_scale_freq_invariant() true From 9d20ad7dfc9a5cc64e33d725902d3863d350a66a Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Fri, 21 Jun 2019 09:42:11 +0100 Subject: [PATCH 32/33] sched/uclamp: Add uclamp_util_with() So far uclamp_util() allows to clamp a specified utilization considering the clamp values requested by RUNNABLE tasks in a CPU. For the Energy Aware Scheduler (EAS) it is interesting to test how clamp values will change when a task is becoming RUNNABLE on a given CPU. For example, EAS is interested in comparing the energy impact of different scheduling decisions and the clamp values can play a role on that. Add uclamp_util_with() which allows to clamp a given utilization by considering the possible impact on CPU clamp values of a specified task. Signed-off-by: Patrick Bellasi Signed-off-by: Peter Zijlstra (Intel) Cc: Alessio Balsini Cc: Dietmar Eggemann Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Quentin Perret Cc: Rafael J . Wysocki Cc: Steve Muckle Cc: Suren Baghdasaryan Cc: Tejun Heo Cc: Thomas Gleixner Cc: Todd Kjos Cc: Vincent Guittot Cc: Viresh Kumar Link: https://lkml.kernel.org/r/20190621084217.8167-11-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 13 +++++++++++++ kernel/sched/sched.h | 21 ++++++++++++++++++++- 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6cd5133f0c2a..fa43ce3962e7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -880,6 +880,19 @@ uclamp_eff_get(struct task_struct *p, unsigned int clamp_id) return uc_req; } +unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id) +{ + struct uclamp_se uc_eff; + + /* Task currently refcounted: use back-annotated (effective) value */ + if (p->uclamp[clamp_id].active) + return p->uclamp[clamp_id].value; + + uc_eff = uclamp_eff_get(p, clamp_id); + + return uc_eff.value; +} + /* * When a task is enqueued on a rq, the clamp bucket currently defined by the * task's uclamp::bucket_id is refcounted on that rq. This also immediately diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 9b0c77a99346..1783f6b4c2e0 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2266,11 +2266,20 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} #endif /* CONFIG_CPU_FREQ */ #ifdef CONFIG_UCLAMP_TASK -static inline unsigned int uclamp_util(struct rq *rq, unsigned int util) +unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id); + +static __always_inline +unsigned int uclamp_util_with(struct rq *rq, unsigned int util, + struct task_struct *p) { unsigned int min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value); unsigned int max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value); + if (p) { + min_util = max(min_util, uclamp_eff_value(p, UCLAMP_MIN)); + max_util = max(max_util, uclamp_eff_value(p, UCLAMP_MAX)); + } + /* * Since CPU's {min,max}_util clamps are MAX aggregated considering * RUNNABLE tasks with _different_ clamps, we can end up with an @@ -2281,7 +2290,17 @@ static inline unsigned int uclamp_util(struct rq *rq, unsigned int util) return clamp(util, min_util, max_util); } + +static inline unsigned int uclamp_util(struct rq *rq, unsigned int util) +{ + return uclamp_util_with(rq, util, NULL); +} #else /* CONFIG_UCLAMP_TASK */ +static inline unsigned int uclamp_util_with(struct rq *rq, unsigned int util, + struct task_struct *p) +{ + return util; +} static inline unsigned int uclamp_util(struct rq *rq, unsigned int util) { return util; From af24bde8df2029f067dc46aff0393c8f18ff6e2f Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Fri, 21 Jun 2019 09:42:12 +0100 Subject: [PATCH 33/33] sched/uclamp: Add uclamp support to energy_compute() The Energy Aware Scheduler (EAS) estimates the energy impact of waking up a task on a given CPU. This estimation is based on: a) an (active) power consumption defined for each CPU frequency b) an estimation of which frequency will be used on each CPU c) an estimation of the busy time (utilization) of each CPU Utilization clamping can affect both b) and c). A CPU is expected to run: - on an higher than required frequency, but for a shorter time, in case its estimated utilization will be smaller than the minimum utilization enforced by uclamp - on a smaller than required frequency, but for a longer time, in case its estimated utilization is bigger than the maximum utilization enforced by uclamp While compute_energy() already accounts clamping effects on busy time, the clamping effects on frequency selection are currently ignored. Fix it by considering how CPU clamp values will be affected by a task waking up and being RUNNABLE on that CPU. Do that by refactoring schedutil_freq_util() to take an additional task_struct* which allows EAS to evaluate the impact on clamp values of a task being eventually queued in a CPU. Clamp values are applied to the RT+CFS utilization only when a FREQUENCY_UTIL is required by compute_energy(). Do note that switching from ENERGY_UTIL to FREQUENCY_UTIL in the computation of the cpu_util signal implies that we are more likely to estimate the highest OPP when a RT task is running in another CPU of the same performance domain. This can have an impact on energy estimation but: - it's not easy to say which approach is better, since it depends on the use case - the original approach could still be obtained by setting a smaller task-specific util_min whenever required Since we are at that: - rename schedutil_freq_util() into schedutil_cpu_util(), since it's not only used for frequency selection. Signed-off-by: Patrick Bellasi Signed-off-by: Peter Zijlstra (Intel) Cc: Alessio Balsini Cc: Dietmar Eggemann Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Quentin Perret Cc: Rafael J . Wysocki Cc: Steve Muckle Cc: Suren Baghdasaryan Cc: Tejun Heo Cc: Thomas Gleixner Cc: Todd Kjos Cc: Vincent Guittot Cc: Viresh Kumar Link: https://lkml.kernel.org/r/20190621084217.8167-12-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/cpufreq_schedutil.c | 9 +++---- kernel/sched/fair.c | 40 +++++++++++++++++++++++++++----- kernel/sched/sched.h | 21 +++++++---------- 3 files changed, 48 insertions(+), 22 deletions(-) diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index d84e036a7536..636ca6f88c8e 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -196,8 +196,9 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, * based on the task model parameters and gives the minimal utilization * required to meet deadlines. */ -unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs, - unsigned long max, enum schedutil_type type) +unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, + unsigned long max, enum schedutil_type type, + struct task_struct *p) { unsigned long dl_util, util, irq; struct rq *rq = cpu_rq(cpu); @@ -230,7 +231,7 @@ unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs, */ util = util_cfs + cpu_util_rt(rq); if (type == FREQUENCY_UTIL) - util = uclamp_util(rq, util); + util = uclamp_util_with(rq, util, p); dl_util = cpu_util_dl(rq); @@ -290,7 +291,7 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) sg_cpu->max = max; sg_cpu->bw_dl = cpu_bw_dl(rq); - return schedutil_freq_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL); + return schedutil_cpu_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL, NULL); } /** diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 28db7ce5c3a6..b798fe7ff7cd 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6231,11 +6231,21 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu) static long compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) { - long util, max_util, sum_util, energy = 0; + unsigned int max_util, util_cfs, cpu_util, cpu_cap; + unsigned long sum_util, energy = 0; + struct task_struct *tsk; int cpu; for (; pd; pd = pd->next) { + struct cpumask *pd_mask = perf_domain_span(pd); + + /* + * The energy model mandates all the CPUs of a performance + * domain have the same capacity. + */ + cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask)); max_util = sum_util = 0; + /* * The capacity state of CPUs of the current rd can be driven by * CPUs of another rd if they belong to the same performance @@ -6246,11 +6256,29 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) * it will not appear in its pd list and will not be accounted * by compute_energy(). */ - for_each_cpu_and(cpu, perf_domain_span(pd), cpu_online_mask) { - util = cpu_util_next(cpu, p, dst_cpu); - util = schedutil_energy_util(cpu, util); - max_util = max(util, max_util); - sum_util += util; + for_each_cpu_and(cpu, pd_mask, cpu_online_mask) { + util_cfs = cpu_util_next(cpu, p, dst_cpu); + + /* + * Busy time computation: utilization clamping is not + * required since the ratio (sum_util / cpu_capacity) + * is already enough to scale the EM reported power + * consumption at the (eventually clamped) cpu_capacity. + */ + sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap, + ENERGY_UTIL, NULL); + + /* + * Performance domain frequency: utilization clamping + * must be considered since it affects the selection + * of the performance domain frequency. + * NOTE: in case RT tasks are running, by default the + * FREQUENCY_UTIL's utilization can be max OPP. + */ + tsk = cpu == dst_cpu ? p : NULL; + cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap, + FREQUENCY_UTIL, tsk); + max_util = max(max_util, cpu_util); } energy += em_pd_energy(pd->em_pd, max_util, sum_util); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 1783f6b4c2e0..802b1f3405f2 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2322,7 +2322,6 @@ static inline unsigned long capacity_orig_of(int cpu) } #endif -#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL /** * enum schedutil_type - CPU utilization type * @FREQUENCY_UTIL: Utilization used to select frequency @@ -2338,15 +2337,11 @@ enum schedutil_type { ENERGY_UTIL, }; -unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs, - unsigned long max, enum schedutil_type type); +#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL -static inline unsigned long schedutil_energy_util(int cpu, unsigned long cfs) -{ - unsigned long max = arch_scale_cpu_capacity(cpu); - - return schedutil_freq_util(cpu, cfs, max, ENERGY_UTIL); -} +unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, + unsigned long max, enum schedutil_type type, + struct task_struct *p); static inline unsigned long cpu_bw_dl(struct rq *rq) { @@ -2375,11 +2370,13 @@ static inline unsigned long cpu_util_rt(struct rq *rq) return READ_ONCE(rq->avg_rt.util_avg); } #else /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ -static inline unsigned long schedutil_energy_util(int cpu, unsigned long cfs) +static inline unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, + unsigned long max, enum schedutil_type type, + struct task_struct *p) { - return cfs; + return 0; } -#endif +#endif /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ #ifdef CONFIG_HAVE_SCHED_AVG_IRQ static inline unsigned long cpu_util_irq(struct rq *rq)