From 05484e0984487d42e97c417cbb0697fa9d16e7e9 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Fri, 20 Jul 2018 14:32:31 +0100 Subject: [PATCH 01/28] sched/topology: Add SD_ASYM_CPUCAPACITY flag detection The SD_ASYM_CPUCAPACITY sched_domain flag is supposed to mark the sched_domain in the hierarchy where all CPU capacities are visible for any CPU's point of view on asymmetric CPU capacity systems. The scheduler can then take to take capacity asymmetry into account when balancing at this level. It also serves as an indicator for how wide task placement heuristics have to search to consider all available CPU capacities as asymmetric systems might often appear symmetric at smallest level(s) of the sched_domain hierarchy. The flag has been around for while but so far only been set by out-of-tree code in Android kernels. One solution is to let each architecture provide the flag through a custom sched_domain topology array and associated mask and flag functions. However, SD_ASYM_CPUCAPACITY is special in the sense that it depends on the capacity and presence of all CPUs in the system, i.e. when hotplugging all CPUs out except those with one particular CPU capacity the flag should disappear even if the sched_domains don't collapse. Similarly, the flag is affected by cpusets where load-balancing is turned off. Detecting when the flags should be set therefore depends not only on topology information but also the cpuset configuration and hotplug state. The arch code doesn't have easy access to the cpuset configuration. Instead, this patch implements the flag detection in generic code where cpusets and hotplug state is already taken care of. All the arch is responsible for is to implement arch_scale_cpu_capacity() and force a full rebuild of the sched_domain hierarchy if capacities are updated, e.g. later in the boot process when cpufreq has initialized. Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: valentin.schneider@arm.com Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/1532093554-30504-2-git-send-email-morten.rasmussen@arm.com [ Fixed 'CPU' capitalization. ] Signed-off-by: Ingo Molnar --- include/linux/sched/topology.h | 6 +-- kernel/sched/topology.c | 81 +++++++++++++++++++++++++++++++--- 2 files changed, 78 insertions(+), 9 deletions(-) diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 26347741ba50..6b9976180c1e 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -23,10 +23,10 @@ #define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */ #define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */ #define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */ -#define SD_ASYM_CPUCAPACITY 0x0040 /* Groups have different max cpu capacities */ -#define SD_SHARE_CPUCAPACITY 0x0080 /* Domain members share cpu capacity */ +#define SD_ASYM_CPUCAPACITY 0x0040 /* Domain members have different CPU capacities */ +#define SD_SHARE_CPUCAPACITY 0x0080 /* Domain members share CPU capacity */ #define SD_SHARE_POWERDOMAIN 0x0100 /* Domain members share power domain */ -#define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */ +#define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share CPU pkg resources */ #define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */ #define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */ #define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */ diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 505a41c42b96..5c4d583d53ee 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1061,7 +1061,6 @@ static struct cpumask ***sched_domains_numa_masks; * SD_SHARE_PKG_RESOURCES - describes shared caches * SD_NUMA - describes NUMA topologies * SD_SHARE_POWERDOMAIN - describes shared power domain - * SD_ASYM_CPUCAPACITY - describes mixed capacity topologies * * Odd one out, which beside describing the topology has a quirk also * prescribes the desired behaviour that goes along with it: @@ -1073,13 +1072,12 @@ static struct cpumask ***sched_domains_numa_masks; SD_SHARE_PKG_RESOURCES | \ SD_NUMA | \ SD_ASYM_PACKING | \ - SD_ASYM_CPUCAPACITY | \ SD_SHARE_POWERDOMAIN) static struct sched_domain * sd_init(struct sched_domain_topology_level *tl, const struct cpumask *cpu_map, - struct sched_domain *child, int cpu) + struct sched_domain *child, int dflags, int cpu) { struct sd_data *sdd = &tl->data; struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); @@ -1100,6 +1098,9 @@ sd_init(struct sched_domain_topology_level *tl, "wrong sd_flags in topology description\n")) sd_flags &= ~TOPOLOGY_SD_FLAGS; + /* Apply detected topology flags */ + sd_flags |= dflags; + *sd = (struct sched_domain){ .min_interval = sd_weight, .max_interval = 2*sd_weight, @@ -1604,9 +1605,9 @@ static void __sdt_free(const struct cpumask *cpu_map) static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, const struct cpumask *cpu_map, struct sched_domain_attr *attr, - struct sched_domain *child, int cpu) + struct sched_domain *child, int dflags, int cpu) { - struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu); + struct sched_domain *sd = sd_init(tl, cpu_map, child, dflags, cpu); if (child) { sd->level = child->level + 1; @@ -1632,6 +1633,65 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve return sd; } +/* + * Find the sched_domain_topology_level where all CPU capacities are visible + * for all CPUs. + */ +static struct sched_domain_topology_level +*asym_cpu_capacity_level(const struct cpumask *cpu_map) +{ + int i, j, asym_level = 0; + bool asym = false; + struct sched_domain_topology_level *tl, *asym_tl = NULL; + unsigned long cap; + + /* Is there any asymmetry? */ + cap = arch_scale_cpu_capacity(NULL, cpumask_first(cpu_map)); + + for_each_cpu(i, cpu_map) { + if (arch_scale_cpu_capacity(NULL, i) != cap) { + asym = true; + break; + } + } + + if (!asym) + return NULL; + + /* + * Examine topology from all CPU's point of views to detect the lowest + * sched_domain_topology_level where a highest capacity CPU is visible + * to everyone. + */ + for_each_cpu(i, cpu_map) { + unsigned long max_capacity = arch_scale_cpu_capacity(NULL, i); + int tl_id = 0; + + for_each_sd_topology(tl) { + if (tl_id < asym_level) + goto next_level; + + for_each_cpu_and(j, tl->mask(i), cpu_map) { + unsigned long capacity; + + capacity = arch_scale_cpu_capacity(NULL, j); + + if (capacity <= max_capacity) + continue; + + max_capacity = capacity; + asym_level = tl_id; + asym_tl = tl; + } +next_level: + tl_id++; + } + } + + return asym_tl; +} + + /* * Build sched domains for a given set of CPUs and attach the sched domains * to the individual CPUs @@ -1644,18 +1704,27 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att struct s_data d; struct rq *rq = NULL; int i, ret = -ENOMEM; + struct sched_domain_topology_level *tl_asym; alloc_state = __visit_domain_allocation_hell(&d, cpu_map); if (alloc_state != sa_rootdomain) goto error; + tl_asym = asym_cpu_capacity_level(cpu_map); + /* Set up domains for CPUs specified by the cpu_map: */ for_each_cpu(i, cpu_map) { struct sched_domain_topology_level *tl; sd = NULL; for_each_sd_topology(tl) { - sd = build_sched_domain(tl, cpu_map, attr, sd, i); + int dflags = 0; + + if (tl == tl_asym) + dflags |= SD_ASYM_CPUCAPACITY; + + sd = build_sched_domain(tl, cpu_map, attr, sd, dflags, i); + if (tl == sched_domain_topology) *per_cpu_ptr(d.sd, i) = sd; if (tl->flags & SDTL_OVERLAP) From bb1fbdd3c3fd12b612c7d8cdf13bd6bfeebdefa3 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Fri, 20 Jul 2018 14:32:32 +0100 Subject: [PATCH 02/28] sched/topology, drivers/base/arch_topology: Rebuild the sched_domain hierarchy when capacities change The setting of SD_ASYM_CPUCAPACITY depends on the per-CPU capacities. These might not have their final values when the hierarchy is initially built as the values depend on cpufreq to be initialized or the values being set through sysfs. To ensure that the flags are set correctly we need to rebuild the sched_domain hierarchy whenever the reported per-CPU capacity (arch_scale_cpu_capacity()) changes. This patch ensure that a full sched_domain rebuild happens when CPU capacity changes occur. Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Greg Kroah-Hartman Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: valentin.schneider@arm.com Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/1532093554-30504-3-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar --- drivers/base/arch_topology.c | 26 ++++++++++++++++++++++++++ include/linux/arch_topology.h | 1 + 2 files changed, 27 insertions(+) diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index e7cb0c6ade81..edfcf8d982e4 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c @@ -15,6 +15,7 @@ #include #include #include +#include DEFINE_PER_CPU(unsigned long, freq_scale) = SCHED_CAPACITY_SCALE; @@ -47,6 +48,9 @@ static ssize_t cpu_capacity_show(struct device *dev, return sprintf(buf, "%lu\n", topology_get_cpu_scale(NULL, cpu->dev.id)); } +static void update_topology_flags_workfn(struct work_struct *work); +static DECLARE_WORK(update_topology_flags_work, update_topology_flags_workfn); + static ssize_t cpu_capacity_store(struct device *dev, struct device_attribute *attr, const char *buf, @@ -72,6 +76,8 @@ static ssize_t cpu_capacity_store(struct device *dev, topology_set_cpu_scale(i, new_capacity); mutex_unlock(&cpu_scale_mutex); + schedule_work(&update_topology_flags_work); + return count; } @@ -96,6 +102,25 @@ static int register_cpu_capacity_sysctl(void) } subsys_initcall(register_cpu_capacity_sysctl); +static int update_topology; + +int topology_update_cpu_topology(void) +{ + return update_topology; +} + +/* + * Updating the sched_domains can't be done directly from cpufreq callbacks + * due to locking, so queue the work for later. + */ +static void update_topology_flags_workfn(struct work_struct *work) +{ + update_topology = 1; + rebuild_sched_domains(); + pr_debug("sched_domain hierarchy rebuilt, flags updated\n"); + update_topology = 0; +} + static u32 capacity_scale; static u32 *raw_capacity; @@ -201,6 +226,7 @@ init_cpu_capacity_callback(struct notifier_block *nb, if (cpumask_empty(cpus_to_visit)) { topology_normalize_cpu_scale(); + schedule_work(&update_topology_flags_work); free_raw_capacity(); pr_debug("cpu_capacity: parsing done\n"); schedule_work(&parsing_done_work); diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h index 2b709416de05..d9bdc1a7f4e7 100644 --- a/include/linux/arch_topology.h +++ b/include/linux/arch_topology.h @@ -9,6 +9,7 @@ #include void topology_normalize_cpu_scale(void); +int topology_update_cpu_topology(void); struct device_node; bool topology_parse_cpu_capacity(struct device_node *cpu_node, int cpu); From 3ba09df4b8b6e3f01ed6381e8fb890840fd0bca3 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Fri, 20 Jul 2018 14:32:33 +0100 Subject: [PATCH 03/28] sched/topology, arch/arm64: Rebuild the sched_domain hierarchy when the CPU capacity changes Asymmetric CPU capacity can not necessarily be determined accurately at the time the initial sched_domain hierarchy is built during boot. It is therefore necessary to be able to force a full rebuild of the hierarchy later triggered by the arch_topology driver. A full rebuild requires the arch-code to implement arch_update_cpu_topology() which isn't yet implemented for arm64. This patch points the arm64 implementation to arch_topology driver to ensure that full hierarchy rebuild happens when needed. Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Catalin Marinas Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Will Deacon Cc: dietmar.eggemann@arm.com Cc: valentin.schneider@arm.com Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/1532093554-30504-4-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar --- arch/arm64/include/asm/topology.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h index 49a0fee4f89b..0524f2438649 100644 --- a/arch/arm64/include/asm/topology.h +++ b/arch/arm64/include/asm/topology.h @@ -45,6 +45,9 @@ int pcibus_to_node(struct pci_bus *bus); /* Replace task scheduler's default cpu-invariant accounting */ #define arch_scale_cpu_capacity topology_get_cpu_scale +/* Enable topology flag updates */ +#define arch_update_cpu_topology topology_update_cpu_topology + #include #endif /* _ASM_ARM_TOPOLOGY_H */ From e1799a80a4f5a463f252b7325da8bb66dfd55471 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Fri, 20 Jul 2018 14:32:34 +0100 Subject: [PATCH 04/28] sched/topology, arch/arm: Rebuild sched_domain hierarchy when CPU capacity changes Asymmetric CPU capacity can not necessarily be determined accurately at the time the initial sched_domain hierarchy is built during boot. It is therefore necessary to be able to force a full rebuild of the hierarchy later triggered by the arch_topology driver. A full rebuild requires the arch-code to implement arch_update_cpu_topology() which isn't yet implemented for arm. This patch points the arm implementation to arch_topology driver to ensure that full hierarchy rebuild happens when needed. Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Russell King Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: valentin.schneider@arm.com Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/1532093554-30504-5-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar --- arch/arm/include/asm/topology.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h index 5d88d2f22b2c..2a786f54d8b8 100644 --- a/arch/arm/include/asm/topology.h +++ b/arch/arm/include/asm/topology.h @@ -33,6 +33,9 @@ const struct cpumask *cpu_coregroup_mask(int cpu); /* Replace task scheduler's default cpu-invariant accounting */ #define arch_scale_cpu_capacity topology_get_cpu_scale +/* Enable topology flag updates */ +#define arch_update_cpu_topology topology_update_cpu_topology + #else static inline void init_cpu_topology(void) { } From df054e8445a4011e3d693c2268129c0456108663 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Wed, 4 Jul 2018 11:17:39 +0100 Subject: [PATCH 05/28] sched/topology: Add static_key for asymmetric CPU capacity optimizations The existing asymmetric CPU capacity code should cause minimal overhead for others. Putting it behind a static_key, it has been done for SMT optimizations, would make it easier to extend and improve without causing harm to others moving forward. Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: gaku.inami.xh@renesas.com Cc: valentin.schneider@arm.com Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/1530699470-29808-2-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 3 +++ kernel/sched/sched.h | 1 + kernel/sched/topology.c | 9 ++++++++- 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f808ddf2a868..3e5071aeb117 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6188,6 +6188,9 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) { long min_cap, max_cap; + if (!static_branch_unlikely(&sched_asym_cpucapacity)) + return 0; + min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu)); max_cap = cpu_rq(cpu)->rd->max_cpu_capacity; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 4a2e8cae63c4..0f36adc31ba5 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1185,6 +1185,7 @@ DECLARE_PER_CPU(int, sd_llc_id); DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); DECLARE_PER_CPU(struct sched_domain *, sd_numa); DECLARE_PER_CPU(struct sched_domain *, sd_asym); +extern struct static_key_false sched_asym_cpucapacity; struct sched_group_capacity { atomic_t ref; diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 5c4d583d53ee..b0cdf5e95bda 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -398,6 +398,7 @@ DEFINE_PER_CPU(int, sd_llc_id); DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); DEFINE_PER_CPU(struct sched_domain *, sd_numa); DEFINE_PER_CPU(struct sched_domain *, sd_asym); +DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity); static void update_top_cache_domain(int cpu) { @@ -1705,6 +1706,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att struct rq *rq = NULL; int i, ret = -ENOMEM; struct sched_domain_topology_level *tl_asym; + bool has_asym = false; alloc_state = __visit_domain_allocation_hell(&d, cpu_map); if (alloc_state != sa_rootdomain) @@ -1720,8 +1722,10 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att for_each_sd_topology(tl) { int dflags = 0; - if (tl == tl_asym) + if (tl == tl_asym) { dflags |= SD_ASYM_CPUCAPACITY; + has_asym = true; + } sd = build_sched_domain(tl, cpu_map, attr, sd, dflags, i); @@ -1773,6 +1777,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att } rcu_read_unlock(); + if (has_asym) + static_branch_enable_cpuslocked(&sched_asym_cpucapacity); + if (rq && sched_debug_enabled) { pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n", cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity); From 3b1baa6496e6b7ad016342a9d256bdfb072ce902 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Wed, 4 Jul 2018 11:17:40 +0100 Subject: [PATCH 06/28] sched/fair: Add 'group_misfit_task' load-balance type To maximize throughput in systems with asymmetric CPU capacities (e.g. ARM big.LITTLE) load-balancing has to consider task and CPU utilization as well as per-CPU compute capacity when load-balancing in addition to the current average load based load-balancing policy. Tasks with high utilization that are scheduled on a lower capacity CPU need to be identified and migrated to a higher capacity CPU if possible to maximize throughput. To implement this additional policy an additional group_type (load-balance scenario) is added: 'group_misfit_task'. This represents scenarios where a sched_group has one or more tasks that are not suitable for its per-CPU capacity. 'group_misfit_task' is only considered if the system is not overloaded or imbalanced ('group_imbalanced' or 'group_overloaded'). Identifying misfit tasks requires the rq lock to be held. To avoid taking remote rq locks to examine source sched_groups for misfit tasks, each CPU is responsible for tracking misfit tasks themselves and update the rq->misfit_task flag. This means checking task utilization when tasks are scheduled and on sched_tick. Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: gaku.inami.xh@renesas.com Cc: valentin.schneider@arm.com Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/1530699470-29808-3-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 54 +++++++++++++++++++++++++++++++++++++------- kernel/sched/sched.h | 2 ++ 2 files changed, 48 insertions(+), 8 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3e5071aeb117..6e04bea5b11a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -693,6 +693,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu); static unsigned long task_h_load(struct task_struct *p); +static unsigned long capacity_of(int cpu); /* Give new sched_entity start runnable values to heavy its load in infant time */ void init_entity_runnable_average(struct sched_entity *se) @@ -1446,7 +1447,6 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, static unsigned long weighted_cpuload(struct rq *rq); static unsigned long source_load(int cpu, int type); static unsigned long target_load(int cpu, int type); -static unsigned long capacity_of(int cpu); /* Cached statistics for all CPUs within a node */ struct numa_stats { @@ -3647,6 +3647,29 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) WRITE_ONCE(p->se.avg.util_est, ue); } +static inline int task_fits_capacity(struct task_struct *p, long capacity) +{ + return capacity * 1024 > task_util_est(p) * capacity_margin; +} + +static inline void update_misfit_status(struct task_struct *p, struct rq *rq) +{ + if (!static_branch_unlikely(&sched_asym_cpucapacity)) + return; + + if (!p) { + rq->misfit_task_load = 0; + return; + } + + if (task_fits_capacity(p, capacity_of(cpu_of(rq)))) { + rq->misfit_task_load = 0; + return; + } + + rq->misfit_task_load = task_h_load(p); +} + #else /* CONFIG_SMP */ #define UPDATE_TG 0x0 @@ -3676,6 +3699,7 @@ util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {} static inline void util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) {} +static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {} #endif /* CONFIG_SMP */ @@ -6201,7 +6225,7 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) /* Bring task utilization in sync with prev_cpu */ sync_entity_load_avg(&p->se); - return min_cap * 1024 < task_util(p) * capacity_margin; + return !task_fits_capacity(p, min_cap); } /* @@ -6618,9 +6642,12 @@ done: __maybe_unused; if (hrtick_enabled(rq)) hrtick_start_fair(rq, p); + update_misfit_status(p, rq); + return p; idle: + update_misfit_status(NULL, rq); new_tasks = idle_balance(rq, rf); /* @@ -6826,6 +6853,13 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10; enum fbq_type { regular, remote, all }; +enum group_type { + group_other = 0, + group_misfit_task, + group_imbalanced, + group_overloaded, +}; + #define LBF_ALL_PINNED 0x01 #define LBF_NEED_BREAK 0x02 #define LBF_DST_PINNED 0x04 @@ -7399,12 +7433,6 @@ static unsigned long task_h_load(struct task_struct *p) /********** Helpers for find_busiest_group ************************/ -enum group_type { - group_other = 0, - group_imbalanced, - group_overloaded, -}; - /* * sg_lb_stats - stats of a sched_group required for load_balancing */ @@ -7420,6 +7448,7 @@ struct sg_lb_stats { unsigned int group_weight; enum group_type group_type; int group_no_capacity; + unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */ #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; unsigned int nr_preferred_running; @@ -7712,6 +7741,9 @@ group_type group_classify(struct sched_group *group, if (sg_imbalanced(group)) return group_imbalanced; + if (sgs->group_misfit_task_load) + return group_misfit_task; + return group_other; } @@ -7786,6 +7818,10 @@ static inline void update_sg_lb_stats(struct lb_env *env, */ if (!nr_running && idle_cpu(i)) sgs->idle_cpus++; + + if (env->sd->flags & SD_ASYM_CPUCAPACITY && + sgs->group_misfit_task_load < rq->misfit_task_load) + sgs->group_misfit_task_load = rq->misfit_task_load; } /* Adjust by relative CPU capacity of the group */ @@ -9567,6 +9603,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) if (static_branch_unlikely(&sched_numa_balancing)) task_tick_numa(rq, curr); + + update_misfit_status(curr, rq); } /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 0f36adc31ba5..7dbf67d147a2 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -842,6 +842,8 @@ struct rq { unsigned char idle_balance; + unsigned long misfit_task_load; + /* For active balancing */ int active_balance; int push_cpu; From e3d6d0cb66f2351cbfd09fbae04eb9804afe9577 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Wed, 4 Jul 2018 11:17:41 +0100 Subject: [PATCH 07/28] sched/fair: Add sched_group per-CPU max capacity The current sg->min_capacity tracks the lowest per-CPU compute capacity available in the sched_group when rt/irq pressure is taken into account. Minimum capacity isn't the ideal metric for tracking if a sched_group needs offloading to another sched_group for some scenarios, e.g. a sched_group with multiple CPUs if only one is under heavy pressure. Tracking maximum capacity isn't perfect either but a better choice for some situations as it indicates that the sched_group definitely compute capacity constrained either due to rt/irq pressure on all CPUs or asymmetric CPU capacities (e.g. big.LITTLE). Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: gaku.inami.xh@renesas.com Cc: valentin.schneider@arm.com Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/1530699470-29808-4-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 24 ++++++++++++++++++++---- kernel/sched/sched.h | 1 + kernel/sched/topology.c | 2 ++ 3 files changed, 23 insertions(+), 4 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6e04bea5b11a..fe04315d57b3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7557,13 +7557,14 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) cpu_rq(cpu)->cpu_capacity = capacity; sdg->sgc->capacity = capacity; sdg->sgc->min_capacity = capacity; + sdg->sgc->max_capacity = capacity; } void update_group_capacity(struct sched_domain *sd, int cpu) { struct sched_domain *child = sd->child; struct sched_group *group, *sdg = sd->groups; - unsigned long capacity, min_capacity; + unsigned long capacity, min_capacity, max_capacity; unsigned long interval; interval = msecs_to_jiffies(sd->balance_interval); @@ -7577,6 +7578,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) capacity = 0; min_capacity = ULONG_MAX; + max_capacity = 0; if (child->flags & SD_OVERLAP) { /* @@ -7607,6 +7609,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) } min_capacity = min(capacity, min_capacity); + max_capacity = max(capacity, max_capacity); } } else { /* @@ -7620,12 +7623,14 @@ void update_group_capacity(struct sched_domain *sd, int cpu) capacity += sgc->capacity; min_capacity = min(sgc->min_capacity, min_capacity); + max_capacity = max(sgc->max_capacity, max_capacity); group = group->next; } while (group != child->groups); } sdg->sgc->capacity = capacity; sdg->sgc->min_capacity = min_capacity; + sdg->sgc->max_capacity = max_capacity; } /* @@ -7721,16 +7726,27 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs) } /* - * group_smaller_cpu_capacity: Returns true if sched_group sg has smaller + * group_smaller_min_cpu_capacity: Returns true if sched_group sg has smaller * per-CPU capacity than sched_group ref. */ static inline bool -group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref) +group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref) { return sg->sgc->min_capacity * capacity_margin < ref->sgc->min_capacity * 1024; } +/* + * group_smaller_max_cpu_capacity: Returns true if sched_group sg has smaller + * per-CPU capacity_orig than sched_group ref. + */ +static inline bool +group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref) +{ + return sg->sgc->max_capacity * capacity_margin < + ref->sgc->max_capacity * 1024; +} + static inline enum group_type group_classify(struct sched_group *group, struct sg_lb_stats *sgs) @@ -7876,7 +7892,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, * power/energy consequences are not considered. */ if (sgs->sum_nr_running <= sgs->group_weight && - group_smaller_cpu_capacity(sds->local, sg)) + group_smaller_min_cpu_capacity(sds->local, sg)) return false; asym_packing: diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 7dbf67d147a2..fe17e0be2d7b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1197,6 +1197,7 @@ struct sched_group_capacity { */ unsigned long capacity; unsigned long min_capacity; /* Min per-CPU capacity in group */ + unsigned long max_capacity; /* Max per-CPU capacity in group */ unsigned long next_update; int imbalance; /* XXX unrelated to capacity but shared group state */ diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index b0cdf5e95bda..2536e1b938f9 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -693,6 +693,7 @@ static void init_overlap_sched_group(struct sched_domain *sd, sg_span = sched_group_span(sg); sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); sg->sgc->min_capacity = SCHED_CAPACITY_SCALE; + sg->sgc->max_capacity = SCHED_CAPACITY_SCALE; } static int @@ -852,6 +853,7 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd) sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_span(sg)); sg->sgc->min_capacity = SCHED_CAPACITY_SCALE; + sg->sgc->max_capacity = SCHED_CAPACITY_SCALE; return sg; } From cad68e552e7774b68ae6a2c5fedb792936098b72 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Wed, 4 Jul 2018 11:17:42 +0100 Subject: [PATCH 08/28] sched/fair: Consider misfit tasks when load-balancing On asymmetric CPU capacity systems load intensive tasks can end up on CPUs that don't suit their compute demand. In this scenarios 'misfit' tasks should be migrated to CPUs with higher compute capacity to ensure better throughput. group_misfit_task indicates this scenario, but tweaks to the load-balance code are needed to make the migrations happen. Misfit balancing only makes sense between a source group of lower per-CPU capacity and destination group of higher compute capacity. Otherwise, misfit balancing is ignored. group_misfit_task has lowest priority so any imbalance due to overload is dealt with first. The modifications are: 1. Only pick a group containing misfit tasks as the busiest group if the destination group has higher capacity and has spare capacity. 2. When the busiest group is a 'misfit' group, skip the usual average load and group capacity checks. 3. Set the imbalance for 'misfit' balancing sufficiently high for a task to be pulled ignoring average load. 4. Pick the CPU with the highest misfit load as the source CPU. 5. If the misfit task is alone on the source CPU, go for active balancing. Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: gaku.inami.xh@renesas.com Cc: valentin.schneider@arm.com Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/1530699470-29808-5-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 51 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 49 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fe04315d57b3..24fe39e57bc3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6890,6 +6890,7 @@ struct lb_env { unsigned int loop_max; enum fbq_type fbq_type; + enum group_type src_grp_type; struct list_head tasks; }; @@ -7873,6 +7874,17 @@ static bool update_sd_pick_busiest(struct lb_env *env, { struct sg_lb_stats *busiest = &sds->busiest_stat; + /* + * Don't try to pull misfit tasks we can't help. + * We can use max_capacity here as reduction in capacity on some + * CPUs in the group should either be possible to resolve + * internally or be covered by avg_load imbalance (eventually). + */ + if (sgs->group_type == group_misfit_task && + (!group_smaller_max_cpu_capacity(sg, sds->local) || + !group_has_capacity(env, &sds->local_stat))) + return false; + if (sgs->group_type > busiest->group_type) return true; @@ -7895,6 +7907,13 @@ static bool update_sd_pick_busiest(struct lb_env *env, group_smaller_min_cpu_capacity(sds->local, sg)) return false; + /* + * If we have more than one misfit sg go with the biggest misfit. + */ + if (sgs->group_type == group_misfit_task && + sgs->group_misfit_task_load < busiest->group_misfit_task_load) + return false; + asym_packing: /* This is the busiest node in its class. */ if (!(env->sd->flags & SD_ASYM_PACKING)) @@ -8192,8 +8211,9 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * factors in sg capacity and sgs with smaller group_type are * skipped when updating the busiest sg: */ - if (busiest->avg_load <= sds->avg_load || - local->avg_load >= sds->avg_load) { + if (busiest->group_type != group_misfit_task && + (busiest->avg_load <= sds->avg_load || + local->avg_load >= sds->avg_load)) { env->imbalance = 0; return fix_small_imbalance(env, sds); } @@ -8227,6 +8247,12 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s (sds->avg_load - local->avg_load) * local->group_capacity ) / SCHED_CAPACITY_SCALE; + /* Boost imbalance to allow misfit task to be balanced. */ + if (busiest->group_type == group_misfit_task) { + env->imbalance = max_t(long, env->imbalance, + busiest->group_misfit_task_load); + } + /* * if *imbalance is less than the average load per runnable task * there is no guarantee that any tasks will be moved so we'll have @@ -8293,6 +8319,10 @@ static struct sched_group *find_busiest_group(struct lb_env *env) busiest->group_no_capacity) goto force_balance; + /* Misfit tasks should be dealt with regardless of the avg load */ + if (busiest->group_type == group_misfit_task) + goto force_balance; + /* * If the local group is busier than the selected busiest group * don't try and pull any tasks. @@ -8330,6 +8360,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) force_balance: /* Looks like there is an imbalance. Compute it */ + env->src_grp_type = busiest->group_type; calculate_imbalance(env, &sds); return env->imbalance ? sds.busiest : NULL; @@ -8377,6 +8408,19 @@ static struct rq *find_busiest_queue(struct lb_env *env, if (rt > env->fbq_type) continue; + /* + * For ASYM_CPUCAPACITY domains with misfit tasks we simply + * seek the "biggest" misfit task. + */ + if (env->src_grp_type == group_misfit_task) { + if (rq->misfit_task_load > busiest_load) { + busiest_load = rq->misfit_task_load; + busiest = rq; + } + + continue; + } + capacity = capacity_of(i); wl = weighted_cpuload(rq); @@ -8446,6 +8490,9 @@ static int need_active_balance(struct lb_env *env) return 1; } + if (env->src_grp_type == group_misfit_task) + return 1; + return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); } From 5fbdfae5221a5208ed8e7653fc1c4b31de420f74 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 4 Jul 2018 11:17:43 +0100 Subject: [PATCH 09/28] sched/fair: Kick nohz balance if rq->misfit_task_load There already are a few conditions in nohz_kick_needed() to ensure a nohz kick is triggered, but they are not enough for some misfit task scenarios. Excluding asym packing, those are: - rq->nr_running >=2: Not relevant here because we are running a misfit task, it needs to be migrated regardless and potentially through active balance. - sds->nr_busy_cpus > 1: If there is only the misfit task being run on a group of low capacity CPUs, this will be evaluated to False. - rq->cfs.h_nr_running >=1 && check_cpu_capacity(): Not relevant here, misfit task needs to be migrated regardless of rt/IRQ pressure As such, this commit adds an rq->misfit_task_load condition to trigger a nohz kick. The idea to kick a nohz balance for misfit tasks originally came from Leo Yan , and a similar patch was submitted for the Android Common Kernel - see: https://lists.linaro.org/pipermail/eas-dev/2016-September/000551.html Signed-off-by: Valentin Schneider Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: gaku.inami.xh@renesas.com Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/1530699470-29808-6-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 24fe39e57bc3..e08287d3806f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9135,7 +9135,7 @@ static void nohz_balancer_kick(struct rq *rq) if (time_before(now, nohz.next_balance)) goto out; - if (rq->nr_running >= 2) { + if (rq->nr_running >= 2 || rq->misfit_task_load) { flags = NOHZ_KICK_MASK; goto out; } From dbbad719449e06d73db21598d6eee178f7a54b3b Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 4 Jul 2018 11:17:44 +0100 Subject: [PATCH 10/28] sched/fair: Change 'prefer_sibling' type to bool This variable is entirely local to update_sd_lb_stats, so we can safely change its type and slightly clean up its initialisation. Signed-off-by: Valentin Schneider Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: gaku.inami.xh@renesas.com Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/1530699470-29808-7-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e08287d3806f..23017939ecab 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7982,11 +7982,9 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd struct sched_group *sg = env->sd->groups; struct sg_lb_stats *local = &sds->local_stat; struct sg_lb_stats tmp_sgs; - int load_idx, prefer_sibling = 0; + int load_idx; bool overload = false; - - if (child && child->flags & SD_PREFER_SIBLING) - prefer_sibling = 1; + bool prefer_sibling = child && child->flags & SD_PREFER_SIBLING; #ifdef CONFIG_NO_HZ_COMMON if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked)) From 575638d1047eb057a5cdf95cc0b3c084e1279508 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 4 Jul 2018 11:17:45 +0100 Subject: [PATCH 11/28] sched/core: Change root_domain->overload type to int sizeof(_Bool) is implementation defined, so let's just go with 'int' as is done for other structures e.g. sched_domain_shared->has_idle_cores. The local 'overload' variable used in update_sd_lb_stats can remain bool, as it won't impact any struct layout and can be assigned to the root_domain field. Signed-off-by: Valentin Schneider Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: gaku.inami.xh@renesas.com Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/1530699470-29808-8-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/sched.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index fe17e0be2d7b..4d181478c5b8 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -716,7 +716,7 @@ struct root_domain { cpumask_var_t online; /* Indicate more than one runnable task for any CPU */ - bool overload; + int overload; /* * The bit corresponding to a CPU gets set here if such CPU has more @@ -1698,7 +1698,7 @@ static inline void add_nr_running(struct rq *rq, unsigned count) if (prev_nr < 2 && rq->nr_running >= 2) { #ifdef CONFIG_SMP if (!rq->rd->overload) - rq->rd->overload = true; + rq->rd->overload = 1; #endif } From e90c8fe15a3bf93a23088bcf1a56a0fa391d4e50 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 4 Jul 2018 11:17:46 +0100 Subject: [PATCH 12/28] sched/fair: Wrap rq->rd->overload accesses with READ/WRITE_ONCE() This variable can be read and set locklessly within update_sd_lb_stats(). As such, READ/WRITE_ONCE() are added to make sure nothing terribly wrong can happen because of the compiler. Signed-off-by: Valentin Schneider Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: gaku.inami.xh@renesas.com Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/1530699470-29808-9-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 6 +++--- kernel/sched/sched.h | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 23017939ecab..d9c4e97bfebd 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8058,8 +8058,8 @@ next_group: if (!env->sd->parent) { /* update overload indicator if we are at root domain */ - if (env->dst_rq->rd->overload != overload) - env->dst_rq->rd->overload = overload; + if (READ_ONCE(env->dst_rq->rd->overload) != overload) + WRITE_ONCE(env->dst_rq->rd->overload, overload); } } @@ -9502,7 +9502,7 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf) rq_unpin_lock(this_rq, rf); if (this_rq->avg_idle < sysctl_sched_migration_cost || - !this_rq->rd->overload) { + !READ_ONCE(this_rq->rd->overload)) { rcu_read_lock(); sd = rcu_dereference_check_sched_domain(this_rq->sd); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 4d181478c5b8..938063639793 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1697,8 +1697,8 @@ static inline void add_nr_running(struct rq *rq, unsigned count) if (prev_nr < 2 && rq->nr_running >= 2) { #ifdef CONFIG_SMP - if (!rq->rd->overload) - rq->rd->overload = 1; + if (!READ_ONCE(rq->rd->overload)) + WRITE_ONCE(rq->rd->overload, 1); #endif } From 757ffdd705ee942fc8150b17942d968601d2a15b Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 4 Jul 2018 11:17:47 +0100 Subject: [PATCH 13/28] sched/fair: Set rq->rd->overload when misfit Idle balance is a great opportunity to pull a misfit task. However, there are scenarios where misfit tasks are present but idle balance is prevented by the overload flag. A good example of this is a workload of n identical tasks. Let's suppose we have a 2+2 Arm big.LITTLE system. We then spawn 4 fairly CPU-intensive tasks - for the sake of simplicity let's say they are just CPU hogs, even when running on big CPUs. They are identical tasks, so on an SMP system they should all end at (roughly) the same time. However, in our case the LITTLE CPUs are less performing than the big CPUs, so tasks running on the LITTLEs will have a longer completion time. This means that the big CPUs will complete their work earlier, at which point they should pull the tasks from the LITTLEs. What we want to happen is summarized as follows: a,b,c,d are our CPU-hogging tasks _ signifies idling LITTLE_0 | a a a a _ _ LITTLE_1 | b b b b _ _ ---------|------------- big_0 | c c c c a a big_1 | d d d d b b ^ ^ Tasks end on the big CPUs, idle balance happens and the misfit tasks are pulled straight away This however won't happen, because currently the overload flag is only set when there is any CPU that has more than one runnable task - which may very well not be the case here if our CPU-hogging workload is all there is to run. As such, this commit sets the overload flag in update_sg_lb_stats when a group is flagged as having a misfit task. Signed-off-by: Valentin Schneider Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: gaku.inami.xh@renesas.com Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/1530699470-29808-10-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 6 ++++-- kernel/sched/sched.h | 6 +++++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d9c4e97bfebd..8b228c5b3eb4 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7793,7 +7793,7 @@ static bool update_nohz_stats(struct rq *rq, bool force) * @load_idx: Load index of sched_domain of this_cpu for load calc. * @local_group: Does group contain this_cpu. * @sgs: variable to hold the statistics for this group. - * @overload: Indicate more than one runnable task for any CPU. + * @overload: Indicate pullable load (e.g. >1 runnable task). */ static inline void update_sg_lb_stats(struct lb_env *env, struct sched_group *group, int load_idx, @@ -7837,8 +7837,10 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->idle_cpus++; if (env->sd->flags & SD_ASYM_CPUCAPACITY && - sgs->group_misfit_task_load < rq->misfit_task_load) + sgs->group_misfit_task_load < rq->misfit_task_load) { sgs->group_misfit_task_load = rq->misfit_task_load; + *overload = 1; + } } /* Adjust by relative CPU capacity of the group */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 938063639793..85b3a2bf6c2b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -715,7 +715,11 @@ struct root_domain { cpumask_var_t span; cpumask_var_t online; - /* Indicate more than one runnable task for any CPU */ + /* + * Indicate pullable load on at least one CPU, e.g: + * - More than one runnable task + * - Running task is misfit + */ int overload; /* From 4ad3831a9d4af5e36da5d44a3b9c6522d0353cee Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Wed, 4 Jul 2018 11:17:48 +0100 Subject: [PATCH 14/28] sched/fair: Don't move tasks to lower capacity CPUs unless necessary When lower capacity CPUs are load balancing and considering to pull something from a higher capacity group, we should not pull tasks from a CPU with only one task running as this is guaranteed to impede progress for that task. If there is more than one task running, load balance in the higher capacity group would have already made any possible moves to resolve imbalance and we should make better use of system compute capacity by moving a task if we still have more than one running. Signed-off-by: Chris Redpath Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: gaku.inami.xh@renesas.com Cc: valentin.schneider@arm.com Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/1530699470-29808-11-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8b228c5b3eb4..06ff75f4ac7b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8423,6 +8423,17 @@ static struct rq *find_busiest_queue(struct lb_env *env, capacity = capacity_of(i); + /* + * For ASYM_CPUCAPACITY domains, don't pick a CPU that could + * eventually lead to active_balancing high->low capacity. + * Higher per-CPU capacity is considered better than balancing + * average load. + */ + if (env->sd->flags & SD_ASYM_CPUCAPACITY && + capacity_of(env->dst_cpu) < capacity && + rq->nr_running == 1) + continue; + wl = weighted_cpuload(rq); /* From 9c63e84db29bcf584040931ad97c2edd11e35f6c Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Wed, 4 Jul 2018 11:17:50 +0100 Subject: [PATCH 15/28] sched/core: Disable SD_PREFER_SIBLING on asymmetric CPU capacity domains The 'prefer sibling' sched_domain flag is intended to encourage spreading tasks to sibling sched_domain to take advantage of more caches and core for SMT systems. It has recently been changed to be on all non-NUMA topology level. However, spreading across domains with CPU capacity asymmetry isn't desirable, e.g. spreading from high capacity to low capacity CPUs even if high capacity CPUs aren't overutilized might give access to more cache but the CPU will be slower and possibly lead to worse overall throughput. To prevent this, we need to remove SD_PREFER_SIBLING on the sched_domain level immediately below SD_ASYM_CPUCAPACITY. Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: gaku.inami.xh@renesas.com Cc: valentin.schneider@arm.com Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/1530699470-29808-13-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/topology.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 2536e1b938f9..7ffad0d3a4eb 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1126,7 +1126,7 @@ sd_init(struct sched_domain_topology_level *tl, | 0*SD_SHARE_CPUCAPACITY | 0*SD_SHARE_PKG_RESOURCES | 0*SD_SERIALIZE - | 0*SD_PREFER_SIBLING + | 1*SD_PREFER_SIBLING | 0*SD_NUMA | sd_flags , @@ -1152,17 +1152,21 @@ sd_init(struct sched_domain_topology_level *tl, if (sd->flags & SD_ASYM_CPUCAPACITY) { struct sched_domain *t = sd; + /* + * Don't attempt to spread across CPUs of different capacities. + */ + if (sd->child) + sd->child->flags &= ~SD_PREFER_SIBLING; + for_each_lower_domain(t) t->flags |= SD_BALANCE_WAKE; } if (sd->flags & SD_SHARE_CPUCAPACITY) { - sd->flags |= SD_PREFER_SIBLING; sd->imbalance_pct = 110; sd->smt_gain = 1178; /* ~15% */ } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { - sd->flags |= SD_PREFER_SIBLING; sd->imbalance_pct = 117; sd->cache_nice_tries = 1; sd->busy_idx = 2; @@ -1173,6 +1177,7 @@ sd_init(struct sched_domain_topology_level *tl, sd->busy_idx = 3; sd->idle_idx = 2; + sd->flags &= ~SD_PREFER_SIBLING; sd->flags |= SD_SERIALIZE; if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) { sd->flags &= ~(SD_BALANCE_EXEC | @@ -1182,7 +1187,6 @@ sd_init(struct sched_domain_topology_level *tl, #endif } else { - sd->flags |= SD_PREFER_SIBLING; sd->cache_nice_tries = 1; sd->busy_idx = 2; sd->idle_idx = 1; From 7e6f4c5d600c1c8e2a1d900e65cab319d9b6782e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 29 Aug 2018 11:45:21 +0200 Subject: [PATCH 16/28] sched/debug: Explicitly cast sched_feat() to bool LLVM has a warning that tags expressions like: if (foo && non-bool-const) This pattern triggers for CONFIG_SCHED_DEBUG=n where sched_feat() ends up being whatever bit we select. Avoid the warning with an explicit cast to bool. Reported-by: Philipp Klocke Tested-by: Nick Desaulniers Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 85b3a2bf6c2b..3a4ef8f73f08 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1401,7 +1401,7 @@ static const_debug __maybe_unused unsigned int sysctl_sched_features = 0; #undef SCHED_FEAT -#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) +#define sched_feat(x) !!(sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ From d90707ebebe03596e19de3abbf79b766e72a3465 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Wed, 29 Aug 2018 15:19:09 +0200 Subject: [PATCH 17/28] sched/numa: Remove unused code from update_numa_stats() With: commit 2d4056fafa19 ("sched/numa: Remove numa_has_capacity()") the local variables 'smt', 'cpus' and 'capacity' and their results are not used anymore in numa_has_capacity() Remove this unused code. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Srikar Dronamraju Acked-by: Mel Gorman Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1535548752-4434-2-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 06ff75f4ac7b..b65596fae06b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1463,8 +1463,7 @@ struct numa_stats { */ static void update_numa_stats(struct numa_stats *ns, int nid) { - int smt, cpu, cpus = 0; - unsigned long capacity; + int cpu; memset(ns, 0, sizeof(*ns)); for_each_cpu(cpu, cpumask_of_node(nid)) { @@ -1473,26 +1472,8 @@ static void update_numa_stats(struct numa_stats *ns, int nid) ns->nr_running += rq->nr_running; ns->load += weighted_cpuload(rq); ns->compute_capacity += capacity_of(cpu); - - cpus++; } - /* - * If we raced with hotplug and there are no CPUs left in our mask - * the @ns structure is NULL'ed and task_numa_compare() will - * not find this node attractive. - * - * We'll detect a huge imbalance and bail there. - */ - if (!cpus) - return; - - /* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */ - smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity); - capacity = cpus / smt; /* cores */ - - capacity = min_t(unsigned, capacity, - DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE)); } struct task_numa_env { From 7477a3504e619768c9e972dafe2907e6b8ed9823 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Wed, 29 Aug 2018 15:19:10 +0200 Subject: [PATCH 18/28] sched/numa: Remove unused numa_stats::nr_running field nr_running in struct numa_stats is not used anywhere in the code. Remove it. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Srikar Dronamraju Acked-by: Mel Gorman Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1535548752-4434-3-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b65596fae06b..6bd142d19549 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1454,8 +1454,6 @@ struct numa_stats { /* Total compute capacity of CPUs on a node */ unsigned long compute_capacity; - - unsigned int nr_running; }; /* @@ -1469,7 +1467,6 @@ static void update_numa_stats(struct numa_stats *ns, int nid) for_each_cpu(cpu, cpumask_of_node(nid)) { struct rq *rq = cpu_rq(cpu); - ns->nr_running += rq->nr_running; ns->load += weighted_cpuload(rq); ns->compute_capacity += capacity_of(cpu); } From ff28915fd31ccafc0d38e6f84b66df280ed9e86a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 5 Sep 2018 11:36:36 +0200 Subject: [PATCH 19/28] sched/debug: Use symbolic names for task state constants MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit include/trace/events/sched.h includes (via ) and so knows about the TASK_* constants used to interpret .prev_state. So instead of duplicating the magic numbers make use of the defined macros to ease understanding the mapping from state bits to letters which isn't completely intuitive for an outsider. Signed-off-by: Uwe Kleine-König Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Cc: kernel@pengutronix.de Link: http://lkml.kernel.org/r/20180905093636.24068-1-u.kleine-koenig@pengutronix.de Signed-off-by: Ingo Molnar --- include/trace/events/sched.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 0be866c91f62..f07b270d4fc4 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -159,9 +159,14 @@ TRACE_EVENT(sched_switch, (__entry->prev_state & (TASK_REPORT_MAX - 1)) ? __print_flags(__entry->prev_state & (TASK_REPORT_MAX - 1), "|", - { 0x01, "S" }, { 0x02, "D" }, { 0x04, "T" }, - { 0x08, "t" }, { 0x10, "X" }, { 0x20, "Z" }, - { 0x40, "P" }, { 0x80, "I" }) : + { TASK_INTERRUPTIBLE, "S" }, + { TASK_UNINTERRUPTIBLE, "D" }, + { __TASK_STOPPED, "T" }, + { __TASK_TRACED, "t" }, + { EXIT_DEAD, "X" }, + { EXIT_ZOMBIE, "Z" }, + { TASK_PARKED, "P" }, + { TASK_DEAD, "I" }) : "R", __entry->prev_state & TASK_REPORT_MAX ? "+" : "", From ace8031099f91480799b5929b4cccf2dcacc5136 Mon Sep 17 00:00:00 2001 From: zhong jiang Date: Fri, 3 Aug 2018 20:37:32 +0800 Subject: [PATCH 20/28] sched/topology: Make local variables static Fix the following warnings: kernel/sched/topology.c:10:15: warning: symbol 'sched_domains_tmpmask' was not declared. Should it be static? kernel/sched/topology.c:11:15: warning: symbol 'sched_domains_tmpmask2' was not declared. Should it be static? Signed-off-by: zhong jiang Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1533299852-26941-1-git-send-email-zhongjiang@huawei.com Signed-off-by: Ingo Molnar --- kernel/sched/topology.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 7ffad0d3a4eb..9d74371e4aad 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -7,8 +7,8 @@ DEFINE_MUTEX(sched_domains_mutex); /* Protected by sched_domains_mutex: */ -cpumask_var_t sched_domains_tmpmask; -cpumask_var_t sched_domains_tmpmask2; +static cpumask_var_t sched_domains_tmpmask; +static cpumask_var_t sched_domains_tmpmask2; #ifdef CONFIG_SCHED_DEBUG From 11d4afd4ff667f9b6178ee8c142c36cb78bd84db Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 25 Sep 2018 11:17:42 +0200 Subject: [PATCH 21/28] sched/pelt: Fix warning and clean up IRQ PELT config Create a config for enabling irq load tracking in the scheduler. irq load tracking is useful only when irq or paravirtual time is accounted but it's only possible with SMP for now. Also use __maybe_unused to remove the compilation warning in update_rq_clock_task() that has been introduced by: 2e62c4743adc ("sched/fair: Remove #ifdefs from scale_rt_capacity()") Suggested-by: Ingo Molnar Reported-by: Dou Liyang Reported-by: Miguel Ojeda Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bp@alien8.de Cc: dou_liyang@163.com Fixes: 2e62c4743adc ("sched/fair: Remove #ifdefs from scale_rt_capacity()") Link: http://lkml.kernel.org/r/1537867062-27285-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- init/Kconfig | 5 +++++ kernel/sched/core.c | 7 +++---- kernel/sched/fair.c | 2 +- kernel/sched/pelt.c | 2 +- kernel/sched/pelt.h | 2 +- kernel/sched/sched.h | 5 ++--- 6 files changed, 13 insertions(+), 10 deletions(-) diff --git a/init/Kconfig b/init/Kconfig index 1e234e2f1cba..317d5ccb5191 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -415,6 +415,11 @@ config IRQ_TIME_ACCOUNTING If in doubt, say N here. +config HAVE_SCHED_AVG_IRQ + def_bool y + depends on IRQ_TIME_ACCOUNTING || PARAVIRT_TIME_ACCOUNTING + depends on SMP + config BSD_PROCESS_ACCT bool "BSD Process Accounting" depends on MULTIUSER diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ad97f3ba5ec5..f2caf1bae4a3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -135,9 +135,8 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) * In theory, the compile should just see 0 here, and optimize out the call * to sched_rt_avg_update. But I don't trust it... */ -#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) - s64 steal = 0, irq_delta = 0; -#endif + s64 __maybe_unused steal = 0, irq_delta = 0; + #ifdef CONFIG_IRQ_TIME_ACCOUNTING irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; @@ -177,7 +176,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) rq->clock_task += delta; -#ifdef HAVE_SCHED_AVG_IRQ +#ifdef CONFIG_HAVE_SCHED_AVG_IRQ if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY)) update_irq_load_avg(rq, irq_delta + steal); #endif diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1d92ed2eca8b..d59307ecd67d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7317,7 +7317,7 @@ static inline bool others_have_blocked(struct rq *rq) if (READ_ONCE(rq->avg_dl.util_avg)) return true; -#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) +#ifdef CONFIG_HAVE_SCHED_AVG_IRQ if (READ_ONCE(rq->avg_irq.util_avg)) return true; #endif diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index 35475c0c5419..48a126486435 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -358,7 +358,7 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) return 0; } -#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) +#ifdef CONFIG_HAVE_SCHED_AVG_IRQ /* * irq: * diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index d2894db28955..7e56b489ff32 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -6,7 +6,7 @@ int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq); int update_rt_rq_load_avg(u64 now, struct rq *rq, int running); int update_dl_rq_load_avg(u64 now, struct rq *rq, int running); -#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) +#ifdef CONFIG_HAVE_SCHED_AVG_IRQ int update_irq_load_avg(struct rq *rq, u64 running); #else static inline int diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 632804fa0b12..798b1afd5092 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -862,8 +862,7 @@ struct rq { struct sched_avg avg_rt; struct sched_avg avg_dl; -#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) -#define HAVE_SCHED_AVG_IRQ +#ifdef CONFIG_HAVE_SCHED_AVG_IRQ struct sched_avg avg_irq; #endif u64 idle_stamp; @@ -2223,7 +2222,7 @@ static inline unsigned long cpu_util_rt(struct rq *rq) } #endif -#ifdef HAVE_SCHED_AVG_IRQ +#ifdef CONFIG_HAVE_SCHED_AVG_IRQ static inline unsigned long cpu_util_irq(struct rq *rq) { return rq->avg_irq.util_avg; From fdf5f315d5cfaefb7bb8a62ec4bf37b9891837aa Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Thu, 9 Aug 2018 14:57:53 +0100 Subject: [PATCH 22/28] sched/fair: Disable LB_BIAS by default MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit LB_BIAS allows the adjustment on how conservative load should be balanced. The rq->cpu_load[idx] array is used for this functionality. It contains weighted CPU load decayed average values over different intervals (idx = 1..4). Idx = 0 is the weighted CPU load itself. The values are updated during scheduler_tick, before idle balance and at nohz exit. There are 5 different types of idx's per sched domain (sd). Each of them is used to index into the rq->cpu_load[idx] array in a specific scenario (busy, idle and newidle for load balancing, forkexec for wake-up slow-path load balancing and wake for affine wakeup based on weight). Only the sd idx's for busy and idle load balancing are set to 2,3 or 1,2 respectively. All the other sd idx's are set to 0. Conservative load balancing is achieved for sd idx's >= 1 by using the min/max (source_load()/target_load()) value between the current weighted CPU load and the rq->cpu_load[sd idx -1] for the busiest(idlest)/local CPU load in load balancing or vice versa in the wake-up slow-path load balancing. There is no conservative balancing for sd idx = 0 since only current weighted CPU load is used in this case. It is very likely that LB_BIAS' influence on load balancing can be neglected (see test results below). This is further supported by: (1) Weighted CPU load today is by itself a decayed average value (PELT) (cfs_rq->avg->runnable_load_avg) and not the instantaneous load (rq->load.weight) it was when LB_BIAS was introduced. (2) Sd imbalance_pct is used for CPU_NEWLY_IDLE and CPU_NOT_IDLE (relate to sd's newidle and busy idx) in find_busiest_group() when comparing busiest and local avg load to make load balancing even more conservative. (3) The sd forkexec and newidle idx are always set to 0 so there is no adjustment on how conservatively load balancing is done here. (4) Affine wakeup based on weight (wake_affine_weight()) will not be impacted since the sd wake idx is always set to 0. Let's disable LB_BIAS by default for a few kernel releases to make sure that no workload and no scheduler topology is affected. The benefit of being able to remove the LB_BIAS dependency from source_load() and target_load() is that the entire rq->cpu_load[idx] code could be removed in this case. It is really hard to say if there is no regression w/o testing this with a lot of different workloads on a lot of different platforms, especially NUMA machines. The following 104 LKP (Linux Kernel Performance) tests were run by the 0-Day guys mostly on multi-socket hosts with a larger number of logical cpus (88, 192). The base for the test was commit b3dae109fa89 ("sched/swait: Rename to exclusive") (tip/sched/core v4.18-rc1). Only 2 out of the 104 tests had a significant change in one of the metrics (fsmark/1x-1t-1HDD-btrfs-nfsv4-4M-60G-NoSync-performance +7% files_per_sec, unixbench/300s-100%-syscall-performance -11% score). Tests which showed a change in one of the metrics are marked with a '*' and this change is listed as well. (a) lkp-bdw-ep3: 88 threads Intel(R) Xeon(R) CPU E5-2699 v4 @ 2.20GHz 64G dd-write/10m-1HDD-cfq-btrfs-100dd-performance fsmark/1x-1t-1HDD-xfs-nfsv4-4M-60G-NoSync-performance * fsmark/1x-1t-1HDD-btrfs-nfsv4-4M-60G-NoSync-performance 7.50 7% 8.00 ± 6% fsmark.files_per_sec fsmark/1x-1t-1HDD-btrfs-nfsv4-4M-60G-fsyncBeforeClose-performance fsmark/1x-1t-1HDD-btrfs-4M-60G-NoSync-performance fsmark/1x-1t-1HDD-btrfs-4M-60G-fsyncBeforeClose-performance kbuild/300s-50%-vmlinux_prereq-performance kbuild/300s-200%-vmlinux_prereq-performance kbuild/300s-50%-vmlinux_prereq-performance-1HDD-ext4 kbuild/300s-200%-vmlinux_prereq-performance-1HDD-ext4 (b) lkp-skl-4sp1: 192 threads Intel(R) Xeon(R) Platinum 8160 768G dbench/100%-performance ebizzy/200%-100x-10s-performance hackbench/1600%-process-pipe-performance iperf/300s-cs-localhost-tcp-performance iperf/300s-cs-localhost-udp-performance perf-bench-numa-mem/2t-300M-performance perf-bench-sched-pipe/10000000ops-process-performance perf-bench-sched-pipe/10000000ops-threads-performance schbench/2-16-300-30000-30000-performance tbench/100%-cs-localhost-performance (c) lkp-bdw-ep6: 88 threads Intel(R) Xeon(R) CPU E5-2699 v4 @ 2.20GHz 128G stress-ng/100%-60s-pipe-performance unixbench/300s-1-whetstone-double-performance unixbench/300s-1-shell1-performance unixbench/300s-1-shell8-performance unixbench/300s-1-pipe-performance * unixbench/300s-1-context1-performance 312 315 unixbench.score unixbench/300s-1-spawn-performance unixbench/300s-1-syscall-performance unixbench/300s-1-dhry2reg-performance unixbench/300s-1-fstime-performance unixbench/300s-1-fsbuffer-performance unixbench/300s-1-fsdisk-performance unixbench/300s-100%-whetstone-double-performance unixbench/300s-100%-shell1-performance unixbench/300s-100%-shell8-performance unixbench/300s-100%-pipe-performance unixbench/300s-100%-context1-performance unixbench/300s-100%-spawn-performance * unixbench/300s-100%-syscall-performance 3571 ± 3% -11% 3183 ± 4% unixbench.score unixbench/300s-100%-dhry2reg-performance unixbench/300s-100%-fstime-performance unixbench/300s-100%-fsbuffer-performance unixbench/300s-100%-fsdisk-performance unixbench/300s-1-execl-performance unixbench/300s-100%-execl-performance * will-it-scale/brk1-performance 365004 360387 will-it-scale.per_thread_ops * will-it-scale/dup1-performance 432401 437596 will-it-scale.per_thread_ops will-it-scale/eventfd1-performance will-it-scale/futex1-performance will-it-scale/futex2-performance will-it-scale/futex3-performance will-it-scale/futex4-performance will-it-scale/getppid1-performance will-it-scale/lock1-performance will-it-scale/lseek1-performance will-it-scale/lseek2-performance * will-it-scale/malloc1-performance 47025 45817 will-it-scale.per_thread_ops 77499 76529 will-it-scale.per_process_ops will-it-scale/malloc2-performance * will-it-scale/mmap1-performance 123399 120815 will-it-scale.per_thread_ops 152219 149833 will-it-scale.per_process_ops * will-it-scale/mmap2-performance 107327 104714 will-it-scale.per_thread_ops 136405 133765 will-it-scale.per_process_ops will-it-scale/open1-performance * will-it-scale/open2-performance 171570 168805 will-it-scale.per_thread_ops 532644 526202 will-it-scale.per_process_ops will-it-scale/page_fault1-performance will-it-scale/page_fault2-performance will-it-scale/page_fault3-performance will-it-scale/pipe1-performance will-it-scale/poll1-performance * will-it-scale/poll2-performance 176134 172848 will-it-scale.per_thread_ops 281361 275053 will-it-scale.per_process_ops will-it-scale/posix_semaphore1-performance will-it-scale/pread1-performance will-it-scale/pread2-performance will-it-scale/pread3-performance will-it-scale/pthread_mutex1-performance will-it-scale/pthread_mutex2-performance will-it-scale/pwrite1-performance will-it-scale/pwrite2-performance will-it-scale/pwrite3-performance * will-it-scale/read1-performance 1190563 1174833 will-it-scale.per_thread_ops * will-it-scale/read2-performance 1105369 1080427 will-it-scale.per_thread_ops will-it-scale/readseek1-performance * will-it-scale/readseek2-performance 261818 259040 will-it-scale.per_thread_ops will-it-scale/readseek3-performance * will-it-scale/sched_yield-performance 2408059 2382034 will-it-scale.per_thread_ops will-it-scale/signal1-performance will-it-scale/unix1-performance will-it-scale/unlink1-performance will-it-scale/unlink2-performance * will-it-scale/write1-performance 976701 961588 will-it-scale.per_thread_ops * will-it-scale/writeseek1-performance 831898 822448 will-it-scale.per_thread_ops * will-it-scale/writeseek2-performance 228248 225065 will-it-scale.per_thread_ops * will-it-scale/writeseek3-performance 226670 224058 will-it-scale.per_thread_ops will-it-scale/context_switch1-performance aim7/performance-fork_test-2000 * aim7/performance-brk_test-3000 74869 76676 aim7.jobs-per-min aim7/performance-disk_cp-3000 aim7/performance-disk_rd-3000 aim7/performance-sieve-3000 aim7/performance-page_test-3000 aim7/performance-creat-clo-3000 aim7/performance-mem_rtns_1-8000 aim7/performance-disk_wrt-8000 aim7/performance-pipe_cpy-8000 aim7/performance-ram_copy-8000 (d) lkp-avoton3: 8 threads Intel(R) Atom(TM) CPU C2750 @ 2.40GHz 16G netperf/ipv4-900s-200%-cs-localhost-TCP_STREAM-performance Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Cc: Fengguang Wu Cc: Li Zhijian Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20180809135753.21077-1-dietmar.eggemann@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/features.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 85ae8488039c..858589b83377 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -39,7 +39,7 @@ SCHED_FEAT(WAKEUP_PREEMPTION, true) SCHED_FEAT(HRTICK, false) SCHED_FEAT(DOUBLE_TICK, false) -SCHED_FEAT(LB_BIAS, true) +SCHED_FEAT(LB_BIAS, false) /* * Decrement CPU capacity based on time not spent running tasks From 4a465e3ebbc8004ce4f7f08f6022ee8315a94edf Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Fri, 3 Aug 2018 15:05:38 +0100 Subject: [PATCH 23/28] sched/fair: Remove setting task's se->runnable_weight during PELT update A CFS (SCHED_OTHER, SCHED_BATCH or SCHED_IDLE policy) task's se->runnable_weight must always be in sync with its se->load.weight. se->runnable_weight is set to se->load.weight when the task is forked (init_entity_runnable_average()) or reniced (reweight_entity()). There are two cases in set_load_weight() which since they currently only set se->load.weight could lead to a situation in which se->load.weight is different to se->runnable_weight for a CFS task: (1) A task switches to SCHED_IDLE. (2) A SCHED_FIFO, SCHED_RR or SCHED_DEADLINE task which has been reniced (during which only its static priority gets set) switches to SCHED_OTHER or SCHED_BATCH. Set se->runnable_weight to se->load.weight in these two cases to prevent this. This eliminates the need to explicitly set it to se->load.weight during PELT updates in the CFS scheduler fastpath. Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Cc: Joel Fernandes Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Patrick Bellasi Cc: Peter Zijlstra Cc: Quentin Perret Cc: Thomas Gleixner Cc: Vincent Guittot Link: http://lkml.kernel.org/r/20180803140538.1178-1-dietmar.eggemann@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 ++ kernel/sched/pelt.c | 6 ------ 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f2caf1bae4a3..56b3c1781276 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -700,6 +700,7 @@ static void set_load_weight(struct task_struct *p, bool update_load) if (idle_policy(p->policy)) { load->weight = scale_load(WEIGHT_IDLEPRIO); load->inv_weight = WMULT_IDLEPRIO; + p->se.runnable_weight = load->weight; return; } @@ -712,6 +713,7 @@ static void set_load_weight(struct task_struct *p, bool update_load) } else { load->weight = scale_load(sched_prio_to_weight[prio]); load->inv_weight = sched_prio_to_wmult[prio]; + p->se.runnable_weight = load->weight; } } diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index 48a126486435..90fb5bc12ad4 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -269,9 +269,6 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runna int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se) { - if (entity_is_task(se)) - se->runnable_weight = se->load.weight; - if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) { ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); return 1; @@ -282,9 +279,6 @@ int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se) int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se) { - if (entity_is_task(se)) - se->runnable_weight = se->load.weight; - if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq, cfs_rq->curr == se)) { From 9c2298aad355d8c1957df3015448fef333526934 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 4 Oct 2018 11:05:14 +0200 Subject: [PATCH 24/28] sched/core: Fix comment regarding nr_iowait_cpu() and get_iowait_load() The comment related to nr_iowait_cpu() and get_iowait_load() confuses cpufreq with cpuidle and is not very useful for this reason, so fix it. Signed-off-by: Rafael J. Wysocki Acked-by: Peter Zijlstra Cc: Linus Torvalds Cc: Linux PM Cc: Tejun Heo Cc: Thomas Gleixner Fixes: e33a9bba85a8 "sched/core: move IO scheduling accounting from io_schedule_timeout() into scheduler" Link: http://lkml.kernel.org/r/3803514.xkx7zY50tF@aspire.rjw.lan Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 56b3c1781276..fe0223121883 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2916,10 +2916,10 @@ unsigned long nr_iowait(void) } /* - * Consumers of these two interfaces, like for example the cpufreq menu - * governor are using nonsensical data. Boosting frequency for a CPU that has - * IO-wait which might not even end up running the task when it does become - * runnable. + * Consumers of these two interfaces, like for example the cpuidle menu + * governor, are using nonsensical data. Preferring shallow idle state selection + * for a CPU that has IO-wait which might not even end up running the task when + * it does become runnable. */ unsigned long nr_iowait_cpu(int cpu) From d0e7d14455d41163126afecd0fcce935463cc512 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 4 Oct 2018 19:22:27 +0200 Subject: [PATCH 25/28] cpu/SMT: State SMT is disabled even with nosmt and without "=force" When booting with "nosmt=force" a message is issued into dmesg to confirm that SMT has been force-disabled but such a message is not issued when only "nosmt" is on the kernel command line. Fix that. Signed-off-by: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20181004172227.10094-1-bp@alien8.de Signed-off-by: Ingo Molnar --- kernel/cpu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/cpu.c b/kernel/cpu.c index 0097acec1c71..f1338452d998 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -362,6 +362,7 @@ void __init cpu_smt_disable(bool force) pr_info("SMT: Force disabled\n"); cpu_smt_control = CPU_SMT_FORCE_DISABLED; } else { + pr_info("SMT: disabled\n"); cpu_smt_control = CPU_SMT_DISABLED; } } From 7b6abce7e1e69b6d8dc5d40a8cb9ddaeb400427c Mon Sep 17 00:00:00 2001 From: John Garry Date: Wed, 10 Oct 2018 22:56:32 +0800 Subject: [PATCH 26/28] sched/completions/Documentation: Fix a couple of punctuation nits This patch fixes a couple of punctuation nits which can make the document more correct and readable. Also missing "()" are added to some function references for consistency. Signed-off-by: John Garry Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: corbet@lwn.net Cc: linux-doc@vger.kernel.org Link: http://lkml.kernel.org/r/1539183392-239389-1-git-send-email-john.garry@huawei.com Signed-off-by: Ingo Molnar --- Documentation/scheduler/completion.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/scheduler/completion.txt b/Documentation/scheduler/completion.txt index 656cf803c006..108bd0f264b3 100644 --- a/Documentation/scheduler/completion.txt +++ b/Documentation/scheduler/completion.txt @@ -116,7 +116,7 @@ A typical usage scenario is: This is not implying any temporal order on wait_for_completion() and the call to complete() - if the call to complete() happened before the call to wait_for_completion() then the waiting side simply will continue -immediately as all dependencies are satisfied if not it will block until +immediately as all dependencies are satisfied; if not, it will block until completion is signaled by complete(). Note that wait_for_completion() is calling spin_lock_irq()/spin_unlock_irq(), @@ -131,7 +131,7 @@ wait_for_completion(): The default behavior is to wait without a timeout and to mark the task as uninterruptible. wait_for_completion() and its variants are only safe in process context (as they can sleep) but not in atomic context, -interrupt context, with disabled irqs. or preemption is disabled - see also +interrupt context, with disabled irqs, or preemption is disabled - see also try_wait_for_completion() below for handling completion in atomic/interrupt context. @@ -224,7 +224,7 @@ queue spinlock. Any such concurrent calls to complete() or complete_all() probably are a design bug. Signaling completion from hard-irq context is fine as it will appropriately -lock with spin_lock_irqsave/spin_unlock_irqrestore and it will never sleep. +lock with spin_lock_irqsave()/spin_unlock_irqrestore() and it will never sleep. try_wait_for_completion()/completion_done(): From 0c373344b5c1eaa9e186368a32a169a2802be3ca Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 11 Oct 2018 10:36:23 +0200 Subject: [PATCH 27/28] sched/completions/Documentation: Clean up the document some more Refresh the document: - Remove unnecessary liguistic complexity and improve the clarity of the text - Improve the explanations all around - Remove unnecessary and stale version info - Fix whitespace noise - Make pseudo-code match kernel style - Fix minor syntax errors in pseudo-code - Use consistent denotation - Mark multi-CPU sequences more explicitly - Unbreak line breaks - Use quotes to refer to 'struct completion' - Use 'IRQ context' and 'IRQs' consistently - Improve grammar - etc. Cc: John Garry Cc: Linus Torvalds Cc: Nicholas Mc Guire Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: corbet@lwn.net Cc: linux-doc@vger.kernel.org Link: http://lkml.kernel.org/r/1539183392-239389-1-git-send-email-john.garry@huawei.com Signed-off-by: Ingo Molnar --- Documentation/scheduler/completion.txt | 229 +++++++++++++------------ 1 file changed, 123 insertions(+), 106 deletions(-) diff --git a/Documentation/scheduler/completion.txt b/Documentation/scheduler/completion.txt index 108bd0f264b3..91a11a668354 100644 --- a/Documentation/scheduler/completion.txt +++ b/Documentation/scheduler/completion.txt @@ -1,146 +1,161 @@ -completions - wait for completion handling -========================================== - -This document was originally written based on 3.18.0 (linux-next) +Completions - "wait for completion" barrier APIs +================================================ Introduction: ------------- -If you have one or more threads of execution that must wait for some process +If you have one or more threads that must wait for some kernel activity to have reached a point or a specific state, completions can provide a race-free solution to this problem. Semantically they are somewhat like a -pthread_barrier and have similar use-cases. +pthread_barrier() and have similar use-cases. Completions are a code synchronization mechanism which is preferable to any -misuse of locks. Any time you think of using yield() or some quirky -msleep(1) loop to allow something else to proceed, you probably want to -look into using one of the wait_for_completion*() calls instead. The -advantage of using completions is clear intent of the code, but also more -efficient code as both threads can continue until the result is actually -needed. +misuse of locks/semaphores and busy-loops. Any time you think of using +yield() or some quirky msleep(1) loop to allow something else to proceed, +you probably want to look into using one of the wait_for_completion*() +calls and complete() instead. -Completions are built on top of the generic event infrastructure in Linux, -with the event reduced to a simple flag (appropriately called "done") in -struct completion that tells the waiting threads of execution if they -can continue safely. +The advantage of using completions is that they have a well defined, focused +purpose which makes it very easy to see the intent of the code, but they +also result in more efficient code as all threads can continue execution +until the result is actually needed, and both the waiting and the signalling +is highly efficient using low level scheduler sleep/wakeup facilities. -As completions are scheduling related, the code is found in +Completions are built on top of the waitqueue and wakeup infrastructure of +the Linux scheduler. The event the threads on the waitqueue are waiting for +is reduced to a simple flag in 'struct completion', appropriately called "done". + +As completions are scheduling related, the code can be found in kernel/sched/completion.c. Usage: ------ -There are three parts to using completions, the initialization of the -struct completion, the waiting part through a call to one of the variants of -wait_for_completion() and the signaling side through a call to complete() -or complete_all(). Further there are some helper functions for checking the -state of completions. +There are three main parts to using completions: -To use completions one needs to include and -create a variable of type struct completion. The structure used for -handling of completions is: + - the initialization of the 'struct completion' synchronization object + - the waiting part through a call to one of the variants of wait_for_completion(), + - the signaling side through a call to complete() or complete_all(). + +There are also some helper functions for checking the state of completions. +Note that while initialization must happen first, the waiting and signaling +part can happen in any order. I.e. it's entirely normal for a thread +to have marked a completion as 'done' before another thread checks whether +it has to wait for it. + +To use completions you need to #include and +create a static or dynamic variable of type 'struct completion', +which has only two fields: struct completion { unsigned int done; wait_queue_head_t wait; }; -providing the wait queue to place tasks on for waiting and the flag for -indicating the state of affairs. +This provides the ->wait waitqueue to place tasks on for waiting (if any), and +the ->done completion flag for indicating whether it's completed or not. -Completions should be named to convey the intent of the waiter. A good -example is: +Completions should be named to refer to the event that is being synchronized on. +A good example is: wait_for_completion(&early_console_added); complete(&early_console_added); -Good naming (as always) helps code readability. +Good, intuitive naming (as always) helps code readability. Naming a completion +'complete' is not helpful unless the purpose is super obvious... Initializing completions: ------------------------- -Initialization of dynamically allocated completions, often embedded in -other structures, is done with: +Initialization of dynamically allocated completion objects, often embedded in +other structures, is done via a call to init_completion(): - void init_completion(&done); + init_completion(&dynamic_object->done); -Initialization is accomplished by initializing the wait queue and setting -the default state to "not available", that is, "done" is set to 0. +In this call we initialize the waitqueue and set ->done to 0, i.e. "not completed" +or "not done". The re-initialization function, reinit_completion(), simply resets the -done element to "not available", thus again to 0, without touching the -wait queue. Calling init_completion() twice on the same completion object is +->done field to 0 ("not done"), without touching the waitqueue. +Callers of this function must make sure that there are no racy +wait_for_completion() calls going on in parallel. + +Calling init_completion() on the same completion object twice is most likely a bug as it re-initializes the queue to an empty queue and -enqueued tasks could get "lost" - use reinit_completion() in that case. +enqueued tasks could get "lost" - use reinit_completion() in that case, +but be aware of other races. -For static declaration and initialization, macros are available. These are: +For static declaration and initialization, macros are available. - static DECLARE_COMPLETION(setup_done) +For static (or global) declarations in file scope you can use DECLARE_COMPLETION(): -used for static declarations in file scope. Within functions the static -initialization should always use: + static DECLARE_COMPLETION(setup_done); + DECLARE_COMPLETION(setup_done); + +Note that in this case the completion is boot time (or module load time) +initialized to 'not done' and doesn't require an init_completion() call. + +When a completion is declared as a local variable within a function, +then the initialization should always use: DECLARE_COMPLETION_ONSTACK(setup_done) -suitable for automatic/local variables on the stack and will make lockdep -happy. Note also that one needs to make *sure* the completion passed to -work threads remains in-scope, and no references remain to on-stack data -when the initiating function returns. - -Using on-stack completions for code that calls any of the _timeout or -_interruptible/_killable variants is not advisable as they will require -additional synchronization to prevent the on-stack completion object in -the timeout/signal cases from going out of scope. Consider using dynamically -allocated completions when intending to use the _interruptible/_killable -or _timeout variants of wait_for_completion(). +A simple DECLARE_COMPLETION() on the stack makes lockdep unhappy. +Note that when using completion objects as local variables you must be +aware of the short life time of the function stack: the function must +not return to a calling context until all activities (such as waiting +threads) have ceased and the completion is ... completely unused. Waiting for completions: ------------------------ -For a thread of execution to wait for some concurrent work to finish, it -calls wait_for_completion() on the initialized completion structure. +For a thread to wait for some concurrent activity to finish, it +calls wait_for_completion() on the initialized completion structure: + + void wait_for_completion(struct completion *done) + A typical usage scenario is: + CPU#1 CPU#2 + struct completion setup_done; + init_completion(&setup_done); - initialize_work(...,&setup_done,...) + initialize_work(...,&setup_done,...); - /* run non-dependent code */ /* do setup */ + /* run non-dependent code */ /* do setup */ - wait_for_completion(&setup_done); complete(setup_done) + wait_for_completion(&setup_done); complete(setup_done); -This is not implying any temporal order on wait_for_completion() and the -call to complete() - if the call to complete() happened before the call +This is not implying any particular order between wait_for_completion() and +the call to complete() - if the call to complete() happened before the call to wait_for_completion() then the waiting side simply will continue immediately as all dependencies are satisfied; if not, it will block until completion is signaled by complete(). Note that wait_for_completion() is calling spin_lock_irq()/spin_unlock_irq(), so it can only be called safely when you know that interrupts are enabled. -Calling it from hard-irq or irqs-off atomic contexts will result in -hard-to-detect spurious enabling of interrupts. - -wait_for_completion(): - - void wait_for_completion(struct completion *done): +Calling it from IRQs-off atomic contexts will result in hard-to-detect +spurious enabling of interrupts. The default behavior is to wait without a timeout and to mark the task as uninterruptible. wait_for_completion() and its variants are only safe in process context (as they can sleep) but not in atomic context, -interrupt context, with disabled irqs, or preemption is disabled - see also +interrupt context, with disabled IRQs, or preemption is disabled - see also try_wait_for_completion() below for handling completion in atomic/interrupt context. As all variants of wait_for_completion() can (obviously) block for a long -time, you probably don't want to call this with held mutexes. +time depending on the nature of the activity they are waiting for, so in +most cases you probably don't want to call this with held mutexes. -Variants available: -------------------- +wait_for_completion*() variants available: +------------------------------------------ The below variants all return status and this status should be checked in most(/all) cases - in cases where the status is deliberately not checked you @@ -148,51 +163,53 @@ probably want to make a note explaining this (e.g. see arch/arm/kernel/smp.c:__cpu_up()). A common problem that occurs is to have unclean assignment of return types, -so care should be taken with assigning return-values to variables of proper -type. Checking for the specific meaning of return values also has been found -to be quite inaccurate e.g. constructs like -if (!wait_for_completion_interruptible_timeout(...)) would execute the same -code path for successful completion and for the interrupted case - which is -probably not what you want. +so take care to assign return-values to variables of the proper type. + +Checking for the specific meaning of return values also has been found +to be quite inaccurate, e.g. constructs like: + + if (!wait_for_completion_interruptible_timeout(...)) + +... would execute the same code path for successful completion and for the +interrupted case - which is probably not what you want. int wait_for_completion_interruptible(struct completion *done) -This function marks the task TASK_INTERRUPTIBLE. If a signal was received -while waiting it will return -ERESTARTSYS; 0 otherwise. +This function marks the task TASK_INTERRUPTIBLE while it is waiting. +If a signal was received while waiting it will return -ERESTARTSYS; 0 otherwise. - unsigned long wait_for_completion_timeout(struct completion *done, - unsigned long timeout) + unsigned long wait_for_completion_timeout(struct completion *done, unsigned long timeout) The task is marked as TASK_UNINTERRUPTIBLE and will wait at most 'timeout' -(in jiffies). If timeout occurs it returns 0 else the remaining time in -jiffies (but at least 1). Timeouts are preferably calculated with -msecs_to_jiffies() or usecs_to_jiffies(). If the returned timeout value is -deliberately ignored a comment should probably explain why (e.g. see -drivers/mfd/wm8350-core.c wm8350_read_auxadc()) +jiffies. If a timeout occurs it returns 0, else the remaining time in +jiffies (but at least 1). - long wait_for_completion_interruptible_timeout( - struct completion *done, unsigned long timeout) +Timeouts are preferably calculated with msecs_to_jiffies() or usecs_to_jiffies(), +to make the code largely HZ-invariant. + +If the returned timeout value is deliberately ignored a comment should probably explain +why (e.g. see drivers/mfd/wm8350-core.c wm8350_read_auxadc()). + + long wait_for_completion_interruptible_timeout(struct completion *done, unsigned long timeout) This function passes a timeout in jiffies and marks the task as TASK_INTERRUPTIBLE. If a signal was received it will return -ERESTARTSYS; -otherwise it returns 0 if the completion timed out or the remaining time in +otherwise it returns 0 if the completion timed out, or the remaining time in jiffies if completion occurred. Further variants include _killable which uses TASK_KILLABLE as the -designated tasks state and will return -ERESTARTSYS if it is interrupted or -else 0 if completion was achieved. There is a _timeout variant as well: +designated tasks state and will return -ERESTARTSYS if it is interrupted, +or 0 if completion was achieved. There is a _timeout variant as well: long wait_for_completion_killable(struct completion *done) - long wait_for_completion_killable_timeout(struct completion *done, - unsigned long timeout) + long wait_for_completion_killable_timeout(struct completion *done, unsigned long timeout) The _io variants wait_for_completion_io() behave the same as the non-_io -variants, except for accounting waiting time as waiting on IO, which has -an impact on how the task is accounted in scheduling stats. +variants, except for accounting waiting time as 'waiting on IO', which has +an impact on how the task is accounted in scheduling/IO stats: void wait_for_completion_io(struct completion *done) - unsigned long wait_for_completion_io_timeout(struct completion *done - unsigned long timeout) + unsigned long wait_for_completion_io_timeout(struct completion *done, unsigned long timeout) Signaling completions: @@ -200,30 +217,30 @@ Signaling completions: A thread that wants to signal that the conditions for continuation have been achieved calls complete() to signal exactly one of the waiters that it can -continue. +continue: void complete(struct completion *done) -or calls complete_all() to signal all current and future waiters. +... or calls complete_all() to signal all current and future waiters: void complete_all(struct completion *done) The signaling will work as expected even if completions are signaled before a thread starts waiting. This is achieved by the waiter "consuming" -(decrementing) the done element of struct completion. Waiting threads +(decrementing) the done field of 'struct completion'. Waiting threads wakeup order is the same in which they were enqueued (FIFO order). If complete() is called multiple times then this will allow for that number of waiters to continue - each call to complete() will simply increment the -done element. Calling complete_all() multiple times is a bug though. Both -complete() and complete_all() can be called in hard-irq/atomic context safely. +done field. Calling complete_all() multiple times is a bug though. Both +complete() and complete_all() can be called in IRQ/atomic context safely. -There only can be one thread calling complete() or complete_all() on a -particular struct completion at any time - serialized through the wait +There can only be one thread calling complete() or complete_all() on a +particular 'struct completion' at any time - serialized through the wait queue spinlock. Any such concurrent calls to complete() or complete_all() probably are a design bug. -Signaling completion from hard-irq context is fine as it will appropriately +Signaling completion from IRQ context is fine as it will appropriately lock with spin_lock_irqsave()/spin_unlock_irqrestore() and it will never sleep. @@ -236,7 +253,7 @@ else it consumes one posted completion and returns true. bool try_wait_for_completion(struct completion *done) -Finally, to check the state of a completion without changing it in any way, +Finally, to check the state of a completion without changing it in any way, call completion_done(), which returns false if there are no posted completions that were not yet consumed by waiters (implying that there are waiters) and true otherwise; @@ -244,4 +261,4 @@ waiters) and true otherwise; bool completion_done(struct completion *done) Both try_wait_for_completion() and completion_done() are safe to be called in -hard-irq or atomic context. +IRQ or atomic context. From 11e13696a08e838ba48c72404c2b3f41429b5b20 Mon Sep 17 00:00:00 2001 From: Nicholas Mc Guire Date: Tue, 16 Oct 2018 15:45:39 +0200 Subject: [PATCH 28/28] sched/completions/Documentation: Add recommendation for dynamic and ONSTACK completions To prevent dynamic completion objects from being de-allocated while still in use, add a recommendation to embed them in long lived data structures. Also add a note for the on-stack case that emphasizes the dangers of the limited scope, and recommends dynamic allocation if scope limitations are not clearly understood. [ mingo: Minor touch-ups of the text, expanded it a bit to make the warnings Nicholas added more prominent. ] Signed-off-by: Nicholas Mc Guire Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: john.garry@huawei.com Link: http://lkml.kernel.org/r/1539697539-24055-1-git-send-email-hofrat@osadl.org Signed-off-by: Ingo Molnar --- Documentation/scheduler/completion.txt | 42 +++++++++++++++++++++----- 1 file changed, 34 insertions(+), 8 deletions(-) diff --git a/Documentation/scheduler/completion.txt b/Documentation/scheduler/completion.txt index 91a11a668354..2dbff579f957 100644 --- a/Documentation/scheduler/completion.txt +++ b/Documentation/scheduler/completion.txt @@ -70,8 +70,18 @@ Good, intuitive naming (as always) helps code readability. Naming a completion Initializing completions: ------------------------- -Initialization of dynamically allocated completion objects, often embedded in -other structures, is done via a call to init_completion(): +Dynamically allocated completion objects should preferably be embedded in data +structures that are assured to be alive for the life-time of the function/driver, +to prevent races with asynchronous complete() calls from occurring. + +Particular care should be taken when using the _timeout() or _killable()/_interruptible() +variants of wait_for_completion(), as it must be assured that memory de-allocation +does not happen until all related activities (complete() or reinit_completion()) +have taken place, even if these wait functions return prematurely due to a timeout +or a signal triggering. + +Initializing of dynamically allocated completion objects is done via a call to +init_completion(): init_completion(&dynamic_object->done); @@ -99,16 +109,32 @@ Note that in this case the completion is boot time (or module load time) initialized to 'not done' and doesn't require an init_completion() call. When a completion is declared as a local variable within a function, -then the initialization should always use: +then the initialization should always use DECLARE_COMPLETION_ONSTACK() +explicitly, not just to make lockdep happy, but also to make it clear +that limited scope had been considered and is intentional: DECLARE_COMPLETION_ONSTACK(setup_done) -A simple DECLARE_COMPLETION() on the stack makes lockdep unhappy. - Note that when using completion objects as local variables you must be -aware of the short life time of the function stack: the function must -not return to a calling context until all activities (such as waiting -threads) have ceased and the completion is ... completely unused. +acutely aware of the short life time of the function stack: the function +must not return to a calling context until all activities (such as waiting +threads) have ceased and the completion object is completely unused. + +To emphasise this again: in particular when using some of the waiting API variants +with more complex outcomes, such as the timeout or signalling (_timeout(), +_killable() and _interruptible()) variants, the wait might complete +prematurely while the object might still be in use by another thread - and a return +from the wait_on_completion*() caller function will deallocate the function +stack and cause subtle data corruption if a complete() is done in some +other thread. Simple testing might not trigger these kinds of races. + +If unsure, use dynamically allocated completion objects, preferably embedded +in some other long lived object that has a boringly long life time which +exceeds the life time of any helper threads using the completion object, +or has a lock or other synchronization mechanism to make sure complete() +is not called on a freed object. + +A naive DECLARE_COMPLETION() on the stack triggers a lockdep warning. Waiting for completions: ------------------------