From 05484e0984487d42e97c417cbb0697fa9d16e7e9 Mon Sep 17 00:00:00 2001
From: Morten Rasmussen <morten.rasmussen@arm.com>
Date: Fri, 20 Jul 2018 14:32:31 +0100
Subject: [PATCH 01/28] sched/topology: Add SD_ASYM_CPUCAPACITY flag detection

The SD_ASYM_CPUCAPACITY sched_domain flag is supposed to mark the
sched_domain in the hierarchy where all CPU capacities are visible for
any CPU's point of view on asymmetric CPU capacity systems. The
scheduler can then take to take capacity asymmetry into account when
balancing at this level. It also serves as an indicator for how wide
task placement heuristics have to search to consider all available CPU
capacities as asymmetric systems might often appear symmetric at
smallest level(s) of the sched_domain hierarchy.

The flag has been around for while but so far only been set by
out-of-tree code in Android kernels. One solution is to let each
architecture provide the flag through a custom sched_domain topology
array and associated mask and flag functions. However,
SD_ASYM_CPUCAPACITY is special in the sense that it depends on the
capacity and presence of all CPUs in the system, i.e. when hotplugging
all CPUs out except those with one particular CPU capacity the flag
should disappear even if the sched_domains don't collapse. Similarly,
the flag is affected by cpusets where load-balancing is turned off.
Detecting when the flags should be set therefore depends not only on
topology information but also the cpuset configuration and hotplug
state. The arch code doesn't have easy access to the cpuset
configuration.

Instead, this patch implements the flag detection in generic code where
cpusets and hotplug state is already taken care of. All the arch is
responsible for is to implement arch_scale_cpu_capacity() and force a
full rebuild of the sched_domain hierarchy if capacities are updated,
e.g. later in the boot process when cpufreq has initialized.

Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dietmar.eggemann@arm.com
Cc: valentin.schneider@arm.com
Cc: vincent.guittot@linaro.org
Link: http://lkml.kernel.org/r/1532093554-30504-2-git-send-email-morten.rasmussen@arm.com
[ Fixed 'CPU' capitalization. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/topology.h |  6 +--
 kernel/sched/topology.c        | 81 +++++++++++++++++++++++++++++++---
 2 files changed, 78 insertions(+), 9 deletions(-)

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 26347741ba50..6b9976180c1e 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -23,10 +23,10 @@
 #define SD_BALANCE_FORK		0x0008	/* Balance on fork, clone */
 #define SD_BALANCE_WAKE		0x0010  /* Balance on wakeup */
 #define SD_WAKE_AFFINE		0x0020	/* Wake task to waking CPU */
-#define SD_ASYM_CPUCAPACITY	0x0040  /* Groups have different max cpu capacities */
-#define SD_SHARE_CPUCAPACITY	0x0080	/* Domain members share cpu capacity */
+#define SD_ASYM_CPUCAPACITY	0x0040  /* Domain members have different CPU capacities */
+#define SD_SHARE_CPUCAPACITY	0x0080	/* Domain members share CPU capacity */
 #define SD_SHARE_POWERDOMAIN	0x0100	/* Domain members share power domain */
-#define SD_SHARE_PKG_RESOURCES	0x0200	/* Domain members share cpu pkg resources */
+#define SD_SHARE_PKG_RESOURCES	0x0200	/* Domain members share CPU pkg resources */
 #define SD_SERIALIZE		0x0400	/* Only a single load balancing instance */
 #define SD_ASYM_PACKING		0x0800  /* Place busy groups earlier in the domain */
 #define SD_PREFER_SIBLING	0x1000	/* Prefer to place tasks in a sibling domain */
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 505a41c42b96..5c4d583d53ee 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1061,7 +1061,6 @@ static struct cpumask		***sched_domains_numa_masks;
  *   SD_SHARE_PKG_RESOURCES - describes shared caches
  *   SD_NUMA                - describes NUMA topologies
  *   SD_SHARE_POWERDOMAIN   - describes shared power domain
- *   SD_ASYM_CPUCAPACITY    - describes mixed capacity topologies
  *
  * Odd one out, which beside describing the topology has a quirk also
  * prescribes the desired behaviour that goes along with it:
@@ -1073,13 +1072,12 @@ static struct cpumask		***sched_domains_numa_masks;
 	 SD_SHARE_PKG_RESOURCES |	\
 	 SD_NUMA		|	\
 	 SD_ASYM_PACKING	|	\
-	 SD_ASYM_CPUCAPACITY	|	\
 	 SD_SHARE_POWERDOMAIN)
 
 static struct sched_domain *
 sd_init(struct sched_domain_topology_level *tl,
 	const struct cpumask *cpu_map,
-	struct sched_domain *child, int cpu)
+	struct sched_domain *child, int dflags, int cpu)
 {
 	struct sd_data *sdd = &tl->data;
 	struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
@@ -1100,6 +1098,9 @@ sd_init(struct sched_domain_topology_level *tl,
 			"wrong sd_flags in topology description\n"))
 		sd_flags &= ~TOPOLOGY_SD_FLAGS;
 
+	/* Apply detected topology flags */
+	sd_flags |= dflags;
+
 	*sd = (struct sched_domain){
 		.min_interval		= sd_weight,
 		.max_interval		= 2*sd_weight,
@@ -1604,9 +1605,9 @@ static void __sdt_free(const struct cpumask *cpu_map)
 
 static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
 		const struct cpumask *cpu_map, struct sched_domain_attr *attr,
-		struct sched_domain *child, int cpu)
+		struct sched_domain *child, int dflags, int cpu)
 {
-	struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu);
+	struct sched_domain *sd = sd_init(tl, cpu_map, child, dflags, cpu);
 
 	if (child) {
 		sd->level = child->level + 1;
@@ -1632,6 +1633,65 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve
 	return sd;
 }
 
+/*
+ * Find the sched_domain_topology_level where all CPU capacities are visible
+ * for all CPUs.
+ */
+static struct sched_domain_topology_level
+*asym_cpu_capacity_level(const struct cpumask *cpu_map)
+{
+	int i, j, asym_level = 0;
+	bool asym = false;
+	struct sched_domain_topology_level *tl, *asym_tl = NULL;
+	unsigned long cap;
+
+	/* Is there any asymmetry? */
+	cap = arch_scale_cpu_capacity(NULL, cpumask_first(cpu_map));
+
+	for_each_cpu(i, cpu_map) {
+		if (arch_scale_cpu_capacity(NULL, i) != cap) {
+			asym = true;
+			break;
+		}
+	}
+
+	if (!asym)
+		return NULL;
+
+	/*
+	 * Examine topology from all CPU's point of views to detect the lowest
+	 * sched_domain_topology_level where a highest capacity CPU is visible
+	 * to everyone.
+	 */
+	for_each_cpu(i, cpu_map) {
+		unsigned long max_capacity = arch_scale_cpu_capacity(NULL, i);
+		int tl_id = 0;
+
+		for_each_sd_topology(tl) {
+			if (tl_id < asym_level)
+				goto next_level;
+
+			for_each_cpu_and(j, tl->mask(i), cpu_map) {
+				unsigned long capacity;
+
+				capacity = arch_scale_cpu_capacity(NULL, j);
+
+				if (capacity <= max_capacity)
+					continue;
+
+				max_capacity = capacity;
+				asym_level = tl_id;
+				asym_tl = tl;
+			}
+next_level:
+			tl_id++;
+		}
+	}
+
+	return asym_tl;
+}
+
+
 /*
  * Build sched domains for a given set of CPUs and attach the sched domains
  * to the individual CPUs
@@ -1644,18 +1704,27 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
 	struct s_data d;
 	struct rq *rq = NULL;
 	int i, ret = -ENOMEM;
+	struct sched_domain_topology_level *tl_asym;
 
 	alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
 	if (alloc_state != sa_rootdomain)
 		goto error;
 
+	tl_asym = asym_cpu_capacity_level(cpu_map);
+
 	/* Set up domains for CPUs specified by the cpu_map: */
 	for_each_cpu(i, cpu_map) {
 		struct sched_domain_topology_level *tl;
 
 		sd = NULL;
 		for_each_sd_topology(tl) {
-			sd = build_sched_domain(tl, cpu_map, attr, sd, i);
+			int dflags = 0;
+
+			if (tl == tl_asym)
+				dflags |= SD_ASYM_CPUCAPACITY;
+
+			sd = build_sched_domain(tl, cpu_map, attr, sd, dflags, i);
+
 			if (tl == sched_domain_topology)
 				*per_cpu_ptr(d.sd, i) = sd;
 			if (tl->flags & SDTL_OVERLAP)

From bb1fbdd3c3fd12b612c7d8cdf13bd6bfeebdefa3 Mon Sep 17 00:00:00 2001
From: Morten Rasmussen <morten.rasmussen@arm.com>
Date: Fri, 20 Jul 2018 14:32:32 +0100
Subject: [PATCH 02/28] sched/topology, drivers/base/arch_topology: Rebuild the
 sched_domain hierarchy when capacities change

The setting of SD_ASYM_CPUCAPACITY depends on the per-CPU capacities.
These might not have their final values when the hierarchy is initially
built as the values depend on cpufreq to be initialized or the values
being set through sysfs. To ensure that the flags are set correctly we
need to rebuild the sched_domain hierarchy whenever the reported per-CPU
capacity (arch_scale_cpu_capacity()) changes.

This patch ensure that a full sched_domain rebuild happens when CPU
capacity changes occur.

Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dietmar.eggemann@arm.com
Cc: valentin.schneider@arm.com
Cc: vincent.guittot@linaro.org
Link: http://lkml.kernel.org/r/1532093554-30504-3-git-send-email-morten.rasmussen@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 drivers/base/arch_topology.c  | 26 ++++++++++++++++++++++++++
 include/linux/arch_topology.h |  1 +
 2 files changed, 27 insertions(+)

diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
index e7cb0c6ade81..edfcf8d982e4 100644
--- a/drivers/base/arch_topology.c
+++ b/drivers/base/arch_topology.c
@@ -15,6 +15,7 @@
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/sched/topology.h>
+#include <linux/cpuset.h>
 
 DEFINE_PER_CPU(unsigned long, freq_scale) = SCHED_CAPACITY_SCALE;
 
@@ -47,6 +48,9 @@ static ssize_t cpu_capacity_show(struct device *dev,
 	return sprintf(buf, "%lu\n", topology_get_cpu_scale(NULL, cpu->dev.id));
 }
 
+static void update_topology_flags_workfn(struct work_struct *work);
+static DECLARE_WORK(update_topology_flags_work, update_topology_flags_workfn);
+
 static ssize_t cpu_capacity_store(struct device *dev,
 				  struct device_attribute *attr,
 				  const char *buf,
@@ -72,6 +76,8 @@ static ssize_t cpu_capacity_store(struct device *dev,
 		topology_set_cpu_scale(i, new_capacity);
 	mutex_unlock(&cpu_scale_mutex);
 
+	schedule_work(&update_topology_flags_work);
+
 	return count;
 }
 
@@ -96,6 +102,25 @@ static int register_cpu_capacity_sysctl(void)
 }
 subsys_initcall(register_cpu_capacity_sysctl);
 
+static int update_topology;
+
+int topology_update_cpu_topology(void)
+{
+	return update_topology;
+}
+
+/*
+ * Updating the sched_domains can't be done directly from cpufreq callbacks
+ * due to locking, so queue the work for later.
+ */
+static void update_topology_flags_workfn(struct work_struct *work)
+{
+	update_topology = 1;
+	rebuild_sched_domains();
+	pr_debug("sched_domain hierarchy rebuilt, flags updated\n");
+	update_topology = 0;
+}
+
 static u32 capacity_scale;
 static u32 *raw_capacity;
 
@@ -201,6 +226,7 @@ init_cpu_capacity_callback(struct notifier_block *nb,
 
 	if (cpumask_empty(cpus_to_visit)) {
 		topology_normalize_cpu_scale();
+		schedule_work(&update_topology_flags_work);
 		free_raw_capacity();
 		pr_debug("cpu_capacity: parsing done\n");
 		schedule_work(&parsing_done_work);
diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
index 2b709416de05..d9bdc1a7f4e7 100644
--- a/include/linux/arch_topology.h
+++ b/include/linux/arch_topology.h
@@ -9,6 +9,7 @@
 #include <linux/percpu.h>
 
 void topology_normalize_cpu_scale(void);
+int topology_update_cpu_topology(void);
 
 struct device_node;
 bool topology_parse_cpu_capacity(struct device_node *cpu_node, int cpu);

From 3ba09df4b8b6e3f01ed6381e8fb890840fd0bca3 Mon Sep 17 00:00:00 2001
From: Morten Rasmussen <morten.rasmussen@arm.com>
Date: Fri, 20 Jul 2018 14:32:33 +0100
Subject: [PATCH 03/28] sched/topology, arch/arm64: Rebuild the sched_domain
 hierarchy when the CPU capacity changes

Asymmetric CPU capacity can not necessarily be determined accurately at
the time the initial sched_domain hierarchy is built during boot. It is
therefore necessary to be able to force a full rebuild of the hierarchy
later triggered by the arch_topology driver. A full rebuild requires the
arch-code to implement arch_update_cpu_topology() which isn't yet
implemented for arm64. This patch points the arm64 implementation to
arch_topology driver to ensure that full hierarchy rebuild happens when
needed.

Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Cc: dietmar.eggemann@arm.com
Cc: valentin.schneider@arm.com
Cc: vincent.guittot@linaro.org
Link: http://lkml.kernel.org/r/1532093554-30504-4-git-send-email-morten.rasmussen@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/arm64/include/asm/topology.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h
index 49a0fee4f89b..0524f2438649 100644
--- a/arch/arm64/include/asm/topology.h
+++ b/arch/arm64/include/asm/topology.h
@@ -45,6 +45,9 @@ int pcibus_to_node(struct pci_bus *bus);
 /* Replace task scheduler's default cpu-invariant accounting */
 #define arch_scale_cpu_capacity topology_get_cpu_scale
 
+/* Enable topology flag updates */
+#define arch_update_cpu_topology topology_update_cpu_topology
+
 #include <asm-generic/topology.h>
 
 #endif /* _ASM_ARM_TOPOLOGY_H */

From e1799a80a4f5a463f252b7325da8bb66dfd55471 Mon Sep 17 00:00:00 2001
From: Morten Rasmussen <morten.rasmussen@arm.com>
Date: Fri, 20 Jul 2018 14:32:34 +0100
Subject: [PATCH 04/28] sched/topology, arch/arm: Rebuild sched_domain
 hierarchy when CPU capacity changes

Asymmetric CPU capacity can not necessarily be determined accurately at
the time the initial sched_domain hierarchy is built during boot. It is
therefore necessary to be able to force a full rebuild of the hierarchy
later triggered by the arch_topology driver. A full rebuild requires the
arch-code to implement arch_update_cpu_topology() which isn't yet
implemented for arm. This patch points the arm implementation to
arch_topology driver to ensure that full hierarchy rebuild happens when
needed.

Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dietmar.eggemann@arm.com
Cc: valentin.schneider@arm.com
Cc: vincent.guittot@linaro.org
Link: http://lkml.kernel.org/r/1532093554-30504-5-git-send-email-morten.rasmussen@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/arm/include/asm/topology.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h
index 5d88d2f22b2c..2a786f54d8b8 100644
--- a/arch/arm/include/asm/topology.h
+++ b/arch/arm/include/asm/topology.h
@@ -33,6 +33,9 @@ const struct cpumask *cpu_coregroup_mask(int cpu);
 /* Replace task scheduler's default cpu-invariant accounting */
 #define arch_scale_cpu_capacity topology_get_cpu_scale
 
+/* Enable topology flag updates */
+#define arch_update_cpu_topology topology_update_cpu_topology
+
 #else
 
 static inline void init_cpu_topology(void) { }

From df054e8445a4011e3d693c2268129c0456108663 Mon Sep 17 00:00:00 2001
From: Morten Rasmussen <morten.rasmussen@arm.com>
Date: Wed, 4 Jul 2018 11:17:39 +0100
Subject: [PATCH 05/28] sched/topology: Add static_key for asymmetric CPU
 capacity optimizations

The existing asymmetric CPU capacity code should cause minimal overhead
for others. Putting it behind a static_key, it has been done for SMT
optimizations, would make it easier to extend and improve without
causing harm to others moving forward.

Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dietmar.eggemann@arm.com
Cc: gaku.inami.xh@renesas.com
Cc: valentin.schneider@arm.com
Cc: vincent.guittot@linaro.org
Link: http://lkml.kernel.org/r/1530699470-29808-2-git-send-email-morten.rasmussen@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c     | 3 +++
 kernel/sched/sched.h    | 1 +
 kernel/sched/topology.c | 9 ++++++++-
 3 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f808ddf2a868..3e5071aeb117 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6188,6 +6188,9 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
 {
 	long min_cap, max_cap;
 
+	if (!static_branch_unlikely(&sched_asym_cpucapacity))
+		return 0;
+
 	min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
 	max_cap = cpu_rq(cpu)->rd->max_cpu_capacity;
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 4a2e8cae63c4..0f36adc31ba5 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1185,6 +1185,7 @@ DECLARE_PER_CPU(int, sd_llc_id);
 DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
 DECLARE_PER_CPU(struct sched_domain *, sd_numa);
 DECLARE_PER_CPU(struct sched_domain *, sd_asym);
+extern struct static_key_false sched_asym_cpucapacity;
 
 struct sched_group_capacity {
 	atomic_t		ref;
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 5c4d583d53ee..b0cdf5e95bda 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -398,6 +398,7 @@ DEFINE_PER_CPU(int, sd_llc_id);
 DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
 DEFINE_PER_CPU(struct sched_domain *, sd_numa);
 DEFINE_PER_CPU(struct sched_domain *, sd_asym);
+DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
 
 static void update_top_cache_domain(int cpu)
 {
@@ -1705,6 +1706,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
 	struct rq *rq = NULL;
 	int i, ret = -ENOMEM;
 	struct sched_domain_topology_level *tl_asym;
+	bool has_asym = false;
 
 	alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
 	if (alloc_state != sa_rootdomain)
@@ -1720,8 +1722,10 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
 		for_each_sd_topology(tl) {
 			int dflags = 0;
 
-			if (tl == tl_asym)
+			if (tl == tl_asym) {
 				dflags |= SD_ASYM_CPUCAPACITY;
+				has_asym = true;
+			}
 
 			sd = build_sched_domain(tl, cpu_map, attr, sd, dflags, i);
 
@@ -1773,6 +1777,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
 	}
 	rcu_read_unlock();
 
+	if (has_asym)
+		static_branch_enable_cpuslocked(&sched_asym_cpucapacity);
+
 	if (rq && sched_debug_enabled) {
 		pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n",
 			cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);

From 3b1baa6496e6b7ad016342a9d256bdfb072ce902 Mon Sep 17 00:00:00 2001
From: Morten Rasmussen <morten.rasmussen@arm.com>
Date: Wed, 4 Jul 2018 11:17:40 +0100
Subject: [PATCH 06/28] sched/fair: Add 'group_misfit_task' load-balance type

To maximize throughput in systems with asymmetric CPU capacities (e.g.
ARM big.LITTLE) load-balancing has to consider task and CPU utilization
as well as per-CPU compute capacity when load-balancing in addition to
the current average load based load-balancing policy. Tasks with high
utilization that are scheduled on a lower capacity CPU need to be
identified and migrated to a higher capacity CPU if possible to maximize
throughput.

To implement this additional policy an additional group_type
(load-balance scenario) is added: 'group_misfit_task'. This represents
scenarios where a sched_group has one or more tasks that are not
suitable for its per-CPU capacity. 'group_misfit_task' is only considered
if the system is not overloaded or imbalanced ('group_imbalanced' or
'group_overloaded').

Identifying misfit tasks requires the rq lock to be held. To avoid
taking remote rq locks to examine source sched_groups for misfit tasks,
each CPU is responsible for tracking misfit tasks themselves and update
the rq->misfit_task flag. This means checking task utilization when
tasks are scheduled and on sched_tick.

Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dietmar.eggemann@arm.com
Cc: gaku.inami.xh@renesas.com
Cc: valentin.schneider@arm.com
Cc: vincent.guittot@linaro.org
Link: http://lkml.kernel.org/r/1530699470-29808-3-git-send-email-morten.rasmussen@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c  | 54 +++++++++++++++++++++++++++++++++++++-------
 kernel/sched/sched.h |  2 ++
 2 files changed, 48 insertions(+), 8 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3e5071aeb117..6e04bea5b11a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -693,6 +693,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 
 static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
 static unsigned long task_h_load(struct task_struct *p);
+static unsigned long capacity_of(int cpu);
 
 /* Give new sched_entity start runnable values to heavy its load in infant time */
 void init_entity_runnable_average(struct sched_entity *se)
@@ -1446,7 +1447,6 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
 static unsigned long weighted_cpuload(struct rq *rq);
 static unsigned long source_load(int cpu, int type);
 static unsigned long target_load(int cpu, int type);
-static unsigned long capacity_of(int cpu);
 
 /* Cached statistics for all CPUs within a node */
 struct numa_stats {
@@ -3647,6 +3647,29 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
 	WRITE_ONCE(p->se.avg.util_est, ue);
 }
 
+static inline int task_fits_capacity(struct task_struct *p, long capacity)
+{
+	return capacity * 1024 > task_util_est(p) * capacity_margin;
+}
+
+static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
+{
+	if (!static_branch_unlikely(&sched_asym_cpucapacity))
+		return;
+
+	if (!p) {
+		rq->misfit_task_load = 0;
+		return;
+	}
+
+	if (task_fits_capacity(p, capacity_of(cpu_of(rq)))) {
+		rq->misfit_task_load = 0;
+		return;
+	}
+
+	rq->misfit_task_load = task_h_load(p);
+}
+
 #else /* CONFIG_SMP */
 
 #define UPDATE_TG	0x0
@@ -3676,6 +3699,7 @@ util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
 static inline void
 util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p,
 		 bool task_sleep) {}
+static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
 
 #endif /* CONFIG_SMP */
 
@@ -6201,7 +6225,7 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
 	/* Bring task utilization in sync with prev_cpu */
 	sync_entity_load_avg(&p->se);
 
-	return min_cap * 1024 < task_util(p) * capacity_margin;
+	return !task_fits_capacity(p, min_cap);
 }
 
 /*
@@ -6618,9 +6642,12 @@ done: __maybe_unused;
 	if (hrtick_enabled(rq))
 		hrtick_start_fair(rq, p);
 
+	update_misfit_status(p, rq);
+
 	return p;
 
 idle:
+	update_misfit_status(NULL, rq);
 	new_tasks = idle_balance(rq, rf);
 
 	/*
@@ -6826,6 +6853,13 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;
 
 enum fbq_type { regular, remote, all };
 
+enum group_type {
+	group_other = 0,
+	group_misfit_task,
+	group_imbalanced,
+	group_overloaded,
+};
+
 #define LBF_ALL_PINNED	0x01
 #define LBF_NEED_BREAK	0x02
 #define LBF_DST_PINNED  0x04
@@ -7399,12 +7433,6 @@ static unsigned long task_h_load(struct task_struct *p)
 
 /********** Helpers for find_busiest_group ************************/
 
-enum group_type {
-	group_other = 0,
-	group_imbalanced,
-	group_overloaded,
-};
-
 /*
  * sg_lb_stats - stats of a sched_group required for load_balancing
  */
@@ -7420,6 +7448,7 @@ struct sg_lb_stats {
 	unsigned int group_weight;
 	enum group_type group_type;
 	int group_no_capacity;
+	unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
 #ifdef CONFIG_NUMA_BALANCING
 	unsigned int nr_numa_running;
 	unsigned int nr_preferred_running;
@@ -7712,6 +7741,9 @@ group_type group_classify(struct sched_group *group,
 	if (sg_imbalanced(group))
 		return group_imbalanced;
 
+	if (sgs->group_misfit_task_load)
+		return group_misfit_task;
+
 	return group_other;
 }
 
@@ -7786,6 +7818,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 		 */
 		if (!nr_running && idle_cpu(i))
 			sgs->idle_cpus++;
+
+		if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
+		    sgs->group_misfit_task_load < rq->misfit_task_load)
+			sgs->group_misfit_task_load = rq->misfit_task_load;
 	}
 
 	/* Adjust by relative CPU capacity of the group */
@@ -9567,6 +9603,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
 
 	if (static_branch_unlikely(&sched_numa_balancing))
 		task_tick_numa(rq, curr);
+
+	update_misfit_status(curr, rq);
 }
 
 /*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 0f36adc31ba5..7dbf67d147a2 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -842,6 +842,8 @@ struct rq {
 
 	unsigned char		idle_balance;
 
+	unsigned long		misfit_task_load;
+
 	/* For active balancing */
 	int			active_balance;
 	int			push_cpu;

From e3d6d0cb66f2351cbfd09fbae04eb9804afe9577 Mon Sep 17 00:00:00 2001
From: Morten Rasmussen <morten.rasmussen@arm.com>
Date: Wed, 4 Jul 2018 11:17:41 +0100
Subject: [PATCH 07/28] sched/fair: Add sched_group per-CPU max capacity

The current sg->min_capacity tracks the lowest per-CPU compute capacity
available in the sched_group when rt/irq pressure is taken into account.
Minimum capacity isn't the ideal metric for tracking if a sched_group
needs offloading to another sched_group for some scenarios, e.g. a
sched_group with multiple CPUs if only one is under heavy pressure.
Tracking maximum capacity isn't perfect either but a better choice for
some situations as it indicates that the sched_group definitely compute
capacity constrained either due to rt/irq pressure on all CPUs or
asymmetric CPU capacities (e.g. big.LITTLE).

Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dietmar.eggemann@arm.com
Cc: gaku.inami.xh@renesas.com
Cc: valentin.schneider@arm.com
Cc: vincent.guittot@linaro.org
Link: http://lkml.kernel.org/r/1530699470-29808-4-git-send-email-morten.rasmussen@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c     | 24 ++++++++++++++++++++----
 kernel/sched/sched.h    |  1 +
 kernel/sched/topology.c |  2 ++
 3 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6e04bea5b11a..fe04315d57b3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7557,13 +7557,14 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
 	cpu_rq(cpu)->cpu_capacity = capacity;
 	sdg->sgc->capacity = capacity;
 	sdg->sgc->min_capacity = capacity;
+	sdg->sgc->max_capacity = capacity;
 }
 
 void update_group_capacity(struct sched_domain *sd, int cpu)
 {
 	struct sched_domain *child = sd->child;
 	struct sched_group *group, *sdg = sd->groups;
-	unsigned long capacity, min_capacity;
+	unsigned long capacity, min_capacity, max_capacity;
 	unsigned long interval;
 
 	interval = msecs_to_jiffies(sd->balance_interval);
@@ -7577,6 +7578,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
 
 	capacity = 0;
 	min_capacity = ULONG_MAX;
+	max_capacity = 0;
 
 	if (child->flags & SD_OVERLAP) {
 		/*
@@ -7607,6 +7609,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
 			}
 
 			min_capacity = min(capacity, min_capacity);
+			max_capacity = max(capacity, max_capacity);
 		}
 	} else  {
 		/*
@@ -7620,12 +7623,14 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
 
 			capacity += sgc->capacity;
 			min_capacity = min(sgc->min_capacity, min_capacity);
+			max_capacity = max(sgc->max_capacity, max_capacity);
 			group = group->next;
 		} while (group != child->groups);
 	}
 
 	sdg->sgc->capacity = capacity;
 	sdg->sgc->min_capacity = min_capacity;
+	sdg->sgc->max_capacity = max_capacity;
 }
 
 /*
@@ -7721,16 +7726,27 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
 }
 
 /*
- * group_smaller_cpu_capacity: Returns true if sched_group sg has smaller
+ * group_smaller_min_cpu_capacity: Returns true if sched_group sg has smaller
  * per-CPU capacity than sched_group ref.
  */
 static inline bool
-group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
+group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
 {
 	return sg->sgc->min_capacity * capacity_margin <
 						ref->sgc->min_capacity * 1024;
 }
 
+/*
+ * group_smaller_max_cpu_capacity: Returns true if sched_group sg has smaller
+ * per-CPU capacity_orig than sched_group ref.
+ */
+static inline bool
+group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
+{
+	return sg->sgc->max_capacity * capacity_margin <
+						ref->sgc->max_capacity * 1024;
+}
+
 static inline enum
 group_type group_classify(struct sched_group *group,
 			  struct sg_lb_stats *sgs)
@@ -7876,7 +7892,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
 	 * power/energy consequences are not considered.
 	 */
 	if (sgs->sum_nr_running <= sgs->group_weight &&
-	    group_smaller_cpu_capacity(sds->local, sg))
+	    group_smaller_min_cpu_capacity(sds->local, sg))
 		return false;
 
 asym_packing:
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 7dbf67d147a2..fe17e0be2d7b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1197,6 +1197,7 @@ struct sched_group_capacity {
 	 */
 	unsigned long		capacity;
 	unsigned long		min_capacity;		/* Min per-CPU capacity in group */
+	unsigned long		max_capacity;		/* Max per-CPU capacity in group */
 	unsigned long		next_update;
 	int			imbalance;		/* XXX unrelated to capacity but shared group state */
 
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index b0cdf5e95bda..2536e1b938f9 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -693,6 +693,7 @@ static void init_overlap_sched_group(struct sched_domain *sd,
 	sg_span = sched_group_span(sg);
 	sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
 	sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
+	sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
 }
 
 static int
@@ -852,6 +853,7 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd)
 
 	sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_span(sg));
 	sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
+	sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
 
 	return sg;
 }

From cad68e552e7774b68ae6a2c5fedb792936098b72 Mon Sep 17 00:00:00 2001
From: Morten Rasmussen <morten.rasmussen@arm.com>
Date: Wed, 4 Jul 2018 11:17:42 +0100
Subject: [PATCH 08/28] sched/fair: Consider misfit tasks when load-balancing

On asymmetric CPU capacity systems load intensive tasks can end up on
CPUs that don't suit their compute demand.  In this scenarios 'misfit'
tasks should be migrated to CPUs with higher compute capacity to ensure
better throughput. group_misfit_task indicates this scenario, but tweaks
to the load-balance code are needed to make the migrations happen.

Misfit balancing only makes sense between a source group of lower
per-CPU capacity and destination group of higher compute capacity.
Otherwise, misfit balancing is ignored. group_misfit_task has lowest
priority so any imbalance due to overload is dealt with first.

The modifications are:

1. Only pick a group containing misfit tasks as the busiest group if the
   destination group has higher capacity and has spare capacity.
2. When the busiest group is a 'misfit' group, skip the usual average
   load and group capacity checks.
3. Set the imbalance for 'misfit' balancing sufficiently high for a task
   to be pulled ignoring average load.
4. Pick the CPU with the highest misfit load as the source CPU.
5. If the misfit task is alone on the source CPU, go for active
   balancing.

Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dietmar.eggemann@arm.com
Cc: gaku.inami.xh@renesas.com
Cc: valentin.schneider@arm.com
Cc: vincent.guittot@linaro.org
Link: http://lkml.kernel.org/r/1530699470-29808-5-git-send-email-morten.rasmussen@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 51 +++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 49 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fe04315d57b3..24fe39e57bc3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6890,6 +6890,7 @@ struct lb_env {
 	unsigned int		loop_max;
 
 	enum fbq_type		fbq_type;
+	enum group_type		src_grp_type;
 	struct list_head	tasks;
 };
 
@@ -7873,6 +7874,17 @@ static bool update_sd_pick_busiest(struct lb_env *env,
 {
 	struct sg_lb_stats *busiest = &sds->busiest_stat;
 
+	/*
+	 * Don't try to pull misfit tasks we can't help.
+	 * We can use max_capacity here as reduction in capacity on some
+	 * CPUs in the group should either be possible to resolve
+	 * internally or be covered by avg_load imbalance (eventually).
+	 */
+	if (sgs->group_type == group_misfit_task &&
+	    (!group_smaller_max_cpu_capacity(sg, sds->local) ||
+	     !group_has_capacity(env, &sds->local_stat)))
+		return false;
+
 	if (sgs->group_type > busiest->group_type)
 		return true;
 
@@ -7895,6 +7907,13 @@ static bool update_sd_pick_busiest(struct lb_env *env,
 	    group_smaller_min_cpu_capacity(sds->local, sg))
 		return false;
 
+	/*
+	 * If we have more than one misfit sg go with the biggest misfit.
+	 */
+	if (sgs->group_type == group_misfit_task &&
+	    sgs->group_misfit_task_load < busiest->group_misfit_task_load)
+		return false;
+
 asym_packing:
 	/* This is the busiest node in its class. */
 	if (!(env->sd->flags & SD_ASYM_PACKING))
@@ -8192,8 +8211,9 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
 	 * factors in sg capacity and sgs with smaller group_type are
 	 * skipped when updating the busiest sg:
 	 */
-	if (busiest->avg_load <= sds->avg_load ||
-	    local->avg_load >= sds->avg_load) {
+	if (busiest->group_type != group_misfit_task &&
+	    (busiest->avg_load <= sds->avg_load ||
+	     local->avg_load >= sds->avg_load)) {
 		env->imbalance = 0;
 		return fix_small_imbalance(env, sds);
 	}
@@ -8227,6 +8247,12 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
 		(sds->avg_load - local->avg_load) * local->group_capacity
 	) / SCHED_CAPACITY_SCALE;
 
+	/* Boost imbalance to allow misfit task to be balanced. */
+	if (busiest->group_type == group_misfit_task) {
+		env->imbalance = max_t(long, env->imbalance,
+				       busiest->group_misfit_task_load);
+	}
+
 	/*
 	 * if *imbalance is less than the average load per runnable task
 	 * there is no guarantee that any tasks will be moved so we'll have
@@ -8293,6 +8319,10 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
 	    busiest->group_no_capacity)
 		goto force_balance;
 
+	/* Misfit tasks should be dealt with regardless of the avg load */
+	if (busiest->group_type == group_misfit_task)
+		goto force_balance;
+
 	/*
 	 * If the local group is busier than the selected busiest group
 	 * don't try and pull any tasks.
@@ -8330,6 +8360,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
 
 force_balance:
 	/* Looks like there is an imbalance. Compute it */
+	env->src_grp_type = busiest->group_type;
 	calculate_imbalance(env, &sds);
 	return env->imbalance ? sds.busiest : NULL;
 
@@ -8377,6 +8408,19 @@ static struct rq *find_busiest_queue(struct lb_env *env,
 		if (rt > env->fbq_type)
 			continue;
 
+		/*
+		 * For ASYM_CPUCAPACITY domains with misfit tasks we simply
+		 * seek the "biggest" misfit task.
+		 */
+		if (env->src_grp_type == group_misfit_task) {
+			if (rq->misfit_task_load > busiest_load) {
+				busiest_load = rq->misfit_task_load;
+				busiest = rq;
+			}
+
+			continue;
+		}
+
 		capacity = capacity_of(i);
 
 		wl = weighted_cpuload(rq);
@@ -8446,6 +8490,9 @@ static int need_active_balance(struct lb_env *env)
 			return 1;
 	}
 
+	if (env->src_grp_type == group_misfit_task)
+		return 1;
+
 	return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
 }
 

From 5fbdfae5221a5208ed8e7653fc1c4b31de420f74 Mon Sep 17 00:00:00 2001
From: Valentin Schneider <valentin.schneider@arm.com>
Date: Wed, 4 Jul 2018 11:17:43 +0100
Subject: [PATCH 09/28] sched/fair: Kick nohz balance if rq->misfit_task_load

There already are a few conditions in nohz_kick_needed() to ensure
a nohz kick is triggered, but they are not enough for some misfit
task scenarios. Excluding asym packing, those are:

 - rq->nr_running >=2: Not relevant here because we are running a
   misfit task, it needs to be migrated regardless and potentially through
   active balance.

 - sds->nr_busy_cpus > 1: If there is only the misfit task being run
   on a group of low capacity CPUs, this will be evaluated to False.

 - rq->cfs.h_nr_running >=1 && check_cpu_capacity(): Not relevant here,
   misfit task needs to be migrated regardless of rt/IRQ pressure

As such, this commit adds an rq->misfit_task_load condition to trigger a
nohz kick.

The idea to kick a nohz balance for misfit tasks originally came from
Leo Yan <leo.yan@linaro.org>, and a similar patch was submitted for
the Android Common Kernel - see:

  https://lists.linaro.org/pipermail/eas-dev/2016-September/000551.html

Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dietmar.eggemann@arm.com
Cc: gaku.inami.xh@renesas.com
Cc: vincent.guittot@linaro.org
Link: http://lkml.kernel.org/r/1530699470-29808-6-git-send-email-morten.rasmussen@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 24fe39e57bc3..e08287d3806f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9135,7 +9135,7 @@ static void nohz_balancer_kick(struct rq *rq)
 	if (time_before(now, nohz.next_balance))
 		goto out;
 
-	if (rq->nr_running >= 2) {
+	if (rq->nr_running >= 2 || rq->misfit_task_load) {
 		flags = NOHZ_KICK_MASK;
 		goto out;
 	}

From dbbad719449e06d73db21598d6eee178f7a54b3b Mon Sep 17 00:00:00 2001
From: Valentin Schneider <valentin.schneider@arm.com>
Date: Wed, 4 Jul 2018 11:17:44 +0100
Subject: [PATCH 10/28] sched/fair: Change 'prefer_sibling' type to bool

This variable is entirely local to update_sd_lb_stats, so we can
safely change its type and slightly clean up its initialisation.

Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dietmar.eggemann@arm.com
Cc: gaku.inami.xh@renesas.com
Cc: vincent.guittot@linaro.org
Link: http://lkml.kernel.org/r/1530699470-29808-7-git-send-email-morten.rasmussen@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e08287d3806f..23017939ecab 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7982,11 +7982,9 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
 	struct sched_group *sg = env->sd->groups;
 	struct sg_lb_stats *local = &sds->local_stat;
 	struct sg_lb_stats tmp_sgs;
-	int load_idx, prefer_sibling = 0;
+	int load_idx;
 	bool overload = false;
-
-	if (child && child->flags & SD_PREFER_SIBLING)
-		prefer_sibling = 1;
+	bool prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
 
 #ifdef CONFIG_NO_HZ_COMMON
 	if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked))

From 575638d1047eb057a5cdf95cc0b3c084e1279508 Mon Sep 17 00:00:00 2001
From: Valentin Schneider <valentin.schneider@arm.com>
Date: Wed, 4 Jul 2018 11:17:45 +0100
Subject: [PATCH 11/28] sched/core: Change root_domain->overload type to int

sizeof(_Bool) is implementation defined, so let's just go with 'int' as
is done for other structures e.g. sched_domain_shared->has_idle_cores.

The local 'overload' variable used in update_sd_lb_stats can remain
bool, as it won't impact any struct layout and can be assigned to the
root_domain field.

Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dietmar.eggemann@arm.com
Cc: gaku.inami.xh@renesas.com
Cc: vincent.guittot@linaro.org
Link: http://lkml.kernel.org/r/1530699470-29808-8-git-send-email-morten.rasmussen@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/sched.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index fe17e0be2d7b..4d181478c5b8 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -716,7 +716,7 @@ struct root_domain {
 	cpumask_var_t		online;
 
 	/* Indicate more than one runnable task for any CPU */
-	bool			overload;
+	int			overload;
 
 	/*
 	 * The bit corresponding to a CPU gets set here if such CPU has more
@@ -1698,7 +1698,7 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
 	if (prev_nr < 2 && rq->nr_running >= 2) {
 #ifdef CONFIG_SMP
 		if (!rq->rd->overload)
-			rq->rd->overload = true;
+			rq->rd->overload = 1;
 #endif
 	}
 

From e90c8fe15a3bf93a23088bcf1a56a0fa391d4e50 Mon Sep 17 00:00:00 2001
From: Valentin Schneider <valentin.schneider@arm.com>
Date: Wed, 4 Jul 2018 11:17:46 +0100
Subject: [PATCH 12/28] sched/fair: Wrap rq->rd->overload accesses with
 READ/WRITE_ONCE()

This variable can be read and set locklessly within update_sd_lb_stats().
As such, READ/WRITE_ONCE() are added to make sure nothing terribly wrong
can happen because of the compiler.

Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dietmar.eggemann@arm.com
Cc: gaku.inami.xh@renesas.com
Cc: vincent.guittot@linaro.org
Link: http://lkml.kernel.org/r/1530699470-29808-9-git-send-email-morten.rasmussen@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c  | 6 +++---
 kernel/sched/sched.h | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 23017939ecab..d9c4e97bfebd 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8058,8 +8058,8 @@ next_group:
 
 	if (!env->sd->parent) {
 		/* update overload indicator if we are at root domain */
-		if (env->dst_rq->rd->overload != overload)
-			env->dst_rq->rd->overload = overload;
+		if (READ_ONCE(env->dst_rq->rd->overload) != overload)
+			WRITE_ONCE(env->dst_rq->rd->overload, overload);
 	}
 }
 
@@ -9502,7 +9502,7 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
 	rq_unpin_lock(this_rq, rf);
 
 	if (this_rq->avg_idle < sysctl_sched_migration_cost ||
-	    !this_rq->rd->overload) {
+	    !READ_ONCE(this_rq->rd->overload)) {
 
 		rcu_read_lock();
 		sd = rcu_dereference_check_sched_domain(this_rq->sd);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 4d181478c5b8..938063639793 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1697,8 +1697,8 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
 
 	if (prev_nr < 2 && rq->nr_running >= 2) {
 #ifdef CONFIG_SMP
-		if (!rq->rd->overload)
-			rq->rd->overload = 1;
+		if (!READ_ONCE(rq->rd->overload))
+			WRITE_ONCE(rq->rd->overload, 1);
 #endif
 	}
 

From 757ffdd705ee942fc8150b17942d968601d2a15b Mon Sep 17 00:00:00 2001
From: Valentin Schneider <valentin.schneider@arm.com>
Date: Wed, 4 Jul 2018 11:17:47 +0100
Subject: [PATCH 13/28] sched/fair: Set rq->rd->overload when misfit

Idle balance is a great opportunity to pull a misfit task. However,
there are scenarios where misfit tasks are present but idle balance is
prevented by the overload flag.

A good example of this is a workload of n identical tasks. Let's suppose
we have a 2+2 Arm big.LITTLE system. We then spawn 4 fairly
CPU-intensive tasks - for the sake of simplicity let's say they are just
CPU hogs, even when running on big CPUs.

They are identical tasks, so on an SMP system they should all end at
(roughly) the same time. However, in our case the LITTLE CPUs are less
performing than the big CPUs, so tasks running on the LITTLEs will have
a longer completion time.

This means that the big CPUs will complete their work earlier, at which
point they should pull the tasks from the LITTLEs. What we want to
happen is summarized as follows:

a,b,c,d are our CPU-hogging tasks _ signifies idling

  LITTLE_0 | a a a a _ _
  LITTLE_1 | b b b b _ _
  ---------|-------------
    big_0  | c c c c a a
    big_1  | d d d d b b
		    ^
		    ^
      Tasks end on the big CPUs, idle balance happens
      and the misfit tasks are pulled straight away

This however won't happen, because currently the overload flag is only
set when there is any CPU that has more than one runnable task - which
may very well not be the case here if our CPU-hogging workload is all
there is to run.

As such, this commit sets the overload flag in update_sg_lb_stats when
a group is flagged as having a misfit task.

Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dietmar.eggemann@arm.com
Cc: gaku.inami.xh@renesas.com
Cc: vincent.guittot@linaro.org
Link: http://lkml.kernel.org/r/1530699470-29808-10-git-send-email-morten.rasmussen@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c  | 6 ++++--
 kernel/sched/sched.h | 6 +++++-
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d9c4e97bfebd..8b228c5b3eb4 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7793,7 +7793,7 @@ static bool update_nohz_stats(struct rq *rq, bool force)
  * @load_idx: Load index of sched_domain of this_cpu for load calc.
  * @local_group: Does group contain this_cpu.
  * @sgs: variable to hold the statistics for this group.
- * @overload: Indicate more than one runnable task for any CPU.
+ * @overload: Indicate pullable load (e.g. >1 runnable task).
  */
 static inline void update_sg_lb_stats(struct lb_env *env,
 			struct sched_group *group, int load_idx,
@@ -7837,8 +7837,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 			sgs->idle_cpus++;
 
 		if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
-		    sgs->group_misfit_task_load < rq->misfit_task_load)
+		    sgs->group_misfit_task_load < rq->misfit_task_load) {
 			sgs->group_misfit_task_load = rq->misfit_task_load;
+			*overload = 1;
+		}
 	}
 
 	/* Adjust by relative CPU capacity of the group */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 938063639793..85b3a2bf6c2b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -715,7 +715,11 @@ struct root_domain {
 	cpumask_var_t		span;
 	cpumask_var_t		online;
 
-	/* Indicate more than one runnable task for any CPU */
+	/*
+	 * Indicate pullable load on at least one CPU, e.g:
+	 * - More than one runnable task
+	 * - Running task is misfit
+	 */
 	int			overload;
 
 	/*

From 4ad3831a9d4af5e36da5d44a3b9c6522d0353cee Mon Sep 17 00:00:00 2001
From: Chris Redpath <chris.redpath@arm.com>
Date: Wed, 4 Jul 2018 11:17:48 +0100
Subject: [PATCH 14/28] sched/fair: Don't move tasks to lower capacity CPUs
 unless necessary

When lower capacity CPUs are load balancing and considering to pull
something from a higher capacity group, we should not pull tasks from a
CPU with only one task running as this is guaranteed to impede progress
for that task. If there is more than one task running, load balance in
the higher capacity group would have already made any possible moves to
resolve imbalance and we should make better use of system compute
capacity by moving a task if we still have more than one running.

Signed-off-by: Chris Redpath <chris.redpath@arm.com>
Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dietmar.eggemann@arm.com
Cc: gaku.inami.xh@renesas.com
Cc: valentin.schneider@arm.com
Cc: vincent.guittot@linaro.org
Link: http://lkml.kernel.org/r/1530699470-29808-11-git-send-email-morten.rasmussen@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8b228c5b3eb4..06ff75f4ac7b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8423,6 +8423,17 @@ static struct rq *find_busiest_queue(struct lb_env *env,
 
 		capacity = capacity_of(i);
 
+		/*
+		 * For ASYM_CPUCAPACITY domains, don't pick a CPU that could
+		 * eventually lead to active_balancing high->low capacity.
+		 * Higher per-CPU capacity is considered better than balancing
+		 * average load.
+		 */
+		if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
+		    capacity_of(env->dst_cpu) < capacity &&
+		    rq->nr_running == 1)
+			continue;
+
 		wl = weighted_cpuload(rq);
 
 		/*

From 9c63e84db29bcf584040931ad97c2edd11e35f6c Mon Sep 17 00:00:00 2001
From: Morten Rasmussen <morten.rasmussen@arm.com>
Date: Wed, 4 Jul 2018 11:17:50 +0100
Subject: [PATCH 15/28] sched/core: Disable SD_PREFER_SIBLING on asymmetric CPU
 capacity domains

The 'prefer sibling' sched_domain flag is intended to encourage
spreading tasks to sibling sched_domain to take advantage of more caches
and core for SMT systems. It has recently been changed to be on all
non-NUMA topology level. However, spreading across domains with CPU
capacity asymmetry isn't desirable, e.g. spreading from high capacity to
low capacity CPUs even if high capacity CPUs aren't overutilized might
give access to more cache but the CPU will be slower and possibly lead
to worse overall throughput.

To prevent this, we need to remove SD_PREFER_SIBLING on the sched_domain
level immediately below SD_ASYM_CPUCAPACITY.

Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dietmar.eggemann@arm.com
Cc: gaku.inami.xh@renesas.com
Cc: valentin.schneider@arm.com
Cc: vincent.guittot@linaro.org
Link: http://lkml.kernel.org/r/1530699470-29808-13-git-send-email-morten.rasmussen@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/topology.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 2536e1b938f9..7ffad0d3a4eb 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1126,7 +1126,7 @@ sd_init(struct sched_domain_topology_level *tl,
 					| 0*SD_SHARE_CPUCAPACITY
 					| 0*SD_SHARE_PKG_RESOURCES
 					| 0*SD_SERIALIZE
-					| 0*SD_PREFER_SIBLING
+					| 1*SD_PREFER_SIBLING
 					| 0*SD_NUMA
 					| sd_flags
 					,
@@ -1152,17 +1152,21 @@ sd_init(struct sched_domain_topology_level *tl,
 	if (sd->flags & SD_ASYM_CPUCAPACITY) {
 		struct sched_domain *t = sd;
 
+		/*
+		 * Don't attempt to spread across CPUs of different capacities.
+		 */
+		if (sd->child)
+			sd->child->flags &= ~SD_PREFER_SIBLING;
+
 		for_each_lower_domain(t)
 			t->flags |= SD_BALANCE_WAKE;
 	}
 
 	if (sd->flags & SD_SHARE_CPUCAPACITY) {
-		sd->flags |= SD_PREFER_SIBLING;
 		sd->imbalance_pct = 110;
 		sd->smt_gain = 1178; /* ~15% */
 
 	} else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
-		sd->flags |= SD_PREFER_SIBLING;
 		sd->imbalance_pct = 117;
 		sd->cache_nice_tries = 1;
 		sd->busy_idx = 2;
@@ -1173,6 +1177,7 @@ sd_init(struct sched_domain_topology_level *tl,
 		sd->busy_idx = 3;
 		sd->idle_idx = 2;
 
+		sd->flags &= ~SD_PREFER_SIBLING;
 		sd->flags |= SD_SERIALIZE;
 		if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
 			sd->flags &= ~(SD_BALANCE_EXEC |
@@ -1182,7 +1187,6 @@ sd_init(struct sched_domain_topology_level *tl,
 
 #endif
 	} else {
-		sd->flags |= SD_PREFER_SIBLING;
 		sd->cache_nice_tries = 1;
 		sd->busy_idx = 2;
 		sd->idle_idx = 1;

From 7e6f4c5d600c1c8e2a1d900e65cab319d9b6782e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 29 Aug 2018 11:45:21 +0200
Subject: [PATCH 16/28] sched/debug: Explicitly cast sched_feat() to bool

LLVM has a warning that tags expressions like:

	if (foo && non-bool-const)

This pattern triggers for CONFIG_SCHED_DEBUG=n where sched_feat() ends
up being whatever bit we select. Avoid the warning with an explicit
cast to bool.

Reported-by: Philipp Klocke <philipp97kl@gmail.com>
Tested-by: Nick Desaulniers <ndesaulniers@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/sched.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 85b3a2bf6c2b..3a4ef8f73f08 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1401,7 +1401,7 @@ static const_debug __maybe_unused unsigned int sysctl_sched_features =
 	0;
 #undef SCHED_FEAT
 
-#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
+#define sched_feat(x) !!(sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
 
 #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
 

From d90707ebebe03596e19de3abbf79b766e72a3465 Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Wed, 29 Aug 2018 15:19:09 +0200
Subject: [PATCH 17/28] sched/numa: Remove unused code from update_numa_stats()

With:

  commit 2d4056fafa19 ("sched/numa: Remove numa_has_capacity()")

the local variables 'smt', 'cpus' and 'capacity' and their results are not used
anymore in numa_has_capacity()

Remove this unused code.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1535548752-4434-2-git-send-email-vincent.guittot@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 21 +--------------------
 1 file changed, 1 insertion(+), 20 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 06ff75f4ac7b..b65596fae06b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1463,8 +1463,7 @@ struct numa_stats {
  */
 static void update_numa_stats(struct numa_stats *ns, int nid)
 {
-	int smt, cpu, cpus = 0;
-	unsigned long capacity;
+	int cpu;
 
 	memset(ns, 0, sizeof(*ns));
 	for_each_cpu(cpu, cpumask_of_node(nid)) {
@@ -1473,26 +1472,8 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
 		ns->nr_running += rq->nr_running;
 		ns->load += weighted_cpuload(rq);
 		ns->compute_capacity += capacity_of(cpu);
-
-		cpus++;
 	}
 
-	/*
-	 * If we raced with hotplug and there are no CPUs left in our mask
-	 * the @ns structure is NULL'ed and task_numa_compare() will
-	 * not find this node attractive.
-	 *
-	 * We'll detect a huge imbalance and bail there.
-	 */
-	if (!cpus)
-		return;
-
-	/* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */
-	smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
-	capacity = cpus / smt; /* cores */
-
-	capacity = min_t(unsigned, capacity,
-		DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
 }
 
 struct task_numa_env {

From 7477a3504e619768c9e972dafe2907e6b8ed9823 Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Wed, 29 Aug 2018 15:19:10 +0200
Subject: [PATCH 18/28] sched/numa: Remove unused numa_stats::nr_running field

nr_running in struct numa_stats is not used anywhere in the code.

Remove it.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1535548752-4434-3-git-send-email-vincent.guittot@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b65596fae06b..6bd142d19549 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1454,8 +1454,6 @@ struct numa_stats {
 
 	/* Total compute capacity of CPUs on a node */
 	unsigned long compute_capacity;
-
-	unsigned int nr_running;
 };
 
 /*
@@ -1469,7 +1467,6 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
 	for_each_cpu(cpu, cpumask_of_node(nid)) {
 		struct rq *rq = cpu_rq(cpu);
 
-		ns->nr_running += rq->nr_running;
 		ns->load += weighted_cpuload(rq);
 		ns->compute_capacity += capacity_of(cpu);
 	}

From ff28915fd31ccafc0d38e6f84b66df280ed9e86a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Wed, 5 Sep 2018 11:36:36 +0200
Subject: [PATCH 19/28] sched/debug: Use symbolic names for task state
 constants
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

include/trace/events/sched.h includes <linux/sched.h> (via
<linux/sched/numa_balancing.h>) and so knows about the TASK_* constants
used to interpret .prev_state. So instead of duplicating the magic
numbers make use of the defined macros to ease understanding the
mapping from state bits to letters which isn't completely intuitive for
an outsider.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kernel@pengutronix.de
Link: http://lkml.kernel.org/r/20180905093636.24068-1-u.kleine-koenig@pengutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/trace/events/sched.h | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 0be866c91f62..f07b270d4fc4 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -159,9 +159,14 @@ TRACE_EVENT(sched_switch,
 
 		(__entry->prev_state & (TASK_REPORT_MAX - 1)) ?
 		  __print_flags(__entry->prev_state & (TASK_REPORT_MAX - 1), "|",
-				{ 0x01, "S" }, { 0x02, "D" }, { 0x04, "T" },
-				{ 0x08, "t" }, { 0x10, "X" }, { 0x20, "Z" },
-				{ 0x40, "P" }, { 0x80, "I" }) :
+				{ TASK_INTERRUPTIBLE, "S" },
+				{ TASK_UNINTERRUPTIBLE, "D" },
+				{ __TASK_STOPPED, "T" },
+				{ __TASK_TRACED, "t" },
+				{ EXIT_DEAD, "X" },
+				{ EXIT_ZOMBIE, "Z" },
+				{ TASK_PARKED, "P" },
+				{ TASK_DEAD, "I" }) :
 		  "R",
 
 		__entry->prev_state & TASK_REPORT_MAX ? "+" : "",

From ace8031099f91480799b5929b4cccf2dcacc5136 Mon Sep 17 00:00:00 2001
From: zhong jiang <zhongjiang@huawei.com>
Date: Fri, 3 Aug 2018 20:37:32 +0800
Subject: [PATCH 20/28] sched/topology: Make local variables static

Fix the following warnings:

  kernel/sched/topology.c:10:15: warning: symbol 'sched_domains_tmpmask' was not declared. Should it be static?
  kernel/sched/topology.c:11:15: warning: symbol 'sched_domains_tmpmask2' was not declared. Should it be static?

Signed-off-by: zhong jiang <zhongjiang@huawei.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1533299852-26941-1-git-send-email-zhongjiang@huawei.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/topology.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 7ffad0d3a4eb..9d74371e4aad 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -7,8 +7,8 @@
 DEFINE_MUTEX(sched_domains_mutex);
 
 /* Protected by sched_domains_mutex: */
-cpumask_var_t sched_domains_tmpmask;
-cpumask_var_t sched_domains_tmpmask2;
+static cpumask_var_t sched_domains_tmpmask;
+static cpumask_var_t sched_domains_tmpmask2;
 
 #ifdef CONFIG_SCHED_DEBUG
 

From 11d4afd4ff667f9b6178ee8c142c36cb78bd84db Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Tue, 25 Sep 2018 11:17:42 +0200
Subject: [PATCH 21/28] sched/pelt: Fix warning and clean up IRQ PELT config

Create a config for enabling irq load tracking in the scheduler.
irq load tracking is useful only when irq or paravirtual time is
accounted but it's only possible with SMP for now.

Also use __maybe_unused to remove the compilation warning in
update_rq_clock_task() that has been introduced by:

  2e62c4743adc ("sched/fair: Remove #ifdefs from scale_rt_capacity()")

Suggested-by: Ingo Molnar <mingo@redhat.com>
Reported-by: Dou Liyang <douly.fnst@cn.fujitsu.com>
Reported-by: Miguel Ojeda <miguel.ojeda.sandonis@gmail.com>
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: bp@alien8.de
Cc: dou_liyang@163.com
Fixes: 2e62c4743adc ("sched/fair: Remove #ifdefs from scale_rt_capacity()")
Link: http://lkml.kernel.org/r/1537867062-27285-1-git-send-email-vincent.guittot@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 init/Kconfig         | 5 +++++
 kernel/sched/core.c  | 7 +++----
 kernel/sched/fair.c  | 2 +-
 kernel/sched/pelt.c  | 2 +-
 kernel/sched/pelt.h  | 2 +-
 kernel/sched/sched.h | 5 ++---
 6 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/init/Kconfig b/init/Kconfig
index 1e234e2f1cba..317d5ccb5191 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -415,6 +415,11 @@ config IRQ_TIME_ACCOUNTING
 
 	  If in doubt, say N here.
 
+config HAVE_SCHED_AVG_IRQ
+	def_bool y
+	depends on IRQ_TIME_ACCOUNTING || PARAVIRT_TIME_ACCOUNTING
+	depends on SMP
+
 config BSD_PROCESS_ACCT
 	bool "BSD Process Accounting"
 	depends on MULTIUSER
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ad97f3ba5ec5..f2caf1bae4a3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -135,9 +135,8 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
  * In theory, the compile should just see 0 here, and optimize out the call
  * to sched_rt_avg_update. But I don't trust it...
  */
-#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
-	s64 steal = 0, irq_delta = 0;
-#endif
+	s64 __maybe_unused steal = 0, irq_delta = 0;
+
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 	irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
 
@@ -177,7 +176,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
 
 	rq->clock_task += delta;
 
-#ifdef HAVE_SCHED_AVG_IRQ
+#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
 	if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
 		update_irq_load_avg(rq, irq_delta + steal);
 #endif
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1d92ed2eca8b..d59307ecd67d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7317,7 +7317,7 @@ static inline bool others_have_blocked(struct rq *rq)
 	if (READ_ONCE(rq->avg_dl.util_avg))
 		return true;
 
-#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
+#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
 	if (READ_ONCE(rq->avg_irq.util_avg))
 		return true;
 #endif
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index 35475c0c5419..48a126486435 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -358,7 +358,7 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
 	return 0;
 }
 
-#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
+#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
 /*
  * irq:
  *
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index d2894db28955..7e56b489ff32 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -6,7 +6,7 @@ int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq);
 int update_rt_rq_load_avg(u64 now, struct rq *rq, int running);
 int update_dl_rq_load_avg(u64 now, struct rq *rq, int running);
 
-#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
+#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
 int update_irq_load_avg(struct rq *rq, u64 running);
 #else
 static inline int
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 632804fa0b12..798b1afd5092 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -862,8 +862,7 @@ struct rq {
 
 	struct sched_avg	avg_rt;
 	struct sched_avg	avg_dl;
-#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
-#define HAVE_SCHED_AVG_IRQ
+#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
 	struct sched_avg	avg_irq;
 #endif
 	u64			idle_stamp;
@@ -2223,7 +2222,7 @@ static inline unsigned long cpu_util_rt(struct rq *rq)
 }
 #endif
 
-#ifdef HAVE_SCHED_AVG_IRQ
+#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
 static inline unsigned long cpu_util_irq(struct rq *rq)
 {
 	return rq->avg_irq.util_avg;

From fdf5f315d5cfaefb7bb8a62ec4bf37b9891837aa Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Thu, 9 Aug 2018 14:57:53 +0100
Subject: [PATCH 22/28] sched/fair: Disable LB_BIAS by default
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

LB_BIAS allows the adjustment on how conservative load should be
balanced.

The rq->cpu_load[idx] array is used for this functionality. It contains
weighted CPU load decayed average values over different intervals
(idx = 1..4). Idx = 0 is the weighted CPU load itself.

The values are updated during scheduler_tick, before idle balance and at
nohz exit.

There are 5 different types of idx's per sched domain (sd). Each of them
is used to index into the rq->cpu_load[idx] array in a specific scenario
(busy, idle and newidle for load balancing, forkexec for wake-up
slow-path load balancing and wake for affine wakeup based on weight).
Only the sd idx's for busy and idle load balancing are set to 2,3 or 1,2
respectively. All the other sd idx's are set to 0.

Conservative load balancing is achieved for sd idx's >= 1 by using the
min/max (source_load()/target_load()) value between the current weighted
CPU load and the rq->cpu_load[sd idx -1] for the busiest(idlest)/local
CPU load in load balancing or vice versa in the wake-up slow-path load
balancing.
There is no conservative balancing for sd idx = 0 since only current
weighted CPU load is used in this case.

It is very likely that LB_BIAS' influence on load balancing can be
neglected (see test results below). This is further supported by:

(1) Weighted CPU load today is by itself a decayed average value (PELT)
    (cfs_rq->avg->runnable_load_avg) and not the instantaneous load
    (rq->load.weight) it was when LB_BIAS was introduced.

(2) Sd imbalance_pct is used for CPU_NEWLY_IDLE and CPU_NOT_IDLE (relate
    to sd's newidle and busy idx) in find_busiest_group() when comparing
    busiest and local avg load to make load balancing even more
    conservative.

(3) The sd forkexec and newidle idx are always set to 0 so there is no
    adjustment on how conservatively load balancing is done here.

(4) Affine wakeup based on weight (wake_affine_weight()) will not be
    impacted since the sd wake idx is always set to 0.

Let's disable LB_BIAS by default for a few kernel releases to make sure
that no workload and no scheduler topology is affected. The benefit of
being able to remove the LB_BIAS dependency from source_load() and
target_load() is that the entire rq->cpu_load[idx] code could be removed
in this case.

It is really hard to say if there is no regression w/o testing this with
a lot of different workloads on a lot of different platforms, especially
NUMA machines.
The following 104 LKP (Linux Kernel Performance) tests were run by the
0-Day guys mostly on multi-socket hosts with a larger number of logical
cpus (88, 192).
The base for the test was commit b3dae109fa89 ("sched/swait: Rename to
exclusive") (tip/sched/core v4.18-rc1).
Only 2 out of the 104 tests had a significant change in one of the
metrics (fsmark/1x-1t-1HDD-btrfs-nfsv4-4M-60G-NoSync-performance +7%
files_per_sec, unixbench/300s-100%-syscall-performance -11% score).
Tests which showed a change in one of the metrics are marked with a '*'
and this change is listed as well.

(a) lkp-bdw-ep3:
      88 threads Intel(R) Xeon(R) CPU E5-2699 v4 @ 2.20GHz 64G

    dd-write/10m-1HDD-cfq-btrfs-100dd-performance
    fsmark/1x-1t-1HDD-xfs-nfsv4-4M-60G-NoSync-performance
  * fsmark/1x-1t-1HDD-btrfs-nfsv4-4M-60G-NoSync-performance
      7.50  7%  8.00  ±  6%  fsmark.files_per_sec
    fsmark/1x-1t-1HDD-btrfs-nfsv4-4M-60G-fsyncBeforeClose-performance
    fsmark/1x-1t-1HDD-btrfs-4M-60G-NoSync-performance
    fsmark/1x-1t-1HDD-btrfs-4M-60G-fsyncBeforeClose-performance
    kbuild/300s-50%-vmlinux_prereq-performance
    kbuild/300s-200%-vmlinux_prereq-performance
    kbuild/300s-50%-vmlinux_prereq-performance-1HDD-ext4
    kbuild/300s-200%-vmlinux_prereq-performance-1HDD-ext4

(b) lkp-skl-4sp1:
      192 threads Intel(R) Xeon(R) Platinum 8160 768G

    dbench/100%-performance
    ebizzy/200%-100x-10s-performance
    hackbench/1600%-process-pipe-performance
    iperf/300s-cs-localhost-tcp-performance
    iperf/300s-cs-localhost-udp-performance
    perf-bench-numa-mem/2t-300M-performance
    perf-bench-sched-pipe/10000000ops-process-performance
    perf-bench-sched-pipe/10000000ops-threads-performance
    schbench/2-16-300-30000-30000-performance
    tbench/100%-cs-localhost-performance

(c) lkp-bdw-ep6:
      88 threads Intel(R) Xeon(R) CPU E5-2699 v4 @ 2.20GHz 128G

    stress-ng/100%-60s-pipe-performance
    unixbench/300s-1-whetstone-double-performance
    unixbench/300s-1-shell1-performance
    unixbench/300s-1-shell8-performance
    unixbench/300s-1-pipe-performance
  * unixbench/300s-1-context1-performance
      312  315  unixbench.score
    unixbench/300s-1-spawn-performance
    unixbench/300s-1-syscall-performance
    unixbench/300s-1-dhry2reg-performance
    unixbench/300s-1-fstime-performance
    unixbench/300s-1-fsbuffer-performance
    unixbench/300s-1-fsdisk-performance
    unixbench/300s-100%-whetstone-double-performance
    unixbench/300s-100%-shell1-performance
    unixbench/300s-100%-shell8-performance
    unixbench/300s-100%-pipe-performance
    unixbench/300s-100%-context1-performance
    unixbench/300s-100%-spawn-performance
  * unixbench/300s-100%-syscall-performance
      3571  ±  3%  -11%  3183  ±  4%  unixbench.score
    unixbench/300s-100%-dhry2reg-performance
    unixbench/300s-100%-fstime-performance
    unixbench/300s-100%-fsbuffer-performance
    unixbench/300s-100%-fsdisk-performance
    unixbench/300s-1-execl-performance
    unixbench/300s-100%-execl-performance
  * will-it-scale/brk1-performance
      365004  360387  will-it-scale.per_thread_ops
  * will-it-scale/dup1-performance
      432401  437596  will-it-scale.per_thread_ops
    will-it-scale/eventfd1-performance
    will-it-scale/futex1-performance
    will-it-scale/futex2-performance
    will-it-scale/futex3-performance
    will-it-scale/futex4-performance
    will-it-scale/getppid1-performance
    will-it-scale/lock1-performance
    will-it-scale/lseek1-performance
    will-it-scale/lseek2-performance
  * will-it-scale/malloc1-performance
      47025  45817  will-it-scale.per_thread_ops
      77499  76529  will-it-scale.per_process_ops
    will-it-scale/malloc2-performance
  * will-it-scale/mmap1-performance
      123399  120815  will-it-scale.per_thread_ops
      152219  149833  will-it-scale.per_process_ops
  * will-it-scale/mmap2-performance
      107327  104714  will-it-scale.per_thread_ops
      136405  133765  will-it-scale.per_process_ops
    will-it-scale/open1-performance
  * will-it-scale/open2-performance
      171570  168805  will-it-scale.per_thread_ops
      532644  526202  will-it-scale.per_process_ops
    will-it-scale/page_fault1-performance
    will-it-scale/page_fault2-performance
    will-it-scale/page_fault3-performance
    will-it-scale/pipe1-performance
    will-it-scale/poll1-performance
  * will-it-scale/poll2-performance
      176134  172848  will-it-scale.per_thread_ops
      281361  275053  will-it-scale.per_process_ops
    will-it-scale/posix_semaphore1-performance
    will-it-scale/pread1-performance
    will-it-scale/pread2-performance
    will-it-scale/pread3-performance
    will-it-scale/pthread_mutex1-performance
    will-it-scale/pthread_mutex2-performance
    will-it-scale/pwrite1-performance
    will-it-scale/pwrite2-performance
    will-it-scale/pwrite3-performance
  * will-it-scale/read1-performance
      1190563  1174833  will-it-scale.per_thread_ops
  * will-it-scale/read2-performance
      1105369  1080427  will-it-scale.per_thread_ops
    will-it-scale/readseek1-performance
  * will-it-scale/readseek2-performance
      261818  259040  will-it-scale.per_thread_ops
    will-it-scale/readseek3-performance
  * will-it-scale/sched_yield-performance
      2408059  2382034  will-it-scale.per_thread_ops
    will-it-scale/signal1-performance
    will-it-scale/unix1-performance
    will-it-scale/unlink1-performance
    will-it-scale/unlink2-performance
  * will-it-scale/write1-performance
      976701  961588  will-it-scale.per_thread_ops
  * will-it-scale/writeseek1-performance
      831898  822448  will-it-scale.per_thread_ops
  * will-it-scale/writeseek2-performance
      228248  225065  will-it-scale.per_thread_ops
  * will-it-scale/writeseek3-performance
      226670  224058  will-it-scale.per_thread_ops
    will-it-scale/context_switch1-performance
    aim7/performance-fork_test-2000
  * aim7/performance-brk_test-3000
      74869  76676  aim7.jobs-per-min
    aim7/performance-disk_cp-3000
    aim7/performance-disk_rd-3000
    aim7/performance-sieve-3000
    aim7/performance-page_test-3000
    aim7/performance-creat-clo-3000
    aim7/performance-mem_rtns_1-8000
    aim7/performance-disk_wrt-8000
    aim7/performance-pipe_cpy-8000
    aim7/performance-ram_copy-8000

(d) lkp-avoton3:
      8 threads Intel(R) Atom(TM) CPU C2750 @ 2.40GHz 16G

    netperf/ipv4-900s-200%-cs-localhost-TCP_STREAM-performance

Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: Li Zhijian <zhijianx.li@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20180809135753.21077-1-dietmar.eggemann@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/features.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 85ae8488039c..858589b83377 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -39,7 +39,7 @@ SCHED_FEAT(WAKEUP_PREEMPTION, true)
 
 SCHED_FEAT(HRTICK, false)
 SCHED_FEAT(DOUBLE_TICK, false)
-SCHED_FEAT(LB_BIAS, true)
+SCHED_FEAT(LB_BIAS, false)
 
 /*
  * Decrement CPU capacity based on time not spent running tasks

From 4a465e3ebbc8004ce4f7f08f6022ee8315a94edf Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Fri, 3 Aug 2018 15:05:38 +0100
Subject: [PATCH 23/28] sched/fair: Remove setting task's se->runnable_weight
 during PELT update

A CFS (SCHED_OTHER, SCHED_BATCH or SCHED_IDLE policy) task's
se->runnable_weight must always be in sync with its se->load.weight.

se->runnable_weight is set to se->load.weight when the task is
forked (init_entity_runnable_average()) or reniced (reweight_entity()).

There are two cases in set_load_weight() which since they currently only
set se->load.weight could lead to a situation in which se->load.weight
is different to se->runnable_weight for a CFS task:

(1) A task switches to SCHED_IDLE.

(2) A SCHED_FIFO, SCHED_RR or SCHED_DEADLINE task which has been reniced
    (during which only its static priority gets set) switches to
    SCHED_OTHER or SCHED_BATCH.

Set se->runnable_weight to se->load.weight in these two cases to prevent
this. This eliminates the need to explicitly set it to se->load.weight
during PELT updates in the CFS scheduler fastpath.

Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Joel Fernandes <joelaf@google.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>
Cc: Patrick Bellasi <patrick.bellasi@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quentin Perret <quentin.perret@arm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Link: http://lkml.kernel.org/r/20180803140538.1178-1-dietmar.eggemann@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 2 ++
 kernel/sched/pelt.c | 6 ------
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f2caf1bae4a3..56b3c1781276 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -700,6 +700,7 @@ static void set_load_weight(struct task_struct *p, bool update_load)
 	if (idle_policy(p->policy)) {
 		load->weight = scale_load(WEIGHT_IDLEPRIO);
 		load->inv_weight = WMULT_IDLEPRIO;
+		p->se.runnable_weight = load->weight;
 		return;
 	}
 
@@ -712,6 +713,7 @@ static void set_load_weight(struct task_struct *p, bool update_load)
 	} else {
 		load->weight = scale_load(sched_prio_to_weight[prio]);
 		load->inv_weight = sched_prio_to_wmult[prio];
+		p->se.runnable_weight = load->weight;
 	}
 }
 
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index 48a126486435..90fb5bc12ad4 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -269,9 +269,6 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runna
 
 int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
 {
-	if (entity_is_task(se))
-		se->runnable_weight = se->load.weight;
-
 	if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) {
 		___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
 		return 1;
@@ -282,9 +279,6 @@ int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
 
 int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	if (entity_is_task(se))
-		se->runnable_weight = se->load.weight;
-
 	if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq,
 				cfs_rq->curr == se)) {
 

From 9c2298aad355d8c1957df3015448fef333526934 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 4 Oct 2018 11:05:14 +0200
Subject: [PATCH 24/28] sched/core: Fix comment regarding nr_iowait_cpu() and
 get_iowait_load()

The comment related to nr_iowait_cpu() and get_iowait_load() confuses
cpufreq with cpuidle and is not very useful for this reason, so fix it.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Linux PM <linux-pm@vger.kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: e33a9bba85a8 "sched/core: move IO scheduling accounting from io_schedule_timeout() into scheduler"
Link: http://lkml.kernel.org/r/3803514.xkx7zY50tF@aspire.rjw.lan
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 56b3c1781276..fe0223121883 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2916,10 +2916,10 @@ unsigned long nr_iowait(void)
 }
 
 /*
- * Consumers of these two interfaces, like for example the cpufreq menu
- * governor are using nonsensical data. Boosting frequency for a CPU that has
- * IO-wait which might not even end up running the task when it does become
- * runnable.
+ * Consumers of these two interfaces, like for example the cpuidle menu
+ * governor, are using nonsensical data. Preferring shallow idle state selection
+ * for a CPU that has IO-wait which might not even end up running the task when
+ * it does become runnable.
  */
 
 unsigned long nr_iowait_cpu(int cpu)

From d0e7d14455d41163126afecd0fcce935463cc512 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Thu, 4 Oct 2018 19:22:27 +0200
Subject: [PATCH 25/28] cpu/SMT: State SMT is disabled even with nosmt and
 without "=force"

When booting with "nosmt=force" a message is issued into dmesg to
confirm that SMT has been force-disabled but such a message is not
issued when only "nosmt" is on the kernel command line.

Fix that.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20181004172227.10094-1-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/cpu.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 0097acec1c71..f1338452d998 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -362,6 +362,7 @@ void __init cpu_smt_disable(bool force)
 		pr_info("SMT: Force disabled\n");
 		cpu_smt_control = CPU_SMT_FORCE_DISABLED;
 	} else {
+		pr_info("SMT: disabled\n");
 		cpu_smt_control = CPU_SMT_DISABLED;
 	}
 }

From 7b6abce7e1e69b6d8dc5d40a8cb9ddaeb400427c Mon Sep 17 00:00:00 2001
From: John Garry <john.garry@huawei.com>
Date: Wed, 10 Oct 2018 22:56:32 +0800
Subject: [PATCH 26/28] sched/completions/Documentation: Fix a couple of
 punctuation nits

This patch fixes a couple of punctuation nits which can make the document
more correct and readable.

Also missing "()" are added to some function references for consistency.

Signed-off-by: John Garry <john.garry@huawei.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: corbet@lwn.net
Cc: linux-doc@vger.kernel.org
Link: http://lkml.kernel.org/r/1539183392-239389-1-git-send-email-john.garry@huawei.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 Documentation/scheduler/completion.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Documentation/scheduler/completion.txt b/Documentation/scheduler/completion.txt
index 656cf803c006..108bd0f264b3 100644
--- a/Documentation/scheduler/completion.txt
+++ b/Documentation/scheduler/completion.txt
@@ -116,7 +116,7 @@ A typical usage scenario is:
 This is not implying any temporal order on wait_for_completion() and the
 call to complete() - if the call to complete() happened before the call
 to wait_for_completion() then the waiting side simply will continue
-immediately as all dependencies are satisfied if not it will block until
+immediately as all dependencies are satisfied; if not, it will block until
 completion is signaled by complete().
 
 Note that wait_for_completion() is calling spin_lock_irq()/spin_unlock_irq(),
@@ -131,7 +131,7 @@ wait_for_completion():
 The default behavior is to wait without a timeout and to mark the task as
 uninterruptible. wait_for_completion() and its variants are only safe
 in process context (as they can sleep) but not in atomic context,
-interrupt context, with disabled irqs. or preemption is disabled - see also
+interrupt context, with disabled irqs, or preemption is disabled - see also
 try_wait_for_completion() below for handling completion in atomic/interrupt
 context.
 
@@ -224,7 +224,7 @@ queue spinlock. Any such concurrent calls to complete() or complete_all()
 probably are a design bug.
 
 Signaling completion from hard-irq context is fine as it will appropriately
-lock with spin_lock_irqsave/spin_unlock_irqrestore and it will never sleep.
+lock with spin_lock_irqsave()/spin_unlock_irqrestore() and it will never sleep.
 
 
 try_wait_for_completion()/completion_done():

From 0c373344b5c1eaa9e186368a32a169a2802be3ca Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Thu, 11 Oct 2018 10:36:23 +0200
Subject: [PATCH 27/28] sched/completions/Documentation: Clean up the document
 some more

Refresh the document:

 - Remove unnecessary liguistic complexity and improve the clarity of the text

 - Improve the explanations all around

 - Remove unnecessary and stale version info

 - Fix whitespace noise

 - Make pseudo-code match kernel style

 - Fix minor syntax errors in pseudo-code

 - Use consistent denotation

 - Mark multi-CPU sequences more explicitly

 - Unbreak line breaks

 - Use quotes to refer to 'struct completion'

 - Use 'IRQ context' and 'IRQs' consistently

 - Improve grammar

 - etc.

Cc: John Garry <john.garry@huawei.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nicholas Mc Guire <hofrat@osadl.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: corbet@lwn.net
Cc: linux-doc@vger.kernel.org
Link: http://lkml.kernel.org/r/1539183392-239389-1-git-send-email-john.garry@huawei.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 Documentation/scheduler/completion.txt | 229 +++++++++++++------------
 1 file changed, 123 insertions(+), 106 deletions(-)

diff --git a/Documentation/scheduler/completion.txt b/Documentation/scheduler/completion.txt
index 108bd0f264b3..91a11a668354 100644
--- a/Documentation/scheduler/completion.txt
+++ b/Documentation/scheduler/completion.txt
@@ -1,146 +1,161 @@
-completions - wait for completion handling
-==========================================
-
-This document was originally written based on 3.18.0 (linux-next)
+Completions - "wait for completion" barrier APIs
+================================================
 
 Introduction:
 -------------
 
-If you have one or more threads of execution that must wait for some process
+If you have one or more threads that must wait for some kernel activity
 to have reached a point or a specific state, completions can provide a
 race-free solution to this problem. Semantically they are somewhat like a
-pthread_barrier and have similar use-cases.
+pthread_barrier() and have similar use-cases.
 
 Completions are a code synchronization mechanism which is preferable to any
-misuse of locks. Any time you think of using yield() or some quirky
-msleep(1) loop to allow something else to proceed, you probably want to
-look into using one of the wait_for_completion*() calls instead. The
-advantage of using completions is clear intent of the code, but also more
-efficient code as both threads can continue until the result is actually
-needed.
+misuse of locks/semaphores and busy-loops. Any time you think of using
+yield() or some quirky msleep(1) loop to allow something else to proceed,
+you probably want to look into using one of the wait_for_completion*()
+calls and complete() instead.
 
-Completions are built on top of the generic event infrastructure in Linux,
-with the event reduced to a simple flag (appropriately called "done") in
-struct completion that tells the waiting threads of execution if they
-can continue safely.
+The advantage of using completions is that they have a well defined, focused
+purpose which makes it very easy to see the intent of the code, but they
+also result in more efficient code as all threads can continue execution
+until the result is actually needed, and both the waiting and the signalling
+is highly efficient using low level scheduler sleep/wakeup facilities.
 
-As completions are scheduling related, the code is found in
+Completions are built on top of the waitqueue and wakeup infrastructure of
+the Linux scheduler. The event the threads on the waitqueue are waiting for
+is reduced to a simple flag in 'struct completion', appropriately called "done".
+
+As completions are scheduling related, the code can be found in
 kernel/sched/completion.c.
 
 
 Usage:
 ------
 
-There are three parts to using completions, the initialization of the
-struct completion, the waiting part through a call to one of the variants of
-wait_for_completion() and the signaling side through a call to complete()
-or complete_all(). Further there are some helper functions for checking the
-state of completions.
+There are three main parts to using completions:
 
-To use completions one needs to include <linux/completion.h> and
-create a variable of type struct completion. The structure used for
-handling of completions is:
+ - the initialization of the 'struct completion' synchronization object
+ - the waiting part through a call to one of the variants of wait_for_completion(),
+ - the signaling side through a call to complete() or complete_all().
+
+There are also some helper functions for checking the state of completions.
+Note that while initialization must happen first, the waiting and signaling
+part can happen in any order. I.e. it's entirely normal for a thread
+to have marked a completion as 'done' before another thread checks whether
+it has to wait for it.
+
+To use completions you need to #include <linux/completion.h> and
+create a static or dynamic variable of type 'struct completion',
+which has only two fields:
 
 	struct completion {
 		unsigned int done;
 		wait_queue_head_t wait;
 	};
 
-providing the wait queue to place tasks on for waiting and the flag for
-indicating the state of affairs.
+This provides the ->wait waitqueue to place tasks on for waiting (if any), and
+the ->done completion flag for indicating whether it's completed or not.
 
-Completions should be named to convey the intent of the waiter. A good
-example is:
+Completions should be named to refer to the event that is being synchronized on.
+A good example is:
 
 	wait_for_completion(&early_console_added);
 
 	complete(&early_console_added);
 
-Good naming (as always) helps code readability.
+Good, intuitive naming (as always) helps code readability. Naming a completion
+'complete' is not helpful unless the purpose is super obvious...
 
 
 Initializing completions:
 -------------------------
 
-Initialization of dynamically allocated completions, often embedded in
-other structures, is done with:
+Initialization of dynamically allocated completion objects, often embedded in
+other structures, is done via a call to init_completion():
 
-	void init_completion(&done);
+	init_completion(&dynamic_object->done);
 
-Initialization is accomplished by initializing the wait queue and setting
-the default state to "not available", that is, "done" is set to 0.
+In this call we initialize the waitqueue and set ->done to 0, i.e. "not completed"
+or "not done".
 
 The re-initialization function, reinit_completion(), simply resets the
-done element to "not available", thus again to 0, without touching the
-wait queue. Calling init_completion() twice on the same completion object is
+->done field to 0 ("not done"), without touching the waitqueue.
+Callers of this function must make sure that there are no racy
+wait_for_completion() calls going on in parallel.
+
+Calling init_completion() on the same completion object twice is
 most likely a bug as it re-initializes the queue to an empty queue and
-enqueued tasks could get "lost" - use reinit_completion() in that case.
+enqueued tasks could get "lost" - use reinit_completion() in that case,
+but be aware of other races.
 
-For static declaration and initialization, macros are available. These are:
+For static declaration and initialization, macros are available.
 
-	static DECLARE_COMPLETION(setup_done)
+For static (or global) declarations in file scope you can use DECLARE_COMPLETION():
 
-used for static declarations in file scope. Within functions the static
-initialization should always use:
+	static DECLARE_COMPLETION(setup_done);
+	DECLARE_COMPLETION(setup_done);
+
+Note that in this case the completion is boot time (or module load time)
+initialized to 'not done' and doesn't require an init_completion() call.
+
+When a completion is declared as a local variable within a function,
+then the initialization should always use:
 
 	DECLARE_COMPLETION_ONSTACK(setup_done)
 
-suitable for automatic/local variables on the stack and will make lockdep
-happy. Note also that one needs to make *sure* the completion passed to
-work threads remains in-scope, and no references remain to on-stack data
-when the initiating function returns.
-
-Using on-stack completions for code that calls any of the _timeout or
-_interruptible/_killable variants is not advisable as they will require
-additional synchronization to prevent the on-stack completion object in
-the timeout/signal cases from going out of scope. Consider using dynamically
-allocated completions when intending to use the _interruptible/_killable
-or _timeout variants of wait_for_completion().
+A simple DECLARE_COMPLETION() on the stack makes lockdep unhappy.
 
+Note that when using completion objects as local variables you must be
+aware of the short life time of the function stack: the function must
+not return to a calling context until all activities (such as waiting
+threads) have ceased and the completion is ... completely unused.
 
 Waiting for completions:
 ------------------------
 
-For a thread of execution to wait for some concurrent work to finish, it
-calls wait_for_completion() on the initialized completion structure.
+For a thread to wait for some concurrent activity to finish, it
+calls wait_for_completion() on the initialized completion structure:
+
+	void wait_for_completion(struct completion *done)
+
 A typical usage scenario is:
 
+	CPU#1					CPU#2
+
 	struct completion setup_done;
+
 	init_completion(&setup_done);
-	initialize_work(...,&setup_done,...)
+	initialize_work(...,&setup_done,...);
 
-	/* run non-dependent code */              /* do setup */
+	/* run non-dependent code */		/* do setup */
 
-	wait_for_completion(&setup_done);         complete(setup_done)
+	wait_for_completion(&setup_done);	complete(setup_done);
 
-This is not implying any temporal order on wait_for_completion() and the
-call to complete() - if the call to complete() happened before the call
+This is not implying any particular order between wait_for_completion() and
+the call to complete() - if the call to complete() happened before the call
 to wait_for_completion() then the waiting side simply will continue
 immediately as all dependencies are satisfied; if not, it will block until
 completion is signaled by complete().
 
 Note that wait_for_completion() is calling spin_lock_irq()/spin_unlock_irq(),
 so it can only be called safely when you know that interrupts are enabled.
-Calling it from hard-irq or irqs-off atomic contexts will result in
-hard-to-detect spurious enabling of interrupts.
-
-wait_for_completion():
-
-	void wait_for_completion(struct completion *done):
+Calling it from IRQs-off atomic contexts will result in hard-to-detect
+spurious enabling of interrupts.
 
 The default behavior is to wait without a timeout and to mark the task as
 uninterruptible. wait_for_completion() and its variants are only safe
 in process context (as they can sleep) but not in atomic context,
-interrupt context, with disabled irqs, or preemption is disabled - see also
+interrupt context, with disabled IRQs, or preemption is disabled - see also
 try_wait_for_completion() below for handling completion in atomic/interrupt
 context.
 
 As all variants of wait_for_completion() can (obviously) block for a long
-time, you probably don't want to call this with held mutexes.
+time depending on the nature of the activity they are waiting for, so in
+most cases you probably don't want to call this with held mutexes.
 
 
-Variants available:
--------------------
+wait_for_completion*() variants available:
+------------------------------------------
 
 The below variants all return status and this status should be checked in
 most(/all) cases - in cases where the status is deliberately not checked you
@@ -148,51 +163,53 @@ probably want to make a note explaining this (e.g. see
 arch/arm/kernel/smp.c:__cpu_up()).
 
 A common problem that occurs is to have unclean assignment of return types,
-so care should be taken with assigning return-values to variables of proper
-type. Checking for the specific meaning of return values also has been found
-to be quite inaccurate e.g. constructs like
-if (!wait_for_completion_interruptible_timeout(...)) would execute the same
-code path for successful completion and for the interrupted case - which is
-probably not what you want.
+so take care to assign return-values to variables of the proper type.
+
+Checking for the specific meaning of return values also has been found
+to be quite inaccurate, e.g. constructs like:
+
+	if (!wait_for_completion_interruptible_timeout(...))
+
+... would execute the same code path for successful completion and for the
+interrupted case - which is probably not what you want.
 
 	int wait_for_completion_interruptible(struct completion *done)
 
-This function marks the task TASK_INTERRUPTIBLE. If a signal was received
-while waiting it will return -ERESTARTSYS; 0 otherwise.
+This function marks the task TASK_INTERRUPTIBLE while it is waiting.
+If a signal was received while waiting it will return -ERESTARTSYS; 0 otherwise.
 
-	unsigned long wait_for_completion_timeout(struct completion *done,
-		unsigned long timeout)
+	unsigned long wait_for_completion_timeout(struct completion *done, unsigned long timeout)
 
 The task is marked as TASK_UNINTERRUPTIBLE and will wait at most 'timeout'
-(in jiffies). If timeout occurs it returns 0 else the remaining time in
-jiffies (but at least 1). Timeouts are preferably calculated with
-msecs_to_jiffies() or usecs_to_jiffies(). If the returned timeout value is
-deliberately ignored a comment should probably explain why (e.g. see
-drivers/mfd/wm8350-core.c wm8350_read_auxadc())
+jiffies. If a timeout occurs it returns 0, else the remaining time in
+jiffies (but at least 1).
 
-	long wait_for_completion_interruptible_timeout(
-		struct completion *done, unsigned long timeout)
+Timeouts are preferably calculated with msecs_to_jiffies() or usecs_to_jiffies(),
+to make the code largely HZ-invariant.
+
+If the returned timeout value is deliberately ignored a comment should probably explain
+why (e.g. see drivers/mfd/wm8350-core.c wm8350_read_auxadc()).
+
+	long wait_for_completion_interruptible_timeout(struct completion *done, unsigned long timeout)
 
 This function passes a timeout in jiffies and marks the task as
 TASK_INTERRUPTIBLE. If a signal was received it will return -ERESTARTSYS;
-otherwise it returns 0 if the completion timed out or the remaining time in
+otherwise it returns 0 if the completion timed out, or the remaining time in
 jiffies if completion occurred.
 
 Further variants include _killable which uses TASK_KILLABLE as the
-designated tasks state and will return -ERESTARTSYS if it is interrupted or
-else 0 if completion was achieved.  There is a _timeout variant as well:
+designated tasks state and will return -ERESTARTSYS if it is interrupted,
+or 0 if completion was achieved.  There is a _timeout variant as well:
 
 	long wait_for_completion_killable(struct completion *done)
-	long wait_for_completion_killable_timeout(struct completion *done,
-		unsigned long timeout)
+	long wait_for_completion_killable_timeout(struct completion *done, unsigned long timeout)
 
 The _io variants wait_for_completion_io() behave the same as the non-_io
-variants, except for accounting waiting time as waiting on IO, which has
-an impact on how the task is accounted in scheduling stats.
+variants, except for accounting waiting time as 'waiting on IO', which has
+an impact on how the task is accounted in scheduling/IO stats:
 
 	void wait_for_completion_io(struct completion *done)
-	unsigned long wait_for_completion_io_timeout(struct completion *done
-		unsigned long timeout)
+	unsigned long wait_for_completion_io_timeout(struct completion *done, unsigned long timeout)
 
 
 Signaling completions:
@@ -200,30 +217,30 @@ Signaling completions:
 
 A thread that wants to signal that the conditions for continuation have been
 achieved calls complete() to signal exactly one of the waiters that it can
-continue.
+continue:
 
 	void complete(struct completion *done)
 
-or calls complete_all() to signal all current and future waiters.
+... or calls complete_all() to signal all current and future waiters:
 
 	void complete_all(struct completion *done)
 
 The signaling will work as expected even if completions are signaled before
 a thread starts waiting. This is achieved by the waiter "consuming"
-(decrementing) the done element of struct completion. Waiting threads
+(decrementing) the done field of 'struct completion'. Waiting threads
 wakeup order is the same in which they were enqueued (FIFO order).
 
 If complete() is called multiple times then this will allow for that number
 of waiters to continue - each call to complete() will simply increment the
-done element. Calling complete_all() multiple times is a bug though. Both
-complete() and complete_all() can be called in hard-irq/atomic context safely.
+done field. Calling complete_all() multiple times is a bug though. Both
+complete() and complete_all() can be called in IRQ/atomic context safely.
 
-There only can be one thread calling complete() or complete_all() on a
-particular struct completion at any time - serialized through the wait
+There can only be one thread calling complete() or complete_all() on a
+particular 'struct completion' at any time - serialized through the wait
 queue spinlock. Any such concurrent calls to complete() or complete_all()
 probably are a design bug.
 
-Signaling completion from hard-irq context is fine as it will appropriately
+Signaling completion from IRQ context is fine as it will appropriately
 lock with spin_lock_irqsave()/spin_unlock_irqrestore() and it will never sleep.
 
 
@@ -236,7 +253,7 @@ else it consumes one posted completion and returns true.
 
 	bool try_wait_for_completion(struct completion *done)
 
-Finally, to check the state of a completion without changing it in any way, 
+Finally, to check the state of a completion without changing it in any way,
 call completion_done(), which returns false if there are no posted
 completions that were not yet consumed by waiters (implying that there are
 waiters) and true otherwise;
@@ -244,4 +261,4 @@ waiters) and true otherwise;
 	bool completion_done(struct completion *done)
 
 Both try_wait_for_completion() and completion_done() are safe to be called in
-hard-irq or atomic context.
+IRQ or atomic context.

From 11e13696a08e838ba48c72404c2b3f41429b5b20 Mon Sep 17 00:00:00 2001
From: Nicholas Mc Guire <hofrat@osadl.org>
Date: Tue, 16 Oct 2018 15:45:39 +0200
Subject: [PATCH 28/28] sched/completions/Documentation: Add recommendation for
 dynamic and ONSTACK completions

To prevent dynamic completion objects from being de-allocated while still
in use, add a recommendation to embed them in long lived data structures.

Also add a note for the on-stack case that emphasizes the dangers of
the limited scope, and recommends dynamic allocation if scope limitations
are not clearly understood.

[ mingo: Minor touch-ups of the text, expanded it a bit to make the
         warnings Nicholas added more prominent. ]

Signed-off-by: Nicholas Mc Guire <hofrat@osadl.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: john.garry@huawei.com
Link: http://lkml.kernel.org/r/1539697539-24055-1-git-send-email-hofrat@osadl.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 Documentation/scheduler/completion.txt | 42 +++++++++++++++++++++-----
 1 file changed, 34 insertions(+), 8 deletions(-)

diff --git a/Documentation/scheduler/completion.txt b/Documentation/scheduler/completion.txt
index 91a11a668354..2dbff579f957 100644
--- a/Documentation/scheduler/completion.txt
+++ b/Documentation/scheduler/completion.txt
@@ -70,8 +70,18 @@ Good, intuitive naming (as always) helps code readability. Naming a completion
 Initializing completions:
 -------------------------
 
-Initialization of dynamically allocated completion objects, often embedded in
-other structures, is done via a call to init_completion():
+Dynamically allocated completion objects should preferably be embedded in data
+structures that are assured to be alive for the life-time of the function/driver,
+to prevent races with asynchronous complete() calls from occurring.
+
+Particular care should be taken when using the _timeout() or _killable()/_interruptible()
+variants of wait_for_completion(), as it must be assured that memory de-allocation
+does not happen until all related activities (complete() or reinit_completion())
+have taken place, even if these wait functions return prematurely due to a timeout
+or a signal triggering.
+
+Initializing of dynamically allocated completion objects is done via a call to
+init_completion():
 
 	init_completion(&dynamic_object->done);
 
@@ -99,16 +109,32 @@ Note that in this case the completion is boot time (or module load time)
 initialized to 'not done' and doesn't require an init_completion() call.
 
 When a completion is declared as a local variable within a function,
-then the initialization should always use:
+then the initialization should always use DECLARE_COMPLETION_ONSTACK()
+explicitly, not just to make lockdep happy, but also to make it clear
+that limited scope had been considered and is intentional:
 
 	DECLARE_COMPLETION_ONSTACK(setup_done)
 
-A simple DECLARE_COMPLETION() on the stack makes lockdep unhappy.
-
 Note that when using completion objects as local variables you must be
-aware of the short life time of the function stack: the function must
-not return to a calling context until all activities (such as waiting
-threads) have ceased and the completion is ... completely unused.
+acutely aware of the short life time of the function stack: the function
+must not return to a calling context until all activities (such as waiting
+threads) have ceased and the completion object is completely unused.
+
+To emphasise this again: in particular when using some of the waiting API variants
+with more complex outcomes, such as the timeout or signalling (_timeout(),
+_killable() and _interruptible()) variants, the wait might complete
+prematurely while the object might still be in use by another thread - and a return
+from the wait_on_completion*() caller function will deallocate the function
+stack and cause subtle data corruption if a complete() is done in some
+other thread. Simple testing might not trigger these kinds of races.
+
+If unsure, use dynamically allocated completion objects, preferably embedded
+in some other long lived object that has a boringly long life time which
+exceeds the life time of any helper threads using the completion object,
+or has a lock or other synchronization mechanism to make sure complete()
+is not called on a freed object.
+
+A naive DECLARE_COMPLETION() on the stack triggers a lockdep warning.
 
 Waiting for completions:
 ------------------------