sched/numa: Replace runnable_load_avg by load_avg

Similarly to what has been done for the normal load balancer, we can
replace runnable_load_avg by load_avg in numa load balancing and track the
other statistics like the utilization and the number of running tasks to
get to better view of the current state of a node.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: "Dietmar Eggemann <dietmar.eggemann@arm.com>"
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Valentin Schneider <valentin.schneider@arm.com>
Cc: Phil Auld <pauld@redhat.com>
Cc: Hillf Danton <hdanton@sina.com>
Link: https://lore.kernel.org/r/20200224095223.13361-6-mgorman@techsingularity.net
This commit is contained in:
Vincent Guittot 2020-02-24 09:52:15 +00:00 committed by Ingo Molnar
parent 6d4d22468d
commit 6499b1b2dd

View file

@ -1473,38 +1473,35 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
}
static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
static unsigned long cpu_runnable_load(struct rq *rq)
{
return cfs_rq_runnable_load_avg(&rq->cfs);
}
/*
* 'numa_type' describes the node at the moment of load balancing.
*/
enum numa_type {
/* The node has spare capacity that can be used to run more tasks. */
node_has_spare = 0,
/*
* The node is fully used and the tasks don't compete for more CPU
* cycles. Nevertheless, some tasks might wait before running.
*/
node_fully_busy,
/*
* The node is overloaded and can't provide expected CPU cycles to all
* tasks.
*/
node_overloaded
};
/* Cached statistics for all CPUs within a node */
struct numa_stats {
unsigned long load;
unsigned long util;
/* Total compute capacity of CPUs on a node */
unsigned long compute_capacity;
unsigned int nr_running;
unsigned int weight;
enum numa_type node_type;
};
/*
* XXX borrowed from update_sg_lb_stats
*/
static void update_numa_stats(struct numa_stats *ns, int nid)
{
int cpu;
memset(ns, 0, sizeof(*ns));
for_each_cpu(cpu, cpumask_of_node(nid)) {
struct rq *rq = cpu_rq(cpu);
ns->load += cpu_runnable_load(rq);
ns->compute_capacity += capacity_of(cpu);
}
}
struct task_numa_env {
struct task_struct *p;
@ -1521,6 +1518,47 @@ struct task_numa_env {
int best_cpu;
};
static unsigned long cpu_load(struct rq *rq);
static unsigned long cpu_util(int cpu);
static inline enum
numa_type numa_classify(unsigned int imbalance_pct,
struct numa_stats *ns)
{
if ((ns->nr_running > ns->weight) &&
((ns->compute_capacity * 100) < (ns->util * imbalance_pct)))
return node_overloaded;
if ((ns->nr_running < ns->weight) ||
((ns->compute_capacity * 100) > (ns->util * imbalance_pct)))
return node_has_spare;
return node_fully_busy;
}
/*
* XXX borrowed from update_sg_lb_stats
*/
static void update_numa_stats(struct task_numa_env *env,
struct numa_stats *ns, int nid)
{
int cpu;
memset(ns, 0, sizeof(*ns));
for_each_cpu(cpu, cpumask_of_node(nid)) {
struct rq *rq = cpu_rq(cpu);
ns->load += cpu_load(rq);
ns->util += cpu_util(cpu);
ns->nr_running += rq->cfs.h_nr_running;
ns->compute_capacity += capacity_of(cpu);
}
ns->weight = cpumask_weight(cpumask_of_node(nid));
ns->node_type = numa_classify(env->imbalance_pct, ns);
}
static void task_numa_assign(struct task_numa_env *env,
struct task_struct *p, long imp)
{
@ -1556,6 +1594,11 @@ static bool load_too_imbalanced(long src_load, long dst_load,
long orig_src_load, orig_dst_load;
long src_capacity, dst_capacity;
/* If dst node has spare capacity, there is no real load imbalance */
if (env->dst_stats.node_type == node_has_spare)
return false;
/*
* The load is corrected for the CPU capacity available on each node.
*
@ -1788,10 +1831,10 @@ static int task_numa_migrate(struct task_struct *p)
dist = env.dist = node_distance(env.src_nid, env.dst_nid);
taskweight = task_weight(p, env.src_nid, dist);
groupweight = group_weight(p, env.src_nid, dist);
update_numa_stats(&env.src_stats, env.src_nid);
update_numa_stats(&env, &env.src_stats, env.src_nid);
taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
update_numa_stats(&env.dst_stats, env.dst_nid);
update_numa_stats(&env, &env.dst_stats, env.dst_nid);
/* Try to find a spot on the preferred nid. */
task_numa_find_cpu(&env, taskimp, groupimp);
@ -1824,7 +1867,7 @@ static int task_numa_migrate(struct task_struct *p)
env.dist = dist;
env.dst_nid = nid;
update_numa_stats(&env.dst_stats, env.dst_nid);
update_numa_stats(&env, &env.dst_stats, env.dst_nid);
task_numa_find_cpu(&env, taskimp, groupimp);
}
}
@ -3686,11 +3729,6 @@ static void remove_entity_load_avg(struct sched_entity *se)
raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
}
static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
{
return cfs_rq->avg.runnable_load_avg;
}
static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
{
return cfs_rq->avg.load_avg;