|
|
|
@ -896,18 +896,24 @@ pid_t task_numa_group_id(struct task_struct *p)
|
|
|
|
|
return p->numa_group ? p->numa_group->gid : 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static inline int task_faults_idx(int nid, int priv)
|
|
|
|
|
/*
|
|
|
|
|
* The averaged statistics, shared & private, memory & cpu,
|
|
|
|
|
* occupy the first half of the array. The second half of the
|
|
|
|
|
* array is for current counters, which are averaged into the
|
|
|
|
|
* first set by task_numa_placement.
|
|
|
|
|
*/
|
|
|
|
|
static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
|
|
|
|
|
{
|
|
|
|
|
return NR_NUMA_HINT_FAULT_TYPES * nid + priv;
|
|
|
|
|
return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static inline unsigned long task_faults(struct task_struct *p, int nid)
|
|
|
|
|
{
|
|
|
|
|
if (!p->numa_faults_memory)
|
|
|
|
|
if (!p->numa_faults)
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
|
|
return p->numa_faults_memory[task_faults_idx(nid, 0)] +
|
|
|
|
|
p->numa_faults_memory[task_faults_idx(nid, 1)];
|
|
|
|
|
return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
|
|
|
|
|
p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static inline unsigned long group_faults(struct task_struct *p, int nid)
|
|
|
|
@ -915,14 +921,14 @@ static inline unsigned long group_faults(struct task_struct *p, int nid)
|
|
|
|
|
if (!p->numa_group)
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
|
|
return p->numa_group->faults[task_faults_idx(nid, 0)] +
|
|
|
|
|
p->numa_group->faults[task_faults_idx(nid, 1)];
|
|
|
|
|
return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
|
|
|
|
|
p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
|
|
|
|
|
{
|
|
|
|
|
return group->faults_cpu[task_faults_idx(nid, 0)] +
|
|
|
|
|
group->faults_cpu[task_faults_idx(nid, 1)];
|
|
|
|
|
return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +
|
|
|
|
|
group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Handle placement on systems where not all nodes are directly connected. */
|
|
|
|
@ -1001,7 +1007,7 @@ static inline unsigned long task_weight(struct task_struct *p, int nid,
|
|
|
|
|
{
|
|
|
|
|
unsigned long faults, total_faults;
|
|
|
|
|
|
|
|
|
|
if (!p->numa_faults_memory)
|
|
|
|
|
if (!p->numa_faults)
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
|
|
total_faults = p->total_numa_faults;
|
|
|
|
@ -1517,7 +1523,7 @@ static void numa_migrate_preferred(struct task_struct *p)
|
|
|
|
|
unsigned long interval = HZ;
|
|
|
|
|
|
|
|
|
|
/* This task has no NUMA fault statistics yet */
|
|
|
|
|
if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory))
|
|
|
|
|
if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
/* Periodically retry migrating the task to the preferred node */
|
|
|
|
@ -1779,18 +1785,23 @@ static void task_numa_placement(struct task_struct *p)
|
|
|
|
|
|
|
|
|
|
/* Find the node with the highest number of faults */
|
|
|
|
|
for_each_online_node(nid) {
|
|
|
|
|
/* Keep track of the offsets in numa_faults array */
|
|
|
|
|
int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
|
|
|
|
|
unsigned long faults = 0, group_faults = 0;
|
|
|
|
|
int priv, i;
|
|
|
|
|
int priv;
|
|
|
|
|
|
|
|
|
|
for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
|
|
|
|
|
long diff, f_diff, f_weight;
|
|
|
|
|
|
|
|
|
|
i = task_faults_idx(nid, priv);
|
|
|
|
|
mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
|
|
|
|
|
membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
|
|
|
|
|
cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
|
|
|
|
|
cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
|
|
|
|
|
|
|
|
|
|
/* Decay existing window, copy faults since last scan */
|
|
|
|
|
diff = p->numa_faults_buffer_memory[i] - p->numa_faults_memory[i] / 2;
|
|
|
|
|
fault_types[priv] += p->numa_faults_buffer_memory[i];
|
|
|
|
|
p->numa_faults_buffer_memory[i] = 0;
|
|
|
|
|
diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
|
|
|
|
|
fault_types[priv] += p->numa_faults[membuf_idx];
|
|
|
|
|
p->numa_faults[membuf_idx] = 0;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Normalize the faults_from, so all tasks in a group
|
|
|
|
@ -1800,21 +1811,27 @@ static void task_numa_placement(struct task_struct *p)
|
|
|
|
|
* faults are less important.
|
|
|
|
|
*/
|
|
|
|
|
f_weight = div64_u64(runtime << 16, period + 1);
|
|
|
|
|
f_weight = (f_weight * p->numa_faults_buffer_cpu[i]) /
|
|
|
|
|
f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
|
|
|
|
|
(total_faults + 1);
|
|
|
|
|
f_diff = f_weight - p->numa_faults_cpu[i] / 2;
|
|
|
|
|
p->numa_faults_buffer_cpu[i] = 0;
|
|
|
|
|
f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
|
|
|
|
|
p->numa_faults[cpubuf_idx] = 0;
|
|
|
|
|
|
|
|
|
|
p->numa_faults_memory[i] += diff;
|
|
|
|
|
p->numa_faults_cpu[i] += f_diff;
|
|
|
|
|
faults += p->numa_faults_memory[i];
|
|
|
|
|
p->numa_faults[mem_idx] += diff;
|
|
|
|
|
p->numa_faults[cpu_idx] += f_diff;
|
|
|
|
|
faults += p->numa_faults[mem_idx];
|
|
|
|
|
p->total_numa_faults += diff;
|
|
|
|
|
if (p->numa_group) {
|
|
|
|
|
/* safe because we can only change our own group */
|
|
|
|
|
p->numa_group->faults[i] += diff;
|
|
|
|
|
p->numa_group->faults_cpu[i] += f_diff;
|
|
|
|
|
/*
|
|
|
|
|
* safe because we can only change our own group
|
|
|
|
|
*
|
|
|
|
|
* mem_idx represents the offset for a given
|
|
|
|
|
* nid and priv in a specific region because it
|
|
|
|
|
* is at the beginning of the numa_faults array.
|
|
|
|
|
*/
|
|
|
|
|
p->numa_group->faults[mem_idx] += diff;
|
|
|
|
|
p->numa_group->faults_cpu[mem_idx] += f_diff;
|
|
|
|
|
p->numa_group->total_faults += diff;
|
|
|
|
|
group_faults += p->numa_group->faults[i];
|
|
|
|
|
group_faults += p->numa_group->faults[mem_idx];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
@ -1886,7 +1903,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
|
|
|
|
|
node_set(task_node(current), grp->active_nodes);
|
|
|
|
|
|
|
|
|
|
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
|
|
|
|
|
grp->faults[i] = p->numa_faults_memory[i];
|
|
|
|
|
grp->faults[i] = p->numa_faults[i];
|
|
|
|
|
|
|
|
|
|
grp->total_faults = p->total_numa_faults;
|
|
|
|
|
|
|
|
|
@ -1945,8 +1962,8 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
|
|
|
|
|
double_lock_irq(&my_grp->lock, &grp->lock);
|
|
|
|
|
|
|
|
|
|
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
|
|
|
|
|
my_grp->faults[i] -= p->numa_faults_memory[i];
|
|
|
|
|
grp->faults[i] += p->numa_faults_memory[i];
|
|
|
|
|
my_grp->faults[i] -= p->numa_faults[i];
|
|
|
|
|
grp->faults[i] += p->numa_faults[i];
|
|
|
|
|
}
|
|
|
|
|
my_grp->total_faults -= p->total_numa_faults;
|
|
|
|
|
grp->total_faults += p->total_numa_faults;
|
|
|
|
@ -1971,14 +1988,14 @@ no_join:
|
|
|
|
|
void task_numa_free(struct task_struct *p)
|
|
|
|
|
{
|
|
|
|
|
struct numa_group *grp = p->numa_group;
|
|
|
|
|
void *numa_faults = p->numa_faults_memory;
|
|
|
|
|
void *numa_faults = p->numa_faults;
|
|
|
|
|
unsigned long flags;
|
|
|
|
|
int i;
|
|
|
|
|
|
|
|
|
|
if (grp) {
|
|
|
|
|
spin_lock_irqsave(&grp->lock, flags);
|
|
|
|
|
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
|
|
|
|
|
grp->faults[i] -= p->numa_faults_memory[i];
|
|
|
|
|
grp->faults[i] -= p->numa_faults[i];
|
|
|
|
|
grp->total_faults -= p->total_numa_faults;
|
|
|
|
|
|
|
|
|
|
list_del(&p->numa_entry);
|
|
|
|
@ -1988,10 +2005,7 @@ void task_numa_free(struct task_struct *p)
|
|
|
|
|
put_numa_group(grp);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
p->numa_faults_memory = NULL;
|
|
|
|
|
p->numa_faults_buffer_memory = NULL;
|
|
|
|
|
p->numa_faults_cpu= NULL;
|
|
|
|
|
p->numa_faults_buffer_cpu = NULL;
|
|
|
|
|
p->numa_faults = NULL;
|
|
|
|
|
kfree(numa_faults);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
@ -2014,24 +2028,14 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
/* Allocate buffer to track faults on a per-node basis */
|
|
|
|
|
if (unlikely(!p->numa_faults_memory)) {
|
|
|
|
|
int size = sizeof(*p->numa_faults_memory) *
|
|
|
|
|
if (unlikely(!p->numa_faults)) {
|
|
|
|
|
int size = sizeof(*p->numa_faults) *
|
|
|
|
|
NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
|
|
|
|
|
|
|
|
|
|
p->numa_faults_memory = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
|
|
|
|
|
if (!p->numa_faults_memory)
|
|
|
|
|
p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
|
|
|
|
|
if (!p->numa_faults)
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
BUG_ON(p->numa_faults_buffer_memory);
|
|
|
|
|
/*
|
|
|
|
|
* The averaged statistics, shared & private, memory & cpu,
|
|
|
|
|
* occupy the first half of the array. The second half of the
|
|
|
|
|
* array is for current counters, which are averaged into the
|
|
|
|
|
* first set by task_numa_placement.
|
|
|
|
|
*/
|
|
|
|
|
p->numa_faults_cpu = p->numa_faults_memory + (2 * nr_node_ids);
|
|
|
|
|
p->numa_faults_buffer_memory = p->numa_faults_memory + (4 * nr_node_ids);
|
|
|
|
|
p->numa_faults_buffer_cpu = p->numa_faults_memory + (6 * nr_node_ids);
|
|
|
|
|
p->total_numa_faults = 0;
|
|
|
|
|
memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
|
|
|
|
|
}
|
|
|
|
@ -2071,8 +2075,8 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
|
|
|
|
|
if (migrated)
|
|
|
|
|
p->numa_pages_migrated += pages;
|
|
|
|
|
|
|
|
|
|
p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages;
|
|
|
|
|
p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages;
|
|
|
|
|
p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
|
|
|
|
|
p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
|
|
|
|
|
p->numa_faults_locality[local] += pages;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
@ -5361,7 +5365,7 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
|
|
|
|
|
struct numa_group *numa_group = rcu_dereference(p->numa_group);
|
|
|
|
|
int src_nid, dst_nid;
|
|
|
|
|
|
|
|
|
|
if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory ||
|
|
|
|
|
if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
|
|
|
|
|
!(env->sd->flags & SD_NUMA)) {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
@ -5400,7 +5404,7 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
|
|
|
|
|
if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
if (!p->numa_faults_memory || !(env->sd->flags & SD_NUMA))
|
|
|
|
|
if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
src_nid = cpu_to_node(env->src_cpu);
|
|
|
|
|