From 01c889cf4f2e16c9a21376f8583f4a2872d7d1da Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Jan 2013 08:51:07 -0800 Subject: [PATCH 01/18] cpuset: remove unused cpuset_unlock() Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cpuset.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 7bb63eea6eb8..854b8bfbc15c 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2411,17 +2411,6 @@ int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask) return 0; } -/** - * cpuset_unlock - release lock on cpuset changes - * - * Undo the lock taken in a previous cpuset_lock() call. - */ - -void cpuset_unlock(void) -{ - mutex_unlock(&callback_mutex); -} - /** * cpuset_mem_spread_node() - On which node to begin search for a file page * cpuset_slab_spread_node() - On which node to begin search for a slab page From 0772324ae669f787b42fdb9fc5ac2c3d1c0df509 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Jan 2013 08:51:07 -0800 Subject: [PATCH 02/18] cpuset: remove fast exit path from remove_tasks_in_empty_cpuset() The function isn't that hot, the overhead of missing the fast exit is low, the test itself depends heavily on cgroup internals, and it's gonna be a hindrance when trying to decouple cpuset locking from cgroup core. Remove the fast exit path. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cpuset.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 854b8bfbc15c..5372b6f5e5b3 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1967,14 +1967,6 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) { struct cpuset *parent; - /* - * The cgroup's css_sets list is in use if there are tasks - * in the cpuset; the list is empty if there are none; - * the cs->css.refcnt seems always 0. - */ - if (list_empty(&cs->css.cgroup->css_sets)) - return; - /* * Find its next-highest non-empty parent, (top cpuset * has online cpus, so can't be empty). From c8f699bb56aeae951e02fe2a46c9ada022535770 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Jan 2013 08:51:07 -0800 Subject: [PATCH 03/18] cpuset: introduce ->css_on/offline() Add cpuset_css_on/offline() and rearrange css init/exit such that, * Allocation and clearing to the default values happen in css_alloc(). Allocation now uses kzalloc(). * Config inheritance and registration happen in css_online(). * css_offline() undoes what css_online() did. * css_free() frees. This doesn't introduce any visible behavior changes. This will help cleaning up locking. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cpuset.c | 66 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 44 insertions(+), 22 deletions(-) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 5372b6f5e5b3..1d7a611ff771 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1790,15 +1790,12 @@ static struct cftype files[] = { static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont) { - struct cgroup *parent_cg = cont->parent; - struct cgroup *tmp_cg; - struct cpuset *parent, *cs; + struct cpuset *cs; - if (!parent_cg) + if (!cont->parent) return &top_cpuset.css; - parent = cgroup_cs(parent_cg); - cs = kmalloc(sizeof(*cs), GFP_KERNEL); + cs = kzalloc(sizeof(*cs), GFP_KERNEL); if (!cs) return ERR_PTR(-ENOMEM); if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) { @@ -1806,22 +1803,34 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont) return ERR_PTR(-ENOMEM); } - cs->flags = 0; - if (is_spread_page(parent)) - set_bit(CS_SPREAD_PAGE, &cs->flags); - if (is_spread_slab(parent)) - set_bit(CS_SPREAD_SLAB, &cs->flags); set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); cpumask_clear(cs->cpus_allowed); nodes_clear(cs->mems_allowed); fmeter_init(&cs->fmeter); cs->relax_domain_level = -1; + cs->parent = cgroup_cs(cont->parent); + + return &cs->css; +} + +static int cpuset_css_online(struct cgroup *cgrp) +{ + struct cpuset *cs = cgroup_cs(cgrp); + struct cpuset *parent = cs->parent; + struct cgroup *tmp_cg; + + if (!parent) + return 0; + + if (is_spread_page(parent)) + set_bit(CS_SPREAD_PAGE, &cs->flags); + if (is_spread_slab(parent)) + set_bit(CS_SPREAD_SLAB, &cs->flags); - cs->parent = parent; number_of_cpusets++; - if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cont->flags)) - goto skip_clone; + if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags)) + return 0; /* * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is @@ -1836,19 +1845,34 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont) * changed to grant parent->cpus_allowed-sibling_cpus_exclusive * (and likewise for mems) to the new cgroup. */ - list_for_each_entry(tmp_cg, &parent_cg->children, sibling) { + list_for_each_entry(tmp_cg, &cgrp->parent->children, sibling) { struct cpuset *tmp_cs = cgroup_cs(tmp_cg); if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) - goto skip_clone; + return 0; } mutex_lock(&callback_mutex); cs->mems_allowed = parent->mems_allowed; cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); mutex_unlock(&callback_mutex); -skip_clone: - return &cs->css; + + return 0; +} + +static void cpuset_css_offline(struct cgroup *cgrp) +{ + struct cpuset *cs = cgroup_cs(cgrp); + + /* css_offline is called w/o cgroup_mutex, grab it */ + cgroup_lock(); + + if (is_sched_load_balance(cs)) + update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); + + number_of_cpusets--; + + cgroup_unlock(); } /* @@ -1861,10 +1885,6 @@ static void cpuset_css_free(struct cgroup *cont) { struct cpuset *cs = cgroup_cs(cont); - if (is_sched_load_balance(cs)) - update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); - - number_of_cpusets--; free_cpumask_var(cs->cpus_allowed); kfree(cs); } @@ -1872,6 +1892,8 @@ static void cpuset_css_free(struct cgroup *cont) struct cgroup_subsys cpuset_subsys = { .name = "cpuset", .css_alloc = cpuset_css_alloc, + .css_online = cpuset_css_online, + .css_offline = cpuset_css_offline, .css_free = cpuset_css_free, .can_attach = cpuset_can_attach, .attach = cpuset_attach, From efeb77b2f13deb0503e65ad2b243495b6de75173 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Jan 2013 08:51:07 -0800 Subject: [PATCH 04/18] cpuset: introduce CS_ONLINE Add CS_ONLINE which is set from css_online() and cleared from css_offline(). This will enable using generic cgroup iterator while allowing decoupling cpuset from cgroup internal locking. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cpuset.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 1d7a611ff771..e857887bd246 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -138,6 +138,7 @@ static inline bool task_has_mempolicy(struct task_struct *task) /* bits in struct cpuset flags field */ typedef enum { + CS_ONLINE, CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, CS_MEM_HARDWALL, @@ -154,6 +155,11 @@ enum hotplug_event { }; /* convenient tests for these bits */ +static inline bool is_cpuset_online(const struct cpuset *cs) +{ + return test_bit(CS_ONLINE, &cs->flags); +} + static inline int is_cpu_exclusive(const struct cpuset *cs) { return test_bit(CS_CPU_EXCLUSIVE, &cs->flags); @@ -190,7 +196,8 @@ static inline int is_spread_slab(const struct cpuset *cs) } static struct cpuset top_cpuset = { - .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), + .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) | + (1 << CS_MEM_EXCLUSIVE)), }; /* @@ -1822,6 +1829,7 @@ static int cpuset_css_online(struct cgroup *cgrp) if (!parent) return 0; + set_bit(CS_ONLINE, &cs->flags); if (is_spread_page(parent)) set_bit(CS_SPREAD_PAGE, &cs->flags); if (is_spread_slab(parent)) @@ -1871,6 +1879,7 @@ static void cpuset_css_offline(struct cgroup *cgrp) update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); number_of_cpusets--; + clear_bit(CS_ONLINE, &cs->flags); cgroup_unlock(); } From ae8086ce15fdab2b57599d7a3242a114ba4b8597 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Jan 2013 08:51:07 -0800 Subject: [PATCH 05/18] cpuset: introduce cpuset_for_each_child() Instead of iterating cgroup->children directly, introduce and use cpuset_for_each_child() which wraps cgroup_for_each_child() and performs online check. As it uses the generic iterator, it requires RCU read locking too. As cpuset is currently protected by cgroup_mutex, non-online cpusets aren't visible to all the iterations and this patch currently doesn't make any functional difference. This will be used to de-couple cpuset locking from cgroup core. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cpuset.c | 85 +++++++++++++++++++++++++++++++------------------ 1 file changed, 54 insertions(+), 31 deletions(-) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index e857887bd246..4b054b9faf3d 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -200,6 +200,19 @@ static struct cpuset top_cpuset = { (1 << CS_MEM_EXCLUSIVE)), }; +/** + * cpuset_for_each_child - traverse online children of a cpuset + * @child_cs: loop cursor pointing to the current child + * @pos_cgrp: used for iteration + * @parent_cs: target cpuset to walk children of + * + * Walk @child_cs through the online children of @parent_cs. Must be used + * with RCU read locked. + */ +#define cpuset_for_each_child(child_cs, pos_cgrp, parent_cs) \ + cgroup_for_each_child((pos_cgrp), (parent_cs)->css.cgroup) \ + if (is_cpuset_online(((child_cs) = cgroup_cs((pos_cgrp))))) + /* * There are two global mutexes guarding cpuset structures. The first * is the main control groups cgroup_mutex, accessed via @@ -419,48 +432,55 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) { struct cgroup *cont; struct cpuset *c, *par; + int ret; + + rcu_read_lock(); /* Each of our child cpusets must be a subset of us */ - list_for_each_entry(cont, &cur->css.cgroup->children, sibling) { - if (!is_cpuset_subset(cgroup_cs(cont), trial)) - return -EBUSY; - } + ret = -EBUSY; + cpuset_for_each_child(c, cont, cur) + if (!is_cpuset_subset(c, trial)) + goto out; /* Remaining checks don't apply to root cpuset */ + ret = 0; if (cur == &top_cpuset) - return 0; + goto out; par = cur->parent; /* We must be a subset of our parent cpuset */ + ret = -EACCES; if (!is_cpuset_subset(trial, par)) - return -EACCES; + goto out; /* * If either I or some sibling (!= me) is exclusive, we can't * overlap */ - list_for_each_entry(cont, &par->css.cgroup->children, sibling) { - c = cgroup_cs(cont); + ret = -EINVAL; + cpuset_for_each_child(c, cont, par) { if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && c != cur && cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) - return -EINVAL; + goto out; if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && c != cur && nodes_intersects(trial->mems_allowed, c->mems_allowed)) - return -EINVAL; + goto out; } /* Cpusets with tasks can't have empty cpus_allowed or mems_allowed */ - if (cgroup_task_count(cur->css.cgroup)) { - if (cpumask_empty(trial->cpus_allowed) || - nodes_empty(trial->mems_allowed)) { - return -ENOSPC; - } - } + ret = -ENOSPC; + if (cgroup_task_count(cur->css.cgroup) && + (cpumask_empty(trial->cpus_allowed) || + nodes_empty(trial->mems_allowed))) + goto out; - return 0; + ret = 0; +out: + rcu_read_unlock(); + return ret; } #ifdef CONFIG_SMP @@ -501,10 +521,10 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) if (is_sched_load_balance(cp)) update_domain_attr(dattr, cp); - list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { - child = cgroup_cs(cont); + rcu_read_lock(); + cpuset_for_each_child(child, cont, cp) list_add_tail(&child->stack_list, &q); - } + rcu_read_unlock(); } } @@ -623,10 +643,10 @@ static int generate_sched_domains(cpumask_var_t **domains, continue; } - list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { - child = cgroup_cs(cont); + rcu_read_lock(); + cpuset_for_each_child(child, cont, cp) list_add_tail(&child->stack_list, &q); - } + rcu_read_unlock(); } for (i = 0; i < csn; i++) @@ -1824,7 +1844,8 @@ static int cpuset_css_online(struct cgroup *cgrp) { struct cpuset *cs = cgroup_cs(cgrp); struct cpuset *parent = cs->parent; - struct cgroup *tmp_cg; + struct cpuset *tmp_cs; + struct cgroup *pos_cg; if (!parent) return 0; @@ -1853,12 +1874,14 @@ static int cpuset_css_online(struct cgroup *cgrp) * changed to grant parent->cpus_allowed-sibling_cpus_exclusive * (and likewise for mems) to the new cgroup. */ - list_for_each_entry(tmp_cg, &cgrp->parent->children, sibling) { - struct cpuset *tmp_cs = cgroup_cs(tmp_cg); - - if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) + rcu_read_lock(); + cpuset_for_each_child(tmp_cs, pos_cg, parent) { + if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) { + rcu_read_unlock(); return 0; + } } + rcu_read_unlock(); mutex_lock(&callback_mutex); cs->mems_allowed = parent->mems_allowed; @@ -2027,10 +2050,10 @@ static struct cpuset *cpuset_next(struct list_head *queue) cp = list_first_entry(queue, struct cpuset, stack_list); list_del(queue->next); - list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { - child = cgroup_cs(cont); + rcu_read_lock(); + cpuset_for_each_child(child, cont, cp) list_add_tail(&child->stack_list, queue); - } + rcu_read_unlock(); return cp; } From 4e4c9a140fc2ecf5e086922ccd2022bdabe509b6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Jan 2013 08:51:07 -0800 Subject: [PATCH 06/18] cpuset: cleanup cpuset[_can]_attach() cpuset_can_attach() prepare global variables cpus_attach and cpuset_attach_nodemask_{to|from} which are used by cpuset_attach(). There is no reason to prepare in cpuset_can_attach(). The same information can be accessed from cpuset_attach(). Move the prepartion logic from cpuset_can_attach() to cpuset_attach() and make the global variables static ones inside cpuset_attach(). With this change, there's no reason to keep cpuset_attach_nodemask_{from|to} global. Move them inside cpuset_attach(). Unfortunately, we need to keep cpus_attach global as it can't be allocated from cpuset_attach(). v2: cpus_attach not converted to cpumask_t as per Li Zefan and Rusty Russell. Signed-off-by: Tejun Heo Acked-by: Li Zefan Cc: Rusty Russell --- kernel/cpuset.c | 43 ++++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 4b054b9faf3d..c5edc6b3eb28 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1395,15 +1395,6 @@ static int fmeter_getrate(struct fmeter *fmp) return val; } -/* - * Protected by cgroup_lock. The nodemasks must be stored globally because - * dynamically allocating them is not allowed in can_attach, and they must - * persist until attach. - */ -static cpumask_var_t cpus_attach; -static nodemask_t cpuset_attach_nodemask_from; -static nodemask_t cpuset_attach_nodemask_to; - /* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) { @@ -1430,6 +1421,28 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) return ret; } + return 0; +} + +/* + * Protected by cgroup_mutex. cpus_attach is used only by cpuset_attach() + * but we can't allocate it dynamically there. Define it global and + * allocate from cpuset_init(). + */ +static cpumask_var_t cpus_attach; + +static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) +{ + /* static bufs protected by cgroup_mutex */ + static nodemask_t cpuset_attach_nodemask_from; + static nodemask_t cpuset_attach_nodemask_to; + struct mm_struct *mm; + struct task_struct *task; + struct task_struct *leader = cgroup_taskset_first(tset); + struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset); + struct cpuset *cs = cgroup_cs(cgrp); + struct cpuset *oldcs = cgroup_cs(oldcgrp); + /* prepare for attach */ if (cs == &top_cpuset) cpumask_copy(cpus_attach, cpu_possible_mask); @@ -1438,18 +1451,6 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) guarantee_online_mems(cs, &cpuset_attach_nodemask_to); - return 0; -} - -static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) -{ - struct mm_struct *mm; - struct task_struct *task; - struct task_struct *leader = cgroup_taskset_first(tset); - struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset); - struct cpuset *cs = cgroup_cs(cgrp); - struct cpuset *oldcs = cgroup_cs(oldcgrp); - cgroup_taskset_for_each(task, cgrp, tset) { /* * can_attach beforehand should guarantee that this doesn't From deb7aa308ea264b374d1db970174f5728a2faa27 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Jan 2013 08:51:07 -0800 Subject: [PATCH 07/18] cpuset: reorganize CPU / memory hotplug handling Reorganize hotplug path to prepare for async hotplug handling. * Both CPU and memory hotplug handlings are collected into a single function - cpuset_handle_hotplug(). It doesn't take any argument but compares the current setttings of top_cpuset against what's actually available to determine what happened. This function directly updates top_cpuset. If there are CPUs or memory nodes which are taken down, cpuset_propagate_hotplug() in invoked on all !root cpusets. * cpuset_propagate_hotplug() is responsible for updating the specified cpuset so that it doesn't include any resource which isn't available to top_cpuset. If no CPU or memory is left after update, all tasks are moved to the nearest ancestor with both resources. * update_tasks_cpumask() and update_tasks_nodemask() are now always called after cpus or mems masks are updated even if the cpuset doesn't have any task. This is for brevity and not expected to have any measureable effect. * cpu_active_mask and N_HIGH_MEMORY are read exactly once per cpuset_handle_hotplug() invocation, all cpusets share the same view of what resources are available, and cpuset_handle_hotplug() can handle multiple resources going up and down. These properties will allow async operation. The reorganization, while drastic, is equivalent and shouldn't cause any behavior difference. This will enable making hotplug handling async and remove get_online_cpus() -> cgroup_mutex nesting. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cpuset.c | 227 +++++++++++++++++++++++------------------------- 1 file changed, 107 insertions(+), 120 deletions(-) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index c5edc6b3eb28..3d448e646a4a 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -148,12 +148,6 @@ typedef enum { CS_SPREAD_SLAB, } cpuset_flagbits_t; -/* the type of hotplug event */ -enum hotplug_event { - CPUSET_CPU_OFFLINE, - CPUSET_MEM_OFFLINE, -}; - /* convenient tests for these bits */ static inline bool is_cpuset_online(const struct cpuset *cs) { @@ -2059,116 +2053,131 @@ static struct cpuset *cpuset_next(struct list_head *queue) return cp; } - -/* - * Walk the specified cpuset subtree upon a hotplug operation (CPU/Memory - * online/offline) and update the cpusets accordingly. - * For regular CPU/Mem hotplug, look for empty cpusets; the tasks of such - * cpuset must be moved to a parent cpuset. +/** + * cpuset_propagate_hotplug - propagate CPU/memory hotplug to a cpuset + * @cs: cpuset in interest * - * Called with cgroup_mutex held. We take callback_mutex to modify - * cpus_allowed and mems_allowed. + * Compare @cs's cpu and mem masks against top_cpuset and if some have gone + * offline, update @cs accordingly. If @cs ends up with no CPU or memory, + * all its tasks are moved to the nearest ancestor with both resources. * - * This walk processes the tree from top to bottom, completing one layer - * before dropping down to the next. It always processes a node before - * any of its children. - * - * In the case of memory hot-unplug, it will remove nodes from N_MEMORY - * if all present pages from a node are offlined. + * Should be called with cgroup_mutex held. */ -static void -scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event) +static void cpuset_propagate_hotplug(struct cpuset *cs) { - LIST_HEAD(queue); - struct cpuset *cp; /* scans cpusets being updated */ - static nodemask_t oldmems; /* protected by cgroup_mutex */ + static cpumask_t off_cpus; + static nodemask_t off_mems, tmp_mems; - list_add_tail((struct list_head *)&root->stack_list, &queue); + WARN_ON_ONCE(!cgroup_lock_is_held()); - switch (event) { - case CPUSET_CPU_OFFLINE: - while ((cp = cpuset_next(&queue)) != NULL) { + cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed); + nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed); - /* Continue past cpusets with all cpus online */ - if (cpumask_subset(cp->cpus_allowed, cpu_active_mask)) - continue; + /* remove offline cpus from @cs */ + if (!cpumask_empty(&off_cpus)) { + mutex_lock(&callback_mutex); + cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus); + mutex_unlock(&callback_mutex); + update_tasks_cpumask(cs, NULL); + } - /* Remove offline cpus from this cpuset. */ - mutex_lock(&callback_mutex); - cpumask_and(cp->cpus_allowed, cp->cpus_allowed, - cpu_active_mask); - mutex_unlock(&callback_mutex); + /* remove offline mems from @cs */ + if (!nodes_empty(off_mems)) { + tmp_mems = cs->mems_allowed; + mutex_lock(&callback_mutex); + nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems); + mutex_unlock(&callback_mutex); + update_tasks_nodemask(cs, &tmp_mems, NULL); + } - /* Move tasks from the empty cpuset to a parent */ - if (cpumask_empty(cp->cpus_allowed)) - remove_tasks_in_empty_cpuset(cp); - else - update_tasks_cpumask(cp, NULL); - } - break; + if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) + remove_tasks_in_empty_cpuset(cs); +} - case CPUSET_MEM_OFFLINE: - while ((cp = cpuset_next(&queue)) != NULL) { +/** + * cpuset_handle_hotplug - handle CPU/memory hot[un]plug + * + * This function is called after either CPU or memory configuration has + * changed and updates cpuset accordingly. The top_cpuset is always + * synchronized to cpu_active_mask and N_MEMORY, which is necessary in + * order to make cpusets transparent (of no affect) on systems that are + * actively using CPU hotplug but making no active use of cpusets. + * + * Non-root cpusets are only affected by offlining. If any CPUs or memory + * nodes have been taken down, cpuset_propagate_hotplug() is invoked on all + * descendants. + * + * Note that CPU offlining during suspend is ignored. We don't modify + * cpusets across suspend/resume cycles at all. + */ +static void cpuset_handle_hotplug(void) +{ + static cpumask_t new_cpus, tmp_cpus; + static nodemask_t new_mems, tmp_mems; + bool cpus_updated, mems_updated; + bool cpus_offlined, mems_offlined; - /* Continue past cpusets with all mems online */ - if (nodes_subset(cp->mems_allowed, - node_states[N_MEMORY])) - continue; + cgroup_lock(); - oldmems = cp->mems_allowed; + /* fetch the available cpus/mems and find out which changed how */ + cpumask_copy(&new_cpus, cpu_active_mask); + new_mems = node_states[N_MEMORY]; - /* Remove offline mems from this cpuset. */ - mutex_lock(&callback_mutex); - nodes_and(cp->mems_allowed, cp->mems_allowed, - node_states[N_MEMORY]); - mutex_unlock(&callback_mutex); + cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus); + cpus_offlined = cpumask_andnot(&tmp_cpus, top_cpuset.cpus_allowed, + &new_cpus); - /* Move tasks from the empty cpuset to a parent */ - if (nodes_empty(cp->mems_allowed)) - remove_tasks_in_empty_cpuset(cp); - else - update_tasks_nodemask(cp, &oldmems, NULL); - } + mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems); + nodes_andnot(tmp_mems, top_cpuset.mems_allowed, new_mems); + mems_offlined = !nodes_empty(tmp_mems); + + /* synchronize cpus_allowed to cpu_active_mask */ + if (cpus_updated) { + mutex_lock(&callback_mutex); + cpumask_copy(top_cpuset.cpus_allowed, &new_cpus); + mutex_unlock(&callback_mutex); + /* we don't mess with cpumasks of tasks in top_cpuset */ + } + + /* synchronize mems_allowed to N_MEMORY */ + if (mems_updated) { + tmp_mems = top_cpuset.mems_allowed; + mutex_lock(&callback_mutex); + top_cpuset.mems_allowed = new_mems; + mutex_unlock(&callback_mutex); + update_tasks_nodemask(&top_cpuset, &tmp_mems, NULL); + } + + /* if cpus or mems went down, we need to propagate to descendants */ + if (cpus_offlined || mems_offlined) { + struct cpuset *cs; + LIST_HEAD(queue); + + list_add_tail(&top_cpuset.stack_list, &queue); + while ((cs = cpuset_next(&queue))) + if (cs != &top_cpuset) + cpuset_propagate_hotplug(cs); + } + + cgroup_unlock(); + + /* rebuild sched domains if cpus_allowed has changed */ + if (cpus_updated) { + struct sched_domain_attr *attr; + cpumask_var_t *doms; + int ndoms; + + cgroup_lock(); + ndoms = generate_sched_domains(&doms, &attr); + cgroup_unlock(); + + partition_sched_domains(ndoms, doms, attr); } } -/* - * The top_cpuset tracks what CPUs and Memory Nodes are online, - * period. This is necessary in order to make cpusets transparent - * (of no affect) on systems that are actively using CPU hotplug - * but making no active use of cpusets. - * - * The only exception to this is suspend/resume, where we don't - * modify cpusets at all. - * - * This routine ensures that top_cpuset.cpus_allowed tracks - * cpu_active_mask on each CPU hotplug (cpuhp) event. - * - * Called within get_online_cpus(). Needs to call cgroup_lock() - * before calling generate_sched_domains(). - * - * @cpu_online: Indicates whether this is a CPU online event (true) or - * a CPU offline event (false). - */ void cpuset_update_active_cpus(bool cpu_online) { - struct sched_domain_attr *attr; - cpumask_var_t *doms; - int ndoms; - - cgroup_lock(); - mutex_lock(&callback_mutex); - cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); - mutex_unlock(&callback_mutex); - - if (!cpu_online) - scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_CPU_OFFLINE); - - ndoms = generate_sched_domains(&doms, &attr); - cgroup_unlock(); - - /* Have scheduler rebuild the domains */ - partition_sched_domains(ndoms, doms, attr); + cpuset_handle_hotplug(); } #ifdef CONFIG_MEMORY_HOTPLUG @@ -2180,29 +2189,7 @@ void cpuset_update_active_cpus(bool cpu_online) static int cpuset_track_online_nodes(struct notifier_block *self, unsigned long action, void *arg) { - static nodemask_t oldmems; /* protected by cgroup_mutex */ - - cgroup_lock(); - switch (action) { - case MEM_ONLINE: - oldmems = top_cpuset.mems_allowed; - mutex_lock(&callback_mutex); - top_cpuset.mems_allowed = node_states[N_MEMORY]; - mutex_unlock(&callback_mutex); - update_tasks_nodemask(&top_cpuset, &oldmems, NULL); - break; - case MEM_OFFLINE: - /* - * needn't update top_cpuset.mems_allowed explicitly because - * scan_cpusets_upon_hotplug() will update it. - */ - scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_MEM_OFFLINE); - break; - default: - break; - } - cgroup_unlock(); - + cpuset_handle_hotplug(); return NOTIFY_OK; } #endif From 3a5a6d0c2b0391e159fa5bf1dddb9bf1f35178a0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Jan 2013 08:51:07 -0800 Subject: [PATCH 08/18] cpuset: don't nest cgroup_mutex inside get_online_cpus() CPU / memory hotplug path currently grabs cgroup_mutex from hotplug event notifications. We want to separate cpuset locking from cgroup core and make cgroup_mutex outer to hotplug synchronization so that, among other things, mechanisms which depend on get_online_cpus() can be used from cgroup callbacks. In general, we want to keep cgroup_mutex the outermost lock to minimize locking interactions among different controllers. Convert cpuset_handle_hotplug() to cpuset_hotplug_workfn() and schedule it from the hotplug notifications. As the function can already handle multiple mixed events without any input, converting it to a work function is mostly trivial; however, one complication is that cpuset_update_active_cpus() needs to update sched domains synchronously to reflect an offlined cpu to avoid confusing the scheduler. This is worked around by falling back to the the default single sched domain synchronously before scheduling the actual hotplug work. This makes sched domain rebuilt twice per CPU hotplug event but the operation isn't that heavy and a lot of the second operation would be noop for systems w/ single sched domain, which is the common case. This decouples cpuset hotplug handling from the notification callbacks and there can be an arbitrary delay between the actual event and updates to cpusets. Scheduler and mm can handle it fine but moving tasks out of an empty cpuset may race against writes to the cpuset restoring execution resources which can lead to confusing behavior. Flush hotplug work item from cpuset_write_resmask() to avoid such confusions. v2: Synchronous sched domain rebuilding using the fallback sched domain added. This fixes various issues caused by confused scheduler putting tasks on a dead CPU, including the one reported by Li Zefan. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cpuset.c | 39 +++++++++++++++++++++++++++++++++++---- 1 file changed, 35 insertions(+), 4 deletions(-) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 3d448e646a4a..658eb1a32084 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -259,6 +259,13 @@ static char cpuset_name[CPUSET_NAME_LEN]; static char cpuset_nodelist[CPUSET_NODELIST_LEN]; static DEFINE_SPINLOCK(cpuset_buffer_lock); +/* + * CPU / memory hotplug is handled asynchronously. + */ +static void cpuset_hotplug_workfn(struct work_struct *work); + +static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn); + /* * This is ugly, but preserves the userspace API for existing cpuset * users. If someone tries to mount the "cpuset" filesystem, we @@ -1565,6 +1572,19 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, struct cpuset *cs = cgroup_cs(cgrp); struct cpuset *trialcs; + /* + * CPU or memory hotunplug may leave @cs w/o any execution + * resources, in which case the hotplug code asynchronously updates + * configuration and transfers all tasks to the nearest ancestor + * which can execute. + * + * As writes to "cpus" or "mems" may restore @cs's execution + * resources, wait for the previously scheduled operations before + * proceeding, so that we don't end up keep removing tasks added + * after execution capability is restored. + */ + flush_work(&cpuset_hotplug_work); + if (!cgroup_lock_live_group(cgrp)) return -ENODEV; @@ -2095,7 +2115,7 @@ static void cpuset_propagate_hotplug(struct cpuset *cs) } /** - * cpuset_handle_hotplug - handle CPU/memory hot[un]plug + * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset * * This function is called after either CPU or memory configuration has * changed and updates cpuset accordingly. The top_cpuset is always @@ -2110,7 +2130,7 @@ static void cpuset_propagate_hotplug(struct cpuset *cs) * Note that CPU offlining during suspend is ignored. We don't modify * cpusets across suspend/resume cycles at all. */ -static void cpuset_handle_hotplug(void) +static void cpuset_hotplug_workfn(struct work_struct *work) { static cpumask_t new_cpus, tmp_cpus; static nodemask_t new_mems, tmp_mems; @@ -2177,7 +2197,18 @@ static void cpuset_handle_hotplug(void) void cpuset_update_active_cpus(bool cpu_online) { - cpuset_handle_hotplug(); + /* + * We're inside cpu hotplug critical region which usually nests + * inside cgroup synchronization. Bounce actual hotplug processing + * to a work item to avoid reverse locking order. + * + * We still need to do partition_sched_domains() synchronously; + * otherwise, the scheduler will get confused and put tasks to the + * dead CPU. Fall back to the default single domain. + * cpuset_hotplug_workfn() will rebuild it as necessary. + */ + partition_sched_domains(1, NULL, NULL); + schedule_work(&cpuset_hotplug_work); } #ifdef CONFIG_MEMORY_HOTPLUG @@ -2189,7 +2220,7 @@ void cpuset_update_active_cpus(bool cpu_online) static int cpuset_track_online_nodes(struct notifier_block *self, unsigned long action, void *arg) { - cpuset_handle_hotplug(); + schedule_work(&cpuset_hotplug_work); return NOTIFY_OK; } #endif From 699140ba838dd3fa2c5cce474e14f194b09f91aa Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Jan 2013 08:51:07 -0800 Subject: [PATCH 09/18] cpuset: drop async_rebuild_sched_domains() In general, we want to make cgroup_mutex one of the outermost locks and be able to use get_online_cpus() and friends from cgroup methods. With cpuset hotplug made async, get_online_cpus() can now be nested inside cgroup_mutex. Currently, cpuset avoids nesting get_online_cpus() inside cgroup_mutex by bouncing sched_domain rebuilding to a work item. As such nesting is allowed now, remove the workqueue bouncing code and always rebuild sched_domains synchronously. This also nests sched_domains_mutex inside cgroup_mutex, which is intended and should be okay. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cpuset.c | 76 +++++++++++-------------------------------------- 1 file changed, 16 insertions(+), 60 deletions(-) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 658eb1a32084..74e412f908db 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -60,14 +60,6 @@ #include #include -/* - * Workqueue for cpuset related tasks. - * - * Using kevent workqueue may cause deadlock when memory_migrate - * is set. So we create a separate workqueue thread for cpuset. - */ -static struct workqueue_struct *cpuset_wq; - /* * Tracks how many cpusets are currently defined in system. * When there is only one cpuset (the root cpuset) we can @@ -753,25 +745,25 @@ done: /* * Rebuild scheduler domains. * - * Call with neither cgroup_mutex held nor within get_online_cpus(). - * Takes both cgroup_mutex and get_online_cpus(). + * If the flag 'sched_load_balance' of any cpuset with non-empty + * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset + * which has that flag enabled, or if any cpuset with a non-empty + * 'cpus' is removed, then call this routine to rebuild the + * scheduler's dynamic sched domains. * - * Cannot be directly called from cpuset code handling changes - * to the cpuset pseudo-filesystem, because it cannot be called - * from code that already holds cgroup_mutex. + * Call with cgroup_mutex held. Takes get_online_cpus(). */ -static void do_rebuild_sched_domains(struct work_struct *unused) +static void rebuild_sched_domains_locked(void) { struct sched_domain_attr *attr; cpumask_var_t *doms; int ndoms; + WARN_ON_ONCE(!cgroup_lock_is_held()); get_online_cpus(); /* Generate domain masks and attrs */ - cgroup_lock(); ndoms = generate_sched_domains(&doms, &attr); - cgroup_unlock(); /* Have scheduler rebuild the domains */ partition_sched_domains(ndoms, doms, attr); @@ -779,7 +771,7 @@ static void do_rebuild_sched_domains(struct work_struct *unused) put_online_cpus(); } #else /* !CONFIG_SMP */ -static void do_rebuild_sched_domains(struct work_struct *unused) +static void rebuild_sched_domains_locked(void) { } @@ -791,44 +783,11 @@ static int generate_sched_domains(cpumask_var_t **domains, } #endif /* CONFIG_SMP */ -static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains); - -/* - * Rebuild scheduler domains, asynchronously via workqueue. - * - * If the flag 'sched_load_balance' of any cpuset with non-empty - * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset - * which has that flag enabled, or if any cpuset with a non-empty - * 'cpus' is removed, then call this routine to rebuild the - * scheduler's dynamic sched domains. - * - * The rebuild_sched_domains() and partition_sched_domains() - * routines must nest cgroup_lock() inside get_online_cpus(), - * but such cpuset changes as these must nest that locking the - * other way, holding cgroup_lock() for much of the code. - * - * So in order to avoid an ABBA deadlock, the cpuset code handling - * these user changes delegates the actual sched domain rebuilding - * to a separate workqueue thread, which ends up processing the - * above do_rebuild_sched_domains() function. - */ -static void async_rebuild_sched_domains(void) -{ - queue_work(cpuset_wq, &rebuild_sched_domains_work); -} - -/* - * Accomplishes the same scheduler domain rebuild as the above - * async_rebuild_sched_domains(), however it directly calls the - * rebuild routine synchronously rather than calling it via an - * asynchronous work thread. - * - * This can only be called from code that is not holding - * cgroup_mutex (not nested in a cgroup_lock() call.) - */ void rebuild_sched_domains(void) { - do_rebuild_sched_domains(NULL); + cgroup_lock(); + rebuild_sched_domains_locked(); + cgroup_unlock(); } /** @@ -948,7 +907,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, heap_free(&heap); if (is_load_balanced) - async_rebuild_sched_domains(); + rebuild_sched_domains_locked(); return 0; } @@ -1196,7 +1155,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) cs->relax_domain_level = val; if (!cpumask_empty(cs->cpus_allowed) && is_sched_load_balance(cs)) - async_rebuild_sched_domains(); + rebuild_sched_domains_locked(); } return 0; @@ -1288,7 +1247,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, mutex_unlock(&callback_mutex); if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) - async_rebuild_sched_domains(); + rebuild_sched_domains_locked(); if (spread_flag_changed) update_tasks_flags(cs, &heap); @@ -1925,7 +1884,7 @@ static void cpuset_css_offline(struct cgroup *cgrp) /* * If the cpuset being removed has its flag 'sched_load_balance' * enabled, then simulate turning sched_load_balance off, which - * will call async_rebuild_sched_domains(). + * will call rebuild_sched_domains_locked(). */ static void cpuset_css_free(struct cgroup *cont) @@ -2237,9 +2196,6 @@ void __init cpuset_init_smp(void) top_cpuset.mems_allowed = node_states[N_MEMORY]; hotplug_memory_notifier(cpuset_track_online_nodes, 10); - - cpuset_wq = create_singlethread_workqueue("cpuset"); - BUG_ON(!cpuset_wq); } /** From 8d03394877ecdf87e1d694664c460747b8e05aa1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Jan 2013 08:51:07 -0800 Subject: [PATCH 10/18] cpuset: make CPU / memory hotplug propagation asynchronous cpuset_hotplug_workfn() has been invoking cpuset_propagate_hotplug() directly to propagate hotplug updates to !root cpusets; however, this has the following problems. * cpuset locking is scheduled to be decoupled from cgroup_mutex, cgroup_mutex will be unexported, and cgroup_attach_task() will do cgroup locking internally, so propagation can't synchronously move tasks to a parent cgroup while walking the hierarchy. * We can't use cgroup generic tree iterator because propagation to each cpuset may sleep. With propagation done asynchronously, we can lose the rather ugly cpuset specific iteration. Convert cpuset_propagate_hotplug() to cpuset_propagate_hotplug_workfn() and execute it from newly added cpuset->hotplug_work. The work items are run on an ordered workqueue, so the propagation order is preserved. cpuset_hotplug_workfn() schedules all propagations while holding cgroup_mutex and waits for completion without cgroup_mutex. Each in-flight propagation holds a reference to the cpuset->css. This patch doesn't cause any functional difference. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cpuset.c | 54 +++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 48 insertions(+), 6 deletions(-) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 74e412f908db..a7bb547786d7 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -99,6 +99,8 @@ struct cpuset { /* used for walking a cpuset hierarchy */ struct list_head stack_list; + + struct work_struct hotplug_work; }; /* Retrieve the cpuset for a cgroup */ @@ -254,7 +256,10 @@ static DEFINE_SPINLOCK(cpuset_buffer_lock); /* * CPU / memory hotplug is handled asynchronously. */ +static struct workqueue_struct *cpuset_propagate_hotplug_wq; + static void cpuset_hotplug_workfn(struct work_struct *work); +static void cpuset_propagate_hotplug_workfn(struct work_struct *work); static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn); @@ -1808,6 +1813,7 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont) cpumask_clear(cs->cpus_allowed); nodes_clear(cs->mems_allowed); fmeter_init(&cs->fmeter); + INIT_WORK(&cs->hotplug_work, cpuset_propagate_hotplug_workfn); cs->relax_domain_level = -1; cs->parent = cgroup_cs(cont->parent); @@ -2033,21 +2039,20 @@ static struct cpuset *cpuset_next(struct list_head *queue) } /** - * cpuset_propagate_hotplug - propagate CPU/memory hotplug to a cpuset + * cpuset_propagate_hotplug_workfn - propagate CPU/memory hotplug to a cpuset * @cs: cpuset in interest * * Compare @cs's cpu and mem masks against top_cpuset and if some have gone * offline, update @cs accordingly. If @cs ends up with no CPU or memory, * all its tasks are moved to the nearest ancestor with both resources. - * - * Should be called with cgroup_mutex held. */ -static void cpuset_propagate_hotplug(struct cpuset *cs) +static void cpuset_propagate_hotplug_workfn(struct work_struct *work) { static cpumask_t off_cpus; static nodemask_t off_mems, tmp_mems; + struct cpuset *cs = container_of(work, struct cpuset, hotplug_work); - WARN_ON_ONCE(!cgroup_lock_is_held()); + cgroup_lock(); cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed); nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed); @@ -2071,6 +2076,36 @@ static void cpuset_propagate_hotplug(struct cpuset *cs) if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) remove_tasks_in_empty_cpuset(cs); + + cgroup_unlock(); + + /* the following may free @cs, should be the last operation */ + css_put(&cs->css); +} + +/** + * schedule_cpuset_propagate_hotplug - schedule hotplug propagation to a cpuset + * @cs: cpuset of interest + * + * Schedule cpuset_propagate_hotplug_workfn() which will update CPU and + * memory masks according to top_cpuset. + */ +static void schedule_cpuset_propagate_hotplug(struct cpuset *cs) +{ + /* + * Pin @cs. The refcnt will be released when the work item + * finishes executing. + */ + if (!css_tryget(&cs->css)) + return; + + /* + * Queue @cs->hotplug_work. If already pending, lose the css ref. + * cpuset_propagate_hotplug_wq is ordered and propagation will + * happen in the order this function is called. + */ + if (!queue_work(cpuset_propagate_hotplug_wq, &cs->hotplug_work)) + css_put(&cs->css); } /** @@ -2135,11 +2170,14 @@ static void cpuset_hotplug_workfn(struct work_struct *work) list_add_tail(&top_cpuset.stack_list, &queue); while ((cs = cpuset_next(&queue))) if (cs != &top_cpuset) - cpuset_propagate_hotplug(cs); + schedule_cpuset_propagate_hotplug(cs); } cgroup_unlock(); + /* wait for propagations to finish */ + flush_workqueue(cpuset_propagate_hotplug_wq); + /* rebuild sched domains if cpus_allowed has changed */ if (cpus_updated) { struct sched_domain_attr *attr; @@ -2196,6 +2234,10 @@ void __init cpuset_init_smp(void) top_cpuset.mems_allowed = node_states[N_MEMORY]; hotplug_memory_notifier(cpuset_track_online_nodes, 10); + + cpuset_propagate_hotplug_wq = + alloc_ordered_workqueue("cpuset_hotplug", 0); + BUG_ON(!cpuset_propagate_hotplug_wq); } /** From 452477fa68c6d8ef80adebd05194c1c157ad9a53 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Jan 2013 08:51:07 -0800 Subject: [PATCH 11/18] cpuset: pin down cpus and mems while a task is being attached cpuset is scheduled to be decoupled from cgroup_lock which will make configuration updates race with task migration. Any config update will be allowed to happen between ->can_attach() and ->attach(). If such config update removes either all cpus or mems, by the time ->attach() is called, the condition verified by ->can_attach(), that the cpuset is capable of hosting the tasks, is no longer true. This patch adds cpuset->attach_in_progress which is incremented from ->can_attach() and decremented when the attach operation finishes either successfully or not. validate_change() treats cpusets w/ non-zero ->attach_in_progress like cpusets w/ tasks and refuses to remove all cpus or mems from it. This currently doesn't make any functional difference as everything is protected by cgroup_mutex but enables decoupling the locking. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cpuset.c | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index a7bb547786d7..4334576f5d6a 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -91,6 +91,12 @@ struct cpuset { struct fmeter fmeter; /* memory_pressure filter */ + /* + * Tasks are being attached to this cpuset. Used to prevent + * zeroing cpus/mems_allowed between ->can_attach() and ->attach(). + */ + int attach_in_progress; + /* partition number for rebuild_sched_domains() */ int pn; @@ -468,9 +474,12 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) goto out; } - /* Cpusets with tasks can't have empty cpus_allowed or mems_allowed */ + /* + * Cpusets with tasks - existing or newly being attached - can't + * have empty cpus_allowed or mems_allowed. + */ ret = -ENOSPC; - if (cgroup_task_count(cur->css.cgroup) && + if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress) && (cpumask_empty(trial->cpus_allowed) || nodes_empty(trial->mems_allowed))) goto out; @@ -1386,9 +1395,21 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) return ret; } + /* + * Mark attach is in progress. This makes validate_change() fail + * changes which zero cpus/mems_allowed. + */ + cs->attach_in_progress++; + return 0; } +static void cpuset_cancel_attach(struct cgroup *cgrp, + struct cgroup_taskset *tset) +{ + cgroup_cs(cgrp)->attach_in_progress--; +} + /* * Protected by cgroup_mutex. cpus_attach is used only by cpuset_attach() * but we can't allocate it dynamically there. Define it global and @@ -1441,6 +1462,8 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) &cpuset_attach_nodemask_to); mmput(mm); } + + cs->attach_in_progress--; } /* The various types of files and directories in a cpuset file system */ @@ -1908,6 +1931,7 @@ struct cgroup_subsys cpuset_subsys = { .css_offline = cpuset_css_offline, .css_free = cpuset_css_free, .can_attach = cpuset_can_attach, + .cancel_attach = cpuset_cancel_attach, .attach = cpuset_attach, .subsys_id = cpuset_subsys_id, .base_cftypes = files, From 02bb586372a71595203b3ff19a9be48eaa076f6c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Jan 2013 08:51:08 -0800 Subject: [PATCH 12/18] cpuset: schedule hotplug propagation from cpuset_attach() if the cpuset is empty cpuset is scheduled to be decoupled from cgroup_lock which will make hotplug handling race with task migration. cpus or mems will be allowed to go offline between ->can_attach() and ->attach(). If hotplug takes down all cpus or mems of a cpuset while attach is in progress, ->attach() may end up putting tasks into an empty cpuset. This patchset makes ->attach() schedule hotplug propagation if the cpuset is empty after attaching is complete. This will move the tasks to the nearest ancestor which can execute and the end result would be as if hotplug handling happened after the tasks finished attaching. cpuset_write_resmask() now also flushes cpuset_propagate_hotplug_wq to wait for propagations scheduled directly by cpuset_attach(). This currently doesn't make any functional difference as everything is protected by cgroup_mutex but enables decoupling the locking. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cpuset.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 4334576f5d6a..644281003f5d 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -266,6 +266,7 @@ static struct workqueue_struct *cpuset_propagate_hotplug_wq; static void cpuset_hotplug_workfn(struct work_struct *work); static void cpuset_propagate_hotplug_workfn(struct work_struct *work); +static void schedule_cpuset_propagate_hotplug(struct cpuset *cs); static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn); @@ -1464,6 +1465,14 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) } cs->attach_in_progress--; + + /* + * We may have raced with CPU/memory hotunplug. Trigger hotplug + * propagation if @cs doesn't have any CPU or memory. It will move + * the newly added tasks to the nearest parent which can execute. + */ + if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) + schedule_cpuset_propagate_hotplug(cs); } /* The various types of files and directories in a cpuset file system */ @@ -1569,8 +1578,13 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, * resources, wait for the previously scheduled operations before * proceeding, so that we don't end up keep removing tasks added * after execution capability is restored. + * + * Flushing cpuset_hotplug_work is enough to synchronize against + * hotplug hanlding; however, cpuset_attach() may schedule + * propagation work directly. Flush the workqueue too. */ flush_work(&cpuset_hotplug_work); + flush_workqueue(cpuset_propagate_hotplug_wq); if (!cgroup_lock_live_group(cgrp)) return -ENODEV; From 5d21cc2db040d01f8c19b8602f6987813e1176b4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Jan 2013 08:51:08 -0800 Subject: [PATCH 13/18] cpuset: replace cgroup_mutex locking with cpuset internal locking Supposedly for historical reasons, cpuset depends on cgroup core for locking. It depends on cgroup_mutex in cgroup callbacks and grabs cgroup_mutex from other places where it wants to be synchronized. This is majorly messy and highly prone to introducing circular locking dependency especially because cgroup_mutex is supposed to be one of the outermost locks. As previous patches already plugged possible races which may happen by decoupling from cgroup_mutex, replacing cgroup_mutex with cpuset specific cpuset_mutex is mostly straight-forward. Introduce cpuset_mutex, replace all occurrences of cgroup_mutex with it, and add cpuset_mutex locking to places which inherited cgroup_mutex from cgroup core. The only complication is from cpuset wanting to initiate task migration when a cpuset loses all cpus or memory nodes. Task migration may go through full cgroup and all subsystem locking and should be initiated without holding any cpuset specific lock; however, a previous patch already made hotplug handled asynchronously and moving the task migration part outside other locks is easy. cpuset_propagate_hotplug_workfn() now invokes remove_tasks_in_empty_cpuset() without holding any lock. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cpuset.c | 186 +++++++++++++++++++++++++++--------------------- 1 file changed, 106 insertions(+), 80 deletions(-) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 644281003f5d..5e348ae37ce9 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -208,23 +208,20 @@ static struct cpuset top_cpuset = { if (is_cpuset_online(((child_cs) = cgroup_cs((pos_cgrp))))) /* - * There are two global mutexes guarding cpuset structures. The first - * is the main control groups cgroup_mutex, accessed via - * cgroup_lock()/cgroup_unlock(). The second is the cpuset-specific - * callback_mutex, below. They can nest. It is ok to first take - * cgroup_mutex, then nest callback_mutex. We also require taking - * task_lock() when dereferencing a task's cpuset pointer. See "The - * task_lock() exception", at the end of this comment. + * There are two global mutexes guarding cpuset structures - cpuset_mutex + * and callback_mutex. The latter may nest inside the former. We also + * require taking task_lock() when dereferencing a task's cpuset pointer. + * See "The task_lock() exception", at the end of this comment. * - * A task must hold both mutexes to modify cpusets. If a task - * holds cgroup_mutex, then it blocks others wanting that mutex, - * ensuring that it is the only task able to also acquire callback_mutex - * and be able to modify cpusets. It can perform various checks on - * the cpuset structure first, knowing nothing will change. It can - * also allocate memory while just holding cgroup_mutex. While it is - * performing these checks, various callback routines can briefly - * acquire callback_mutex to query cpusets. Once it is ready to make - * the changes, it takes callback_mutex, blocking everyone else. + * A task must hold both mutexes to modify cpusets. If a task holds + * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it + * is the only task able to also acquire callback_mutex and be able to + * modify cpusets. It can perform various checks on the cpuset structure + * first, knowing nothing will change. It can also allocate memory while + * just holding cpuset_mutex. While it is performing these checks, various + * callback routines can briefly acquire callback_mutex to query cpusets. + * Once it is ready to make the changes, it takes callback_mutex, blocking + * everyone else. * * Calls to the kernel memory allocator can not be made while holding * callback_mutex, as that would risk double tripping on callback_mutex @@ -246,6 +243,7 @@ static struct cpuset top_cpuset = { * guidelines for accessing subsystem state in kernel/cgroup.c */ +static DEFINE_MUTEX(cpuset_mutex); static DEFINE_MUTEX(callback_mutex); /* @@ -351,7 +349,7 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) /* * update task's spread flag if cpuset's page/slab spread flag is set * - * Called with callback_mutex/cgroup_mutex held + * Called with callback_mutex/cpuset_mutex held */ static void cpuset_update_task_spread_flag(struct cpuset *cs, struct task_struct *tsk) @@ -371,7 +369,7 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs, * * One cpuset is a subset of another if all its allowed CPUs and * Memory Nodes are a subset of the other, and its exclusive flags - * are only set if the other's are set. Call holding cgroup_mutex. + * are only set if the other's are set. Call holding cpuset_mutex. */ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) @@ -420,7 +418,7 @@ static void free_trial_cpuset(struct cpuset *trial) * If we replaced the flag and mask values of the current cpuset * (cur) with those values in the trial cpuset (trial), would * our various subset and exclusive rules still be valid? Presumes - * cgroup_mutex held. + * cpuset_mutex held. * * 'cur' is the address of an actual, in-use cpuset. Operations * such as list traversal that depend on the actual address of the @@ -555,7 +553,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) * domains when operating in the severe memory shortage situations * that could cause allocation failures below. * - * Must be called with cgroup_lock held. + * Must be called with cpuset_mutex held. * * The three key local variables below are: * q - a linked-list queue of cpuset pointers, used to implement a @@ -766,7 +764,7 @@ done: * 'cpus' is removed, then call this routine to rebuild the * scheduler's dynamic sched domains. * - * Call with cgroup_mutex held. Takes get_online_cpus(). + * Call with cpuset_mutex held. Takes get_online_cpus(). */ static void rebuild_sched_domains_locked(void) { @@ -774,7 +772,7 @@ static void rebuild_sched_domains_locked(void) cpumask_var_t *doms; int ndoms; - WARN_ON_ONCE(!cgroup_lock_is_held()); + lockdep_assert_held(&cpuset_mutex); get_online_cpus(); /* Generate domain masks and attrs */ @@ -800,9 +798,9 @@ static int generate_sched_domains(cpumask_var_t **domains, void rebuild_sched_domains(void) { - cgroup_lock(); + mutex_lock(&cpuset_mutex); rebuild_sched_domains_locked(); - cgroup_unlock(); + mutex_unlock(&cpuset_mutex); } /** @@ -810,7 +808,7 @@ void rebuild_sched_domains(void) * @tsk: task to test * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner * - * Call with cgroup_mutex held. May take callback_mutex during call. + * Call with cpuset_mutex held. May take callback_mutex during call. * Called for each task in a cgroup by cgroup_scan_tasks(). * Return nonzero if this tasks's cpus_allowed mask should be changed (in other * words, if its mask is not equal to its cpuset's mask). @@ -831,7 +829,7 @@ static int cpuset_test_cpumask(struct task_struct *tsk, * cpus_allowed mask needs to be changed. * * We don't need to re-check for the cgroup/cpuset membership, since we're - * holding cgroup_lock() at this point. + * holding cpuset_mutex at this point. */ static void cpuset_change_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan) @@ -844,7 +842,7 @@ static void cpuset_change_cpumask(struct task_struct *tsk, * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() * - * Called with cgroup_mutex held + * Called with cpuset_mutex held * * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, * calling callback functions for each. @@ -934,7 +932,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, * Temporarilly set tasks mems_allowed to target nodes of migration, * so that the migration code can allocate pages on these nodes. * - * Call holding cgroup_mutex, so current's cpuset won't change + * Call holding cpuset_mutex, so current's cpuset won't change * during this call, as manage_mutex holds off any cpuset_attach() * calls. Therefore we don't need to take task_lock around the * call to guarantee_online_mems(), as we know no one is changing @@ -1009,7 +1007,7 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, /* * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy * of it to cpuset's new mems_allowed, and migrate pages to new nodes if - * memory_migrate flag is set. Called with cgroup_mutex held. + * memory_migrate flag is set. Called with cpuset_mutex held. */ static void cpuset_change_nodemask(struct task_struct *p, struct cgroup_scanner *scan) @@ -1018,7 +1016,7 @@ static void cpuset_change_nodemask(struct task_struct *p, struct cpuset *cs; int migrate; const nodemask_t *oldmem = scan->data; - static nodemask_t newmems; /* protected by cgroup_mutex */ + static nodemask_t newmems; /* protected by cpuset_mutex */ cs = cgroup_cs(scan->cg); guarantee_online_mems(cs, &newmems); @@ -1045,7 +1043,7 @@ static void *cpuset_being_rebound; * @oldmem: old mems_allowed of cpuset cs * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() * - * Called with cgroup_mutex held + * Called with cpuset_mutex held * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 * if @heap != NULL. */ @@ -1067,7 +1065,7 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem, * take while holding tasklist_lock. Forks can happen - the * mpol_dup() cpuset_being_rebound check will catch such forks, * and rebind their vma mempolicies too. Because we still hold - * the global cgroup_mutex, we know that no other rebind effort + * the global cpuset_mutex, we know that no other rebind effort * will be contending for the global variable cpuset_being_rebound. * It's ok if we rebind the same mm twice; mpol_rebind_mm() * is idempotent. Also migrate pages in each mm to new nodes. @@ -1086,7 +1084,7 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem, * mempolicies and if the cpuset is marked 'memory_migrate', * migrate the tasks pages to the new memory. * - * Call with cgroup_mutex held. May take callback_mutex during call. + * Call with cpuset_mutex held. May take callback_mutex during call. * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, * lock each such tasks mm->mmap_sem, scan its vma's and rebind * their mempolicies to the cpusets new mems_allowed. @@ -1184,7 +1182,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) * Called by cgroup_scan_tasks() for each task in a cgroup. * * We don't need to re-check for the cgroup/cpuset membership, since we're - * holding cgroup_lock() at this point. + * holding cpuset_mutex at this point. */ static void cpuset_change_flag(struct task_struct *tsk, struct cgroup_scanner *scan) @@ -1197,7 +1195,7 @@ static void cpuset_change_flag(struct task_struct *tsk, * @cs: the cpuset in which each task's spread flags needs to be changed * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() * - * Called with cgroup_mutex held + * Called with cpuset_mutex held * * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, * calling callback functions for each. @@ -1222,7 +1220,7 @@ static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap) * cs: the cpuset to update * turning_on: whether the flag is being set or cleared * - * Call with cgroup_mutex held. + * Call with cpuset_mutex held. */ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, @@ -1370,15 +1368,18 @@ static int fmeter_getrate(struct fmeter *fmp) return val; } -/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ +/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) { struct cpuset *cs = cgroup_cs(cgrp); struct task_struct *task; int ret; + mutex_lock(&cpuset_mutex); + + ret = -ENOSPC; if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) - return -ENOSPC; + goto out_unlock; cgroup_taskset_for_each(task, cgrp, tset) { /* @@ -1390,10 +1391,12 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) * set_cpus_allowed_ptr() on all attached tasks before * cpus_allowed may be changed. */ + ret = -EINVAL; if (task->flags & PF_THREAD_BOUND) - return -EINVAL; - if ((ret = security_task_setscheduler(task))) - return ret; + goto out_unlock; + ret = security_task_setscheduler(task); + if (ret) + goto out_unlock; } /* @@ -1401,18 +1404,22 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) * changes which zero cpus/mems_allowed. */ cs->attach_in_progress++; - - return 0; + ret = 0; +out_unlock: + mutex_unlock(&cpuset_mutex); + return ret; } static void cpuset_cancel_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) { + mutex_lock(&cpuset_mutex); cgroup_cs(cgrp)->attach_in_progress--; + mutex_unlock(&cpuset_mutex); } /* - * Protected by cgroup_mutex. cpus_attach is used only by cpuset_attach() + * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach() * but we can't allocate it dynamically there. Define it global and * allocate from cpuset_init(). */ @@ -1420,7 +1427,7 @@ static cpumask_var_t cpus_attach; static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) { - /* static bufs protected by cgroup_mutex */ + /* static bufs protected by cpuset_mutex */ static nodemask_t cpuset_attach_nodemask_from; static nodemask_t cpuset_attach_nodemask_to; struct mm_struct *mm; @@ -1430,6 +1437,8 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) struct cpuset *cs = cgroup_cs(cgrp); struct cpuset *oldcs = cgroup_cs(oldcgrp); + mutex_lock(&cpuset_mutex); + /* prepare for attach */ if (cs == &top_cpuset) cpumask_copy(cpus_attach, cpu_possible_mask); @@ -1473,6 +1482,8 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) */ if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) schedule_cpuset_propagate_hotplug(cs); + + mutex_unlock(&cpuset_mutex); } /* The various types of files and directories in a cpuset file system */ @@ -1494,12 +1505,13 @@ typedef enum { static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) { - int retval = 0; struct cpuset *cs = cgroup_cs(cgrp); cpuset_filetype_t type = cft->private; + int retval = -ENODEV; - if (!cgroup_lock_live_group(cgrp)) - return -ENODEV; + mutex_lock(&cpuset_mutex); + if (!is_cpuset_online(cs)) + goto out_unlock; switch (type) { case FILE_CPU_EXCLUSIVE: @@ -1533,18 +1545,20 @@ static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) retval = -EINVAL; break; } - cgroup_unlock(); +out_unlock: + mutex_unlock(&cpuset_mutex); return retval; } static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val) { - int retval = 0; struct cpuset *cs = cgroup_cs(cgrp); cpuset_filetype_t type = cft->private; + int retval = -ENODEV; - if (!cgroup_lock_live_group(cgrp)) - return -ENODEV; + mutex_lock(&cpuset_mutex); + if (!is_cpuset_online(cs)) + goto out_unlock; switch (type) { case FILE_SCHED_RELAX_DOMAIN_LEVEL: @@ -1554,7 +1568,8 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val) retval = -EINVAL; break; } - cgroup_unlock(); +out_unlock: + mutex_unlock(&cpuset_mutex); return retval; } @@ -1564,9 +1579,9 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val) static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, const char *buf) { - int retval = 0; struct cpuset *cs = cgroup_cs(cgrp); struct cpuset *trialcs; + int retval = -ENODEV; /* * CPU or memory hotunplug may leave @cs w/o any execution @@ -1586,13 +1601,14 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, flush_work(&cpuset_hotplug_work); flush_workqueue(cpuset_propagate_hotplug_wq); - if (!cgroup_lock_live_group(cgrp)) - return -ENODEV; + mutex_lock(&cpuset_mutex); + if (!is_cpuset_online(cs)) + goto out_unlock; trialcs = alloc_trial_cpuset(cs); if (!trialcs) { retval = -ENOMEM; - goto out; + goto out_unlock; } switch (cft->private) { @@ -1608,8 +1624,8 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, } free_trial_cpuset(trialcs); -out: - cgroup_unlock(); +out_unlock: + mutex_unlock(&cpuset_mutex); return retval; } @@ -1867,6 +1883,8 @@ static int cpuset_css_online(struct cgroup *cgrp) if (!parent) return 0; + mutex_lock(&cpuset_mutex); + set_bit(CS_ONLINE, &cs->flags); if (is_spread_page(parent)) set_bit(CS_SPREAD_PAGE, &cs->flags); @@ -1876,7 +1894,7 @@ static int cpuset_css_online(struct cgroup *cgrp) number_of_cpusets++; if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags)) - return 0; + goto out_unlock; /* * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is @@ -1895,7 +1913,7 @@ static int cpuset_css_online(struct cgroup *cgrp) cpuset_for_each_child(tmp_cs, pos_cg, parent) { if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) { rcu_read_unlock(); - return 0; + goto out_unlock; } } rcu_read_unlock(); @@ -1904,7 +1922,8 @@ static int cpuset_css_online(struct cgroup *cgrp) cs->mems_allowed = parent->mems_allowed; cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); mutex_unlock(&callback_mutex); - +out_unlock: + mutex_unlock(&cpuset_mutex); return 0; } @@ -1912,8 +1931,7 @@ static void cpuset_css_offline(struct cgroup *cgrp) { struct cpuset *cs = cgroup_cs(cgrp); - /* css_offline is called w/o cgroup_mutex, grab it */ - cgroup_lock(); + mutex_lock(&cpuset_mutex); if (is_sched_load_balance(cs)) update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); @@ -1921,7 +1939,7 @@ static void cpuset_css_offline(struct cgroup *cgrp) number_of_cpusets--; clear_bit(CS_ONLINE, &cs->flags); - cgroup_unlock(); + mutex_unlock(&cpuset_mutex); } /* @@ -1996,7 +2014,9 @@ static void cpuset_do_move_task(struct task_struct *tsk, { struct cgroup *new_cgroup = scan->data; + cgroup_lock(); cgroup_attach_task(new_cgroup, tsk); + cgroup_unlock(); } /** @@ -2004,7 +2024,7 @@ static void cpuset_do_move_task(struct task_struct *tsk, * @from: cpuset in which the tasks currently reside * @to: cpuset to which the tasks will be moved * - * Called with cgroup_mutex held + * Called with cpuset_mutex held * callback_mutex must not be held, as cpuset_attach() will take it. * * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, @@ -2031,9 +2051,6 @@ static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to) * removing that CPU or node from all cpusets. If this removes the * last CPU or node from a cpuset, then move the tasks in the empty * cpuset to its next-highest non-empty parent. - * - * Called with cgroup_mutex held - * callback_mutex must not be held, as cpuset_attach() will take it. */ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) { @@ -2089,8 +2106,9 @@ static void cpuset_propagate_hotplug_workfn(struct work_struct *work) static cpumask_t off_cpus; static nodemask_t off_mems, tmp_mems; struct cpuset *cs = container_of(work, struct cpuset, hotplug_work); + bool is_empty; - cgroup_lock(); + mutex_lock(&cpuset_mutex); cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed); nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed); @@ -2112,10 +2130,18 @@ static void cpuset_propagate_hotplug_workfn(struct work_struct *work) update_tasks_nodemask(cs, &tmp_mems, NULL); } - if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) - remove_tasks_in_empty_cpuset(cs); + is_empty = cpumask_empty(cs->cpus_allowed) || + nodes_empty(cs->mems_allowed); - cgroup_unlock(); + mutex_unlock(&cpuset_mutex); + + /* + * If @cs became empty, move tasks to the nearest ancestor with + * execution resources. This is full cgroup operation which will + * also call back into cpuset. Should be done outside any lock. + */ + if (is_empty) + remove_tasks_in_empty_cpuset(cs); /* the following may free @cs, should be the last operation */ css_put(&cs->css); @@ -2169,7 +2195,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) bool cpus_updated, mems_updated; bool cpus_offlined, mems_offlined; - cgroup_lock(); + mutex_lock(&cpuset_mutex); /* fetch the available cpus/mems and find out which changed how */ cpumask_copy(&new_cpus, cpu_active_mask); @@ -2211,7 +2237,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) schedule_cpuset_propagate_hotplug(cs); } - cgroup_unlock(); + mutex_unlock(&cpuset_mutex); /* wait for propagations to finish */ flush_workqueue(cpuset_propagate_hotplug_wq); @@ -2222,9 +2248,9 @@ static void cpuset_hotplug_workfn(struct work_struct *work) cpumask_var_t *doms; int ndoms; - cgroup_lock(); + mutex_lock(&cpuset_mutex); ndoms = generate_sched_domains(&doms, &attr); - cgroup_unlock(); + mutex_unlock(&cpuset_mutex); partition_sched_domains(ndoms, doms, attr); } @@ -2650,7 +2676,7 @@ void __cpuset_memory_pressure_bump(void) * - Used for /proc//cpuset. * - No need to task_lock(tsk) on this tsk->cpuset reference, as it * doesn't really matter if tsk->cpuset changes after we read it, - * and we take cgroup_mutex, keeping cpuset_attach() from changing it + * and we take cpuset_mutex, keeping cpuset_attach() from changing it * anyway. */ static int proc_cpuset_show(struct seq_file *m, void *unused_v) @@ -2673,7 +2699,7 @@ static int proc_cpuset_show(struct seq_file *m, void *unused_v) goto out_free; retval = -EINVAL; - cgroup_lock(); + mutex_lock(&cpuset_mutex); css = task_subsys_state(tsk, cpuset_subsys_id); retval = cgroup_path(css->cgroup, buf, PAGE_SIZE); if (retval < 0) @@ -2681,7 +2707,7 @@ static int proc_cpuset_show(struct seq_file *m, void *unused_v) seq_puts(m, buf); seq_putc(m, '\n'); out_unlock: - cgroup_unlock(); + mutex_unlock(&cpuset_mutex); put_task_struct(tsk); out_free: kfree(buf); From fc560a26accea9567b7f8f41eefa01e1bb127d74 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Jan 2013 08:51:08 -0800 Subject: [PATCH 14/18] cpuset: replace cpuset->stack_list with cpuset_for_each_descendant_pre() Implement cpuset_for_each_descendant_pre() and replace the cpuset-specific tree walking using cpuset->stack_list with it. Signed-off-by: Tejun Heo Reviewed-by: Michal Hocko Acked-by: Li Zefan --- kernel/cpuset.c | 123 +++++++++++++++++++----------------------------- 1 file changed, 48 insertions(+), 75 deletions(-) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 5e348ae37ce9..0ddb48d40e4d 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -103,9 +103,6 @@ struct cpuset { /* for custom sched domain */ int relax_domain_level; - /* used for walking a cpuset hierarchy */ - struct list_head stack_list; - struct work_struct hotplug_work; }; @@ -207,6 +204,20 @@ static struct cpuset top_cpuset = { cgroup_for_each_child((pos_cgrp), (parent_cs)->css.cgroup) \ if (is_cpuset_online(((child_cs) = cgroup_cs((pos_cgrp))))) +/** + * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants + * @des_cs: loop cursor pointing to the current descendant + * @pos_cgrp: used for iteration + * @root_cs: target cpuset to walk ancestor of + * + * Walk @des_cs through the online descendants of @root_cs. Must be used + * with RCU read locked. The caller may modify @pos_cgrp by calling + * cgroup_rightmost_descendant() to skip subtree. + */ +#define cpuset_for_each_descendant_pre(des_cs, pos_cgrp, root_cs) \ + cgroup_for_each_descendant_pre((pos_cgrp), (root_cs)->css.cgroup) \ + if (is_cpuset_online(((des_cs) = cgroup_cs((pos_cgrp))))) + /* * There are two global mutexes guarding cpuset structures - cpuset_mutex * and callback_mutex. The latter may nest inside the former. We also @@ -507,31 +518,24 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) return; } -static void -update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) +static void update_domain_attr_tree(struct sched_domain_attr *dattr, + struct cpuset *root_cs) { - LIST_HEAD(q); + struct cpuset *cp; + struct cgroup *pos_cgrp; - list_add(&c->stack_list, &q); - while (!list_empty(&q)) { - struct cpuset *cp; - struct cgroup *cont; - struct cpuset *child; - - cp = list_first_entry(&q, struct cpuset, stack_list); - list_del(q.next); - - if (cpumask_empty(cp->cpus_allowed)) + rcu_read_lock(); + cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) { + /* skip the whole subtree if @cp doesn't have any CPU */ + if (cpumask_empty(cp->cpus_allowed)) { + pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); continue; + } if (is_sched_load_balance(cp)) update_domain_attr(dattr, cp); - - rcu_read_lock(); - cpuset_for_each_child(child, cont, cp) - list_add_tail(&child->stack_list, &q); - rcu_read_unlock(); } + rcu_read_unlock(); } /* @@ -591,7 +595,6 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) static int generate_sched_domains(cpumask_var_t **domains, struct sched_domain_attr **attributes) { - LIST_HEAD(q); /* queue of cpusets to be scanned */ struct cpuset *cp; /* scans q */ struct cpuset **csa; /* array of all cpuset ptrs */ int csn; /* how many cpuset ptrs in csa so far */ @@ -600,6 +603,7 @@ static int generate_sched_domains(cpumask_var_t **domains, struct sched_domain_attr *dattr; /* attributes for custom domains */ int ndoms = 0; /* number of sched domains in result */ int nslot; /* next empty doms[] struct cpumask slot */ + struct cgroup *pos_cgrp; doms = NULL; dattr = NULL; @@ -627,33 +631,27 @@ static int generate_sched_domains(cpumask_var_t **domains, goto done; csn = 0; - list_add(&top_cpuset.stack_list, &q); - while (!list_empty(&q)) { - struct cgroup *cont; - struct cpuset *child; /* scans child cpusets of cp */ - - cp = list_first_entry(&q, struct cpuset, stack_list); - list_del(q.next); - - if (cpumask_empty(cp->cpus_allowed)) - continue; - + rcu_read_lock(); + cpuset_for_each_descendant_pre(cp, pos_cgrp, &top_cpuset) { /* - * All child cpusets contain a subset of the parent's cpus, so - * just skip them, and then we call update_domain_attr_tree() - * to calc relax_domain_level of the corresponding sched - * domain. + * Continue traversing beyond @cp iff @cp has some CPUs and + * isn't load balancing. The former is obvious. The + * latter: All child cpusets contain a subset of the + * parent's cpus, so just skip them, and then we call + * update_domain_attr_tree() to calc relax_domain_level of + * the corresponding sched domain. */ - if (is_sched_load_balance(cp)) { - csa[csn++] = cp; + if (!cpumask_empty(cp->cpus_allowed) && + !is_sched_load_balance(cp)) continue; - } - rcu_read_lock(); - cpuset_for_each_child(child, cont, cp) - list_add_tail(&child->stack_list, &q); - rcu_read_unlock(); - } + if (is_sched_load_balance(cp)) + csa[csn++] = cp; + + /* skip @cp's subtree */ + pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); + } + rcu_read_unlock(); for (i = 0; i < csn; i++) csa[i]->pn = i; @@ -2068,31 +2066,6 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) move_member_tasks_to_cpuset(cs, parent); } -/* - * Helper function to traverse cpusets. - * It can be used to walk the cpuset tree from top to bottom, completing - * one layer before dropping down to the next (thus always processing a - * node before any of its children). - */ -static struct cpuset *cpuset_next(struct list_head *queue) -{ - struct cpuset *cp; - struct cpuset *child; /* scans child cpusets of cp */ - struct cgroup *cont; - - if (list_empty(queue)) - return NULL; - - cp = list_first_entry(queue, struct cpuset, stack_list); - list_del(queue->next); - rcu_read_lock(); - cpuset_for_each_child(child, cont, cp) - list_add_tail(&child->stack_list, queue); - rcu_read_unlock(); - - return cp; -} - /** * cpuset_propagate_hotplug_workfn - propagate CPU/memory hotplug to a cpuset * @cs: cpuset in interest @@ -2229,12 +2202,12 @@ static void cpuset_hotplug_workfn(struct work_struct *work) /* if cpus or mems went down, we need to propagate to descendants */ if (cpus_offlined || mems_offlined) { struct cpuset *cs; - LIST_HEAD(queue); + struct cgroup *pos_cgrp; - list_add_tail(&top_cpuset.stack_list, &queue); - while ((cs = cpuset_next(&queue))) - if (cs != &top_cpuset) - schedule_cpuset_propagate_hotplug(cs); + rcu_read_lock(); + cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset) + schedule_cpuset_propagate_hotplug(cs); + rcu_read_unlock(); } mutex_unlock(&cpuset_mutex); From c431069fe4bacc0cd3ca94a8453987140a5b3517 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Jan 2013 08:51:08 -0800 Subject: [PATCH 15/18] cpuset: remove cpuset->parent cgroup already tracks the hierarchy. Follow cgroup->parent to find the parent and drop cpuset->parent. Signed-off-by: Tejun Heo Reviewed-by: Michal Hocko Acked-by: Li Zefan --- kernel/cpuset.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 0ddb48d40e4d..6aa5bbb5f33b 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -87,8 +87,6 @@ struct cpuset { cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ - struct cpuset *parent; /* my parent */ - struct fmeter fmeter; /* memory_pressure filter */ /* @@ -120,6 +118,15 @@ static inline struct cpuset *task_cs(struct task_struct *task) struct cpuset, css); } +static inline struct cpuset *parent_cs(const struct cpuset *cs) +{ + struct cgroup *pcgrp = cs->css.cgroup->parent; + + if (pcgrp) + return cgroup_cs(pcgrp); + return NULL; +} + #ifdef CONFIG_NUMA static inline bool task_has_mempolicy(struct task_struct *task) { @@ -323,7 +330,7 @@ static void guarantee_online_cpus(const struct cpuset *cs, struct cpumask *pmask) { while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) - cs = cs->parent; + cs = parent_cs(cs); if (cs) cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask); else @@ -348,7 +355,7 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) { while (cs && !nodes_intersects(cs->mems_allowed, node_states[N_MEMORY])) - cs = cs->parent; + cs = parent_cs(cs); if (cs) nodes_and(*pmask, cs->mems_allowed, node_states[N_MEMORY]); @@ -461,7 +468,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) if (cur == &top_cpuset) goto out; - par = cur->parent; + par = parent_cs(cur); /* We must be a subset of our parent cpuset */ ret = -EACCES; @@ -1866,7 +1873,6 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont) fmeter_init(&cs->fmeter); INIT_WORK(&cs->hotplug_work, cpuset_propagate_hotplug_workfn); cs->relax_domain_level = -1; - cs->parent = cgroup_cs(cont->parent); return &cs->css; } @@ -1874,7 +1880,7 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont) static int cpuset_css_online(struct cgroup *cgrp) { struct cpuset *cs = cgroup_cs(cgrp); - struct cpuset *parent = cs->parent; + struct cpuset *parent = parent_cs(cs); struct cpuset *tmp_cs; struct cgroup *pos_cg; @@ -2058,10 +2064,10 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) * Find its next-highest non-empty parent, (top cpuset * has online cpus, so can't be empty). */ - parent = cs->parent; + parent = parent_cs(cs); while (cpumask_empty(parent->cpus_allowed) || nodes_empty(parent->mems_allowed)) - parent = parent->parent; + parent = parent_cs(parent); move_member_tasks_to_cpuset(cs, parent); } @@ -2373,8 +2379,8 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) */ static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs) { - while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && cs->parent) - cs = cs->parent; + while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs)) + cs = parent_cs(cs); return cs; } From f47b89c73ce7535339525c141a6a114db333f0bd Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 11 Jan 2013 17:29:17 +0800 Subject: [PATCH 16/18] cpuset: update MAINTAINERS As Paul doesn't maintain cpusets anymore, I'll take over the maintainership. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index fa309ab7ccbf..d58bc1222b57 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2128,10 +2128,10 @@ S: Maintained F: tools/power/cpupower CPUSETS -M: Paul Menage +M: Li Zefan W: http://www.bullopensource.org/cpuset/ W: http://oss.sgi.com/projects/cpusets/ -S: Supported +S: Maintained F: Documentation/cgroups/cpusets.txt F: include/linux/cpuset.h F: kernel/cpuset.c From 27e89ae5d6e94e30231ee89a7736f62d84ba4c6f Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 15 Jan 2013 14:10:57 +0800 Subject: [PATCH 17/18] cpuset: fix RCU lockdep splat 5d21cc2db040d01f8c19b8602f6987813e1176b4 ("cpuset: replace cgroup_mutex locking with cpuset internal locking") incorrectly converted proc_cpuset_show() from cgroup_lock() to cpuset_mutex. proc_cpuset_show() is accessing cgroup hierarchy proper to determine cgroup path which can't be protected by cpuset_mutex. This triggered the following RCU warning. =============================== [ INFO: suspicious RCU usage. ] 3.8.0-rc3-next-20130114-sasha-00016-ga107525-dirty #262 Tainted: G W ------------------------------- include/linux/cgroup.h:534 suspicious rcu_dereference_check() usage! other info that might help us debug this: rcu_scheduler_active = 1, debug_locks = 1 2 locks held by trinity/7514: #0: (&p->lock){+.+.+.}, at: [] seq_read+0x3a/0x3e0 #1: (cpuset_mutex){+.+...}, at: [] proc_cpuset_show+0x84/0x190 stack backtrace: Pid: 7514, comm: trinity Tainted: G W +3.8.0-rc3-next-20130114-sasha-00016-ga107525-dirty #262 Call Trace: [] lockdep_rcu_suspicious+0x10b/0x120 [] proc_cpuset_show+0x111/0x190 [] seq_read+0x1b7/0x3e0 [] ? seq_lseek+0x110/0x110 [] do_loop_readv_writev+0x4b/0x90 [] do_readv_writev+0xf6/0x1d0 [] vfs_readv+0x3e/0x60 [] sys_readv+0x50/0xd0 [] tracesys+0xe1/0xe6 The operation can be performed under RCU read lock. Replace cpuset_mutex locking with RCU read locking. tj: Rewrote patch description. Reported-by: Sasha Levin Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cpuset.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 6aa5bbb5f33b..1a675e446ef2 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2678,15 +2678,15 @@ static int proc_cpuset_show(struct seq_file *m, void *unused_v) goto out_free; retval = -EINVAL; - mutex_lock(&cpuset_mutex); + rcu_read_lock(); css = task_subsys_state(tsk, cpuset_subsys_id); retval = cgroup_path(css->cgroup, buf, PAGE_SIZE); + rcu_read_unlock(); if (retval < 0) - goto out_unlock; + goto out_put_task; seq_puts(m, buf); seq_putc(m, '\n'); -out_unlock: - mutex_unlock(&cpuset_mutex); +out_put_task: put_task_struct(tsk); out_free: kfree(buf); From d127027baf98dce3ca31bec18c2c0e048ceda7c4 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 15 Jan 2013 14:11:32 +0800 Subject: [PATCH 18/18] cpuset: drop spurious retval assignment in proc_cpuset_show() proc_cpuset_show() has a spurious -EINVAL assignment which does nothing. Remove it. This patch doesn't make any functional difference. tj: Rewrote patch description. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cpuset.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 1a675e446ef2..16be7c9edfd7 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2677,7 +2677,6 @@ static int proc_cpuset_show(struct seq_file *m, void *unused_v) if (!tsk) goto out_free; - retval = -EINVAL; rcu_read_lock(); css = task_subsys_state(tsk, cpuset_subsys_id); retval = cgroup_path(css->cgroup, buf, PAGE_SIZE);