From 51fae6da640edf9d266c94f36bc806c63c301991 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 21 Oct 2014 09:27:12 +0200 Subject: [PATCH 1/5] freezer: Do not freeze tasks killed by OOM killer Since f660daac474c6f (oom: thaw threads if oom killed thread is frozen before deferring) OOM killer relies on being able to thaw a frozen task to handle OOM situation but a3201227f803 (freezer: make freezing() test freeze conditions in effect instead of TIF_FREEZE) has reorganized the code and stopped clearing freeze flag in __thaw_task. This means that the target task only wakes up and goes into the fridge again because the freezing condition hasn't changed for it. This reintroduces the bug fixed by f660daac474c6f. Fix the issue by checking for TIF_MEMDIE thread flag in freezing_slow_path and exclude the task from freezing completely. If a task was already frozen it would get woken by __thaw_task from OOM killer and get out of freezer after rechecking freezing(). Changes since v1 - put TIF_MEMDIE check into freezing_slowpath rather than in __refrigerator as per Oleg - return __thaw_task into oom_scan_process_thread because oom_kill_process will not wake task in the fridge because it is sleeping uninterruptible [mhocko@suse.cz: rewrote the changelog] Fixes: a3201227f803 (freezer: make freezing() test freeze conditions in effect instead of TIF_FREEZE) Cc: 3.3+ # 3.3+ Signed-off-by: Cong Wang Signed-off-by: Michal Hocko Acked-by: Oleg Nesterov Signed-off-by: Rafael J. Wysocki --- kernel/freezer.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/freezer.c b/kernel/freezer.c index aa6a8aadb911..8f9279b9c6d7 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -42,6 +42,9 @@ bool freezing_slow_path(struct task_struct *p) if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK)) return false; + if (test_thread_flag(TIF_MEMDIE)) + return false; + if (pm_nosig_freezing || cgroup_freezing(p)) return true; From c05eb32f472fb9f7f474c20ff6fa5bfe0cbedc05 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 21 Oct 2014 09:27:13 +0200 Subject: [PATCH 2/5] freezer: remove obsolete comments in __thaw_task() __thaw_task() no longer clears frozen flag since commit a3201227f803 (freezer: make freezing() test freeze conditions in effect instead of TIF_FREEZE). Reviewed-by: Michal Hocko Signed-off-by: Cong Wang Signed-off-by: Rafael J. Wysocki --- kernel/freezer.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/kernel/freezer.c b/kernel/freezer.c index 8f9279b9c6d7..a8900a3bc27a 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -150,12 +150,6 @@ void __thaw_task(struct task_struct *p) { unsigned long flags; - /* - * Clear freezing and kick @p if FROZEN. Clearing is guaranteed to - * be visible to @p as waking up implies wmb. Waking up inside - * freezer_lock also prevents wakeups from leaking outside - * refrigerator. - */ spin_lock_irqsave(&freezer_lock, flags); if (frozen(p)) wake_up_process(p); From 5695be142e203167e3cb515ef86a88424f3524eb Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 20 Oct 2014 18:12:32 +0200 Subject: [PATCH 3/5] OOM, PM: OOM killed task shouldn't escape PM suspend PM freezer relies on having all tasks frozen by the time devices are getting frozen so that no task will touch them while they are getting frozen. But OOM killer is allowed to kill an already frozen task in order to handle OOM situtation. In order to protect from late wake ups OOM killer is disabled after all tasks are frozen. This, however, still keeps a window open when a killed task didn't manage to die by the time freeze_processes finishes. Reduce the race window by checking all tasks after OOM killer has been disabled. This is still not race free completely unfortunately because oom_killer_disable cannot stop an already ongoing OOM killer so a task might still wake up from the fridge and get killed without freeze_processes noticing. Full synchronization of OOM and freezer is, however, too heavy weight for this highly unlikely case. Introduce and check oom_kills counter which gets incremented early when the allocator enters __alloc_pages_may_oom path and only check all the tasks if the counter changes during the freezing attempt. The counter is updated so early to reduce the race window since allocator checked oom_killer_disabled which is set by PM-freezing code. A false positive will push the PM-freezer into a slow path but that is not a big deal. Changes since v1 - push the re-check loop out of freeze_processes into check_frozen_processes and invert the condition to make the code more readable as per Rafael Fixes: f660daac474c6f (oom: thaw threads if oom killed thread is frozen before deferring) Cc: 3.2+ # 3.2+ Signed-off-by: Michal Hocko Signed-off-by: Rafael J. Wysocki --- include/linux/oom.h | 3 +++ kernel/power/process.c | 40 +++++++++++++++++++++++++++++++++++++++- mm/oom_kill.c | 17 +++++++++++++++++ mm/page_alloc.c | 8 ++++++++ 4 files changed, 67 insertions(+), 1 deletion(-) diff --git a/include/linux/oom.h b/include/linux/oom.h index 647395a1a550..e8d6e1058723 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -50,6 +50,9 @@ static inline bool oom_task_origin(const struct task_struct *p) extern unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, const nodemask_t *nodemask, unsigned long totalpages); + +extern int oom_kills_count(void); +extern void note_oom_kill(void); extern void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, unsigned int points, unsigned long totalpages, struct mem_cgroup *memcg, nodemask_t *nodemask, diff --git a/kernel/power/process.c b/kernel/power/process.c index 7b323221b9ee..5cc588c1abab 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -108,6 +108,28 @@ static int try_to_freeze_tasks(bool user_only) return todo ? -EBUSY : 0; } +/* + * Returns true if all freezable tasks (except for current) are frozen already + */ +static bool check_frozen_processes(void) +{ + struct task_struct *g, *p; + bool ret = true; + + read_lock(&tasklist_lock); + for_each_process_thread(g, p) { + if (p != current && !freezer_should_skip(p) && + !frozen(p)) { + ret = false; + goto done; + } + } +done: + read_unlock(&tasklist_lock); + + return ret; +} + /** * freeze_processes - Signal user space processes to enter the refrigerator. * The current thread will not be frozen. The same process that calls @@ -118,6 +140,7 @@ static int try_to_freeze_tasks(bool user_only) int freeze_processes(void) { int error; + int oom_kills_saved; error = __usermodehelper_disable(UMH_FREEZING); if (error) @@ -132,12 +155,27 @@ int freeze_processes(void) pm_wakeup_clear(); printk("Freezing user space processes ... "); pm_freezing = true; + oom_kills_saved = oom_kills_count(); error = try_to_freeze_tasks(true); if (!error) { - printk("done."); __usermodehelper_set_disable_depth(UMH_DISABLED); oom_killer_disable(); + + /* + * There might have been an OOM kill while we were + * freezing tasks and the killed task might be still + * on the way out so we have to double check for race. + */ + if (oom_kills_count() != oom_kills_saved && + !check_frozen_processes()) { + __usermodehelper_set_disable_depth(UMH_ENABLED); + printk("OOM in progress."); + error = -EBUSY; + goto done; + } + printk("done."); } +done: printk("\n"); BUG_ON(in_atomic()); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index bbf405a3a18f..5340f6b91312 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -404,6 +404,23 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, dump_tasks(memcg, nodemask); } +/* + * Number of OOM killer invocations (including memcg OOM killer). + * Primarily used by PM freezer to check for potential races with + * OOM killed frozen task. + */ +static atomic_t oom_kills = ATOMIC_INIT(0); + +int oom_kills_count(void) +{ + return atomic_read(&oom_kills); +} + +void note_oom_kill(void) +{ + atomic_inc(&oom_kills); +} + #define K(x) ((x) << (PAGE_SHIFT-10)) /* * Must be called while holding a reference to p, which will be released upon diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 736d8e1b6381..9cd36b822444 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2251,6 +2251,14 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, return NULL; } + /* + * PM-freezer should be notified that there might be an OOM killer on + * its way to kill and wake somebody up. This is too early and we might + * end up not killing anything but false positives are acceptable. + * See freeze_processes. + */ + note_oom_kill(); + /* * Go through the zonelist yet one more time, keep very high watermark * here, this is only to catch a parallel oom killing, we must fail if From a28e785a9f794ba32e603570ab52a262cf963489 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 21 Oct 2014 09:27:15 +0200 Subject: [PATCH 4/5] PM: convert do_each_thread to for_each_process_thread as per 0c740d0afc3b (introduce for_each_thread() to replace the buggy while_each_thread()) get rid of do_each_thread { } while_each_thread() construct and replace it by a more error prone for_each_thread. This patch doesn't introduce any user visible change. Suggested-by: Oleg Nesterov Signed-off-by: Michal Hocko Signed-off-by: Rafael J. Wysocki --- kernel/power/process.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/power/process.c b/kernel/power/process.c index 5cc588c1abab..7f0d4343af1b 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -46,13 +46,13 @@ static int try_to_freeze_tasks(bool user_only) while (true) { todo = 0; read_lock(&tasklist_lock); - do_each_thread(g, p) { + for_each_process_thread(g, p) { if (p == current || !freeze_task(p)) continue; if (!freezer_should_skip(p)) todo++; - } while_each_thread(g, p); + } read_unlock(&tasklist_lock); if (!user_only) { @@ -93,11 +93,11 @@ static int try_to_freeze_tasks(bool user_only) if (!wakeup) { read_lock(&tasklist_lock); - do_each_thread(g, p) { + for_each_process_thread(g, p) { if (p != current && !freezer_should_skip(p) && freezing(p) && !frozen(p)) sched_show_task(p); - } while_each_thread(g, p); + } read_unlock(&tasklist_lock); } } else { @@ -229,11 +229,11 @@ void thaw_processes(void) thaw_workqueues(); read_lock(&tasklist_lock); - do_each_thread(g, p) { + for_each_process_thread(g, p) { /* No other threads should have PF_SUSPEND_TASK set */ WARN_ON((p != curr) && (p->flags & PF_SUSPEND_TASK)); __thaw_task(p); - } while_each_thread(g, p); + } read_unlock(&tasklist_lock); WARN_ON(!(curr->flags & PF_SUSPEND_TASK)); @@ -256,10 +256,10 @@ void thaw_kernel_threads(void) thaw_workqueues(); read_lock(&tasklist_lock); - do_each_thread(g, p) { + for_each_process_thread(g, p) { if (p->flags & (PF_KTHREAD | PF_WQ_WORKER)) __thaw_task(p); - } while_each_thread(g, p); + } read_unlock(&tasklist_lock); schedule(); From 71be2114a5474a76edad95343d89b8731457fccd Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 22 Oct 2014 22:47:32 +0200 Subject: [PATCH 5/5] PM / freezer: Clean up code after recent fixes Clean up the code in process.c after recent changes to get rid of unnecessary labels and goto statements. Signed-off-by: Rafael J. Wysocki --- kernel/power/process.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/kernel/power/process.c b/kernel/power/process.c index 7f0d4343af1b..5a6ec8678b9a 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -108,25 +108,27 @@ static int try_to_freeze_tasks(bool user_only) return todo ? -EBUSY : 0; } +static bool __check_frozen_processes(void) +{ + struct task_struct *g, *p; + + for_each_process_thread(g, p) + if (p != current && !freezer_should_skip(p) && !frozen(p)) + return false; + + return true; +} + /* * Returns true if all freezable tasks (except for current) are frozen already */ static bool check_frozen_processes(void) { - struct task_struct *g, *p; - bool ret = true; + bool ret; read_lock(&tasklist_lock); - for_each_process_thread(g, p) { - if (p != current && !freezer_should_skip(p) && - !frozen(p)) { - ret = false; - goto done; - } - } -done: + ret = __check_frozen_processes(); read_unlock(&tasklist_lock); - return ret; } @@ -167,15 +169,14 @@ int freeze_processes(void) * on the way out so we have to double check for race. */ if (oom_kills_count() != oom_kills_saved && - !check_frozen_processes()) { + !check_frozen_processes()) { __usermodehelper_set_disable_depth(UMH_ENABLED); printk("OOM in progress."); error = -EBUSY; - goto done; + } else { + printk("done."); } - printk("done."); } -done: printk("\n"); BUG_ON(in_atomic());