diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c index 0071469ecbf1..259a4d5a4e8f 100644 --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c @@ -355,8 +355,9 @@ static struct sysrq_key_op sysrq_term_op = { static void moom_callback(struct work_struct *ignored) { - out_of_memory(node_zonelist(first_memory_node, GFP_KERNEL), GFP_KERNEL, - 0, NULL, true); + if (!out_of_memory(node_zonelist(first_memory_node, GFP_KERNEL), + GFP_KERNEL, 0, NULL, true)) + pr_info("OOM request ignored because killer is disabled\n"); } static DECLARE_WORK(moom_work, moom_callback); diff --git a/include/linux/oom.h b/include/linux/oom.h index b42b80f88c3a..d5771bed59c9 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -72,22 +72,14 @@ extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task, unsigned long totalpages, const nodemask_t *nodemask, bool force_kill); -extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, +extern bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order, nodemask_t *mask, bool force_kill); extern int register_oom_notifier(struct notifier_block *nb); extern int unregister_oom_notifier(struct notifier_block *nb); extern bool oom_killer_disabled; - -static inline void oom_killer_disable(void) -{ - oom_killer_disabled = true; -} - -static inline void oom_killer_enable(void) -{ - oom_killer_disabled = false; -} +extern bool oom_killer_disable(void); +extern void oom_killer_enable(void); extern struct task_struct *find_lock_task_mm(struct task_struct *p); diff --git a/kernel/exit.c b/kernel/exit.c index 02b3d1ab2ec0..feff10bbb307 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -435,7 +435,8 @@ static void exit_mm(struct task_struct *tsk) task_unlock(tsk); mm_update_next_owner(mm); mmput(mm); - unmark_oom_victim(); + if (test_thread_flag(TIF_MEMDIE)) + unmark_oom_victim(); } static struct task_struct *find_alive_thread(struct task_struct *p) diff --git a/kernel/power/process.c b/kernel/power/process.c index 3ac45f192e9f..564f786df470 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -108,30 +108,6 @@ static int try_to_freeze_tasks(bool user_only) return todo ? -EBUSY : 0; } -static bool __check_frozen_processes(void) -{ - struct task_struct *g, *p; - - for_each_process_thread(g, p) - if (p != current && !freezer_should_skip(p) && !frozen(p)) - return false; - - return true; -} - -/* - * Returns true if all freezable tasks (except for current) are frozen already - */ -static bool check_frozen_processes(void) -{ - bool ret; - - read_lock(&tasklist_lock); - ret = __check_frozen_processes(); - read_unlock(&tasklist_lock); - return ret; -} - /** * freeze_processes - Signal user space processes to enter the refrigerator. * The current thread will not be frozen. The same process that calls @@ -142,7 +118,6 @@ static bool check_frozen_processes(void) int freeze_processes(void) { int error; - int oom_kills_saved; error = __usermodehelper_disable(UMH_FREEZING); if (error) @@ -157,29 +132,22 @@ int freeze_processes(void) pm_wakeup_clear(); pr_info("Freezing user space processes ... "); pm_freezing = true; - oom_kills_saved = oom_kills_count(); error = try_to_freeze_tasks(true); if (!error) { __usermodehelper_set_disable_depth(UMH_DISABLED); - oom_killer_disable(); - - /* - * There might have been an OOM kill while we were - * freezing tasks and the killed task might be still - * on the way out so we have to double check for race. - */ - if (oom_kills_count() != oom_kills_saved && - !check_frozen_processes()) { - __usermodehelper_set_disable_depth(UMH_ENABLED); - pr_cont("OOM in progress."); - error = -EBUSY; - } else { - pr_cont("done."); - } + pr_cont("done."); } pr_cont("\n"); BUG_ON(in_atomic()); + /* + * Now that the whole userspace is frozen we need to disbale + * the OOM killer to disallow any further interference with + * killable tasks. + */ + if (!error && !oom_killer_disable()) + error = -EBUSY; + if (error) thaw_processes(); return error; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index fe4d258ef32b..fbf64e6f64e4 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1930,7 +1930,7 @@ bool mem_cgroup_oom_synchronize(bool handle) if (!memcg) return false; - if (!handle) + if (!handle || oom_killer_disabled) goto cleanup; owait.memcg = memcg; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 3cbd76b8c13b..b8df76ee2be3 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -398,30 +398,27 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, } /* - * Number of OOM killer invocations (including memcg OOM killer). - * Primarily used by PM freezer to check for potential races with - * OOM killed frozen task. + * Number of OOM victims in flight */ -static atomic_t oom_kills = ATOMIC_INIT(0); +static atomic_t oom_victims = ATOMIC_INIT(0); +static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait); -int oom_kills_count(void) -{ - return atomic_read(&oom_kills); -} - -void note_oom_kill(void) -{ - atomic_inc(&oom_kills); -} +bool oom_killer_disabled __read_mostly; +static DECLARE_RWSEM(oom_sem); /** * mark_tsk_oom_victim - marks the given taks as OOM victim. * @tsk: task to mark + * + * Has to be called with oom_sem taken for read and never after + * oom has been disabled already. */ void mark_tsk_oom_victim(struct task_struct *tsk) { - set_tsk_thread_flag(tsk, TIF_MEMDIE); - + WARN_ON(oom_killer_disabled); + /* OOM killer might race with memcg OOM */ + if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE)) + return; /* * Make sure that the task is woken up from uninterruptible sleep * if it is frozen because OOM killer wouldn't be able to free @@ -429,14 +426,70 @@ void mark_tsk_oom_victim(struct task_struct *tsk) * that TIF_MEMDIE tasks should be ignored. */ __thaw_task(tsk); + atomic_inc(&oom_victims); } /** * unmark_oom_victim - unmarks the current task as OOM victim. + * + * Wakes up all waiters in oom_killer_disable() */ void unmark_oom_victim(void) { - clear_thread_flag(TIF_MEMDIE); + if (!test_and_clear_thread_flag(TIF_MEMDIE)) + return; + + down_read(&oom_sem); + /* + * There is no need to signal the lasst oom_victim if there + * is nobody who cares. + */ + if (!atomic_dec_return(&oom_victims) && oom_killer_disabled) + wake_up_all(&oom_victims_wait); + up_read(&oom_sem); +} + +/** + * oom_killer_disable - disable OOM killer + * + * Forces all page allocations to fail rather than trigger OOM killer. + * Will block and wait until all OOM victims are killed. + * + * The function cannot be called when there are runnable user tasks because + * the userspace would see unexpected allocation failures as a result. Any + * new usage of this function should be consulted with MM people. + * + * Returns true if successful and false if the OOM killer cannot be + * disabled. + */ +bool oom_killer_disable(void) +{ + /* + * Make sure to not race with an ongoing OOM killer + * and that the current is not the victim. + */ + down_write(&oom_sem); + if (test_thread_flag(TIF_MEMDIE)) { + up_write(&oom_sem); + return false; + } + + oom_killer_disabled = true; + up_write(&oom_sem); + + wait_event(oom_victims_wait, !atomic_read(&oom_victims)); + + return true; +} + +/** + * oom_killer_enable - enable OOM killer + */ +void oom_killer_enable(void) +{ + down_write(&oom_sem); + oom_killer_disabled = false; + up_write(&oom_sem); } #define K(x) ((x) << (PAGE_SHIFT-10)) @@ -637,7 +690,7 @@ void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask) } /** - * out_of_memory - kill the "best" process when we run out of memory + * __out_of_memory - kill the "best" process when we run out of memory * @zonelist: zonelist pointer * @gfp_mask: memory allocation flags * @order: amount of memory being requested as a power of 2 @@ -649,7 +702,7 @@ void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask) * OR try to be smart about which process to kill. Note that we * don't have to be perfect here, we just have to be good. */ -void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, +static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order, nodemask_t *nodemask, bool force_kill) { const nodemask_t *mpol_mask; @@ -718,6 +771,32 @@ out: schedule_timeout_killable(1); } +/** + * out_of_memory - tries to invoke OOM killer. + * @zonelist: zonelist pointer + * @gfp_mask: memory allocation flags + * @order: amount of memory being requested as a power of 2 + * @nodemask: nodemask passed to page allocator + * @force_kill: true if a task must be killed, even if others are exiting + * + * invokes __out_of_memory if the OOM is not disabled by oom_killer_disable() + * when it returns false. Otherwise returns true. + */ +bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, + int order, nodemask_t *nodemask, bool force_kill) +{ + bool ret = false; + + down_read(&oom_sem); + if (!oom_killer_disabled) { + __out_of_memory(zonelist, gfp_mask, order, nodemask, force_kill); + ret = true; + } + up_read(&oom_sem); + + return ret; +} + /* * The pagefault handler calls here because it is out of memory, so kill a * memory-hogging task. If any populated zone has ZONE_OOM_LOCKED set, a @@ -727,12 +806,25 @@ void pagefault_out_of_memory(void) { struct zonelist *zonelist; + down_read(&oom_sem); if (mem_cgroup_oom_synchronize(true)) - return; + goto unlock; zonelist = node_zonelist(first_memory_node, GFP_KERNEL); if (oom_zonelist_trylock(zonelist, GFP_KERNEL)) { - out_of_memory(NULL, 0, 0, NULL, false); + if (!oom_killer_disabled) + __out_of_memory(NULL, 0, 0, NULL, false); + else + /* + * There shouldn't be any user tasks runable while the + * OOM killer is disabled so the current task has to + * be a racing OOM victim for which oom_killer_disable() + * is waiting for. + */ + WARN_ON(test_thread_flag(TIF_MEMDIE)); + oom_zonelist_unlock(zonelist, GFP_KERNEL); } +unlock: + up_read(&oom_sem); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 641d5a9a8617..134e25525044 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -244,8 +244,6 @@ void set_pageblock_migratetype(struct page *page, int migratetype) PB_migrate, PB_migrate_end); } -bool oom_killer_disabled __read_mostly; - #ifdef CONFIG_DEBUG_VM static int page_outside_zone_boundaries(struct zone *zone, struct page *page) { @@ -2317,9 +2315,6 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, *did_some_progress = 0; - if (oom_killer_disabled) - return NULL; - /* * Acquire the per-zone oom lock for each zone. If that * fails, somebody else is making progress for us. @@ -2330,14 +2325,6 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, return NULL; } - /* - * PM-freezer should be notified that there might be an OOM killer on - * its way to kill and wake somebody up. This is too early and we might - * end up not killing anything but false positives are acceptable. - * See freeze_processes. - */ - note_oom_kill(); - /* * Go through the zonelist yet one more time, keep very high watermark * here, this is only to catch a parallel oom killing, we must fail if @@ -2372,8 +2359,8 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, goto out; } /* Exhausted what can be done so it's blamo time */ - out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false); - *did_some_progress = 1; + if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false)) + *did_some_progress = 1; out: oom_zonelist_unlock(ac->zonelist, gfp_mask); return page;