summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/linux/memcontrol.h21
-rw-r--r--include/linux/sched.h4
-rw-r--r--mm/memcontrol.c154
-rw-r--r--mm/memory.c3
-rw-r--r--mm/oom_kill.c7
5 files changed, 140 insertions, 49 deletions
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 2c911c95b1ac..64591ffc2e2c 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -130,6 +130,10 @@ extern void mem_cgroup_replace_page_cache(struct page *oldpage,
*
* Toggle whether a failed memcg charge should invoke the OOM killer
* or just return -ENOMEM. Returns the previous toggle state.
+ *
+ * NOTE: Any path that enables the OOM killer before charging must
+ * call mem_cgroup_oom_synchronize() afterward to finalize the
+ * OOM handling and clean up.
*/
static inline bool mem_cgroup_toggle_oom(bool new)
{
@@ -155,6 +159,13 @@ static inline void mem_cgroup_disable_oom(void)
WARN_ON(old == false);
}
+static inline bool task_in_memcg_oom(struct task_struct *p)
+{
+ return p->memcg_oom.in_memcg_oom;
+}
+
+bool mem_cgroup_oom_synchronize(void);
+
#ifdef CONFIG_MEMCG_SWAP
extern int do_swap_account;
#endif
@@ -391,6 +402,16 @@ static inline void mem_cgroup_disable_oom(void)
{
}
+static inline bool task_in_memcg_oom(struct task_struct *p)
+{
+ return false;
+}
+
+static inline bool mem_cgroup_oom_synchronize(void)
+{
+ return false;
+}
+
static inline void mem_cgroup_inc_page_stat(struct page *page,
enum mem_cgroup_page_stat_item idx)
{
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 42a58ce480bc..a1b7e6ee453b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1413,6 +1413,10 @@ struct task_struct {
unsigned int memcg_kmem_skip_account;
struct memcg_oom_info {
unsigned int may_oom:1;
+ unsigned int in_memcg_oom:1;
+ unsigned int oom_locked:1;
+ int wakeups;
+ struct mem_cgroup *wait_on_memcg;
} memcg_oom;
#endif
#ifdef CONFIG_HAVE_HW_BREAKPOINT
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 333bb91ee3f2..7849660665d7 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -302,6 +302,7 @@ struct mem_cgroup {
bool oom_lock;
atomic_t under_oom;
+ atomic_t oom_wakeups;
atomic_t refcnt;
@@ -2179,6 +2180,7 @@ static int memcg_oom_wake_function(wait_queue_t *wait,
static void memcg_wakeup_oom(struct mem_cgroup *memcg)
{
+ atomic_inc(&memcg->oom_wakeups);
/* for filtering, pass "memcg" as argument. */
__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
}
@@ -2190,19 +2192,17 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
}
/*
- * try to call OOM killer. returns false if we should exit memory-reclaim loop.
+ * try to call OOM killer
*/
-static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask,
- int order)
+static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
{
- struct oom_wait_info owait;
bool locked;
+ int wakeups;
- owait.memcg = memcg;
- owait.wait.flags = 0;
- owait.wait.func = memcg_oom_wake_function;
- owait.wait.private = current;
- INIT_LIST_HEAD(&owait.wait.task_list);
+ if (!current->memcg_oom.may_oom)
+ return;
+
+ current->memcg_oom.in_memcg_oom = 1;
/*
* As with any blocking lock, a contender needs to start
@@ -2210,12 +2210,8 @@ static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask,
* otherwise it can miss the wakeup from the unlock and sleep
* indefinitely. This is just open-coded because our locking
* is so particular to memcg hierarchies.
- *
- * Even if signal_pending(), we can't quit charge() loop without
- * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
- * under OOM is always welcomed, use TASK_KILLABLE here.
*/
- prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
+ wakeups = atomic_read(&memcg->oom_wakeups);
mem_cgroup_mark_under_oom(memcg);
locked = mem_cgroup_oom_trylock(memcg);
@@ -2225,15 +2221,95 @@ static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask,
if (locked && !memcg->oom_kill_disable) {
mem_cgroup_unmark_under_oom(memcg);
- finish_wait(&memcg_oom_waitq, &owait.wait);
mem_cgroup_out_of_memory(memcg, mask, order);
+ mem_cgroup_oom_unlock(memcg);
+ /*
+ * There is no guarantee that an OOM-lock contender
+ * sees the wakeups triggered by the OOM kill
+ * uncharges. Wake any sleepers explicitely.
+ */
+ memcg_oom_recover(memcg);
} else {
- schedule();
- mem_cgroup_unmark_under_oom(memcg);
- finish_wait(&memcg_oom_waitq, &owait.wait);
+ /*
+ * A system call can just return -ENOMEM, but if this
+ * is a page fault and somebody else is handling the
+ * OOM already, we need to sleep on the OOM waitqueue
+ * for this memcg until the situation is resolved.
+ * Which can take some time because it might be
+ * handled by a userspace task.
+ *
+ * However, this is the charge context, which means
+ * that we may sit on a large call stack and hold
+ * various filesystem locks, the mmap_sem etc. and we
+ * don't want the OOM handler to deadlock on them
+ * while we sit here and wait. Store the current OOM
+ * context in the task_struct, then return -ENOMEM.
+ * At the end of the page fault handler, with the
+ * stack unwound, pagefault_out_of_memory() will check
+ * back with us by calling
+ * mem_cgroup_oom_synchronize(), possibly putting the
+ * task to sleep.
+ */
+ current->memcg_oom.oom_locked = locked;
+ current->memcg_oom.wakeups = wakeups;
+ css_get(&memcg->css);
+ current->memcg_oom.wait_on_memcg = memcg;
}
+}
+
+/**
+ * mem_cgroup_oom_synchronize - complete memcg OOM handling
+ *
+ * This has to be called at the end of a page fault if the the memcg
+ * OOM handler was enabled and the fault is returning %VM_FAULT_OOM.
+ *
+ * Memcg supports userspace OOM handling, so failed allocations must
+ * sleep on a waitqueue until the userspace task resolves the
+ * situation. Sleeping directly in the charge context with all kinds
+ * of locks held is not a good idea, instead we remember an OOM state
+ * in the task and mem_cgroup_oom_synchronize() has to be called at
+ * the end of the page fault to put the task to sleep and clean up the
+ * OOM state.
+ *
+ * Returns %true if an ongoing memcg OOM situation was detected and
+ * finalized, %false otherwise.
+ */
+bool mem_cgroup_oom_synchronize(void)
+{
+ struct oom_wait_info owait;
+ struct mem_cgroup *memcg;
+
+ /* OOM is global, do not handle */
+ if (!current->memcg_oom.in_memcg_oom)
+ return false;
+
+ /*
+ * We invoked the OOM killer but there is a chance that a kill
+ * did not free up any charges. Everybody else might already
+ * be sleeping, so restart the fault and keep the rampage
+ * going until some charges are released.
+ */
+ memcg = current->memcg_oom.wait_on_memcg;
+ if (!memcg)
+ goto out;
+
+ if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
+ goto out_memcg;
+
+ owait.memcg = memcg;
+ owait.wait.flags = 0;
+ owait.wait.func = memcg_oom_wake_function;
+ owait.wait.private = current;
+ INIT_LIST_HEAD(&owait.wait.task_list);
- if (locked) {
+ prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
+ /* Only sleep if we didn't miss any wakeups since OOM */
+ if (atomic_read(&memcg->oom_wakeups) == current->memcg_oom.wakeups)
+ schedule();
+ finish_wait(&memcg_oom_waitq, &owait.wait);
+out_memcg:
+ mem_cgroup_unmark_under_oom(memcg);
+ if (current->memcg_oom.oom_locked) {
mem_cgroup_oom_unlock(memcg);
/*
* There is no guarantee that an OOM-lock contender
@@ -2242,11 +2318,10 @@ static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask,
*/
memcg_oom_recover(memcg);
}
-
- if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
- return false;
- /* Give chance to dying process */
- schedule_timeout_uninterruptible(1);
+ css_put(&memcg->css);
+ current->memcg_oom.wait_on_memcg = NULL;
+out:
+ current->memcg_oom.in_memcg_oom = 0;
return true;
}
@@ -2559,12 +2634,11 @@ enum {
CHARGE_RETRY, /* need to retry but retry is not bad */
CHARGE_NOMEM, /* we can't do more. return -ENOMEM */
CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */
- CHARGE_OOM_DIE, /* the current is killed because of OOM */
};
static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
unsigned int nr_pages, unsigned int min_pages,
- bool oom_check)
+ bool invoke_oom)
{
unsigned long csize = nr_pages * PAGE_SIZE;
struct mem_cgroup *mem_over_limit;
@@ -2621,14 +2695,10 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
if (mem_cgroup_wait_acct_move(mem_over_limit))
return CHARGE_RETRY;
- /* If we don't need to call oom-killer at el, return immediately */
- if (!oom_check || !current->memcg_oom.may_oom)
- return CHARGE_NOMEM;
- /* check OOM */
- if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask, get_order(csize)))
- return CHARGE_OOM_DIE;
+ if (invoke_oom)
+ mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(csize));
- return CHARGE_RETRY;
+ return CHARGE_NOMEM;
}
/*
@@ -2731,7 +2801,7 @@ again:
}
do {
- bool oom_check;
+ bool invoke_oom = oom && !nr_oom_retries;
/* If killed, bypass charge */
if (fatal_signal_pending(current)) {
@@ -2739,14 +2809,8 @@ again:
goto bypass;
}
- oom_check = false;
- if (oom && !nr_oom_retries) {
- oom_check = true;
- nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
- }
-
- ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, nr_pages,
- oom_check);
+ ret = mem_cgroup_do_charge(memcg, gfp_mask, batch,
+ nr_pages, invoke_oom);
switch (ret) {
case CHARGE_OK:
break;
@@ -2759,16 +2823,12 @@ again:
css_put(&memcg->css);
goto nomem;
case CHARGE_NOMEM: /* OOM routine works */
- if (!oom) {
+ if (!oom || invoke_oom) {
css_put(&memcg->css);
goto nomem;
}
- /* If oom, we never return -ENOMEM */
nr_oom_retries--;
break;
- case CHARGE_OOM_DIE: /* Killed by OOM Killer */
- css_put(&memcg->css);
- goto bypass;
}
} while (ret != CHARGE_OK);
diff --git a/mm/memory.c b/mm/memory.c
index 59f450c5c0a3..c4ce987745e4 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3867,6 +3867,9 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (flags & FAULT_FLAG_USER)
mem_cgroup_disable_oom();
+ if (WARN_ON(task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM)))
+ mem_cgroup_oom_synchronize();
+
return ret;
}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index f104c7e9f61e..1a582e3aee3e 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -702,9 +702,12 @@ out:
*/
void pagefault_out_of_memory(void)
{
- struct zonelist *zonelist = node_zonelist(first_online_node,
- GFP_KERNEL);
+ struct zonelist *zonelist;
+ if (mem_cgroup_oom_synchronize())
+ return;
+
+ zonelist = node_zonelist(first_online_node, GFP_KERNEL);
if (try_set_zonelist_oom(zonelist, GFP_KERNEL)) {
out_of_memory(NULL, 0, 0, NULL, false);
clear_zonelist_oom(zonelist, GFP_KERNEL);