summaryrefslogtreecommitdiff
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c1451
1 files changed, 795 insertions, 656 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7f1a356153c0..29501f040568 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -45,16 +45,17 @@
#include <linux/swapops.h>
#include <linux/spinlock.h>
#include <linux/eventfd.h>
+#include <linux/poll.h>
#include <linux/sort.h>
#include <linux/fs.h>
#include <linux/seq_file.h>
-#include <linux/vmalloc.h>
#include <linux/vmpressure.h>
#include <linux/mm_inline.h>
#include <linux/page_cgroup.h>
#include <linux/cpu.h>
#include <linux/oom.h>
#include <linux/lockdep.h>
+#include <linux/file.h>
#include "internal.h"
#include <net/sock.h>
#include <net/ip.h>
@@ -65,8 +66,8 @@
#include <trace/events/vmscan.h>
-struct cgroup_subsys mem_cgroup_subsys __read_mostly;
-EXPORT_SYMBOL(mem_cgroup_subsys);
+struct cgroup_subsys memory_cgrp_subsys __read_mostly;
+EXPORT_SYMBOL(memory_cgrp_subsys);
#define MEM_CGROUP_RECLAIM_RETRIES 5
static struct mem_cgroup *root_mem_cgroup __read_mostly;
@@ -148,7 +149,7 @@ struct mem_cgroup_reclaim_iter {
* matches memcg->dead_count of the hierarchy root group.
*/
struct mem_cgroup *last_visited;
- unsigned long last_dead_count;
+ int last_dead_count;
/* scan generation, increased every round-trip */
unsigned int generation;
@@ -227,6 +228,46 @@ struct mem_cgroup_eventfd_list {
struct eventfd_ctx *eventfd;
};
+/*
+ * cgroup_event represents events which userspace want to receive.
+ */
+struct mem_cgroup_event {
+ /*
+ * memcg which the event belongs to.
+ */
+ struct mem_cgroup *memcg;
+ /*
+ * eventfd to signal userspace about the event.
+ */
+ struct eventfd_ctx *eventfd;
+ /*
+ * Each of these stored in a list by the cgroup.
+ */
+ struct list_head list;
+ /*
+ * register_event() callback will be used to add new userspace
+ * waiter for changes related to this event. Use eventfd_signal()
+ * on eventfd to send notification to userspace.
+ */
+ int (*register_event)(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd, const char *args);
+ /*
+ * unregister_event() callback will be called when userspace closes
+ * the eventfd or on cgroup removing. This callback must be set,
+ * if you want provide notification functionality.
+ */
+ void (*unregister_event)(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd);
+ /*
+ * All fields below needed to unregister event when
+ * userspace closes eventfd.
+ */
+ poll_table pt;
+ wait_queue_head_t *wqh;
+ wait_queue_t wait;
+ struct work_struct remove;
+};
+
static void mem_cgroup_threshold(struct mem_cgroup *memcg);
static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
@@ -331,27 +372,20 @@ struct mem_cgroup {
atomic_t numainfo_updating;
#endif
+ /* List of events which userspace want to receive */
+ struct list_head event_list;
+ spinlock_t event_list_lock;
+
struct mem_cgroup_per_node *nodeinfo[0];
/* WARNING: nodeinfo must be the last member here */
};
-static size_t memcg_size(void)
-{
- return sizeof(struct mem_cgroup) +
- nr_node_ids * sizeof(struct mem_cgroup_per_node *);
-}
-
/* internal only representation about the status of kmem accounting. */
enum {
- KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */
- KMEM_ACCOUNTED_ACTIVATED, /* static key enabled. */
+ KMEM_ACCOUNTED_ACTIVE, /* accounted by this cgroup itself */
KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */
};
-/* We account when limit is on, but only after call sites are patched */
-#define KMEM_ACCOUNTED_MASK \
- ((1 << KMEM_ACCOUNTED_ACTIVE) | (1 << KMEM_ACCOUNTED_ACTIVATED))
-
#ifdef CONFIG_MEMCG_KMEM
static inline void memcg_kmem_set_active(struct mem_cgroup *memcg)
{
@@ -363,16 +397,6 @@ static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
}
-static void memcg_kmem_set_activated(struct mem_cgroup *memcg)
-{
- set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
-}
-
-static void memcg_kmem_clear_activated(struct mem_cgroup *memcg)
-{
- clear_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
-}
-
static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
{
/*
@@ -490,11 +514,6 @@ struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
}
-struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css)
-{
- return &mem_cgroup_from_css(css)->vmpressure;
-}
-
static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
{
return (memcg == root_mem_cgroup);
@@ -519,7 +538,7 @@ static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
{
struct cgroup_subsys_state *css;
- css = css_from_id(id - 1, &mem_cgroup_subsys);
+ css = css_from_id(id - 1, &memory_cgrp_subsys);
return mem_cgroup_from_css(css);
}
@@ -902,8 +921,6 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
struct page *page,
bool anon, int nr_pages)
{
- preempt_disable();
-
/*
* Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
* counted as CACHE even if it's on ANON LRU.
@@ -928,8 +945,6 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
}
__this_cpu_add(memcg->stat->nr_page_events, nr_pages);
-
- preempt_enable();
}
unsigned long
@@ -1053,25 +1068,18 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
if (unlikely(!p))
return NULL;
- return mem_cgroup_from_css(task_css(p, mem_cgroup_subsys_id));
+ return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
}
-struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
+static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
{
struct mem_cgroup *memcg = NULL;
- if (!mm)
- return NULL;
- /*
- * Because we have no locks, mm->owner's may be being moved to other
- * cgroup. We use css_tryget() here even if this looks
- * pessimistic (rather than adding locks here).
- */
rcu_read_lock();
do {
memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
if (unlikely(!memcg))
- break;
+ memcg = root_mem_cgroup;
} while (!css_tryget(&memcg->css));
rcu_read_unlock();
return memcg;
@@ -1098,16 +1106,22 @@ skip_node:
* skipped and we should continue the tree walk.
* last_visited css is safe to use because it is
* protected by css_get and the tree walk is rcu safe.
+ *
+ * We do not take a reference on the root of the tree walk
+ * because we might race with the root removal when it would
+ * be the only node in the iterated hierarchy and mem_cgroup_iter
+ * would end up in an endless loop because it expects that at
+ * least one valid node will be returned. Root cannot disappear
+ * because caller of the iterator should hold it already so
+ * skipping css reference should be safe.
*/
if (next_css) {
- struct mem_cgroup *mem = mem_cgroup_from_css(next_css);
+ if ((next_css == &root->css) ||
+ ((next_css->flags & CSS_ONLINE) && css_tryget(next_css)))
+ return mem_cgroup_from_css(next_css);
- if (css_tryget(&mem->css))
- return mem;
- else {
- prev_css = next_css;
- goto skip_node;
- }
+ prev_css = next_css;
+ goto skip_node;
}
return NULL;
@@ -1141,7 +1155,15 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
if (iter->last_dead_count == *sequence) {
smp_rmb();
position = iter->last_visited;
- if (position && !css_tryget(&position->css))
+
+ /*
+ * We cannot take a reference to root because we might race
+ * with root removal and returning NULL would end up in
+ * an endless loop on the iterator user level when root
+ * would be returned all the time.
+ */
+ if (position && position != root &&
+ !css_tryget(&position->css))
position = NULL;
}
return position;
@@ -1150,9 +1172,11 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
struct mem_cgroup *last_visited,
struct mem_cgroup *new_position,
+ struct mem_cgroup *root,
int sequence)
{
- if (last_visited)
+ /* root reference counting symmetric to mem_cgroup_iter_load */
+ if (last_visited && last_visited != root)
css_put(&last_visited->css);
/*
* We store the sequence count from the time @last_visited was
@@ -1227,7 +1251,8 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
memcg = __mem_cgroup_iter_next(root, last_visited);
if (reclaim) {
- mem_cgroup_iter_update(iter, last_visited, memcg, seq);
+ mem_cgroup_iter_update(iter, last_visited, memcg, root,
+ seq);
if (!memcg)
iter->generation++;
@@ -1450,7 +1475,7 @@ bool task_in_mem_cgroup(struct task_struct *task,
p = find_lock_task_mm(task);
if (p) {
- curr = try_get_mem_cgroup_from_mm(p->mm);
+ curr = get_mem_cgroup_from_mm(p->mm);
task_unlock(p);
} else {
/*
@@ -1464,8 +1489,6 @@ bool task_in_mem_cgroup(struct task_struct *task,
css_get(&curr->css);
rcu_read_unlock();
}
- if (!curr)
- return false;
/*
* We should check use_hierarchy of "memcg" not "curr". Because checking
* use_hierarchy of "curr" here make this function true if hierarchy is
@@ -1647,53 +1670,25 @@ static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
*/
void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
{
- struct cgroup *task_cgrp;
- struct cgroup *mem_cgrp;
- /*
- * Need a buffer in BSS, can't rely on allocations. The code relies
- * on the assumption that OOM is serialized for memory controller.
- * If this assumption is broken, revisit this code.
- */
- static char memcg_name[PATH_MAX];
- int ret;
+ /* oom_info_lock ensures that parallel ooms do not interleave */
+ static DEFINE_MUTEX(oom_info_lock);
struct mem_cgroup *iter;
unsigned int i;
if (!p)
return;
+ mutex_lock(&oom_info_lock);
rcu_read_lock();
- mem_cgrp = memcg->css.cgroup;
- task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);
-
- ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);
- if (ret < 0) {
- /*
- * Unfortunately, we are unable to convert to a useful name
- * But we'll still print out the usage information
- */
- rcu_read_unlock();
- goto done;
- }
- rcu_read_unlock();
-
- pr_info("Task in %s killed", memcg_name);
+ pr_info("Task in ");
+ pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
+ pr_info(" killed as a result of limit of ");
+ pr_cont_cgroup_path(memcg->css.cgroup);
+ pr_info("\n");
- rcu_read_lock();
- ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
- if (ret < 0) {
- rcu_read_unlock();
- goto done;
- }
rcu_read_unlock();
- /*
- * Continues from above, so we don't need an KERN_ level
- */
- pr_cont(" as a result of limit of %s\n", memcg_name);
-done:
-
pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n",
res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
@@ -1708,13 +1703,8 @@ done:
res_counter_read_u64(&memcg->kmem, RES_FAILCNT));
for_each_mem_cgroup_tree(iter, memcg) {
- pr_info("Memory cgroup stats");
-
- rcu_read_lock();
- ret = cgroup_path(iter->css.cgroup, memcg_name, PATH_MAX);
- if (!ret)
- pr_cont(" for %s", memcg_name);
- rcu_read_unlock();
+ pr_info("Memory cgroup stats for ");
+ pr_cont_cgroup_path(iter->css.cgroup);
pr_cont(":");
for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
@@ -1730,6 +1720,7 @@ done:
pr_cont("\n");
}
+ mutex_unlock(&oom_info_lock);
}
/*
@@ -1822,13 +1813,18 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
break;
};
points = oom_badness(task, memcg, NULL, totalpages);
- if (points > chosen_points) {
- if (chosen)
- put_task_struct(chosen);
- chosen = task;
- chosen_points = points;
- get_task_struct(chosen);
- }
+ if (!points || points < chosen_points)
+ continue;
+ /* Prefer thread group leaders for display purposes */
+ if (points == chosen_points &&
+ thread_group_leader(chosen))
+ continue;
+
+ if (chosen)
+ put_task_struct(chosen);
+ chosen = task;
+ chosen_points = points;
+ get_task_struct(chosen);
}
css_task_iter_end(&it);
}
@@ -2579,7 +2575,7 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
}
-/* See __mem_cgroup_try_charge() for details */
+/* See mem_cgroup_try_charge() for details */
enum {
CHARGE_OK, /* success */
CHARGE_RETRY, /* need to retry but retry is not bad */
@@ -2652,45 +2648,34 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
return CHARGE_NOMEM;
}
-/*
- * __mem_cgroup_try_charge() does
- * 1. detect memcg to be charged against from passed *mm and *ptr,
- * 2. update res_counter
- * 3. call memory reclaim if necessary.
- *
- * In some special case, if the task is fatal, fatal_signal_pending() or
- * has TIF_MEMDIE, this function returns -EINTR while writing root_mem_cgroup
- * to *ptr. There are two reasons for this. 1: fatal threads should quit as soon
- * as possible without any hazards. 2: all pages should have a valid
- * pc->mem_cgroup. If mm is NULL and the caller doesn't pass a valid memcg
- * pointer, that is treated as a charge to root_mem_cgroup.
- *
- * So __mem_cgroup_try_charge() will return
- * 0 ... on success, filling *ptr with a valid memcg pointer.
- * -ENOMEM ... charge failure because of resource limits.
- * -EINTR ... if thread is fatal. *ptr is filled with root_mem_cgroup.
+/**
+ * mem_cgroup_try_charge - try charging a memcg
+ * @memcg: memcg to charge
+ * @nr_pages: number of pages to charge
+ * @oom: trigger OOM if reclaim fails
*
- * Unlike the exported interface, an "oom" parameter is added. if oom==true,
- * the oom-killer can be invoked.
+ * Returns 0 if @memcg was charged successfully, -EINTR if the charge
+ * was bypassed to root_mem_cgroup, and -ENOMEM if the charge failed.
*/
-static int __mem_cgroup_try_charge(struct mm_struct *mm,
- gfp_t gfp_mask,
- unsigned int nr_pages,
- struct mem_cgroup **ptr,
- bool oom)
+static int mem_cgroup_try_charge(struct mem_cgroup *memcg,
+ gfp_t gfp_mask,
+ unsigned int nr_pages,
+ bool oom)
{
unsigned int batch = max(CHARGE_BATCH, nr_pages);
int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
- struct mem_cgroup *memcg = NULL;
int ret;
+ if (mem_cgroup_is_root(memcg))
+ goto done;
/*
- * Unlike gloval-vm's OOM-kill, we're not in memory shortage
- * in system level. So, allow to go ahead dying process in addition to
- * MEMDIE process.
+ * Unlike in global OOM situations, memcg is not in a physical
+ * memory shortage. Allow dying and OOM-killed tasks to
+ * bypass the last charges so that they can exit quickly and
+ * free their memory.
*/
- if (unlikely(test_thread_flag(TIF_MEMDIE)
- || fatal_signal_pending(current)))
+ if (unlikely(test_thread_flag(TIF_MEMDIE) ||
+ fatal_signal_pending(current)))
goto bypass;
if (unlikely(task_in_memcg_oom(current)))
@@ -2698,73 +2683,16 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
if (gfp_mask & __GFP_NOFAIL)
oom = false;
-
- /*
- * We always charge the cgroup the mm_struct belongs to.
- * The mm_struct's mem_cgroup changes on task migration if the
- * thread group leader migrates. It's possible that mm is not
- * set, if so charge the root memcg (happens for pagecache usage).
- */
- if (!*ptr && !mm)
- *ptr = root_mem_cgroup;
again:
- if (*ptr) { /* css should be a valid one */
- memcg = *ptr;
- if (mem_cgroup_is_root(memcg))
- goto done;
- if (consume_stock(memcg, nr_pages))
- goto done;
- css_get(&memcg->css);
- } else {
- struct task_struct *p;
-
- rcu_read_lock();
- p = rcu_dereference(mm->owner);
- /*
- * Because we don't have task_lock(), "p" can exit.
- * In that case, "memcg" can point to root or p can be NULL with
- * race with swapoff. Then, we have small risk of mis-accouning.
- * But such kind of mis-account by race always happens because
- * we don't have cgroup_mutex(). It's overkill and we allo that
- * small race, here.
- * (*) swapoff at el will charge against mm-struct not against
- * task-struct. So, mm->owner can be NULL.
- */
- memcg = mem_cgroup_from_task(p);
- if (!memcg)
- memcg = root_mem_cgroup;
- if (mem_cgroup_is_root(memcg)) {
- rcu_read_unlock();
- goto done;
- }
- if (consume_stock(memcg, nr_pages)) {
- /*
- * It seems dagerous to access memcg without css_get().
- * But considering how consume_stok works, it's not
- * necessary. If consume_stock success, some charges
- * from this memcg are cached on this cpu. So, we
- * don't need to call css_get()/css_tryget() before
- * calling consume_stock().
- */
- rcu_read_unlock();
- goto done;
- }
- /* after here, we may be blocked. we need to get refcnt */
- if (!css_tryget(&memcg->css)) {
- rcu_read_unlock();
- goto again;
- }
- rcu_read_unlock();
- }
+ if (consume_stock(memcg, nr_pages))
+ goto done;
do {
bool invoke_oom = oom && !nr_oom_retries;
/* If killed, bypass charge */
- if (fatal_signal_pending(current)) {
- css_put(&memcg->css);
+ if (fatal_signal_pending(current))
goto bypass;
- }
ret = mem_cgroup_do_charge(memcg, gfp_mask, batch,
nr_pages, invoke_oom);
@@ -2773,17 +2701,12 @@ again:
break;
case CHARGE_RETRY: /* not in OOM situation but retry */
batch = nr_pages;
- css_put(&memcg->css);
- memcg = NULL;
goto again;
case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
- css_put(&memcg->css);
goto nomem;
case CHARGE_NOMEM: /* OOM routine works */
- if (!oom || invoke_oom) {
- css_put(&memcg->css);
+ if (!oom || invoke_oom)
goto nomem;
- }
nr_oom_retries--;
break;
}
@@ -2791,20 +2714,44 @@ again:
if (batch > nr_pages)
refill_stock(memcg, batch - nr_pages);
- css_put(&memcg->css);
done:
- *ptr = memcg;
return 0;
nomem:
- if (!(gfp_mask & __GFP_NOFAIL)) {
- *ptr = NULL;
+ if (!(gfp_mask & __GFP_NOFAIL))
return -ENOMEM;
- }
bypass:
- *ptr = root_mem_cgroup;
return -EINTR;
}
+/**
+ * mem_cgroup_try_charge_mm - try charging a mm
+ * @mm: mm_struct to charge
+ * @nr_pages: number of pages to charge
+ * @oom: trigger OOM if reclaim fails
+ *
+ * Returns the charged mem_cgroup associated with the given mm_struct or
+ * NULL the charge failed.
+ */
+static struct mem_cgroup *mem_cgroup_try_charge_mm(struct mm_struct *mm,
+ gfp_t gfp_mask,
+ unsigned int nr_pages,
+ bool oom)
+
+{
+ struct mem_cgroup *memcg;
+ int ret;
+
+ memcg = get_mem_cgroup_from_mm(mm);
+ ret = mem_cgroup_try_charge(memcg, gfp_mask, nr_pages, oom);
+ css_put(&memcg->css);
+ if (ret == -EINTR)
+ memcg = root_mem_cgroup;
+ else if (ret)
+ memcg = NULL;
+
+ return memcg;
+}
+
/*
* Somemtimes we have to undo a charge we got by try_charge().
* This function is for that and do uncharge, put css's refcnt.
@@ -2861,7 +2808,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
unsigned short id;
swp_entry_t ent;
- VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
pc = lookup_page_cgroup(page);
lock_page_cgroup(pc);
@@ -2895,7 +2842,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
bool anon;
lock_page_cgroup(pc);
- VM_BUG_ON(PageCgroupUsed(pc));
+ VM_BUG_ON_PAGE(PageCgroupUsed(pc), page);
/*
* we don't need page_cgroup_lock about tail pages, becase they are not
* accessed by any other context at this point.
@@ -2930,7 +2877,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
if (lrucare) {
if (was_on_lru) {
lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
- VM_BUG_ON(PageLRU(page));
+ VM_BUG_ON_PAGE(PageLRU(page), page);
SetPageLRU(page);
add_page_to_lru_list(page, lruvec, page_lru(page));
}
@@ -2956,10 +2903,12 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
static DEFINE_MUTEX(set_limit_mutex);
#ifdef CONFIG_MEMCG_KMEM
+static DEFINE_MUTEX(activate_kmem_mutex);
+
static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg)
{
return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) &&
- (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK);
+ memcg_kmem_is_active(memcg);
}
/*
@@ -2976,10 +2925,9 @@ static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
}
#ifdef CONFIG_SLABINFO
-static int mem_cgroup_slabinfo_read(struct cgroup_subsys_state *css,
- struct cftype *cft, struct seq_file *m)
+static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
struct memcg_cache_params *params;
if (!memcg_can_account_kmem(memcg))
@@ -2999,20 +2947,17 @@ static int mem_cgroup_slabinfo_read(struct cgroup_subsys_state *css,
static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
{
struct res_counter *fail_res;
- struct mem_cgroup *_memcg;
int ret = 0;
ret = res_counter_charge(&memcg->kmem, size, &fail_res);
if (ret)
return ret;
- _memcg = memcg;
- ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT,
- &_memcg, oom_gfp_allowed(gfp));
-
+ ret = mem_cgroup_try_charge(memcg, gfp, size >> PAGE_SHIFT,
+ oom_gfp_allowed(gfp));
if (ret == -EINTR) {
/*
- * __mem_cgroup_try_charge() chosed to bypass to root due to
+ * mem_cgroup_try_charge() chosed to bypass to root due to
* OOM kill or fatal signal. Since our only options are to
* either fail the allocation or charge it to this cgroup, do
* it as a temporary condition. But we can't fail. From a
@@ -3022,7 +2967,7 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
*
* This condition will only trigger if the task entered
* memcg_charge_kmem in a sane state, but was OOM-killed during
- * __mem_cgroup_try_charge() above. Tasks that were already
+ * mem_cgroup_try_charge() above. Tasks that were already
* dying when the allocation triggers should have been already
* directed to the root cgroup in memcontrol.h
*/
@@ -3059,16 +3004,6 @@ static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size)
css_put(&memcg->css);
}
-void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep)
-{
- if (!memcg)
- return;
-
- mutex_lock(&memcg->slab_caches_mutex);
- list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
- mutex_unlock(&memcg->slab_caches_mutex);
-}
-
/*
* helper for acessing a memcg's index. It will be used as an index in the
* child cache array in kmem_cache, and also to derive its name. This function
@@ -3079,43 +3014,6 @@ int memcg_cache_id(struct mem_cgroup *memcg)
return memcg ? memcg->kmemcg_id : -1;
}
-/*
- * This ends up being protected by the set_limit mutex, during normal
- * operation, because that is its main call site.
- *
- * But when we create a new cache, we can call this as well if its parent
- * is kmem-limited. That will have to hold set_limit_mutex as well.
- */
-int memcg_update_cache_sizes(struct mem_cgroup *memcg)
-{
- int num, ret;
-
- num = ida_simple_get(&kmem_limited_groups,
- 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
- if (num < 0)
- return num;
- /*
- * After this point, kmem_accounted (that we test atomically in
- * the beginning of this conditional), is no longer 0. This
- * guarantees only one process will set the following boolean
- * to true. We don't need test_and_set because we're protected
- * by the set_limit_mutex anyway.
- */
- memcg_kmem_set_activated(memcg);
-
- ret = memcg_update_all_caches(num+1);
- if (ret) {
- ida_simple_remove(&kmem_limited_groups, num);
- memcg_kmem_clear_activated(memcg);
- return ret;
- }
-
- memcg->kmemcg_id = num;
- INIT_LIST_HEAD(&memcg->memcg_slab_caches);
- mutex_init(&memcg->slab_caches_mutex);
- return 0;
-}
-
static size_t memcg_caches_array_size(int num_groups)
{
ssize_t size;
@@ -3152,18 +3050,17 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
if (num_groups > memcg_limited_groups_array_size) {
int i;
+ struct memcg_cache_params *new_params;
ssize_t size = memcg_caches_array_size(num_groups);
size *= sizeof(void *);
size += offsetof(struct memcg_cache_params, memcg_caches);
- s->memcg_params = kzalloc(size, GFP_KERNEL);
- if (!s->memcg_params) {
- s->memcg_params = cur_params;
+ new_params = kzalloc(size, GFP_KERNEL);
+ if (!new_params)
return -ENOMEM;
- }
- s->memcg_params->is_root_cache = true;
+ new_params->is_root_cache = true;
/*
* There is the chance it will be bigger than
@@ -3177,7 +3074,7 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
for (i = 0; i < memcg_limited_groups_array_size; i++) {
if (!cur_params->memcg_caches[i])
continue;
- s->memcg_params->memcg_caches[i] =
+ new_params->memcg_caches[i] =
cur_params->memcg_caches[i];
}
@@ -3190,13 +3087,38 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
* bigger than the others. And all updates will reset this
* anyway.
*/
- kfree(cur_params);
+ rcu_assign_pointer(s->memcg_params, new_params);
+ if (cur_params)
+ kfree_rcu(cur_params, rcu_head);
}
return 0;
}
-int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
- struct kmem_cache *root_cache)
+char *memcg_create_cache_name(struct mem_cgroup *memcg,
+ struct kmem_cache *root_cache)
+{
+ static char *buf = NULL;
+
+ /*
+ * We need a mutex here to protect the shared buffer. Since this is
+ * expected to be called only on cache creation, we can employ the
+ * slab_mutex for that purpose.
+ */
+ lockdep_assert_held(&slab_mutex);
+
+ if (!buf) {
+ buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
+ if (!buf)
+ return NULL;
+ }
+
+ cgroup_name(memcg->css.cgroup, buf, NAME_MAX + 1);
+ return kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name,
+ memcg_cache_id(memcg), buf);
+}
+
+int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
+ struct kmem_cache *root_cache)
{
size_t size;
@@ -3218,41 +3140,91 @@ int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
s->memcg_params->root_cache = root_cache;
INIT_WORK(&s->memcg_params->destroy,
kmem_cache_destroy_work_func);
+ css_get(&memcg->css);
} else
s->memcg_params->is_root_cache = true;
return 0;
}
-void memcg_release_cache(struct kmem_cache *s)
+void memcg_free_cache_params(struct kmem_cache *s)
+{
+ if (!s->memcg_params)
+ return;
+ if (!s->memcg_params->is_root_cache)
+ css_put(&s->memcg_params->memcg->css);
+ kfree(s->memcg_params);
+}
+
+void memcg_register_cache(struct kmem_cache *s)
{
struct kmem_cache *root;
struct mem_cgroup *memcg;
int id;
- /*
- * This happens, for instance, when a root cache goes away before we
- * add any memcg.
- */
- if (!s->memcg_params)
+ if (is_root_cache(s))
return;
- if (s->memcg_params->is_root_cache)
- goto out;
+ /*
+ * Holding the slab_mutex assures nobody will touch the memcg_caches
+ * array while we are modifying it.
+ */
+ lockdep_assert_held(&slab_mutex);
+ root = s->memcg_params->root_cache;
memcg = s->memcg_params->memcg;
- id = memcg_cache_id(memcg);
+ id = memcg_cache_id(memcg);
+
+ /*
+ * Since readers won't lock (see cache_from_memcg_idx()), we need a
+ * barrier here to ensure nobody will see the kmem_cache partially
+ * initialized.
+ */
+ smp_wmb();
+
+ /*
+ * Initialize the pointer to this cache in its parent's memcg_params
+ * before adding it to the memcg_slab_caches list, otherwise we can
+ * fail to convert memcg_params_to_cache() while traversing the list.
+ */
+ VM_BUG_ON(root->memcg_params->memcg_caches[id]);
+ root->memcg_params->memcg_caches[id] = s;
+
+ mutex_lock(&memcg->slab_caches_mutex);
+ list_add(&s->memcg_params->list, &memcg->memcg_slab_caches);
+ mutex_unlock(&memcg->slab_caches_mutex);
+}
+
+void memcg_unregister_cache(struct kmem_cache *s)
+{
+ struct kmem_cache *root;
+ struct mem_cgroup *memcg;
+ int id;
+
+ if (is_root_cache(s))
+ return;
+
+ /*
+ * Holding the slab_mutex assures nobody will touch the memcg_caches
+ * array while we are modifying it.
+ */
+ lockdep_assert_held(&slab_mutex);
root = s->memcg_params->root_cache;
- root->memcg_params->memcg_caches[id] = NULL;
+ memcg = s->memcg_params->memcg;
+ id = memcg_cache_id(memcg);
mutex_lock(&memcg->slab_caches_mutex);
list_del(&s->memcg_params->list);
mutex_unlock(&memcg->slab_caches_mutex);
- css_put(&memcg->css);
-out:
- kfree(s->memcg_params);
+ /*
+ * Clear the pointer to this cache in its parent's memcg_params only
+ * after removing it from the memcg_slab_caches list, otherwise we can
+ * fail to convert memcg_params_to_cache() while traversing the list.
+ */
+ VM_BUG_ON(root->memcg_params->memcg_caches[id] != s);
+ root->memcg_params->memcg_caches[id] = NULL;
}
/*
@@ -3311,11 +3283,9 @@ static void kmem_cache_destroy_work_func(struct work_struct *w)
* So if we aren't down to zero, we'll just schedule a worker and try
* again
*/
- if (atomic_read(&cachep->memcg_params->nr_pages) != 0) {
+ if (atomic_read(&cachep->memcg_params->nr_pages) != 0)
kmem_cache_shrink(cachep);
- if (atomic_read(&cachep->memcg_params->nr_pages) == 0)
- return;
- } else
+ else
kmem_cache_destroy(cachep);
}
@@ -3351,99 +3321,10 @@ void mem_cgroup_destroy_cache(struct kmem_cache *cachep)
schedule_work(&cachep->memcg_params->destroy);
}
-/*
- * This lock protects updaters, not readers. We want readers to be as fast as
- * they can, and they will either see NULL or a valid cache value. Our model
- * allow them to see NULL, in which case the root memcg will be selected.
- *
- * We need this lock because multiple allocations to the same cache from a non
- * will span more than one worker. Only one of them can create the cache.
- */
-static DEFINE_MUTEX(memcg_cache_mutex);
-
-/*
- * Called with memcg_cache_mutex held
- */
-static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg,
- struct kmem_cache *s)
-{
- struct kmem_cache *new;
- static char *tmp_name = NULL;
-
- lockdep_assert_held(&memcg_cache_mutex);
-
- /*
- * kmem_cache_create_memcg duplicates the given name and
- * cgroup_name for this name requires RCU context.
- * This static temporary buffer is used to prevent from
- * pointless shortliving allocation.
- */
- if (!tmp_name) {
- tmp_name = kmalloc(PATH_MAX, GFP_KERNEL);
- if (!tmp_name)
- return NULL;
- }
-
- rcu_read_lock();
- snprintf(tmp_name, PATH_MAX, "%s(%d:%s)", s->name,
- memcg_cache_id(memcg), cgroup_name(memcg->css.cgroup));
- rcu_read_unlock();
-
- new = kmem_cache_create_memcg(memcg, tmp_name, s->object_size, s->align,
- (s->flags & ~SLAB_PANIC), s->ctor, s);
-
- if (new)
- new->allocflags |= __GFP_KMEMCG;
-
- return new;
-}
-
-static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
- struct kmem_cache *cachep)
-{
- struct kmem_cache *new_cachep;
- int idx;
-
- BUG_ON(!memcg_can_account_kmem(memcg));
-
- idx = memcg_cache_id(memcg);
-
- mutex_lock(&memcg_cache_mutex);
- new_cachep = cache_from_memcg_idx(cachep, idx);
- if (new_cachep) {
- css_put(&memcg->css);
- goto out;
- }
-
- new_cachep = kmem_cache_dup(memcg, cachep);
- if (new_cachep == NULL) {
- new_cachep = cachep;
- css_put(&memcg->css);
- goto out;
- }
-
- atomic_set(&new_cachep->memcg_params->nr_pages , 0);
-
- cachep->memcg_params->memcg_caches[idx] = new_cachep;
- /*
- * the readers won't lock, make sure everybody sees the updated value,
- * so they won't put stuff in the queue again for no reason
- */
- wmb();
-out:
- mutex_unlock(&memcg_cache_mutex);
- return new_cachep;
-}
-
-void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
+int __kmem_cache_destroy_memcg_children(struct kmem_cache *s)
{
struct kmem_cache *c;
- int i;
-
- if (!s->memcg_params)
- return;
- if (!s->memcg_params->is_root_cache)
- return;
+ int i, failed = 0;
/*
* If the cache is being destroyed, we trust that there is no one else
@@ -3452,9 +3333,10 @@ void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
*
* Still, we don't want anyone else freeing memcg_caches under our
* noses, which can happen if a new memcg comes to life. As usual,
- * we'll take the set_limit_mutex to protect ourselves against this.
+ * we'll take the activate_kmem_mutex to protect ourselves against
+ * this.
*/
- mutex_lock(&set_limit_mutex);
+ mutex_lock(&activate_kmem_mutex);
for_each_memcg_cache_index(i) {
c = cache_from_memcg_idx(s, i);
if (!c)
@@ -3476,16 +3358,14 @@ void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
c->memcg_params->dead = false;
cancel_work_sync(&c->memcg_params->destroy);
kmem_cache_destroy(c);
+
+ if (cache_from_memcg_idx(s, i))
+ failed++;
}
- mutex_unlock(&set_limit_mutex);
+ mutex_unlock(&activate_kmem_mutex);
+ return failed;
}
-struct create_work {
- struct mem_cgroup *memcg;
- struct kmem_cache *cachep;
- struct work_struct work;
-};
-
static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
{
struct kmem_cache *cachep;
@@ -3503,12 +3383,20 @@ static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
mutex_unlock(&memcg->slab_caches_mutex);
}
+struct create_work {
+ struct mem_cgroup *memcg;
+ struct kmem_cache *cachep;
+ struct work_struct work;
+};
+
static void memcg_create_cache_work_func(struct work_struct *w)
{
- struct create_work *cw;
+ struct create_work *cw = container_of(w, struct create_work, work);
+ struct mem_cgroup *memcg = cw->memcg;
+ struct kmem_cache *cachep = cw->cachep;
- cw = container_of(w, struct create_work, work);
- memcg_create_kmem_cache(cw->memcg, cw->cachep);
+ kmem_cache_create_memcg(memcg, cachep);
+ css_put(&memcg->css);
kfree(cw);
}
@@ -3568,7 +3456,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
gfp_t gfp)
{
struct mem_cgroup *memcg;
- int idx;
+ struct kmem_cache *memcg_cachep;
VM_BUG_ON(!cachep->memcg_params);
VM_BUG_ON(!cachep->memcg_params->is_root_cache);
@@ -3582,15 +3470,9 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
if (!memcg_can_account_kmem(memcg))
goto out;
- idx = memcg_cache_id(memcg);
-
- /*
- * barrier to mare sure we're always seeing the up to date value. The
- * code updating memcg_caches will issue a write barrier to match this.
- */
- read_barrier_depends();
- if (likely(cache_from_memcg_idx(cachep, idx))) {
- cachep = cache_from_memcg_idx(cachep, idx);
+ memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg));
+ if (likely(memcg_cachep)) {
+ cachep = memcg_cachep;
goto out;
}
@@ -3673,15 +3555,7 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
if (!current->mm || current->memcg_kmem_skip_account)
return true;
- memcg = try_get_mem_cgroup_from_mm(current->mm);
-
- /*
- * very rare case described in mem_cgroup_from_task. Unfortunately there
- * isn't much we can do without complicating this too much, and it would
- * be gfp-dependent anyway. Just let it go
- */
- if (unlikely(!memcg))
- return true;
+ memcg = get_mem_cgroup_from_mm(current->mm);
if (!memcg_can_account_kmem(memcg)) {
css_put(&memcg->css);
@@ -3744,7 +3618,7 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order)
if (!memcg)
return;
- VM_BUG_ON(mem_cgroup_is_root(memcg));
+ VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
}
#else
@@ -3784,19 +3658,6 @@ void mem_cgroup_split_huge_fixup(struct page *head)
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-static inline
-void mem_cgroup_move_account_page_stat(struct mem_cgroup *from,
- struct mem_cgroup *to,
- unsigned int nr_pages,
- enum mem_cgroup_stat_index idx)
-{
- /* Update stat data for mem_cgroup */
- preempt_disable();
- __this_cpu_sub(from->stat->count[idx], nr_pages);
- __this_cpu_add(to->stat->count[idx], nr_pages);
- preempt_enable();
-}
-
/**
* mem_cgroup_move_account - move account of the page
* @page: the page
@@ -3823,7 +3684,7 @@ static int mem_cgroup_move_account(struct page *page,
bool anon = PageAnon(page);
VM_BUG_ON(from == to);
- VM_BUG_ON(PageLRU(page));
+ VM_BUG_ON_PAGE(PageLRU(page), page);
/*
* The page is isolated from LRU. So, collapse function
* will not handle this page. But page splitting can happen.
@@ -3842,13 +3703,19 @@ static int mem_cgroup_move_account(struct page *page,
move_lock_mem_cgroup(from, &flags);
- if (!anon && page_mapped(page))
- mem_cgroup_move_account_page_stat(from, to, nr_pages,
- MEM_CGROUP_STAT_FILE_MAPPED);
+ if (!anon && page_mapped(page)) {
+ __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
+ nr_pages);
+ __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
+ nr_pages);
+ }
- if (PageWriteback(page))
- mem_cgroup_move_account_page_stat(from, to, nr_pages,
- MEM_CGROUP_STAT_WRITEBACK);
+ if (PageWriteback(page)) {
+ __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK],
+ nr_pages);
+ __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK],
+ nr_pages);
+ }
mem_cgroup_charge_statistics(from, page, anon, -nr_pages);
@@ -3916,7 +3783,7 @@ static int mem_cgroup_move_parent(struct page *page,
parent = root_mem_cgroup;
if (nr_pages > 1) {
- VM_BUG_ON(!PageTransHuge(page));
+ VM_BUG_ON_PAGE(!PageTransHuge(page), page);
flags = compound_lock_irqsave(page);
}
@@ -3934,23 +3801,23 @@ out:
return ret;
}
-/*
- * Charge the memory controller for page usage.
- * Return
- * 0 if the charge was successful
- * < 0 if the cgroup is over its limit
- */
-static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
- gfp_t gfp_mask, enum charge_type ctype)
+int mem_cgroup_charge_anon(struct page *page,
+ struct mm_struct *mm, gfp_t gfp_mask)
{
- struct mem_cgroup *memcg = NULL;
unsigned int nr_pages = 1;
+ struct mem_cgroup *memcg;
bool oom = true;
- int ret;
+
+ if (mem_cgroup_disabled())
+ return 0;
+
+ VM_BUG_ON_PAGE(page_mapped(page), page);
+ VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page);
+ VM_BUG_ON(!mm);
if (PageTransHuge(page)) {
nr_pages <<= compound_order(page);
- VM_BUG_ON(!PageTransHuge(page));
+ VM_BUG_ON_PAGE(!PageTransHuge(page), page);
/*
* Never OOM-kill a process for a huge page. The
* fault handler will fall back to regular pages.
@@ -3958,25 +3825,14 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
oom = false;
}
- ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom);
- if (ret == -ENOMEM)
- return ret;
- __mem_cgroup_commit_charge(memcg, page, nr_pages, ctype, false);
+ memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, nr_pages, oom);
+ if (!memcg)
+ return -ENOMEM;
+ __mem_cgroup_commit_charge(memcg, page, nr_pages,
+ MEM_CGROUP_CHARGE_TYPE_ANON, false);
return 0;
}
-int mem_cgroup_newpage_charge(struct page *page,
- struct mm_struct *mm, gfp_t gfp_mask)
-{
- if (mem_cgroup_disabled())
- return 0;
- VM_BUG_ON(page_mapped(page));
- VM_BUG_ON(page->mapping && !PageAnon(page));
- VM_BUG_ON(!mm);
- return mem_cgroup_charge_common(page, mm, gfp_mask,
- MEM_CGROUP_CHARGE_TYPE_ANON);
-}
-
/*
* While swap-in, try_charge -> commit or cancel, the page is locked.
* And when try_charge() successfully returns, one refcnt to memcg without
@@ -3988,7 +3844,7 @@ static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm,
gfp_t mask,
struct mem_cgroup **memcgp)
{
- struct mem_cgroup *memcg;
+ struct mem_cgroup *memcg = NULL;
struct page_cgroup *pc;
int ret;
@@ -4001,31 +3857,29 @@ static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm,
* in turn serializes uncharging.
*/
if (PageCgroupUsed(pc))
- return 0;
- if (!do_swap_account)
- goto charge_cur_mm;
- memcg = try_get_mem_cgroup_from_page(page);
+ goto out;
+ if (do_swap_account)
+ memcg = try_get_mem_cgroup_from_page(page);
if (!memcg)
- goto charge_cur_mm;
- *memcgp = memcg;
- ret = __mem_cgroup_try_charge(NULL, mask, 1, memcgp, true);
+ memcg = get_mem_cgroup_from_mm(mm);
+ ret = mem_cgroup_try_charge(memcg, mask, 1, true);
css_put(&memcg->css);
if (ret == -EINTR)
- ret = 0;
- return ret;
-charge_cur_mm:
- ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true);
- if (ret == -EINTR)
- ret = 0;
- return ret;
+ memcg = root_mem_cgroup;
+ else if (ret)
+ return ret;
+out:
+ *memcgp = memcg;
+ return 0;
}
int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page,
gfp_t gfp_mask, struct mem_cgroup **memcgp)
{
- *memcgp = NULL;
- if (mem_cgroup_disabled())
+ if (mem_cgroup_disabled()) {
+ *memcgp = NULL;
return 0;
+ }
/*
* A racing thread's fault, or swapoff, may have already
* updated the pte, and even removed page from swap cache: in
@@ -4033,12 +3887,13 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page,
* there's also a KSM case which does need to charge the page.
*/
if (!PageSwapCache(page)) {
- int ret;
+ struct mem_cgroup *memcg;
- ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, memcgp, true);
- if (ret == -EINTR)
- ret = 0;
- return ret;
+ memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true);
+ if (!memcg)
+ return -ENOMEM;
+ *memcgp = memcg;
+ return 0;
}
return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp);
}
@@ -4082,11 +3937,11 @@ void mem_cgroup_commit_charge_swapin(struct page *page,
MEM_CGROUP_CHARGE_TYPE_ANON);
}
-int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
+int mem_cgroup_charge_file(struct page *page, struct mm_struct *mm,
gfp_t gfp_mask)
{
- struct mem_cgroup *memcg = NULL;
enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
+ struct mem_cgroup *memcg;
int ret;
if (mem_cgroup_disabled())
@@ -4094,15 +3949,28 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
if (PageCompound(page))
return 0;
- if (!PageSwapCache(page))
- ret = mem_cgroup_charge_common(page, mm, gfp_mask, type);
- else { /* page is swapcache/shmem */
+ if (PageSwapCache(page)) { /* shmem */
ret = __mem_cgroup_try_charge_swapin(mm, page,
gfp_mask, &memcg);
- if (!ret)
- __mem_cgroup_commit_charge_swapin(page, memcg, type);
+ if (ret)
+ return ret;
+ __mem_cgroup_commit_charge_swapin(page, memcg, type);
+ return 0;
}
- return ret;
+
+ /*
+ * Page cache insertions can happen without an actual mm
+ * context, e.g. during disk probing on boot.
+ */
+ if (unlikely(!mm))
+ memcg = root_mem_cgroup;
+ else {
+ memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true);
+ if (!memcg)
+ return -ENOMEM;
+ }
+ __mem_cgroup_commit_charge(memcg, page, 1, type, false);
+ return 0;
}
static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,
@@ -4175,7 +4043,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
if (PageTransHuge(page)) {
nr_pages <<= compound_order(page);
- VM_BUG_ON(!PageTransHuge(page));
+ VM_BUG_ON_PAGE(!PageTransHuge(page), page);
}
/*
* Check if our page_cgroup is valid
@@ -4267,7 +4135,7 @@ void mem_cgroup_uncharge_page(struct page *page)
/* early check. */
if (page_mapped(page))
return;
- VM_BUG_ON(page->mapping && !PageAnon(page));
+ VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page);
/*
* If the page is in swap cache, uncharge should be deferred
* to the swap path, which also properly accounts swap usage
@@ -4287,8 +4155,8 @@ void mem_cgroup_uncharge_page(struct page *page)
void mem_cgroup_uncharge_cache_page(struct page *page)
{
- VM_BUG_ON(page_mapped(page));
- VM_BUG_ON(page->mapping);
+ VM_BUG_ON_PAGE(page_mapped(page), page);
+ VM_BUG_ON_PAGE(page->mapping, page);
__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false);
}
@@ -4994,7 +4862,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
struct cgroup *cgrp = memcg->css.cgroup;
/* returns EBUSY if there is a task or if we come here twice. */
- if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
+ if (cgroup_has_tasks(cgrp) || !list_empty(&cgrp->children))
return -EBUSY;
/* we call try-to-free pages for make this cgroup empty */
@@ -5112,14 +4980,12 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
return val << PAGE_SHIFT;
}
-static ssize_t mem_cgroup_read(struct cgroup_subsys_state *css,
- struct cftype *cft, struct file *file,
- char __user *buf, size_t nbytes, loff_t *ppos)
+static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- char str[64];
u64 val;
- int name, len;
+ int name;
enum res_type type;
type = MEMFILE_TYPE(cft->private);
@@ -5145,15 +5011,26 @@ static ssize_t mem_cgroup_read(struct cgroup_subsys_state *css,
BUG();
}
- len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);
- return simple_read_from_buffer(buf, nbytes, ppos, str, len);
+ return val;
}
-static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val)
-{
- int ret = -EINVAL;
#ifdef CONFIG_MEMCG_KMEM
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+/* should be called with activate_kmem_mutex held */
+static int __memcg_activate_kmem(struct mem_cgroup *memcg,
+ unsigned long long limit)
+{
+ int err = 0;
+ int memcg_id;
+
+ if (memcg_kmem_is_active(memcg))
+ return 0;
+
+ /*
+ * We are going to allocate memory for data shared by all memory
+ * cgroups so let's stop accounting here.
+ */
+ memcg_stop_kmem_account();
+
/*
* For simplicity, we won't allow this to be disabled. It also can't
* be changed if the cgroup has children already, or if tasks had
@@ -5167,72 +5044,101 @@ static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val)
* of course permitted.
*/
mutex_lock(&memcg_create_mutex);
- mutex_lock(&set_limit_mutex);
- if (!memcg->kmem_account_flags && val != RES_COUNTER_MAX) {
- if (cgroup_task_count(css->cgroup) || memcg_has_children(memcg)) {
- ret = -EBUSY;
- goto out;
- }
- ret = res_counter_set_limit(&memcg->kmem, val);
- VM_BUG_ON(ret);
+ if (cgroup_has_tasks(memcg->css.cgroup) || memcg_has_children(memcg))
+ err = -EBUSY;
+ mutex_unlock(&memcg_create_mutex);
+ if (err)
+ goto out;
- ret = memcg_update_cache_sizes(memcg);
- if (ret) {
- res_counter_set_limit(&memcg->kmem, RES_COUNTER_MAX);
- goto out;
- }
- static_key_slow_inc(&memcg_kmem_enabled_key);
- /*
- * setting the active bit after the inc will guarantee no one
- * starts accounting before all call sites are patched
- */
- memcg_kmem_set_active(memcg);
- } else
- ret = res_counter_set_limit(&memcg->kmem, val);
+ memcg_id = ida_simple_get(&kmem_limited_groups,
+ 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
+ if (memcg_id < 0) {
+ err = memcg_id;
+ goto out;
+ }
+
+ /*
+ * Make sure we have enough space for this cgroup in each root cache's
+ * memcg_params.
+ */
+ err = memcg_update_all_caches(memcg_id + 1);
+ if (err)
+ goto out_rmid;
+
+ memcg->kmemcg_id = memcg_id;
+ INIT_LIST_HEAD(&memcg->memcg_slab_caches);
+ mutex_init(&memcg->slab_caches_mutex);
+
+ /*
+ * We couldn't have accounted to this cgroup, because it hasn't got the
+ * active bit set yet, so this should succeed.
+ */
+ err = res_counter_set_limit(&memcg->kmem, limit);
+ VM_BUG_ON(err);
+
+ static_key_slow_inc(&memcg_kmem_enabled_key);
+ /*
+ * Setting the active bit after enabling static branching will
+ * guarantee no one starts accounting before all call sites are
+ * patched.
+ */
+ memcg_kmem_set_active(memcg);
out:
- mutex_unlock(&set_limit_mutex);
- mutex_unlock(&memcg_create_mutex);
-#endif
+ memcg_resume_kmem_account();
+ return err;
+
+out_rmid:
+ ida_simple_remove(&kmem_limited_groups, memcg_id);
+ goto out;
+}
+
+static int memcg_activate_kmem(struct mem_cgroup *memcg,
+ unsigned long long limit)
+{
+ int ret;
+
+ mutex_lock(&activate_kmem_mutex);
+ ret = __memcg_activate_kmem(memcg, limit);
+ mutex_unlock(&activate_kmem_mutex);
+ return ret;
+}
+
+static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
+ unsigned long long val)
+{
+ int ret;
+
+ if (!memcg_kmem_is_active(memcg))
+ ret = memcg_activate_kmem(memcg, val);
+ else
+ ret = res_counter_set_limit(&memcg->kmem, val);
return ret;
}
-#ifdef CONFIG_MEMCG_KMEM
static int memcg_propagate_kmem(struct mem_cgroup *memcg)
{
int ret = 0;
struct mem_cgroup *parent = parent_mem_cgroup(memcg);
- if (!parent)
- goto out;
- memcg->kmem_account_flags = parent->kmem_account_flags;
- /*
- * When that happen, we need to disable the static branch only on those
- * memcgs that enabled it. To achieve this, we would be forced to
- * complicate the code by keeping track of which memcgs were the ones
- * that actually enabled limits, and which ones got it from its
- * parents.
- *
- * It is a lot simpler just to do static_key_slow_inc() on every child
- * that is accounted.
- */
- if (!memcg_kmem_is_active(memcg))
- goto out;
+ if (!parent)
+ return 0;
+ mutex_lock(&activate_kmem_mutex);
/*
- * __mem_cgroup_free() will issue static_key_slow_dec() because this
- * memcg is active already. If the later initialization fails then the
- * cgroup core triggers the cleanup so we do not have to do it here.
+ * If the parent cgroup is not kmem-active now, it cannot be activated
+ * after this point, because it has at least one child already.
*/
- static_key_slow_inc(&memcg_kmem_enabled_key);
-
- mutex_lock(&set_limit_mutex);
- memcg_stop_kmem_account();
- ret = memcg_update_cache_sizes(memcg);
- memcg_resume_kmem_account();
- mutex_unlock(&set_limit_mutex);
-out:
+ if (memcg_kmem_is_active(parent))
+ ret = __memcg_activate_kmem(memcg, RES_COUNTER_MAX);
+ mutex_unlock(&activate_kmem_mutex);
return ret;
}
+#else
+static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
+ unsigned long long val)
+{
+ return -EINVAL;
+}
#endif /* CONFIG_MEMCG_KMEM */
/*
@@ -5240,7 +5146,7 @@ out:
* RES_LIMIT.
*/
static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft,
- const char *buffer)
+ char *buffer)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
enum res_type type;
@@ -5266,7 +5172,7 @@ static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft,
else if (type == _MEMSWAP)
ret = mem_cgroup_resize_memsw_limit(memcg, val);
else if (type == _KMEM)
- ret = memcg_update_kmem_limit(css, val);
+ ret = memcg_update_kmem_limit(memcg, val);
else
return -EINVAL;
break;
@@ -5383,8 +5289,7 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
#endif
#ifdef CONFIG_NUMA
-static int memcg_numa_stat_show(struct cgroup_subsys_state *css,
- struct cftype *cft, struct seq_file *m)
+static int memcg_numa_stat_show(struct seq_file *m, void *v)
{
struct numa_stat {
const char *name;
@@ -5400,7 +5305,7 @@ static int memcg_numa_stat_show(struct cgroup_subsys_state *css,
const struct numa_stat *stat;
int nid;
unsigned long nr;
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
@@ -5439,10 +5344,9 @@ static inline void mem_cgroup_lru_names_not_uptodate(void)
BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
}
-static int memcg_stat_show(struct cgroup_subsys_state *css, struct cftype *cft,
- struct seq_file *m)
+static int memcg_stat_show(struct seq_file *m, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
struct mem_cgroup *mi;
unsigned int i;
@@ -5651,13 +5555,11 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
mem_cgroup_oom_notify_cb(iter);
}
-static int mem_cgroup_usage_register_event(struct cgroup_subsys_state *css,
- struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
+static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd, const char *args, enum res_type type)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct mem_cgroup_thresholds *thresholds;
struct mem_cgroup_threshold_ary *new;
- enum res_type type = MEMFILE_TYPE(cft->private);
u64 threshold, usage;
int i, size, ret;
@@ -5734,13 +5636,23 @@ unlock:
return ret;
}
-static void mem_cgroup_usage_unregister_event(struct cgroup_subsys_state *css,
- struct cftype *cft, struct eventfd_ctx *eventfd)
+static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd, const char *args)
+{
+ return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
+}
+
+static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd, const char *args)
+{
+ return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
+}
+
+static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd, enum res_type type)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct mem_cgroup_thresholds *thresholds;
struct mem_cgroup_threshold_ary *new;
- enum res_type type = MEMFILE_TYPE(cft->private);
u64 usage;
int i, j, size;
@@ -5813,14 +5725,23 @@ unlock:
mutex_unlock(&memcg->thresholds_lock);
}
-static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css,
- struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
+static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd)
+{
+ return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
+}
+
+static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd)
+{
+ return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
+}
+
+static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd, const char *args)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct mem_cgroup_eventfd_list *event;
- enum res_type type = MEMFILE_TYPE(cft->private);
- BUG_ON(type != _OOM_TYPE);
event = kmalloc(sizeof(*event), GFP_KERNEL);
if (!event)
return -ENOMEM;
@@ -5838,14 +5759,10 @@ static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css,
return 0;
}
-static void mem_cgroup_oom_unregister_event(struct cgroup_subsys_state *css,
- struct cftype *cft, struct eventfd_ctx *eventfd)
+static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct mem_cgroup_eventfd_list *ev, *tmp;
- enum res_type type = MEMFILE_TYPE(cft->private);
-
- BUG_ON(type != _OOM_TYPE);
spin_lock(&memcg_oom_lock);
@@ -5859,17 +5776,12 @@ static void mem_cgroup_oom_unregister_event(struct cgroup_subsys_state *css,
spin_unlock(&memcg_oom_lock);
}
-static int mem_cgroup_oom_control_read(struct cgroup_subsys_state *css,
- struct cftype *cft, struct cgroup_map_cb *cb)
+static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
- cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable);
-
- if (atomic_read(&memcg->under_oom))
- cb->fill(cb, "under_oom", 1);
- else
- cb->fill(cb, "under_oom", 0);
+ seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
+ seq_printf(sf, "under_oom %d\n", (bool)atomic_read(&memcg->under_oom));
return 0;
}
@@ -5962,41 +5874,259 @@ static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
}
#endif
+/*
+ * DO NOT USE IN NEW FILES.
+ *
+ * "cgroup.event_control" implementation.
+ *
+ * This is way over-engineered. It tries to support fully configurable
+ * events for each user. Such level of flexibility is completely
+ * unnecessary especially in the light of the planned unified hierarchy.
+ *
+ * Please deprecate this and replace with something simpler if at all
+ * possible.
+ */
+
+/*
+ * Unregister event and free resources.
+ *
+ * Gets called from workqueue.
+ */
+static void memcg_event_remove(struct work_struct *work)
+{
+ struct mem_cgroup_event *event =
+ container_of(work, struct mem_cgroup_event, remove);
+ struct mem_cgroup *memcg = event->memcg;
+
+ remove_wait_queue(event->wqh, &event->wait);
+
+ event->unregister_event(memcg, event->eventfd);
+
+ /* Notify userspace the event is going away. */
+ eventfd_signal(event->eventfd, 1);
+
+ eventfd_ctx_put(event->eventfd);
+ kfree(event);
+ css_put(&memcg->css);
+}
+
+/*
+ * Gets called on POLLHUP on eventfd when user closes it.
+ *
+ * Called with wqh->lock held and interrupts disabled.
+ */
+static int memcg_event_wake(wait_queue_t *wait, unsigned mode,
+ int sync, void *key)
+{
+ struct mem_cgroup_event *event =
+ container_of(wait, struct mem_cgroup_event, wait);
+ struct mem_cgroup *memcg = event->memcg;
+ unsigned long flags = (unsigned long)key;
+
+ if (flags & POLLHUP) {
+ /*
+ * If the event has been detached at cgroup removal, we
+ * can simply return knowing the other side will cleanup
+ * for us.
+ *
+ * We can't race against event freeing since the other
+ * side will require wqh->lock via remove_wait_queue(),
+ * which we hold.
+ */
+ spin_lock(&memcg->event_list_lock);
+ if (!list_empty(&event->list)) {
+ list_del_init(&event->list);
+ /*
+ * We are in atomic context, but cgroup_event_remove()
+ * may sleep, so we have to call it in workqueue.
+ */
+ schedule_work(&event->remove);
+ }
+ spin_unlock(&memcg->event_list_lock);
+ }
+
+ return 0;
+}
+
+static void memcg_event_ptable_queue_proc(struct file *file,
+ wait_queue_head_t *wqh, poll_table *pt)
+{
+ struct mem_cgroup_event *event =
+ container_of(pt, struct mem_cgroup_event, pt);
+
+ event->wqh = wqh;
+ add_wait_queue(wqh, &event->wait);
+}
+
+/*
+ * DO NOT USE IN NEW FILES.
+ *
+ * Parse input and register new cgroup event handler.
+ *
+ * Input must be in format '<event_fd> <control_fd> <args>'.
+ * Interpretation of args is defined by control file implementation.
+ */
+static int memcg_write_event_control(struct cgroup_subsys_state *css,
+ struct cftype *cft, char *buffer)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct mem_cgroup_event *event;
+ struct cgroup_subsys_state *cfile_css;
+ unsigned int efd, cfd;
+ struct fd efile;
+ struct fd cfile;
+ const char *name;
+ char *endp;
+ int ret;
+
+ efd = simple_strtoul(buffer, &endp, 10);
+ if (*endp != ' ')
+ return -EINVAL;
+ buffer = endp + 1;
+
+ cfd = simple_strtoul(buffer, &endp, 10);
+ if ((*endp != ' ') && (*endp != '\0'))
+ return -EINVAL;
+ buffer = endp + 1;
+
+ event = kzalloc(sizeof(*event), GFP_KERNEL);
+ if (!event)
+ return -ENOMEM;
+
+ event->memcg = memcg;
+ INIT_LIST_HEAD(&event->list);
+ init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
+ init_waitqueue_func_entry(&event->wait, memcg_event_wake);
+ INIT_WORK(&event->remove, memcg_event_remove);
+
+ efile = fdget(efd);
+ if (!efile.file) {
+ ret = -EBADF;
+ goto out_kfree;
+ }
+
+ event->eventfd = eventfd_ctx_fileget(efile.file);
+ if (IS_ERR(event->eventfd)) {
+ ret = PTR_ERR(event->eventfd);
+ goto out_put_efile;
+ }
+
+ cfile = fdget(cfd);
+ if (!cfile.file) {
+ ret = -EBADF;
+ goto out_put_eventfd;
+ }
+
+ /* the process need read permission on control file */
+ /* AV: shouldn't we check that it's been opened for read instead? */
+ ret = inode_permission(file_inode(cfile.file), MAY_READ);
+ if (ret < 0)
+ goto out_put_cfile;
+
+ /*
+ * Determine the event callbacks and set them in @event. This used
+ * to be done via struct cftype but cgroup core no longer knows
+ * about these events. The following is crude but the whole thing
+ * is for compatibility anyway.
+ *
+ * DO NOT ADD NEW FILES.
+ */
+ name = cfile.file->f_dentry->d_name.name;
+
+ if (!strcmp(name, "memory.usage_in_bytes")) {
+ event->register_event = mem_cgroup_usage_register_event;
+ event->unregister_event = mem_cgroup_usage_unregister_event;
+ } else if (!strcmp(name, "memory.oom_control")) {
+ event->register_event = mem_cgroup_oom_register_event;
+ event->unregister_event = mem_cgroup_oom_unregister_event;
+ } else if (!strcmp(name, "memory.pressure_level")) {
+ event->register_event = vmpressure_register_event;
+ event->unregister_event = vmpressure_unregister_event;
+ } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
+ event->register_event = memsw_cgroup_usage_register_event;
+ event->unregister_event = memsw_cgroup_usage_unregister_event;
+ } else {
+ ret = -EINVAL;
+ goto out_put_cfile;
+ }
+
+ /*
+ * Verify @cfile should belong to @css. Also, remaining events are
+ * automatically removed on cgroup destruction but the removal is
+ * asynchronous, so take an extra ref on @css.
+ */
+ cfile_css = css_tryget_from_dir(cfile.file->f_dentry->d_parent,
+ &memory_cgrp_subsys);
+ ret = -EINVAL;
+ if (IS_ERR(cfile_css))
+ goto out_put_cfile;
+ if (cfile_css != css) {
+ css_put(cfile_css);
+ goto out_put_cfile;
+ }
+
+ ret = event->register_event(memcg, event->eventfd, buffer);
+ if (ret)
+ goto out_put_css;
+
+ efile.file->f_op->poll(efile.file, &event->pt);
+
+ spin_lock(&memcg->event_list_lock);
+ list_add(&event->list, &memcg->event_list);
+ spin_unlock(&memcg->event_list_lock);
+
+ fdput(cfile);
+ fdput(efile);
+
+ return 0;
+
+out_put_css:
+ css_put(css);
+out_put_cfile:
+ fdput(cfile);
+out_put_eventfd:
+ eventfd_ctx_put(event->eventfd);
+out_put_efile:
+ fdput(efile);
+out_kfree:
+ kfree(event);
+
+ return ret;
+}
+
static struct cftype mem_cgroup_files[] = {
{
.name = "usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
- .read = mem_cgroup_read,
- .register_event = mem_cgroup_usage_register_event,
- .unregister_event = mem_cgroup_usage_unregister_event,
+ .read_u64 = mem_cgroup_read_u64,
},
{
.name = "max_usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
.trigger = mem_cgroup_reset,
- .read = mem_cgroup_read,
+ .read_u64 = mem_cgroup_read_u64,
},
{
.name = "limit_in_bytes",
.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
.write_string = mem_cgroup_write,
- .read = mem_cgroup_read,
+ .read_u64 = mem_cgroup_read_u64,
},
{
.name = "soft_limit_in_bytes",
.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
.write_string = mem_cgroup_write,
- .read = mem_cgroup_read,
+ .read_u64 = mem_cgroup_read_u64,
},
{
.name = "failcnt",
.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
.trigger = mem_cgroup_reset,
- .read = mem_cgroup_read,
+ .read_u64 = mem_cgroup_read_u64,
},
{
.name = "stat",
- .read_seq_string = memcg_stat_show,
+ .seq_show = memcg_stat_show,
},
{
.name = "force_empty",
@@ -6009,6 +6139,12 @@ static struct cftype mem_cgroup_files[] = {
.read_u64 = mem_cgroup_hierarchy_read,
},
{
+ .name = "cgroup.event_control", /* XXX: for compat */
+ .write_string = memcg_write_event_control,
+ .flags = CFTYPE_NO_PREFIX,
+ .mode = S_IWUGO,
+ },
+ {
.name = "swappiness",
.read_u64 = mem_cgroup_swappiness_read,
.write_u64 = mem_cgroup_swappiness_write,
@@ -6020,21 +6156,17 @@ static struct cftype mem_cgroup_files[] = {
},
{
.name = "oom_control",
- .read_map = mem_cgroup_oom_control_read,
+ .seq_show = mem_cgroup_oom_control_read,
.write_u64 = mem_cgroup_oom_control_write,
- .register_event = mem_cgroup_oom_register_event,
- .unregister_event = mem_cgroup_oom_unregister_event,
.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
},
{
.name = "pressure_level",
- .register_event = vmpressure_register_event,
- .unregister_event = vmpressure_unregister_event,
},
#ifdef CONFIG_NUMA
{
.name = "numa_stat",
- .read_seq_string = memcg_numa_stat_show,
+ .seq_show = memcg_numa_stat_show,
},
#endif
#ifdef CONFIG_MEMCG_KMEM
@@ -6042,29 +6174,29 @@ static struct cftype mem_cgroup_files[] = {
.name = "kmem.limit_in_bytes",
.private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
.write_string = mem_cgroup_write,
- .read = mem_cgroup_read,
+ .read_u64 = mem_cgroup_read_u64,
},
{
.name = "kmem.usage_in_bytes",
.private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
- .read = mem_cgroup_read,
+ .read_u64 = mem_cgroup_read_u64,
},
{
.name = "kmem.failcnt",
.private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
.trigger = mem_cgroup_reset,
- .read = mem_cgroup_read,
+ .read_u64 = mem_cgroup_read_u64,
},
{
.name = "kmem.max_usage_in_bytes",
.private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
.trigger = mem_cgroup_reset,
- .read = mem_cgroup_read,
+ .read_u64 = mem_cgroup_read_u64,
},
#ifdef CONFIG_SLABINFO
{
.name = "kmem.slabinfo",
- .read_seq_string = mem_cgroup_slabinfo_read,
+ .seq_show = mem_cgroup_slabinfo_read,
},
#endif
#endif
@@ -6076,27 +6208,25 @@ static struct cftype memsw_cgroup_files[] = {
{
.name = "memsw.usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
- .read = mem_cgroup_read,
- .register_event = mem_cgroup_usage_register_event,
- .unregister_event = mem_cgroup_usage_unregister_event,
+ .read_u64 = mem_cgroup_read_u64,
},
{
.name = "memsw.max_usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
.trigger = mem_cgroup_reset,
- .read = mem_cgroup_read,
+ .read_u64 = mem_cgroup_read_u64,
},
{
.name = "memsw.limit_in_bytes",
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
.write_string = mem_cgroup_write,
- .read = mem_cgroup_read,
+ .read_u64 = mem_cgroup_read_u64,
},
{
.name = "memsw.failcnt",
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
.trigger = mem_cgroup_reset,
- .read = mem_cgroup_read,
+ .read_u64 = mem_cgroup_read_u64,
},
{ }, /* terminate */
};
@@ -6139,14 +6269,12 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
static struct mem_cgroup *mem_cgroup_alloc(void)
{
struct mem_cgroup *memcg;
- size_t size = memcg_size();
+ size_t size;
- /* Can be very big if nr_node_ids is very big */
- if (size < PAGE_SIZE)
- memcg = kzalloc(size, GFP_KERNEL);
- else
- memcg = vzalloc(size);
+ size = sizeof(struct mem_cgroup);
+ size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
+ memcg = kzalloc(size, GFP_KERNEL);
if (!memcg)
return NULL;
@@ -6157,10 +6285,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
return memcg;
out_free:
- if (size < PAGE_SIZE)
- kfree(memcg);
- else
- vfree(memcg);
+ kfree(memcg);
return NULL;
}
@@ -6178,7 +6303,6 @@ out_free:
static void __mem_cgroup_free(struct mem_cgroup *memcg)
{
int node;
- size_t size = memcg_size();
mem_cgroup_remove_from_trees(memcg);
@@ -6199,10 +6323,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
* the cgroup_lock.
*/
disarm_static_keys(memcg);
- if (size < PAGE_SIZE)
- kfree(memcg);
- else
- vfree(memcg);
+ kfree(memcg);
}
/*
@@ -6268,6 +6389,8 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
mutex_init(&memcg->thresholds_lock);
spin_lock_init(&memcg->move_lock);
vmpressure_init(&memcg->vmpressure);
+ INIT_LIST_HEAD(&memcg->event_list);
+ spin_lock_init(&memcg->event_list_lock);
return &memcg->css;
@@ -6281,7 +6404,6 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(css));
- int error = 0;
if (css->cgroup->id > MEM_CGROUP_ID_MAX)
return -ENOSPC;
@@ -6314,12 +6436,11 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
* unfortunate state in our controller.
*/
if (parent != root_mem_cgroup)
- mem_cgroup_subsys.broken_hierarchy = true;
+ memory_cgrp_subsys.broken_hierarchy = true;
}
-
- error = memcg_init_kmem(memcg, &mem_cgroup_subsys);
mutex_unlock(&memcg_create_mutex);
- return error;
+
+ return memcg_init_kmem(memcg, &memory_cgrp_subsys);
}
/*
@@ -6343,11 +6464,32 @@ static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)
static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct mem_cgroup_event *event, *tmp;
+ struct cgroup_subsys_state *iter;
+
+ /*
+ * Unregister events and notify userspace.
+ * Notify userspace about cgroup removing only after rmdir of cgroup
+ * directory to avoid race between userspace and kernelspace.
+ */
+ spin_lock(&memcg->event_list_lock);
+ list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
+ list_del_init(&event->list);
+ schedule_work(&event->remove);
+ }
+ spin_unlock(&memcg->event_list_lock);
kmem_cgroup_css_offline(memcg);
mem_cgroup_invalidate_reclaim_iterators(memcg);
- mem_cgroup_reparent_charges(memcg);
+
+ /*
+ * This requires that offlining is serialized. Right now that is
+ * guaranteed because css_killed_work_fn() holds the cgroup_mutex.
+ */
+ css_for_each_descendant_post(iter, css)
+ mem_cgroup_reparent_charges(mem_cgroup_from_css(iter));
+
mem_cgroup_destroy_all_caches(memcg);
vmpressure_cleanup(&memcg->vmpressure);
}
@@ -6440,8 +6582,7 @@ one_by_one:
batch_count = PRECHARGE_COUNT_AT_ONCE;
cond_resched();
}
- ret = __mem_cgroup_try_charge(NULL,
- GFP_KERNEL, 1, &memcg, false);
+ ret = mem_cgroup_try_charge(memcg, GFP_KERNEL, 1, false);
if (ret)
/* mem_cgroup_clear_mc() will do uncharge later */
return ret;
@@ -6615,7 +6756,7 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
enum mc_target_type ret = MC_TARGET_NONE;
page = pmd_page(pmd);
- VM_BUG_ON(!page || !PageHead(page));
+ VM_BUG_ON_PAGE(!page || !PageHead(page), page);
if (!move_anon())
return ret;
pc = lookup_page_cgroup(page);
@@ -7000,9 +7141,7 @@ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
mem_cgroup_from_css(root_css)->use_hierarchy = true;
}
-struct cgroup_subsys mem_cgroup_subsys = {
- .name = "memory",
- .subsys_id = mem_cgroup_subsys_id,
+struct cgroup_subsys memory_cgrp_subsys = {
.css_alloc = mem_cgroup_css_alloc,
.css_online = mem_cgroup_css_online,
.css_offline = mem_cgroup_css_offline,
@@ -7028,7 +7167,7 @@ __setup("swapaccount=", enable_swap_account);
static void __init memsw_file_init(void)
{
- WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys, memsw_cgroup_files));
+ WARN_ON(cgroup_add_cftypes(&memory_cgrp_subsys, memsw_cgroup_files));
}
static void __init enable_swap_cgroup(void)