diff --git a/Documentation/controllers/memory.txt b/Documentation/controllers/memory.txt index 9fe2d0eabe05..05fe29ab1e58 100644 --- a/Documentation/controllers/memory.txt +++ b/Documentation/controllers/memory.txt @@ -137,12 +137,32 @@ behind this approach is that a cgroup that aggressively uses a shared page will eventually get charged for it (once it is uncharged from the cgroup that brought it in -- this will happen on memory pressure). -Exception: When you do swapoff and make swapped-out pages of shmem(tmpfs) to +Exception: If CONFIG_CGROUP_CGROUP_MEM_RES_CTLR_SWAP is not used.. +When you do swapoff and make swapped-out pages of shmem(tmpfs) to be backed into memory in force, charges for pages are accounted against the caller of swapoff rather than the users of shmem. -2.4 Reclaim +2.4 Swap Extension (CONFIG_CGROUP_MEM_RES_CTLR_SWAP) +Swap Extension allows you to record charge for swap. A swapped-in page is +charged back to original page allocator if possible. + +When swap is accounted, following files are added. + - memory.memsw.usage_in_bytes. + - memory.memsw.limit_in_bytes. + +usage of mem+swap is limited by memsw.limit_in_bytes. + +Note: why 'mem+swap' rather than swap. +The global LRU(kswapd) can swap out arbitrary pages. Swap-out means +to move account from memory to swap...there is no change in usage of +mem+swap. + +In other words, when we want to limit the usage of swap without affecting +global LRU, mem+swap limit is better than just limiting swap from OS point +of view. + +2.5 Reclaim Each cgroup maintains a per cgroup LRU that consists of an active and inactive list. When a cgroup goes over its limit, we first try @@ -246,6 +266,11 @@ Such charges are freed(at default) or moved to its parent. When moved, both of RSS and CACHES are moved to parent. If both of them are busy, rmdir() returns -EBUSY. See 5.1 Also. +Charges recorded in swap information is not updated at removal of cgroup. +Recorded information is discarded and a cgroup which uses swap (swapcache) +will be charged as a new owner of it. + + 5. Misc. interfaces. 5.1 force_empty diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 41b46cc9d1f1..ca51ac72d6c0 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -32,6 +32,8 @@ extern int mem_cgroup_newpage_charge(struct page *page, struct mm_struct *mm, /* for swap handling */ extern int mem_cgroup_try_charge(struct mm_struct *mm, gfp_t gfp_mask, struct mem_cgroup **ptr); +extern int mem_cgroup_try_charge_swapin(struct mm_struct *mm, + struct page *page, gfp_t mask, struct mem_cgroup **ptr); extern void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr); extern void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *ptr); @@ -80,7 +82,6 @@ extern long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone, #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP extern int do_swap_account; #endif - #else /* CONFIG_CGROUP_MEM_RES_CTLR */ struct mem_cgroup; @@ -97,7 +98,13 @@ static inline int mem_cgroup_cache_charge(struct page *page, } static inline int mem_cgroup_try_charge(struct mm_struct *mm, - gfp_t gfp_mask, struct mem_cgroup **ptr) + gfp_t gfp_mask, struct mem_cgroup **ptr) +{ + return 0; +} + +static inline int mem_cgroup_try_charge_swapin(struct mm_struct *mm, + struct page *page, gfp_t gfp_mask, struct mem_cgroup **ptr) { return 0; } diff --git a/include/linux/swap.h b/include/linux/swap.h index f8f3907533f0..be938ce4895a 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -214,7 +214,7 @@ static inline void lru_cache_add_active_file(struct page *page) extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask); extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem, - gfp_t gfp_mask); + gfp_t gfp_mask, bool noswap); extern int __isolate_lru_page(struct page *page, int mode, int file); extern unsigned long shrink_all_memory(unsigned long nr_pages); extern int vm_swappiness; @@ -336,7 +336,7 @@ static inline void disable_swap_token(void) #ifdef CONFIG_CGROUP_MEM_RES_CTLR extern int mem_cgroup_cache_charge_swapin(struct page *page, struct mm_struct *mm, gfp_t mask, bool locked); -extern void mem_cgroup_uncharge_swapcache(struct page *page); +extern void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent); #else static inline int mem_cgroup_cache_charge_swapin(struct page *page, @@ -344,7 +344,15 @@ int mem_cgroup_cache_charge_swapin(struct page *page, { return 0; } -static inline void mem_cgroup_uncharge_swapcache(struct page *page) +static inline void +mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent) +{ +} +#endif +#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP +extern void mem_cgroup_uncharge_swap(swp_entry_t ent); +#else +static inline void mem_cgroup_uncharge_swap(swp_entry_t ent) { } #endif diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 59dd8c116372..2efcf38f3b73 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -131,6 +132,10 @@ struct mem_cgroup { * the counter to account for memory usage */ struct res_counter res; + /* + * the counter to account for mem+swap usage. + */ + struct res_counter memsw; /* * Per cgroup active and inactive list, similar to the * per zone LRU lists. @@ -138,6 +143,8 @@ struct mem_cgroup { struct mem_cgroup_lru_info info; int prev_priority; /* for recording reclaim priority */ + int obsolete; + atomic_t refcnt; /* * statistics. This must be placed at the end of memcg. */ @@ -167,6 +174,17 @@ pcg_default_flags[NR_CHARGE_TYPE] = { 0, /* FORCE */ }; + +/* for encoding cft->private value on file */ +#define _MEM (0) +#define _MEMSWAP (1) +#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) +#define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) +#define MEMFILE_ATTR(val) ((val) & 0xffff) + +static void mem_cgroup_get(struct mem_cgroup *mem); +static void mem_cgroup_put(struct mem_cgroup *mem); + /* * Always modified under lru lock. Then, not necessary to preempt_disable() */ @@ -485,7 +503,8 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, * oom-killer can be invoked. */ static int __mem_cgroup_try_charge(struct mm_struct *mm, - gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom) + gfp_t gfp_mask, struct mem_cgroup **memcg, + bool oom) { struct mem_cgroup *mem; int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; @@ -513,12 +532,25 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, css_get(&mem->css); } + while (1) { + int ret; + bool noswap = false; - while (unlikely(res_counter_charge(&mem->res, PAGE_SIZE))) { + ret = res_counter_charge(&mem->res, PAGE_SIZE); + if (likely(!ret)) { + if (!do_swap_account) + break; + ret = res_counter_charge(&mem->memsw, PAGE_SIZE); + if (likely(!ret)) + break; + /* mem+swap counter fails */ + res_counter_uncharge(&mem->res, PAGE_SIZE); + noswap = true; + } if (!(gfp_mask & __GFP_WAIT)) goto nomem; - if (try_to_free_mem_cgroup_pages(mem, gfp_mask)) + if (try_to_free_mem_cgroup_pages(mem, gfp_mask, noswap)) continue; /* @@ -527,8 +559,13 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, * moved to swap cache or just unmapped from the cgroup. * Check the limit again to see if the reclaim reduced the * current usage of the cgroup before giving up + * */ - if (res_counter_check_under_limit(&mem->res)) + if (!do_swap_account && + res_counter_check_under_limit(&mem->res)) + continue; + if (do_swap_account && + res_counter_check_under_limit(&mem->memsw)) continue; if (!nr_retries--) { @@ -582,6 +619,8 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, if (unlikely(PageCgroupUsed(pc))) { unlock_page_cgroup(pc); res_counter_uncharge(&mem->res, PAGE_SIZE); + if (do_swap_account) + res_counter_uncharge(&mem->memsw, PAGE_SIZE); css_put(&mem->css); return; } @@ -646,6 +685,8 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, __mem_cgroup_remove_list(from_mz, pc); css_put(&from->css); res_counter_uncharge(&from->res, PAGE_SIZE); + if (do_swap_account) + res_counter_uncharge(&from->memsw, PAGE_SIZE); pc->mem_cgroup = to; css_get(&to->css); __mem_cgroup_add_list(to_mz, pc, false); @@ -692,8 +733,11 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, /* drop extra refcnt */ css_put(&parent->css); /* uncharge if move fails */ - if (ret) + if (ret) { res_counter_uncharge(&parent->res, PAGE_SIZE); + if (do_swap_account) + res_counter_uncharge(&parent->memsw, PAGE_SIZE); + } return ret; } @@ -791,7 +835,42 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL); } +int mem_cgroup_try_charge_swapin(struct mm_struct *mm, + struct page *page, + gfp_t mask, struct mem_cgroup **ptr) +{ + struct mem_cgroup *mem; + swp_entry_t ent; + + if (mem_cgroup_subsys.disabled) + return 0; + + if (!do_swap_account) + goto charge_cur_mm; + + /* + * A racing thread's fault, or swapoff, may have already updated + * the pte, and even removed page from swap cache: return success + * to go on to do_swap_page()'s pte_same() test, which should fail. + */ + if (!PageSwapCache(page)) + return 0; + + ent.val = page_private(page); + + mem = lookup_swap_cgroup(ent); + if (!mem || mem->obsolete) + goto charge_cur_mm; + *ptr = mem; + return __mem_cgroup_try_charge(NULL, mask, ptr, true); +charge_cur_mm: + if (unlikely(!mm)) + mm = &init_mm; + return __mem_cgroup_try_charge(mm, mask, ptr, true); +} + #ifdef CONFIG_SWAP + int mem_cgroup_cache_charge_swapin(struct page *page, struct mm_struct *mm, gfp_t mask, bool locked) { @@ -808,8 +887,28 @@ int mem_cgroup_cache_charge_swapin(struct page *page, * we reach here. */ if (PageSwapCache(page)) { + struct mem_cgroup *mem = NULL; + swp_entry_t ent; + + ent.val = page_private(page); + if (do_swap_account) { + mem = lookup_swap_cgroup(ent); + if (mem && mem->obsolete) + mem = NULL; + if (mem) + mm = NULL; + } ret = mem_cgroup_charge_common(page, mm, mask, - MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL); + MEM_CGROUP_CHARGE_TYPE_SHMEM, mem); + + if (!ret && do_swap_account) { + /* avoid double counting */ + mem = swap_cgroup_record(ent, NULL); + if (mem) { + res_counter_uncharge(&mem->memsw, PAGE_SIZE); + mem_cgroup_put(mem); + } + } } if (!locked) unlock_page(page); @@ -828,6 +927,23 @@ void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) return; pc = lookup_page_cgroup(page); __mem_cgroup_commit_charge(ptr, pc, MEM_CGROUP_CHARGE_TYPE_MAPPED); + /* + * Now swap is on-memory. This means this page may be + * counted both as mem and swap....double count. + * Fix it by uncharging from memsw. This SwapCache is stable + * because we're still under lock_page(). + */ + if (do_swap_account) { + swp_entry_t ent = {.val = page_private(page)}; + struct mem_cgroup *memcg; + memcg = swap_cgroup_record(ent, NULL); + if (memcg) { + /* If memcg is obsolete, memcg can be != ptr */ + res_counter_uncharge(&memcg->memsw, PAGE_SIZE); + mem_cgroup_put(memcg); + } + + } } void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) @@ -837,6 +953,8 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) if (!mem) return; res_counter_uncharge(&mem->res, PAGE_SIZE); + if (do_swap_account) + res_counter_uncharge(&mem->memsw, PAGE_SIZE); css_put(&mem->css); } @@ -844,29 +962,31 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) /* * uncharge if !page_mapped(page) */ -static void +static struct mem_cgroup * __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) { struct page_cgroup *pc; - struct mem_cgroup *mem; + struct mem_cgroup *mem = NULL; struct mem_cgroup_per_zone *mz; unsigned long flags; if (mem_cgroup_subsys.disabled) - return; + return NULL; if (PageSwapCache(page)) - return; + return NULL; /* * Check if our page_cgroup is valid */ pc = lookup_page_cgroup(page); if (unlikely(!pc || !PageCgroupUsed(pc))) - return; + return NULL; lock_page_cgroup(pc); + mem = pc->mem_cgroup; + if (!PageCgroupUsed(pc)) goto unlock_out; @@ -886,8 +1006,11 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) break; } + res_counter_uncharge(&mem->res, PAGE_SIZE); + if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) + res_counter_uncharge(&mem->memsw, PAGE_SIZE); + ClearPageCgroupUsed(pc); - mem = pc->mem_cgroup; mz = page_cgroup_zoneinfo(pc); spin_lock_irqsave(&mz->lru_lock, flags); @@ -895,14 +1018,13 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) spin_unlock_irqrestore(&mz->lru_lock, flags); unlock_page_cgroup(pc); - res_counter_uncharge(&mem->res, PAGE_SIZE); css_put(&mem->css); - return; + return mem; unlock_out: unlock_page_cgroup(pc); - return; + return NULL; } void mem_cgroup_uncharge_page(struct page *page) @@ -922,11 +1044,43 @@ void mem_cgroup_uncharge_cache_page(struct page *page) __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); } -void mem_cgroup_uncharge_swapcache(struct page *page) +/* + * called from __delete_from_swap_cache() and drop "page" account. + * memcg information is recorded to swap_cgroup of "ent" + */ +void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent) { - __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_SWAPOUT); + struct mem_cgroup *memcg; + + memcg = __mem_cgroup_uncharge_common(page, + MEM_CGROUP_CHARGE_TYPE_SWAPOUT); + /* record memcg information */ + if (do_swap_account && memcg) { + swap_cgroup_record(ent, memcg); + mem_cgroup_get(memcg); + } } +#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP +/* + * called from swap_entry_free(). remove record in swap_cgroup and + * uncharge "memsw" account. + */ +void mem_cgroup_uncharge_swap(swp_entry_t ent) +{ + struct mem_cgroup *memcg; + + if (!do_swap_account) + return; + + memcg = swap_cgroup_record(ent, NULL); + if (memcg) { + res_counter_uncharge(&memcg->memsw, PAGE_SIZE); + mem_cgroup_put(memcg); + } +} +#endif + /* * Before starting migration, account PAGE_SIZE to mem_cgroup that the old * page belongs to. @@ -1034,7 +1188,7 @@ int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask) rcu_read_unlock(); do { - progress = try_to_free_mem_cgroup_pages(mem, gfp_mask); + progress = try_to_free_mem_cgroup_pages(mem, gfp_mask, true); progress += res_counter_check_under_limit(&mem->res); } while (!progress && --retry); @@ -1044,26 +1198,84 @@ int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask) return 0; } +static DEFINE_MUTEX(set_limit_mutex); + static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, - unsigned long long val) + unsigned long long val) { int retry_count = MEM_CGROUP_RECLAIM_RETRIES; int progress; + u64 memswlimit; int ret = 0; - while (res_counter_set_limit(&memcg->res, val)) { + while (retry_count) { if (signal_pending(current)) { ret = -EINTR; break; } - if (!retry_count) { - ret = -EBUSY; + /* + * Rather than hide all in some function, I do this in + * open coded manner. You see what this really does. + * We have to guarantee mem->res.limit < mem->memsw.limit. + */ + mutex_lock(&set_limit_mutex); + memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); + if (memswlimit < val) { + ret = -EINVAL; + mutex_unlock(&set_limit_mutex); break; } + ret = res_counter_set_limit(&memcg->res, val); + mutex_unlock(&set_limit_mutex); + + if (!ret) + break; + progress = try_to_free_mem_cgroup_pages(memcg, - GFP_HIGHUSER_MOVABLE); - if (!progress) + GFP_HIGHUSER_MOVABLE, false); + if (!progress) retry_count--; + } + return ret; +} + +int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, + unsigned long long val) +{ + int retry_count = MEM_CGROUP_RECLAIM_RETRIES; + u64 memlimit, oldusage, curusage; + int ret; + + if (!do_swap_account) + return -EINVAL; + + while (retry_count) { + if (signal_pending(current)) { + ret = -EINTR; + break; + } + /* + * Rather than hide all in some function, I do this in + * open coded manner. You see what this really does. + * We have to guarantee mem->res.limit < mem->memsw.limit. + */ + mutex_lock(&set_limit_mutex); + memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); + if (memlimit > val) { + ret = -EINVAL; + mutex_unlock(&set_limit_mutex); + break; + } + ret = res_counter_set_limit(&memcg->memsw, val); + mutex_unlock(&set_limit_mutex); + + if (!ret) + break; + + oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); + try_to_free_mem_cgroup_pages(memcg, GFP_HIGHUSER_MOVABLE, true); + curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); + if (curusage >= oldusage) retry_count--; } return ret; @@ -1193,7 +1405,7 @@ try_to_free: goto out; } progress = try_to_free_mem_cgroup_pages(mem, - GFP_HIGHUSER_MOVABLE); + GFP_HIGHUSER_MOVABLE, false); if (!progress) { nr_retries--; /* maybe some writeback is necessary */ @@ -1216,8 +1428,25 @@ int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) { - return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res, - cft->private); + struct mem_cgroup *mem = mem_cgroup_from_cont(cont); + u64 val = 0; + int type, name; + + type = MEMFILE_TYPE(cft->private); + name = MEMFILE_ATTR(cft->private); + switch (type) { + case _MEM: + val = res_counter_read_u64(&mem->res, name); + break; + case _MEMSWAP: + if (do_swap_account) + val = res_counter_read_u64(&mem->memsw, name); + break; + default: + BUG(); + break; + } + return val; } /* * The user of this function is... @@ -1227,15 +1456,22 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, const char *buffer) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + int type, name; unsigned long long val; int ret; - switch (cft->private) { + type = MEMFILE_TYPE(cft->private); + name = MEMFILE_ATTR(cft->private); + switch (name) { case RES_LIMIT: /* This function does all necessary parse...reuse it */ ret = res_counter_memparse_write_strategy(buffer, &val); - if (!ret) + if (ret) + break; + if (type == _MEM) ret = mem_cgroup_resize_limit(memcg, val); + else + ret = mem_cgroup_resize_memsw_limit(memcg, val); break; default: ret = -EINVAL; /* should be BUG() ? */ @@ -1247,14 +1483,23 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) { struct mem_cgroup *mem; + int type, name; mem = mem_cgroup_from_cont(cont); - switch (event) { + type = MEMFILE_TYPE(event); + name = MEMFILE_ATTR(event); + switch (name) { case RES_MAX_USAGE: - res_counter_reset_max(&mem->res); + if (type == _MEM) + res_counter_reset_max(&mem->res); + else + res_counter_reset_max(&mem->memsw); break; case RES_FAILCNT: - res_counter_reset_failcnt(&mem->res); + if (type == _MEM) + res_counter_reset_failcnt(&mem->res); + else + res_counter_reset_failcnt(&mem->memsw); break; } return 0; @@ -1315,24 +1560,24 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, static struct cftype mem_cgroup_files[] = { { .name = "usage_in_bytes", - .private = RES_USAGE, + .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), .read_u64 = mem_cgroup_read, }, { .name = "max_usage_in_bytes", - .private = RES_MAX_USAGE, + .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), .trigger = mem_cgroup_reset, .read_u64 = mem_cgroup_read, }, { .name = "limit_in_bytes", - .private = RES_LIMIT, + .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), .write_string = mem_cgroup_write, .read_u64 = mem_cgroup_read, }, { .name = "failcnt", - .private = RES_FAILCNT, + .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), .trigger = mem_cgroup_reset, .read_u64 = mem_cgroup_read, }, @@ -1346,6 +1591,47 @@ static struct cftype mem_cgroup_files[] = { }, }; +#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP +static struct cftype memsw_cgroup_files[] = { + { + .name = "memsw.usage_in_bytes", + .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), + .read_u64 = mem_cgroup_read, + }, + { + .name = "memsw.max_usage_in_bytes", + .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), + .trigger = mem_cgroup_reset, + .read_u64 = mem_cgroup_read, + }, + { + .name = "memsw.limit_in_bytes", + .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), + .write_string = mem_cgroup_write, + .read_u64 = mem_cgroup_read, + }, + { + .name = "memsw.failcnt", + .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), + .trigger = mem_cgroup_reset, + .read_u64 = mem_cgroup_read, + }, +}; + +static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) +{ + if (!do_swap_account) + return 0; + return cgroup_add_files(cont, ss, memsw_cgroup_files, + ARRAY_SIZE(memsw_cgroup_files)); +}; +#else +static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) +{ + return 0; +} +#endif + static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) { struct mem_cgroup_per_node *pn; @@ -1404,14 +1690,44 @@ static struct mem_cgroup *mem_cgroup_alloc(void) return mem; } +/* + * At destroying mem_cgroup, references from swap_cgroup can remain. + * (scanning all at force_empty is too costly...) + * + * Instead of clearing all references at force_empty, we remember + * the number of reference from swap_cgroup and free mem_cgroup when + * it goes down to 0. + * + * When mem_cgroup is destroyed, mem->obsolete will be set to 0 and + * entry which points to this memcg will be ignore at swapin. + * + * Removal of cgroup itself succeeds regardless of refs from swap. + */ + static void mem_cgroup_free(struct mem_cgroup *mem) { + if (atomic_read(&mem->refcnt) > 0) + return; if (mem_cgroup_size() < PAGE_SIZE) kfree(mem); else vfree(mem); } +static void mem_cgroup_get(struct mem_cgroup *mem) +{ + atomic_inc(&mem->refcnt); +} + +static void mem_cgroup_put(struct mem_cgroup *mem) +{ + if (atomic_dec_and_test(&mem->refcnt)) { + if (!mem->obsolete) + return; + mem_cgroup_free(mem); + } +} + #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP static void __init enable_swap_cgroup(void) @@ -1436,6 +1752,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) return ERR_PTR(-ENOMEM); res_counter_init(&mem->res); + res_counter_init(&mem->memsw); for_each_node_state(node, N_POSSIBLE) if (alloc_mem_cgroup_per_zone_info(mem, node)) @@ -1456,6 +1773,7 @@ static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss, struct cgroup *cont) { struct mem_cgroup *mem = mem_cgroup_from_cont(cont); + mem->obsolete = 1; mem_cgroup_force_empty(mem, false); } @@ -1474,8 +1792,14 @@ static void mem_cgroup_destroy(struct cgroup_subsys *ss, static int mem_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) { - return cgroup_add_files(cont, ss, mem_cgroup_files, - ARRAY_SIZE(mem_cgroup_files)); + int ret; + + ret = cgroup_add_files(cont, ss, mem_cgroup_files, + ARRAY_SIZE(mem_cgroup_files)); + + if (!ret) + ret = register_memsw_files(cont, ss); + return ret; } static void mem_cgroup_move_task(struct cgroup_subsys *ss, diff --git a/mm/memory.c b/mm/memory.c index ba5189e322e6..1358012ffa73 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2431,7 +2431,8 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, lock_page(page); delayacct_clear_flag(DELAYACCT_PF_SWAPIN); - if (mem_cgroup_try_charge(mm, GFP_HIGHUSER_MOVABLE, &ptr) == -ENOMEM) { + if (mem_cgroup_try_charge_swapin(mm, page, + GFP_HIGHUSER_MOVABLE, &ptr) == -ENOMEM) { ret = VM_FAULT_OOM; unlock_page(page); goto out; @@ -2449,8 +2450,20 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, goto out_nomap; } - /* The page isn't present yet, go ahead with the fault. */ + /* + * The page isn't present yet, go ahead with the fault. + * + * Be careful about the sequence of operations here. + * To get its accounting right, reuse_swap_page() must be called + * while the page is counted on swap but not yet in mapcount i.e. + * before page_add_anon_rmap() and swap_free(); try_to_free_swap() + * must be called after the swap_free(), or it will never succeed. + * And mem_cgroup_commit_charge_swapin(), which uses the swp_entry + * in page->private, must be called before reuse_swap_page(), + * which may delete_from_swap_cache(). + */ + mem_cgroup_commit_charge_swapin(page, ptr); inc_mm_counter(mm, anon_rss); pte = mk_pte(page, vma->vm_page_prot); if (write_access && reuse_swap_page(page)) { @@ -2461,7 +2474,6 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, flush_icache_page(vma, page); set_pte_at(mm, address, page_table, pte); page_add_anon_rmap(page, vma, address); - mem_cgroup_commit_charge_swapin(page, ptr); swap_free(entry); if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) diff --git a/mm/swap_state.c b/mm/swap_state.c index 09291ca11f5f..3ecea98ecb45 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -17,6 +17,7 @@ #include #include #include +#include #include @@ -108,6 +109,8 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) */ void __delete_from_swap_cache(struct page *page) { + swp_entry_t ent = {.val = page_private(page)}; + VM_BUG_ON(!PageLocked(page)); VM_BUG_ON(!PageSwapCache(page)); VM_BUG_ON(PageWriteback(page)); @@ -118,7 +121,7 @@ void __delete_from_swap_cache(struct page *page) total_swapcache_pages--; __dec_zone_page_state(page, NR_FILE_PAGES); INC_CACHE_INFO(del_total); - mem_cgroup_uncharge_swapcache(page); + mem_cgroup_uncharge_swapcache(page, ent); } /** diff --git a/mm/swapfile.c b/mm/swapfile.c index 1e7a715a3866..0579d9069b61 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -471,8 +471,9 @@ out: return NULL; } -static int swap_entry_free(struct swap_info_struct *p, unsigned long offset) +static int swap_entry_free(struct swap_info_struct *p, swp_entry_t ent) { + unsigned long offset = swp_offset(ent); int count = p->swap_map[offset]; if (count < SWAP_MAP_MAX) { @@ -487,6 +488,7 @@ static int swap_entry_free(struct swap_info_struct *p, unsigned long offset) swap_list.next = p - swap_info; nr_swap_pages++; p->inuse_pages--; + mem_cgroup_uncharge_swap(ent); } } return count; @@ -502,7 +504,7 @@ void swap_free(swp_entry_t entry) p = swap_info_get(entry); if (p) { - swap_entry_free(p, swp_offset(entry)); + swap_entry_free(p, entry); spin_unlock(&swap_lock); } } @@ -582,7 +584,7 @@ int free_swap_and_cache(swp_entry_t entry) p = swap_info_get(entry); if (p) { - if (swap_entry_free(p, swp_offset(entry)) == 1) { + if (swap_entry_free(p, entry) == 1) { page = find_get_page(&swapper_space, entry.val); if (page && !trylock_page(page)) { page_cache_release(page); @@ -696,7 +698,8 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, pte_t *pte; int ret = 1; - if (mem_cgroup_try_charge(vma->vm_mm, GFP_HIGHUSER_MOVABLE, &ptr)) + if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, + GFP_HIGHUSER_MOVABLE, &ptr)) ret = -ENOMEM; pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); diff --git a/mm/vmscan.c b/mm/vmscan.c index b07c48b09a93..f63b20dd7714 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1661,7 +1661,8 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, #ifdef CONFIG_CGROUP_MEM_RES_CTLR unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, - gfp_t gfp_mask) + gfp_t gfp_mask, + bool noswap) { struct scan_control sc = { .may_writepage = !laptop_mode, @@ -1674,6 +1675,9 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, }; struct zonelist *zonelist; + if (noswap) + sc.may_swap = 0; + sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); zonelist = NODE_DATA(numa_node_id())->node_zonelists;