diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index a805474df4ab..56e6069d2452 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -97,6 +97,11 @@ enum mem_cgroup_events_target { #define MEM_CGROUP_ID_SHIFT 16 #define MEM_CGROUP_ID_MAX USHRT_MAX +struct mem_cgroup_id { + int id; + atomic_t ref; +}; + struct mem_cgroup_stat_cpu { long count[MEMCG_NR_STAT]; unsigned long events[MEMCG_NR_EVENTS]; @@ -172,6 +177,9 @@ enum memcg_kmem_state { struct mem_cgroup { struct cgroup_subsys_state css; + /* Private memcg ID. Used to ID objects that outlive the cgroup */ + struct mem_cgroup_id id; + /* Accounted resources */ struct page_counter memory; struct page_counter swap; @@ -330,22 +338,9 @@ static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) if (mem_cgroup_disabled()) return 0; - return memcg->css.id; -} - -/** - * mem_cgroup_from_id - look up a memcg from an id - * @id: the id to look up - * - * Caller must hold rcu_read_lock() and use css_tryget() as necessary. - */ -static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) -{ - struct cgroup_subsys_state *css; - - css = css_from_id(id, &memory_cgrp_subsys); - return mem_cgroup_from_css(css); + return memcg->id.id; } +struct mem_cgroup *mem_cgroup_from_id(unsigned short id); /** * parent_mem_cgroup - find the accounting parent of a memcg diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ac8664db3823..5339c89dff63 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4057,6 +4057,60 @@ static struct cftype mem_cgroup_legacy_files[] = { { }, /* terminate */ }; +/* + * Private memory cgroup IDR + * + * Swap-out records and page cache shadow entries need to store memcg + * references in constrained space, so we maintain an ID space that is + * limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of + * memory-controlled cgroups to 64k. + * + * However, there usually are many references to the oflline CSS after + * the cgroup has been destroyed, such as page cache or reclaimable + * slab objects, that don't need to hang on to the ID. We want to keep + * those dead CSS from occupying IDs, or we might quickly exhaust the + * relatively small ID space and prevent the creation of new cgroups + * even when there are much fewer than 64k cgroups - possibly none. + * + * Maintain a private 16-bit ID space for memcg, and allow the ID to + * be freed and recycled when it's no longer needed, which is usually + * when the CSS is offlined. + * + * The only exception to that are records of swapped out tmpfs/shmem + * pages that need to be attributed to live ancestors on swapin. But + * those references are manageable from userspace. + */ + +static DEFINE_IDR(mem_cgroup_idr); + +static void mem_cgroup_id_get(struct mem_cgroup *memcg) +{ + atomic_inc(&memcg->id.ref); +} + +static void mem_cgroup_id_put(struct mem_cgroup *memcg) +{ + if (atomic_dec_and_test(&memcg->id.ref)) { + idr_remove(&mem_cgroup_idr, memcg->id.id); + memcg->id.id = 0; + + /* Memcg ID pins CSS */ + css_put(&memcg->css); + } +} + +/** + * mem_cgroup_from_id - look up a memcg from a memcg id + * @id: the memcg id to look up + * + * Caller must hold rcu_read_lock(). + */ +struct mem_cgroup *mem_cgroup_from_id(unsigned short id) +{ + WARN_ON_ONCE(!rcu_read_lock_held()); + return idr_find(&mem_cgroup_idr, id); +} + static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) { struct mem_cgroup_per_node *pn; @@ -4116,6 +4170,12 @@ static struct mem_cgroup *mem_cgroup_alloc(void) if (!memcg) return NULL; + memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL, + 1, MEM_CGROUP_ID_MAX, + GFP_KERNEL); + if (memcg->id.id < 0) + goto fail; + memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu); if (!memcg->stat) goto fail; @@ -4142,8 +4202,11 @@ static struct mem_cgroup *mem_cgroup_alloc(void) #ifdef CONFIG_CGROUP_WRITEBACK INIT_LIST_HEAD(&memcg->cgwb_list); #endif + idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); return memcg; fail: + if (memcg->id.id > 0) + idr_remove(&mem_cgroup_idr, memcg->id.id); mem_cgroup_free(memcg); return NULL; } @@ -4206,12 +4269,11 @@ fail: return ERR_PTR(-ENOMEM); } -static int -mem_cgroup_css_online(struct cgroup_subsys_state *css) +static int mem_cgroup_css_online(struct cgroup_subsys_state *css) { - if (css->id > MEM_CGROUP_ID_MAX) - return -ENOSPC; - + /* Online state pins memcg ID, memcg ID pins CSS */ + mem_cgroup_id_get(mem_cgroup_from_css(css)); + css_get(css); return 0; } @@ -4234,6 +4296,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) memcg_offline_kmem(memcg); wb_memcg_offline(memcg); + + mem_cgroup_id_put(memcg); } static void mem_cgroup_css_released(struct cgroup_subsys_state *css) @@ -5756,6 +5820,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) if (!memcg) return; + mem_cgroup_id_get(memcg); oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg)); VM_BUG_ON_PAGE(oldid, page); mem_cgroup_swap_statistics(memcg, true); @@ -5774,6 +5839,9 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) VM_BUG_ON(!irqs_disabled()); mem_cgroup_charge_statistics(memcg, page, false, -1); memcg_check_events(memcg, page); + + if (!mem_cgroup_is_root(memcg)) + css_put(&memcg->css); } /* @@ -5804,11 +5872,11 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) !page_counter_try_charge(&memcg->swap, 1, &counter)) return -ENOMEM; + mem_cgroup_id_get(memcg); oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg)); VM_BUG_ON_PAGE(oldid, page); mem_cgroup_swap_statistics(memcg, true); - css_get(&memcg->css); return 0; } @@ -5837,7 +5905,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry) page_counter_uncharge(&memcg->memsw, 1); } mem_cgroup_swap_statistics(memcg, false); - css_put(&memcg->css); + mem_cgroup_id_put(memcg); } rcu_read_unlock(); } diff --git a/mm/slab_common.c b/mm/slab_common.c index a65dad7fdcd1..82317abb03ed 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -526,8 +526,8 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg, goto out_unlock; cgroup_name(css->cgroup, memcg_name_buf, sizeof(memcg_name_buf)); - cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name, - css->id, memcg_name_buf); + cache_name = kasprintf(GFP_KERNEL, "%s(%llu:%s)", root_cache->name, + css->serial_nr, memcg_name_buf); if (!cache_name) goto out_unlock;