diff --git a/include/linux/percpu.h b/include/linux/percpu.h index d1b416da25ed..a3aa63e47637 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -48,9 +48,9 @@ * intelligent way to determine this would be nice. */ #if BITS_PER_LONG > 32 -#define PERCPU_DYNAMIC_RESERVE (20 << 10) +#define PERCPU_DYNAMIC_RESERVE (28 << 10) #else -#define PERCPU_DYNAMIC_RESERVE (12 << 10) +#define PERCPU_DYNAMIC_RESERVE (20 << 10) #endif extern void *pcpu_base_addr; diff --git a/mm/percpu.c b/mm/percpu.c index 28a830590b4c..867efd38d879 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -78,6 +78,8 @@ #define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ #define PCPU_ATOMIC_MAP_MARGIN_LOW 32 #define PCPU_ATOMIC_MAP_MARGIN_HIGH 64 +#define PCPU_EMPTY_POP_PAGES_LOW 2 +#define PCPU_EMPTY_POP_PAGES_HIGH 4 #ifdef CONFIG_SMP /* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */ @@ -168,9 +170,22 @@ static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ */ static int pcpu_nr_empty_pop_pages; -/* balance work is used to populate or destroy chunks asynchronously */ +/* + * Balance work is used to populate or destroy chunks asynchronously. We + * try to keep the number of populated free pages between + * PCPU_EMPTY_POP_PAGES_LOW and HIGH for atomic allocations and at most one + * empty chunk. + */ static void pcpu_balance_workfn(struct work_struct *work); static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn); +static bool pcpu_async_enabled __read_mostly; +static bool pcpu_atomic_alloc_failed; + +static void pcpu_schedule_balance_work(void) +{ + if (pcpu_async_enabled) + schedule_work(&pcpu_balance_work); +} static bool pcpu_addr_in_first_chunk(void *addr) { @@ -386,7 +401,8 @@ static int pcpu_need_to_extend(struct pcpu_chunk *chunk, bool is_atomic) margin = 3; if (chunk->map_alloc < - chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW) + chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW && + pcpu_async_enabled) schedule_work(&chunk->map_extend_work); } else { margin = PCPU_ATOMIC_MAP_MARGIN_HIGH; @@ -1005,6 +1021,9 @@ area_found: if (chunk != pcpu_reserved_chunk) pcpu_nr_empty_pop_pages -= occ_pages; + if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW) + pcpu_schedule_balance_work(); + /* clear the areas and return address relative to base address */ for_each_possible_cpu(cpu) memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); @@ -1023,6 +1042,11 @@ fail: if (!--warn_limit) pr_info("PERCPU: limit reached, disable warning\n"); } + if (is_atomic) { + /* see the flag handling in pcpu_blance_workfn() */ + pcpu_atomic_alloc_failed = true; + pcpu_schedule_balance_work(); + } return NULL; } @@ -1080,7 +1104,7 @@ void __percpu *__alloc_reserved_percpu(size_t size, size_t align) } /** - * pcpu_balance_workfn - reclaim fully free chunks, workqueue function + * pcpu_balance_workfn - manage the amount of free chunks and populated pages * @work: unused * * Reclaim all fully free chunks except for the first one. @@ -1090,7 +1114,12 @@ static void pcpu_balance_workfn(struct work_struct *work) LIST_HEAD(to_free); struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1]; struct pcpu_chunk *chunk, *next; + int slot, nr_to_pop, ret; + /* + * There's no reason to keep around multiple unused chunks and VM + * areas can be scarce. Destroy all free chunks except for one. + */ mutex_lock(&pcpu_alloc_mutex); spin_lock_irq(&pcpu_lock); @@ -1118,6 +1147,74 @@ static void pcpu_balance_workfn(struct work_struct *work) pcpu_destroy_chunk(chunk); } + /* + * Ensure there are certain number of free populated pages for + * atomic allocs. Fill up from the most packed so that atomic + * allocs don't increase fragmentation. If atomic allocation + * failed previously, always populate the maximum amount. This + * should prevent atomic allocs larger than PAGE_SIZE from keeping + * failing indefinitely; however, large atomic allocs are not + * something we support properly and can be highly unreliable and + * inefficient. + */ +retry_pop: + if (pcpu_atomic_alloc_failed) { + nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH; + /* best effort anyway, don't worry about synchronization */ + pcpu_atomic_alloc_failed = false; + } else { + nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH - + pcpu_nr_empty_pop_pages, + 0, PCPU_EMPTY_POP_PAGES_HIGH); + } + + for (slot = pcpu_size_to_slot(PAGE_SIZE); slot < pcpu_nr_slots; slot++) { + int nr_unpop = 0, rs, re; + + if (!nr_to_pop) + break; + + spin_lock_irq(&pcpu_lock); + list_for_each_entry(chunk, &pcpu_slot[slot], list) { + nr_unpop = pcpu_unit_pages - chunk->nr_populated; + if (nr_unpop) + break; + } + spin_unlock_irq(&pcpu_lock); + + if (!nr_unpop) + continue; + + /* @chunk can't go away while pcpu_alloc_mutex is held */ + pcpu_for_each_unpop_region(chunk, rs, re, 0, pcpu_unit_pages) { + int nr = min(re - rs, nr_to_pop); + + ret = pcpu_populate_chunk(chunk, rs, rs + nr); + if (!ret) { + nr_to_pop -= nr; + spin_lock_irq(&pcpu_lock); + pcpu_chunk_populated(chunk, rs, rs + nr); + spin_unlock_irq(&pcpu_lock); + } else { + nr_to_pop = 0; + } + + if (!nr_to_pop) + break; + } + } + + if (nr_to_pop) { + /* ran out of chunks to populate, create a new one and retry */ + chunk = pcpu_create_chunk(); + if (chunk) { + spin_lock_irq(&pcpu_lock); + pcpu_chunk_relocate(chunk, -1); + spin_unlock_irq(&pcpu_lock); + goto retry_pop; + } + } + mutex_unlock(&pcpu_alloc_mutex); } @@ -1160,7 +1257,7 @@ void free_percpu(void __percpu *ptr) list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list) if (pos != chunk) { - schedule_work(&pcpu_balance_work); + pcpu_schedule_balance_work(); break; } } @@ -2187,3 +2284,15 @@ void __init percpu_init_late(void) spin_unlock_irqrestore(&pcpu_lock, flags); } } + +/* + * Percpu allocator is initialized early during boot when neither slab or + * workqueue is available. Plug async management until everything is up + * and running. + */ +static int __init percpu_enable_async(void) +{ + pcpu_async_enabled = true; + return 0; +} +subsys_initcall(percpu_enable_async);