Merge branch 'tj-percpu' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/misc into core/percpu

Conflicts:
	arch/x86/include/asm/pgtable.h
This commit is contained in:
Ingo Molnar 2009-02-24 21:52:45 +01:00
commit 0edcf8d692
25 changed files with 1720 additions and 198 deletions

View file

@ -189,9 +189,21 @@ callback_init(void * kernel_end)
if (alpha_using_srm) { if (alpha_using_srm) {
static struct vm_struct console_remap_vm; static struct vm_struct console_remap_vm;
unsigned long vaddr = VMALLOC_START; unsigned long nr_pages = 0;
unsigned long vaddr;
unsigned long i, j; unsigned long i, j;
/* calculate needed size */
for (i = 0; i < crb->map_entries; ++i)
nr_pages += crb->map[i].count;
/* register the vm area */
console_remap_vm.flags = VM_ALLOC;
console_remap_vm.size = nr_pages << PAGE_SHIFT;
vm_area_register_early(&console_remap_vm, PAGE_SIZE);
vaddr = (unsigned long)consle_remap_vm.addr;
/* Set up the third level PTEs and update the virtual /* Set up the third level PTEs and update the virtual
addresses of the CRB entries. */ addresses of the CRB entries. */
for (i = 0; i < crb->map_entries; ++i) { for (i = 0; i < crb->map_entries; ++i) {
@ -213,12 +225,6 @@ callback_init(void * kernel_end)
vaddr += PAGE_SIZE; vaddr += PAGE_SIZE;
} }
} }
/* Let vmalloc know that we've allocated some space. */
console_remap_vm.flags = VM_ALLOC;
console_remap_vm.addr = (void *) VMALLOC_START;
console_remap_vm.size = vaddr - VMALLOC_START;
vmlist = &console_remap_vm;
} }
callback_init_done = 1; callback_init_done = 1;

View file

@ -181,7 +181,7 @@ source "kernel/Kconfig.preempt"
config QUICKLIST config QUICKLIST
def_bool y def_bool y
config HAVE_ARCH_BOOTMEM_NODE config HAVE_ARCH_BOOTMEM
def_bool n def_bool n
config ARCH_HAVE_MEMORY_PRESENT config ARCH_HAVE_MEMORY_PRESENT

View file

@ -135,6 +135,9 @@ config ARCH_HAS_CACHE_LINE_SIZE
config HAVE_SETUP_PER_CPU_AREA config HAVE_SETUP_PER_CPU_AREA
def_bool y def_bool y
config HAVE_DYNAMIC_PER_CPU_AREA
def_bool y
config HAVE_CPUMASK_OF_CPU_MAP config HAVE_CPUMASK_OF_CPU_MAP
def_bool X86_64_SMP def_bool X86_64_SMP
@ -1122,7 +1125,7 @@ config NODES_SHIFT
Specify the maximum number of NUMA Nodes available on the target Specify the maximum number of NUMA Nodes available on the target
system. Increases memory reserved to accomodate various tables. system. Increases memory reserved to accomodate various tables.
config HAVE_ARCH_BOOTMEM_NODE config HAVE_ARCH_BOOTMEM
def_bool y def_bool y
depends on X86_32 && NUMA depends on X86_32 && NUMA

View file

@ -91,45 +91,12 @@ static inline int pfn_valid(int pfn)
#endif /* CONFIG_DISCONTIGMEM */ #endif /* CONFIG_DISCONTIGMEM */
#ifdef CONFIG_NEED_MULTIPLE_NODES #ifdef CONFIG_NEED_MULTIPLE_NODES
/* always use node 0 for bootmem on this numa platform */
/* #define alloc_bootmem_core(__bdata, size, align, goal, limit) \
* Following are macros that are specific to this numa platform.
*/
#define reserve_bootmem(addr, size, flags) \
reserve_bootmem_node(NODE_DATA(0), (addr), (size), (flags))
#define alloc_bootmem(x) \
__alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
#define alloc_bootmem_nopanic(x) \
__alloc_bootmem_node_nopanic(NODE_DATA(0), (x), SMP_CACHE_BYTES, \
__pa(MAX_DMA_ADDRESS))
#define alloc_bootmem_low(x) \
__alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, 0)
#define alloc_bootmem_pages(x) \
__alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
#define alloc_bootmem_pages_nopanic(x) \
__alloc_bootmem_node_nopanic(NODE_DATA(0), (x), PAGE_SIZE, \
__pa(MAX_DMA_ADDRESS))
#define alloc_bootmem_low_pages(x) \
__alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0)
#define alloc_bootmem_node(pgdat, x) \
({ \ ({ \
struct pglist_data __maybe_unused \ bootmem_data_t __maybe_unused * __abm_bdata_dummy = (__bdata); \
*__alloc_bootmem_node__pgdat = (pgdat); \ __alloc_bootmem_core(NODE_DATA(0)->bdata, \
__alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, \ (size), (align), (goal), (limit)); \
__pa(MAX_DMA_ADDRESS)); \
})
#define alloc_bootmem_pages_node(pgdat, x) \
({ \
struct pglist_data __maybe_unused \
*__alloc_bootmem_node__pgdat = (pgdat); \
__alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, \
__pa(MAX_DMA_ADDRESS)); \
})
#define alloc_bootmem_low_pages_node(pgdat, x) \
({ \
struct pglist_data __maybe_unused \
*__alloc_bootmem_node__pgdat = (pgdat); \
__alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0); \
}) })
#endif /* CONFIG_NEED_MULTIPLE_NODES */ #endif /* CONFIG_NEED_MULTIPLE_NODES */

View file

@ -43,6 +43,14 @@
#else /* ...!ASSEMBLY */ #else /* ...!ASSEMBLY */
#include <linux/stringify.h> #include <linux/stringify.h>
#include <asm/sections.h>
#define __addr_to_pcpu_ptr(addr) \
(void *)((unsigned long)(addr) - (unsigned long)pcpu_base_addr \
+ (unsigned long)__per_cpu_start)
#define __pcpu_ptr_to_addr(ptr) \
(void *)((unsigned long)(ptr) + (unsigned long)pcpu_base_addr \
- (unsigned long)__per_cpu_start)
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
#define __percpu_arg(x) "%%"__stringify(__percpu_seg)":%P" #x #define __percpu_arg(x) "%%"__stringify(__percpu_seg)":%P" #x

View file

@ -288,6 +288,8 @@ static inline int is_new_memtype_allowed(unsigned long flags,
return 1; return 1;
} }
pmd_t *populate_extra_pmd(unsigned long vaddr);
pte_t *populate_extra_pte(unsigned long vaddr);
#endif /* __ASSEMBLY__ */ #endif /* __ASSEMBLY__ */
#ifdef CONFIG_X86_32 #ifdef CONFIG_X86_32

View file

@ -601,7 +601,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
if (!data) if (!data)
return -ENOMEM; return -ENOMEM;
data->acpi_data = percpu_ptr(acpi_perf_data, cpu); data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu);
per_cpu(drv_data, cpu) = data; per_cpu(drv_data, cpu) = data;
if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) if (cpu_has(c, X86_FEATURE_CONSTANT_TSC))

View file

@ -16,6 +16,7 @@
#include <linux/cpu.h> #include <linux/cpu.h>
#include <linux/delay.h> #include <linux/delay.h>
#include <linux/uaccess.h> #include <linux/uaccess.h>
#include <linux/percpu.h>
#include <asm/apic.h> #include <asm/apic.h>
@ -55,13 +56,13 @@ static inline void print_stack_overflow(void) { }
union irq_ctx { union irq_ctx {
struct thread_info tinfo; struct thread_info tinfo;
u32 stack[THREAD_SIZE/sizeof(u32)]; u32 stack[THREAD_SIZE/sizeof(u32)];
}; } __attribute__((aligned(PAGE_SIZE)));
static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly; static DEFINE_PER_CPU(union irq_ctx *, hardirq_ctx);
static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly; static DEFINE_PER_CPU(union irq_ctx *, softirq_ctx);
static char softirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss; static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, hardirq_stack);
static char hardirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss; static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, softirq_stack);
static void call_on_stack(void *func, void *stack) static void call_on_stack(void *func, void *stack)
{ {
@ -81,7 +82,7 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
u32 *isp, arg1, arg2; u32 *isp, arg1, arg2;
curctx = (union irq_ctx *) current_thread_info(); curctx = (union irq_ctx *) current_thread_info();
irqctx = hardirq_ctx[smp_processor_id()]; irqctx = __get_cpu_var(hardirq_ctx);
/* /*
* this is where we switch to the IRQ stack. However, if we are * this is where we switch to the IRQ stack. However, if we are
@ -125,34 +126,34 @@ void __cpuinit irq_ctx_init(int cpu)
{ {
union irq_ctx *irqctx; union irq_ctx *irqctx;
if (hardirq_ctx[cpu]) if (per_cpu(hardirq_ctx, cpu))
return; return;
irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE]; irqctx = &per_cpu(hardirq_stack, cpu);
irqctx->tinfo.task = NULL; irqctx->tinfo.task = NULL;
irqctx->tinfo.exec_domain = NULL; irqctx->tinfo.exec_domain = NULL;
irqctx->tinfo.cpu = cpu; irqctx->tinfo.cpu = cpu;
irqctx->tinfo.preempt_count = HARDIRQ_OFFSET; irqctx->tinfo.preempt_count = HARDIRQ_OFFSET;
irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
hardirq_ctx[cpu] = irqctx; per_cpu(hardirq_ctx, cpu) = irqctx;
irqctx = (union irq_ctx *) &softirq_stack[cpu*THREAD_SIZE]; irqctx = &per_cpu(softirq_stack, cpu);
irqctx->tinfo.task = NULL; irqctx->tinfo.task = NULL;
irqctx->tinfo.exec_domain = NULL; irqctx->tinfo.exec_domain = NULL;
irqctx->tinfo.cpu = cpu; irqctx->tinfo.cpu = cpu;
irqctx->tinfo.preempt_count = 0; irqctx->tinfo.preempt_count = 0;
irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
softirq_ctx[cpu] = irqctx; per_cpu(softirq_ctx, cpu) = irqctx;
printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n", printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n",
cpu, hardirq_ctx[cpu], softirq_ctx[cpu]); cpu, per_cpu(hardirq_ctx, cpu), per_cpu(softirq_ctx, cpu));
} }
void irq_ctx_exit(int cpu) void irq_ctx_exit(int cpu)
{ {
hardirq_ctx[cpu] = NULL; per_cpu(hardirq_ctx, cpu) = NULL;
} }
asmlinkage void do_softirq(void) asmlinkage void do_softirq(void)
@ -169,7 +170,7 @@ asmlinkage void do_softirq(void)
if (local_softirq_pending()) { if (local_softirq_pending()) {
curctx = current_thread_info(); curctx = current_thread_info();
irqctx = softirq_ctx[smp_processor_id()]; irqctx = __get_cpu_var(softirq_ctx);
irqctx->tinfo.task = curctx->task; irqctx->tinfo.task = curctx->task;
irqctx->tinfo.previous_esp = current_stack_pointer; irqctx->tinfo.previous_esp = current_stack_pointer;

View file

@ -7,6 +7,7 @@
#include <linux/crash_dump.h> #include <linux/crash_dump.h>
#include <linux/smp.h> #include <linux/smp.h>
#include <linux/topology.h> #include <linux/topology.h>
#include <linux/pfn.h>
#include <asm/sections.h> #include <asm/sections.h>
#include <asm/processor.h> #include <asm/processor.h>
#include <asm/setup.h> #include <asm/setup.h>
@ -41,6 +42,321 @@ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = {
}; };
EXPORT_SYMBOL(__per_cpu_offset); EXPORT_SYMBOL(__per_cpu_offset);
/**
* pcpu_need_numa - determine percpu allocation needs to consider NUMA
*
* If NUMA is not configured or there is only one NUMA node available,
* there is no reason to consider NUMA. This function determines
* whether percpu allocation should consider NUMA or not.
*
* RETURNS:
* true if NUMA should be considered; otherwise, false.
*/
static bool __init pcpu_need_numa(void)
{
#ifdef CONFIG_NEED_MULTIPLE_NODES
pg_data_t *last = NULL;
unsigned int cpu;
for_each_possible_cpu(cpu) {
int node = early_cpu_to_node(cpu);
if (node_online(node) && NODE_DATA(node) &&
last && last != NODE_DATA(node))
return true;
last = NODE_DATA(node);
}
#endif
return false;
}
/**
* pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu
* @cpu: cpu to allocate for
* @size: size allocation in bytes
* @align: alignment
*
* Allocate @size bytes aligned at @align for cpu @cpu. This wrapper
* does the right thing for NUMA regardless of the current
* configuration.
*
* RETURNS:
* Pointer to the allocated area on success, NULL on failure.
*/
static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
unsigned long align)
{
const unsigned long goal = __pa(MAX_DMA_ADDRESS);
#ifdef CONFIG_NEED_MULTIPLE_NODES
int node = early_cpu_to_node(cpu);
void *ptr;
if (!node_online(node) || !NODE_DATA(node)) {
ptr = __alloc_bootmem_nopanic(size, align, goal);
pr_info("cpu %d has no node %d or node-local memory\n",
cpu, node);
pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n",
cpu, size, __pa(ptr));
} else {
ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
size, align, goal);
pr_debug("per cpu data for cpu%d %lu bytes on node%d at "
"%016lx\n", cpu, size, node, __pa(ptr));
}
return ptr;
#else
return __alloc_bootmem_nopanic(size, align, goal);
#endif
}
/*
* Remap allocator
*
* This allocator uses PMD page as unit. A PMD page is allocated for
* each cpu and each is remapped into vmalloc area using PMD mapping.
* As PMD page is quite large, only part of it is used for the first
* chunk. Unused part is returned to the bootmem allocator.
*
* So, the PMD pages are mapped twice - once to the physical mapping
* and to the vmalloc area for the first percpu chunk. The double
* mapping does add one more PMD TLB entry pressure but still is much
* better than only using 4k mappings while still being NUMA friendly.
*/
#ifdef CONFIG_NEED_MULTIPLE_NODES
static size_t pcpur_size __initdata;
static void **pcpur_ptrs __initdata;
static struct page * __init pcpur_get_page(unsigned int cpu, int pageno)
{
size_t off = (size_t)pageno << PAGE_SHIFT;
if (off >= pcpur_size)
return NULL;
return virt_to_page(pcpur_ptrs[cpu] + off);
}
static ssize_t __init setup_pcpu_remap(size_t static_size)
{
static struct vm_struct vm;
pg_data_t *last;
size_t ptrs_size;
unsigned int cpu;
ssize_t ret;
/*
* If large page isn't supported, there's no benefit in doing
* this. Also, on non-NUMA, embedding is better.
*/
if (!cpu_has_pse || pcpu_need_numa())
return -EINVAL;
last = NULL;
for_each_possible_cpu(cpu) {
int node = early_cpu_to_node(cpu);
if (node_online(node) && NODE_DATA(node) &&
last && last != NODE_DATA(node))
goto proceed;
last = NODE_DATA(node);
}
return -EINVAL;
proceed:
/*
* Currently supports only single page. Supporting multiple
* pages won't be too difficult if it ever becomes necessary.
*/
pcpur_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE);
if (pcpur_size > PMD_SIZE) {
pr_warning("PERCPU: static data is larger than large page, "
"can't use large page\n");
return -EINVAL;
}
/* allocate pointer array and alloc large pages */
ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0]));
pcpur_ptrs = alloc_bootmem(ptrs_size);
for_each_possible_cpu(cpu) {
pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PMD_SIZE, PMD_SIZE);
if (!pcpur_ptrs[cpu])
goto enomem;
/*
* Only use pcpur_size bytes and give back the rest.
*
* Ingo: The 2MB up-rounding bootmem is needed to make
* sure the partial 2MB page is still fully RAM - it's
* not well-specified to have a PAT-incompatible area
* (unmapped RAM, device memory, etc.) in that hole.
*/
free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size),
PMD_SIZE - pcpur_size);
memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size);
}
/* allocate address and map */
vm.flags = VM_ALLOC;
vm.size = num_possible_cpus() * PMD_SIZE;
vm_area_register_early(&vm, PMD_SIZE);
for_each_possible_cpu(cpu) {
pmd_t *pmd;
pmd = populate_extra_pmd((unsigned long)vm.addr
+ cpu * PMD_SIZE);
set_pmd(pmd, pfn_pmd(page_to_pfn(virt_to_page(pcpur_ptrs[cpu])),
PAGE_KERNEL_LARGE));
}
/* we're ready, commit */
pr_info("PERCPU: Remapped at %p with large pages, static data "
"%zu bytes\n", vm.addr, static_size);
ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, PMD_SIZE,
pcpur_size - static_size, vm.addr, NULL);
goto out_free_ar;
enomem:
for_each_possible_cpu(cpu)
if (pcpur_ptrs[cpu])
free_bootmem(__pa(pcpur_ptrs[cpu]), PMD_SIZE);
ret = -ENOMEM;
out_free_ar:
free_bootmem(__pa(pcpur_ptrs), ptrs_size);
return ret;
}
#else
static ssize_t __init setup_pcpu_remap(size_t static_size)
{
return -EINVAL;
}
#endif
/*
* Embedding allocator
*
* The first chunk is sized to just contain the static area plus
* PERCPU_DYNAMIC_RESERVE and allocated as a contiguous area using
* bootmem allocator and used as-is without being mapped into vmalloc
* area. This enables the first chunk to piggy back on the linear
* physical PMD mapping and doesn't add any additional pressure to
* TLB.
*/
static void *pcpue_ptr __initdata;
static size_t pcpue_unit_size __initdata;
static struct page * __init pcpue_get_page(unsigned int cpu, int pageno)
{
return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size
+ ((size_t)pageno << PAGE_SHIFT));
}
static ssize_t __init setup_pcpu_embed(size_t static_size)
{
unsigned int cpu;
/*
* If large page isn't supported, there's no benefit in doing
* this. Also, embedding allocation doesn't play well with
* NUMA.
*/
if (!cpu_has_pse || pcpu_need_numa())
return -EINVAL;
/* allocate and copy */
pcpue_unit_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE);
pcpue_unit_size = max(pcpue_unit_size, PCPU_MIN_UNIT_SIZE);
pcpue_ptr = pcpu_alloc_bootmem(0, num_possible_cpus() * pcpue_unit_size,
PAGE_SIZE);
if (!pcpue_ptr)
return -ENOMEM;
for_each_possible_cpu(cpu)
memcpy(pcpue_ptr + cpu * pcpue_unit_size, __per_cpu_load,
static_size);
/* we're ready, commit */
pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n",
pcpue_unit_size >> PAGE_SHIFT, pcpue_ptr, static_size);
return pcpu_setup_first_chunk(pcpue_get_page, static_size,
pcpue_unit_size,
pcpue_unit_size - static_size, pcpue_ptr,
NULL);
}
/*
* 4k page allocator
*
* This is the basic allocator. Static percpu area is allocated
* page-by-page and most of initialization is done by the generic
* setup function.
*/
static struct page **pcpu4k_pages __initdata;
static int pcpu4k_nr_static_pages __initdata;
static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno)
{
if (pageno < pcpu4k_nr_static_pages)
return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno];
return NULL;
}
static void __init pcpu4k_populate_pte(unsigned long addr)
{
populate_extra_pte(addr);
}
static ssize_t __init setup_pcpu_4k(size_t static_size)
{
size_t pages_size;
unsigned int cpu;
int i, j;
ssize_t ret;
pcpu4k_nr_static_pages = PFN_UP(static_size);
/* unaligned allocations can't be freed, round up to page size */
pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * num_possible_cpus()
* sizeof(pcpu4k_pages[0]));
pcpu4k_pages = alloc_bootmem(pages_size);
/* allocate and copy */
j = 0;
for_each_possible_cpu(cpu)
for (i = 0; i < pcpu4k_nr_static_pages; i++) {
void *ptr;
ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE);
if (!ptr)
goto enomem;
memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE);
pcpu4k_pages[j++] = virt_to_page(ptr);
}
/* we're ready, commit */
pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n",
pcpu4k_nr_static_pages, static_size);
ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, 0, 0, NULL,
pcpu4k_populate_pte);
goto out_free_ar;
enomem:
while (--j >= 0)
free_bootmem(__pa(page_address(pcpu4k_pages[j])), PAGE_SIZE);
ret = -ENOMEM;
out_free_ar:
free_bootmem(__pa(pcpu4k_pages), pages_size);
return ret;
}
static inline void setup_percpu_segment(int cpu) static inline void setup_percpu_segment(int cpu)
{ {
#ifdef CONFIG_X86_32 #ifdef CONFIG_X86_32
@ -61,38 +377,35 @@ static inline void setup_percpu_segment(int cpu)
*/ */
void __init setup_per_cpu_areas(void) void __init setup_per_cpu_areas(void)
{ {
ssize_t size; size_t static_size = __per_cpu_end - __per_cpu_start;
char *ptr; unsigned int cpu;
int cpu; unsigned long delta;
size_t pcpu_unit_size;
/* Copy section for each CPU (we discard the original) */ ssize_t ret;
size = roundup(PERCPU_ENOUGH_ROOM, PAGE_SIZE);
pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n", pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
pr_info("PERCPU: Allocating %zd bytes of per cpu data\n", size); /*
* Allocate percpu area. If PSE is supported, try to make use
* of large page mappings. Please read comments on top of
* each allocator for details.
*/
ret = setup_pcpu_remap(static_size);
if (ret < 0)
ret = setup_pcpu_embed(static_size);
if (ret < 0)
ret = setup_pcpu_4k(static_size);
if (ret < 0)
panic("cannot allocate static percpu area (%zu bytes, err=%zd)",
static_size, ret);
pcpu_unit_size = ret;
/* alrighty, percpu areas up and running */
delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
for_each_possible_cpu(cpu) { for_each_possible_cpu(cpu) {
#ifndef CONFIG_NEED_MULTIPLE_NODES per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size;
ptr = alloc_bootmem_pages(size);
#else
int node = early_cpu_to_node(cpu);
if (!node_online(node) || !NODE_DATA(node)) {
ptr = alloc_bootmem_pages(size);
pr_info("cpu %d has no node %d or node-local memory\n",
cpu, node);
pr_debug("per cpu data for cpu%d at %016lx\n",
cpu, __pa(ptr));
} else {
ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
pr_debug("per cpu data for cpu%d on node%d at %016lx\n",
cpu, node, __pa(ptr));
}
#endif
memcpy(ptr, __per_cpu_load, __per_cpu_end - __per_cpu_start);
per_cpu_offset(cpu) = ptr - __per_cpu_start;
per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
per_cpu(cpu_number, cpu) = cpu; per_cpu(cpu_number, cpu) = cpu;
setup_percpu_segment(cpu); setup_percpu_segment(cpu);

View file

@ -137,6 +137,23 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
return pte_offset_kernel(pmd, 0); return pte_offset_kernel(pmd, 0);
} }
pmd_t * __init populate_extra_pmd(unsigned long vaddr)
{
int pgd_idx = pgd_index(vaddr);
int pmd_idx = pmd_index(vaddr);
return one_md_table_init(swapper_pg_dir + pgd_idx) + pmd_idx;
}
pte_t * __init populate_extra_pte(unsigned long vaddr)
{
int pte_idx = pte_index(vaddr);
pmd_t *pmd;
pmd = populate_extra_pmd(vaddr);
return one_page_table_init(pmd) + pte_idx;
}
static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd, static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
unsigned long vaddr, pte_t *lastpte) unsigned long vaddr, pte_t *lastpte)
{ {

View file

@ -168,34 +168,51 @@ static __ref void *spp_getpage(void)
return ptr; return ptr;
} }
void static pud_t * __init fill_pud(pgd_t *pgd, unsigned long vaddr)
set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) {
if (pgd_none(*pgd)) {
pud_t *pud = (pud_t *)spp_getpage();
pgd_populate(&init_mm, pgd, pud);
if (pud != pud_offset(pgd, 0))
printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n",
pud, pud_offset(pgd, 0));
}
return pud_offset(pgd, vaddr);
}
static pmd_t * __init fill_pmd(pud_t *pud, unsigned long vaddr)
{
if (pud_none(*pud)) {
pmd_t *pmd = (pmd_t *) spp_getpage();
pud_populate(&init_mm, pud, pmd);
if (pmd != pmd_offset(pud, 0))
printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
pmd, pmd_offset(pud, 0));
}
return pmd_offset(pud, vaddr);
}
static pte_t * __init fill_pte(pmd_t *pmd, unsigned long vaddr)
{
if (pmd_none(*pmd)) {
pte_t *pte = (pte_t *) spp_getpage();
pmd_populate_kernel(&init_mm, pmd, pte);
if (pte != pte_offset_kernel(pmd, 0))
printk(KERN_ERR "PAGETABLE BUG #02!\n");
}
return pte_offset_kernel(pmd, vaddr);
}
void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
{ {
pud_t *pud; pud_t *pud;
pmd_t *pmd; pmd_t *pmd;
pte_t *pte; pte_t *pte;
pud = pud_page + pud_index(vaddr); pud = pud_page + pud_index(vaddr);
if (pud_none(*pud)) { pmd = fill_pmd(pud, vaddr);
pmd = (pmd_t *) spp_getpage(); pte = fill_pte(pmd, vaddr);
pud_populate(&init_mm, pud, pmd);
if (pmd != pmd_offset(pud, 0)) {
printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
pmd, pmd_offset(pud, 0));
return;
}
}
pmd = pmd_offset(pud, vaddr);
if (pmd_none(*pmd)) {
pte = (pte_t *) spp_getpage();
pmd_populate_kernel(&init_mm, pmd, pte);
if (pte != pte_offset_kernel(pmd, 0)) {
printk(KERN_ERR "PAGETABLE BUG #02!\n");
return;
}
}
pte = pte_offset_kernel(pmd, vaddr);
set_pte(pte, new_pte); set_pte(pte, new_pte);
/* /*
@ -205,8 +222,7 @@ set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
__flush_tlb_one(vaddr); __flush_tlb_one(vaddr);
} }
void void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
set_pte_vaddr(unsigned long vaddr, pte_t pteval)
{ {
pgd_t *pgd; pgd_t *pgd;
pud_t *pud_page; pud_t *pud_page;
@ -223,6 +239,24 @@ set_pte_vaddr(unsigned long vaddr, pte_t pteval)
set_pte_vaddr_pud(pud_page, vaddr, pteval); set_pte_vaddr_pud(pud_page, vaddr, pteval);
} }
pmd_t * __init populate_extra_pmd(unsigned long vaddr)
{
pgd_t *pgd;
pud_t *pud;
pgd = pgd_offset_k(vaddr);
pud = fill_pud(pgd, vaddr);
return fill_pmd(pud, vaddr);
}
pte_t * __init populate_extra_pte(unsigned long vaddr)
{
pmd_t *pmd;
pmd = populate_extra_pmd(vaddr);
return fill_pte(pmd, vaddr);
}
/* /*
* Create large page table mappings for a range of physical addresses. * Create large page table mappings for a range of physical addresses.
*/ */

View file

@ -363,7 +363,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
if (!bt->sequence) if (!bt->sequence)
goto err; goto err;
bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG); bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG, __alignof__(char));
if (!bt->msg_data) if (!bt->msg_data)
goto err; goto err;

View file

@ -516,12 +516,12 @@ int acpi_processor_preregister_performance(
continue; continue;
} }
if (!performance || !percpu_ptr(performance, i)) { if (!performance || !per_cpu_ptr(performance, i)) {
retval = -EINVAL; retval = -EINVAL;
continue; continue;
} }
pr->performance = percpu_ptr(performance, i); pr->performance = per_cpu_ptr(performance, i);
cpumask_set_cpu(i, pr->performance->shared_cpu_map); cpumask_set_cpu(i, pr->performance->shared_cpu_map);
if (acpi_processor_get_psd(pr)) { if (acpi_processor_get_psd(pr)) {
retval = -EINVAL; retval = -EINVAL;

View file

@ -65,23 +65,20 @@ extern void free_bootmem(unsigned long addr, unsigned long size);
#define BOOTMEM_DEFAULT 0 #define BOOTMEM_DEFAULT 0
#define BOOTMEM_EXCLUSIVE (1<<0) #define BOOTMEM_EXCLUSIVE (1<<0)
extern int reserve_bootmem(unsigned long addr,
unsigned long size,
int flags);
extern int reserve_bootmem_node(pg_data_t *pgdat, extern int reserve_bootmem_node(pg_data_t *pgdat,
unsigned long physaddr, unsigned long physaddr,
unsigned long size, unsigned long size,
int flags); int flags);
#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
extern int reserve_bootmem(unsigned long addr, unsigned long size, int flags);
#endif
extern void *__alloc_bootmem_nopanic(unsigned long size, extern void *__alloc_bootmem(unsigned long size,
unsigned long align, unsigned long align,
unsigned long goal); unsigned long goal);
extern void *__alloc_bootmem(unsigned long size, extern void *__alloc_bootmem_nopanic(unsigned long size,
unsigned long align, unsigned long align,
unsigned long goal); unsigned long goal);
extern void *__alloc_bootmem_low(unsigned long size,
unsigned long align,
unsigned long goal);
extern void *__alloc_bootmem_node(pg_data_t *pgdat, extern void *__alloc_bootmem_node(pg_data_t *pgdat,
unsigned long size, unsigned long size,
unsigned long align, unsigned long align,
@ -90,30 +87,35 @@ extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat,
unsigned long size, unsigned long size,
unsigned long align, unsigned long align,
unsigned long goal); unsigned long goal);
extern void *__alloc_bootmem_low(unsigned long size,
unsigned long align,
unsigned long goal);
extern void *__alloc_bootmem_low_node(pg_data_t *pgdat, extern void *__alloc_bootmem_low_node(pg_data_t *pgdat,
unsigned long size, unsigned long size,
unsigned long align, unsigned long align,
unsigned long goal); unsigned long goal);
#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
#define alloc_bootmem(x) \ #define alloc_bootmem(x) \
__alloc_bootmem(x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)) __alloc_bootmem(x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
#define alloc_bootmem_nopanic(x) \ #define alloc_bootmem_nopanic(x) \
__alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)) __alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
#define alloc_bootmem_low(x) \
__alloc_bootmem_low(x, SMP_CACHE_BYTES, 0)
#define alloc_bootmem_pages(x) \ #define alloc_bootmem_pages(x) \
__alloc_bootmem(x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)) __alloc_bootmem(x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
#define alloc_bootmem_pages_nopanic(x) \ #define alloc_bootmem_pages_nopanic(x) \
__alloc_bootmem_nopanic(x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)) __alloc_bootmem_nopanic(x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
#define alloc_bootmem_low_pages(x) \
__alloc_bootmem_low(x, PAGE_SIZE, 0)
#define alloc_bootmem_node(pgdat, x) \ #define alloc_bootmem_node(pgdat, x) \
__alloc_bootmem_node(pgdat, x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)) __alloc_bootmem_node(pgdat, x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
#define alloc_bootmem_pages_node(pgdat, x) \ #define alloc_bootmem_pages_node(pgdat, x) \
__alloc_bootmem_node(pgdat, x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)) __alloc_bootmem_node(pgdat, x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
#define alloc_bootmem_pages_node_nopanic(pgdat, x) \
__alloc_bootmem_node_nopanic(pgdat, x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
#define alloc_bootmem_low(x) \
__alloc_bootmem_low(x, SMP_CACHE_BYTES, 0)
#define alloc_bootmem_low_pages(x) \
__alloc_bootmem_low(x, PAGE_SIZE, 0)
#define alloc_bootmem_low_pages_node(pgdat, x) \ #define alloc_bootmem_low_pages_node(pgdat, x) \
__alloc_bootmem_low_node(pgdat, x, PAGE_SIZE, 0) __alloc_bootmem_low_node(pgdat, x, PAGE_SIZE, 0)
#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
extern int reserve_bootmem_generic(unsigned long addr, unsigned long size, extern int reserve_bootmem_generic(unsigned long addr, unsigned long size,
int flags); int flags);

View file

@ -76,52 +76,98 @@
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
/* minimum unit size, also is the maximum supported allocation size */
#define PCPU_MIN_UNIT_SIZE (16UL << PAGE_SHIFT)
/*
* PERCPU_DYNAMIC_RESERVE indicates the amount of free area to piggy
* back on the first chunk if arch is manually allocating and mapping
* it for faster access (as a part of large page mapping for example).
* Note that dynamic percpu allocator covers both static and dynamic
* areas, so these values are bigger than PERCPU_MODULE_RESERVE.
*
* On typical configuration with modules, the following values leave
* about 8k of free space on the first chunk after boot on both x86_32
* and 64 when module support is enabled. When module support is
* disabled, it's much tighter.
*/
#ifndef PERCPU_DYNAMIC_RESERVE
# if BITS_PER_LONG > 32
# ifdef CONFIG_MODULES
# define PERCPU_DYNAMIC_RESERVE (6 << PAGE_SHIFT)
# else
# define PERCPU_DYNAMIC_RESERVE (4 << PAGE_SHIFT)
# endif
# else
# ifdef CONFIG_MODULES
# define PERCPU_DYNAMIC_RESERVE (4 << PAGE_SHIFT)
# else
# define PERCPU_DYNAMIC_RESERVE (2 << PAGE_SHIFT)
# endif
# endif
#endif /* PERCPU_DYNAMIC_RESERVE */
extern void *pcpu_base_addr;
typedef struct page * (*pcpu_get_page_fn_t)(unsigned int cpu, int pageno);
typedef void (*pcpu_populate_pte_fn_t)(unsigned long addr);
extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
size_t static_size, size_t unit_size,
size_t free_size, void *base_addr,
pcpu_populate_pte_fn_t populate_pte_fn);
/*
* Use this to get to a cpu's version of the per-cpu object
* dynamically allocated. Non-atomic access to the current CPU's
* version should probably be combined with get_cpu()/put_cpu().
*/
#define per_cpu_ptr(ptr, cpu) SHIFT_PERCPU_PTR((ptr), per_cpu_offset((cpu)))
#else /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
struct percpu_data { struct percpu_data {
void *ptrs[1]; void *ptrs[1];
}; };
#define __percpu_disguise(pdata) (struct percpu_data *)~(unsigned long)(pdata) #define __percpu_disguise(pdata) (struct percpu_data *)~(unsigned long)(pdata)
/*
* Use this to get to a cpu's version of the per-cpu object dynamically #define per_cpu_ptr(ptr, cpu) \
* allocated. Non-atomic access to the current CPU's version should ({ \
* probably be combined with get_cpu()/put_cpu(). struct percpu_data *__p = __percpu_disguise(ptr); \
*/ (__typeof__(ptr))__p->ptrs[(cpu)]; \
#define percpu_ptr(ptr, cpu) \
({ \
struct percpu_data *__p = __percpu_disguise(ptr); \
(__typeof__(ptr))__p->ptrs[(cpu)]; \
}) })
extern void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask); #endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
extern void percpu_free(void *__pdata);
extern void *__alloc_percpu(size_t size, size_t align);
extern void free_percpu(void *__pdata);
#else /* CONFIG_SMP */ #else /* CONFIG_SMP */
#define percpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); }) #define per_cpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); })
static __always_inline void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask) static inline void *__alloc_percpu(size_t size, size_t align)
{ {
/*
* Can't easily make larger alignment work with kmalloc. WARN
* on it. Larger alignment should only be used for module
* percpu sections on SMP for which this path isn't used.
*/
WARN_ON_ONCE(align > __alignof__(unsigned long long));
return kzalloc(size, gfp); return kzalloc(size, gfp);
} }
static inline void percpu_free(void *__pdata) static inline void free_percpu(void *p)
{ {
kfree(__pdata); kfree(p);
} }
#endif /* CONFIG_SMP */ #endif /* CONFIG_SMP */
#define percpu_alloc_mask(size, gfp, mask) \ #define alloc_percpu(type) (type *)__alloc_percpu(sizeof(type), \
__percpu_alloc_mask((size), (gfp), &(mask)) __alignof__(type))
#define percpu_alloc(size, gfp) percpu_alloc_mask((size), (gfp), cpu_online_map)
/* (legacy) interface for use without CPU hotplug handling */
#define __alloc_percpu(size) percpu_alloc_mask((size), GFP_KERNEL, \
cpu_possible_map)
#define alloc_percpu(type) (type *)__alloc_percpu(sizeof(type))
#define free_percpu(ptr) percpu_free((ptr))
#define per_cpu_ptr(ptr, cpu) percpu_ptr((ptr), (cpu))
#endif /* __LINUX_PERCPU_H */ #endif /* __LINUX_PERCPU_H */

View file

@ -95,6 +95,9 @@ extern struct vm_struct *remove_vm_area(const void *addr);
extern int map_vm_area(struct vm_struct *area, pgprot_t prot, extern int map_vm_area(struct vm_struct *area, pgprot_t prot,
struct page ***pages); struct page ***pages);
extern int map_kernel_range_noflush(unsigned long start, unsigned long size,
pgprot_t prot, struct page **pages);
extern void unmap_kernel_range_noflush(unsigned long addr, unsigned long size);
extern void unmap_kernel_range(unsigned long addr, unsigned long size); extern void unmap_kernel_range(unsigned long addr, unsigned long size);
/* Allocate/destroy a 'vmalloc' VM area. */ /* Allocate/destroy a 'vmalloc' VM area. */
@ -110,5 +113,6 @@ extern long vwrite(char *buf, char *addr, unsigned long count);
*/ */
extern rwlock_t vmlist_lock; extern rwlock_t vmlist_lock;
extern struct vm_struct *vmlist; extern struct vm_struct *vmlist;
extern __init void vm_area_register_early(struct vm_struct *vm, size_t align);
#endif /* _LINUX_VMALLOC_H */ #endif /* _LINUX_VMALLOC_H */

View file

@ -51,6 +51,7 @@
#include <linux/tracepoint.h> #include <linux/tracepoint.h>
#include <linux/ftrace.h> #include <linux/ftrace.h>
#include <linux/async.h> #include <linux/async.h>
#include <linux/percpu.h>
#if 0 #if 0
#define DEBUGP printk #define DEBUGP printk
@ -366,6 +367,34 @@ static struct module *find_module(const char *name)
} }
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
static void *percpu_modalloc(unsigned long size, unsigned long align,
const char *name)
{
void *ptr;
if (align > PAGE_SIZE) {
printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n",
name, align, PAGE_SIZE);
align = PAGE_SIZE;
}
ptr = __alloc_percpu(size, align);
if (!ptr)
printk(KERN_WARNING
"Could not allocate %lu bytes percpu data\n", size);
return ptr;
}
static void percpu_modfree(void *freeme)
{
free_percpu(freeme);
}
#else /* ... !CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
/* Number of blocks used and allocated. */ /* Number of blocks used and allocated. */
static unsigned int pcpu_num_used, pcpu_num_allocated; static unsigned int pcpu_num_used, pcpu_num_allocated;
/* Size of each block. -ve means used. */ /* Size of each block. -ve means used. */
@ -480,21 +509,6 @@ static void percpu_modfree(void *freeme)
} }
} }
static unsigned int find_pcpusec(Elf_Ehdr *hdr,
Elf_Shdr *sechdrs,
const char *secstrings)
{
return find_sec(hdr, sechdrs, secstrings, ".data.percpu");
}
static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size)
{
int cpu;
for_each_possible_cpu(cpu)
memcpy(pcpudest + per_cpu_offset(cpu), from, size);
}
static int percpu_modinit(void) static int percpu_modinit(void)
{ {
pcpu_num_used = 2; pcpu_num_used = 2;
@ -513,7 +527,26 @@ static int percpu_modinit(void)
return 0; return 0;
} }
__initcall(percpu_modinit); __initcall(percpu_modinit);
#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
static unsigned int find_pcpusec(Elf_Ehdr *hdr,
Elf_Shdr *sechdrs,
const char *secstrings)
{
return find_sec(hdr, sechdrs, secstrings, ".data.percpu");
}
static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size)
{
int cpu;
for_each_possible_cpu(cpu)
memcpy(pcpudest + per_cpu_offset(cpu), from, size);
}
#else /* ... !CONFIG_SMP */ #else /* ... !CONFIG_SMP */
static inline void *percpu_modalloc(unsigned long size, unsigned long align, static inline void *percpu_modalloc(unsigned long size, unsigned long align,
const char *name) const char *name)
{ {
@ -535,6 +568,7 @@ static inline void percpu_modcopy(void *pcpudst, const void *src,
/* pcpusec should be 0, and size of that section should be 0. */ /* pcpusec should be 0, and size of that section should be 0. */
BUG_ON(size != 0); BUG_ON(size != 0);
} }
#endif /* CONFIG_SMP */ #endif /* CONFIG_SMP */
#define MODINFO_ATTR(field) \ #define MODINFO_ATTR(field) \

View file

@ -9476,7 +9476,7 @@ cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
{ {
u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
u64 data; u64 data;
#ifndef CONFIG_64BIT #ifndef CONFIG_64BIT
@ -9495,7 +9495,7 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
{ {
u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
#ifndef CONFIG_64BIT #ifndef CONFIG_64BIT
/* /*
@ -9591,7 +9591,7 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
ca = task_ca(tsk); ca = task_ca(tsk);
for (; ca; ca = ca->parent) { for (; ca; ca = ca->parent) {
u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
*cpuusage += cputime; *cpuusage += cputime;
} }
} }

View file

@ -170,7 +170,7 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
* doesn't hit this CPU until we're ready. */ * doesn't hit this CPU until we're ready. */
get_cpu(); get_cpu();
for_each_online_cpu(i) { for_each_online_cpu(i) {
sm_work = percpu_ptr(stop_machine_work, i); sm_work = per_cpu_ptr(stop_machine_work, i);
INIT_WORK(sm_work, stop_cpu); INIT_WORK(sm_work, stop_cpu);
queue_work_on(i, stop_machine_wq, sm_work); queue_work_on(i, stop_machine_wq, sm_work);
} }

View file

@ -30,6 +30,10 @@ obj-$(CONFIG_FAILSLAB) += failslab.o
obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
obj-$(CONFIG_FS_XIP) += filemap_xip.o obj-$(CONFIG_FS_XIP) += filemap_xip.o
obj-$(CONFIG_MIGRATION) += migrate.o obj-$(CONFIG_MIGRATION) += migrate.o
ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
obj-$(CONFIG_SMP) += percpu.o
else
obj-$(CONFIG_SMP) += allocpercpu.o obj-$(CONFIG_SMP) += allocpercpu.o
endif
obj-$(CONFIG_QUICKLIST) += quicklist.o obj-$(CONFIG_QUICKLIST) += quicklist.o
obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o

View file

@ -99,45 +99,51 @@ static int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
__percpu_populate_mask((__pdata), (size), (gfp), &(mask)) __percpu_populate_mask((__pdata), (size), (gfp), &(mask))
/** /**
* percpu_alloc_mask - initial setup of per-cpu data * alloc_percpu - initial setup of per-cpu data
* @size: size of per-cpu object * @size: size of per-cpu object
* @gfp: may sleep or not etc. * @align: alignment
* @mask: populate per-data for cpu's selected through mask bits
* *
* Populating per-cpu data for all online cpu's would be a typical use case, * Allocate dynamic percpu area. Percpu objects are populated with
* which is simplified by the percpu_alloc() wrapper. * zeroed buffers.
* Per-cpu objects are populated with zeroed buffers.
*/ */
void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask) void *__alloc_percpu(size_t size, size_t align)
{ {
/* /*
* We allocate whole cache lines to avoid false sharing * We allocate whole cache lines to avoid false sharing
*/ */
size_t sz = roundup(nr_cpu_ids * sizeof(void *), cache_line_size()); size_t sz = roundup(nr_cpu_ids * sizeof(void *), cache_line_size());
void *pdata = kzalloc(sz, gfp); void *pdata = kzalloc(sz, GFP_KERNEL);
void *__pdata = __percpu_disguise(pdata); void *__pdata = __percpu_disguise(pdata);
/*
* Can't easily make larger alignment work with kmalloc. WARN
* on it. Larger alignment should only be used for module
* percpu sections on SMP for which this path isn't used.
*/
WARN_ON_ONCE(align > __alignof__(unsigned long long));
if (unlikely(!pdata)) if (unlikely(!pdata))
return NULL; return NULL;
if (likely(!__percpu_populate_mask(__pdata, size, gfp, mask))) if (likely(!__percpu_populate_mask(__pdata, size, GFP_KERNEL,
&cpu_possible_map)))
return __pdata; return __pdata;
kfree(pdata); kfree(pdata);
return NULL; return NULL;
} }
EXPORT_SYMBOL_GPL(__percpu_alloc_mask); EXPORT_SYMBOL_GPL(__alloc_percpu);
/** /**
* percpu_free - final cleanup of per-cpu data * free_percpu - final cleanup of per-cpu data
* @__pdata: object to clean up * @__pdata: object to clean up
* *
* We simply clean up any per-cpu object left. No need for the client to * We simply clean up any per-cpu object left. No need for the client to
* track and specify through a bis mask which per-cpu objects are to free. * track and specify through a bis mask which per-cpu objects are to free.
*/ */
void percpu_free(void *__pdata) void free_percpu(void *__pdata)
{ {
if (unlikely(!__pdata)) if (unlikely(!__pdata))
return; return;
__percpu_depopulate_mask(__pdata, &cpu_possible_map); __percpu_depopulate_mask(__pdata, &cpu_possible_map);
kfree(__percpu_disguise(__pdata)); kfree(__percpu_disguise(__pdata));
} }
EXPORT_SYMBOL_GPL(percpu_free); EXPORT_SYMBOL_GPL(free_percpu);

View file

@ -37,6 +37,16 @@ static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list);
static int bootmem_debug; static int bootmem_debug;
/*
* If an arch needs to apply workarounds to bootmem allocation, it can
* set CONFIG_HAVE_ARCH_BOOTMEM and define a wrapper around
* __alloc_bootmem_core().
*/
#ifndef CONFIG_HAVE_ARCH_BOOTMEM
#define alloc_bootmem_core(bdata, size, align, goal, limit) \
__alloc_bootmem_core((bdata), (size), (align), (goal), (limit))
#endif
static int __init bootmem_debug_setup(char *buf) static int __init bootmem_debug_setup(char *buf)
{ {
bootmem_debug = 1; bootmem_debug = 1;
@ -382,7 +392,6 @@ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
return mark_bootmem_node(pgdat->bdata, start, end, 1, flags); return mark_bootmem_node(pgdat->bdata, start, end, 1, flags);
} }
#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
/** /**
* reserve_bootmem - mark a page range as usable * reserve_bootmem - mark a page range as usable
* @addr: starting address of the range * @addr: starting address of the range
@ -403,7 +412,6 @@ int __init reserve_bootmem(unsigned long addr, unsigned long size,
return mark_bootmem(start, end, 1, flags); return mark_bootmem(start, end, 1, flags);
} }
#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
static unsigned long align_idx(struct bootmem_data *bdata, unsigned long idx, static unsigned long align_idx(struct bootmem_data *bdata, unsigned long idx,
unsigned long step) unsigned long step)
@ -428,7 +436,7 @@ static unsigned long align_off(struct bootmem_data *bdata, unsigned long off,
return ALIGN(base + off, align) - base; return ALIGN(base + off, align) - base;
} }
static void * __init alloc_bootmem_core(struct bootmem_data *bdata, static void * __init __alloc_bootmem_core(struct bootmem_data *bdata,
unsigned long size, unsigned long align, unsigned long size, unsigned long align,
unsigned long goal, unsigned long limit) unsigned long goal, unsigned long limit)
{ {

979
mm/percpu.c Normal file
View file

@ -0,0 +1,979 @@
/*
* linux/mm/percpu.c - percpu memory allocator
*
* Copyright (C) 2009 SUSE Linux Products GmbH
* Copyright (C) 2009 Tejun Heo <tj@kernel.org>
*
* This file is released under the GPLv2.
*
* This is percpu allocator which can handle both static and dynamic
* areas. Percpu areas are allocated in chunks in vmalloc area. Each
* chunk is consisted of num_possible_cpus() units and the first chunk
* is used for static percpu variables in the kernel image (special
* boot time alloc/init handling necessary as these areas need to be
* brought up before allocation services are running). Unit grows as
* necessary and all units grow or shrink in unison. When a chunk is
* filled up, another chunk is allocated. ie. in vmalloc area
*
* c0 c1 c2
* ------------------- ------------------- ------------
* | u0 | u1 | u2 | u3 | | u0 | u1 | u2 | u3 | | u0 | u1 | u
* ------------------- ...... ------------------- .... ------------
*
* Allocation is done in offset-size areas of single unit space. Ie,
* an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0,
* c1:u1, c1:u2 and c1:u3. Percpu access can be done by configuring
* percpu base registers UNIT_SIZE apart.
*
* There are usually many small percpu allocations many of them as
* small as 4 bytes. The allocator organizes chunks into lists
* according to free size and tries to allocate from the fullest one.
* Each chunk keeps the maximum contiguous area size hint which is
* guaranteed to be eqaul to or larger than the maximum contiguous
* area in the chunk. This helps the allocator not to iterate the
* chunk maps unnecessarily.
*
* Allocation state in each chunk is kept using an array of integers
* on chunk->map. A positive value in the map represents a free
* region and negative allocated. Allocation inside a chunk is done
* by scanning this map sequentially and serving the first matching
* entry. This is mostly copied from the percpu_modalloc() allocator.
* Chunks are also linked into a rb tree to ease address to chunk
* mapping during free.
*
* To use this allocator, arch code should do the followings.
*
* - define CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
*
* - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
* regular address to percpu pointer and back
*
* - use pcpu_setup_first_chunk() during percpu area initialization to
* setup the first chunk containing the kernel static percpu area
*/
#include <linux/bitmap.h>
#include <linux/bootmem.h>
#include <linux/list.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/percpu.h>
#include <linux/pfn.h>
#include <linux/rbtree.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
#define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */
#define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */
struct pcpu_chunk {
struct list_head list; /* linked to pcpu_slot lists */
struct rb_node rb_node; /* key is chunk->vm->addr */
int free_size; /* free bytes in the chunk */
int contig_hint; /* max contiguous size hint */
struct vm_struct *vm; /* mapped vmalloc region */
int map_used; /* # of map entries used */
int map_alloc; /* # of map entries allocated */
int *map; /* allocation map */
bool immutable; /* no [de]population allowed */
struct page *page[]; /* #cpus * UNIT_PAGES */
};
static int pcpu_unit_pages __read_mostly;
static int pcpu_unit_size __read_mostly;
static int pcpu_chunk_size __read_mostly;
static int pcpu_nr_slots __read_mostly;
static size_t pcpu_chunk_struct_size __read_mostly;
/* the address of the first chunk which starts with the kernel static area */
void *pcpu_base_addr __read_mostly;
EXPORT_SYMBOL_GPL(pcpu_base_addr);
/* the size of kernel static area */
static int pcpu_static_size __read_mostly;
/*
* One mutex to rule them all.
*
* The following mutex is grabbed in the outermost public alloc/free
* interface functions and released only when the operation is
* complete. As such, every function in this file other than the
* outermost functions are called under pcpu_mutex.
*
* It can easily be switched to use spinlock such that only the area
* allocation and page population commit are protected with it doing
* actual [de]allocation without holding any lock. However, given
* what this allocator does, I think it's better to let them run
* sequentially.
*/
static DEFINE_MUTEX(pcpu_mutex);
static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
static struct rb_root pcpu_addr_root = RB_ROOT; /* chunks by address */
static int __pcpu_size_to_slot(int size)
{
int highbit = fls(size); /* size is in bytes */
return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1);
}
static int pcpu_size_to_slot(int size)
{
if (size == pcpu_unit_size)
return pcpu_nr_slots - 1;
return __pcpu_size_to_slot(size);
}
static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
{
if (chunk->free_size < sizeof(int) || chunk->contig_hint < sizeof(int))
return 0;
return pcpu_size_to_slot(chunk->free_size);
}
static int pcpu_page_idx(unsigned int cpu, int page_idx)
{
return cpu * pcpu_unit_pages + page_idx;
}
static struct page **pcpu_chunk_pagep(struct pcpu_chunk *chunk,
unsigned int cpu, int page_idx)
{
return &chunk->page[pcpu_page_idx(cpu, page_idx)];
}
static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
unsigned int cpu, int page_idx)
{
return (unsigned long)chunk->vm->addr +
(pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT);
}
static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk,
int page_idx)
{
return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL;
}
/**
* pcpu_realloc - versatile realloc
* @p: the current pointer (can be NULL for new allocations)
* @size: the current size in bytes (can be 0 for new allocations)
* @new_size: the wanted new size in bytes (can be 0 for free)
*
* More robust realloc which can be used to allocate, resize or free a
* memory area of arbitrary size. If the needed size goes over
* PAGE_SIZE, kernel VM is used.
*
* RETURNS:
* The new pointer on success, NULL on failure.
*/
static void *pcpu_realloc(void *p, size_t size, size_t new_size)
{
void *new;
if (new_size <= PAGE_SIZE)
new = kmalloc(new_size, GFP_KERNEL);
else
new = vmalloc(new_size);
if (new_size && !new)
return NULL;
memcpy(new, p, min(size, new_size));
if (new_size > size)
memset(new + size, 0, new_size - size);
if (size <= PAGE_SIZE)
kfree(p);
else
vfree(p);
return new;
}
/**
* pcpu_chunk_relocate - put chunk in the appropriate chunk slot
* @chunk: chunk of interest
* @oslot: the previous slot it was on
*
* This function is called after an allocation or free changed @chunk.
* New slot according to the changed state is determined and @chunk is
* moved to the slot.
*/
static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
{
int nslot = pcpu_chunk_slot(chunk);
if (oslot != nslot) {
if (oslot < nslot)
list_move(&chunk->list, &pcpu_slot[nslot]);
else
list_move_tail(&chunk->list, &pcpu_slot[nslot]);
}
}
static struct rb_node **pcpu_chunk_rb_search(void *addr,
struct rb_node **parentp)
{
struct rb_node **p = &pcpu_addr_root.rb_node;
struct rb_node *parent = NULL;
struct pcpu_chunk *chunk;
while (*p) {
parent = *p;
chunk = rb_entry(parent, struct pcpu_chunk, rb_node);
if (addr < chunk->vm->addr)
p = &(*p)->rb_left;
else if (addr > chunk->vm->addr)
p = &(*p)->rb_right;
else
break;
}
if (parentp)
*parentp = parent;
return p;
}
/**
* pcpu_chunk_addr_search - search for chunk containing specified address
* @addr: address to search for
*
* Look for chunk which might contain @addr. More specifically, it
* searchs for the chunk with the highest start address which isn't
* beyond @addr.
*
* RETURNS:
* The address of the found chunk.
*/
static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
{
struct rb_node *n, *parent;
struct pcpu_chunk *chunk;
n = *pcpu_chunk_rb_search(addr, &parent);
if (!n) {
/* no exactly matching chunk, the parent is the closest */
n = parent;
BUG_ON(!n);
}
chunk = rb_entry(n, struct pcpu_chunk, rb_node);
if (addr < chunk->vm->addr) {
/* the parent was the next one, look for the previous one */
n = rb_prev(n);
BUG_ON(!n);
chunk = rb_entry(n, struct pcpu_chunk, rb_node);
}
return chunk;
}
/**
* pcpu_chunk_addr_insert - insert chunk into address rb tree
* @new: chunk to insert
*
* Insert @new into address rb tree.
*/
static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)
{
struct rb_node **p, *parent;
p = pcpu_chunk_rb_search(new->vm->addr, &parent);
BUG_ON(*p);
rb_link_node(&new->rb_node, parent, p);
rb_insert_color(&new->rb_node, &pcpu_addr_root);
}
/**
* pcpu_split_block - split a map block
* @chunk: chunk of interest
* @i: index of map block to split
* @head: head size in bytes (can be 0)
* @tail: tail size in bytes (can be 0)
*
* Split the @i'th map block into two or three blocks. If @head is
* non-zero, @head bytes block is inserted before block @i moving it
* to @i+1 and reducing its size by @head bytes.
*
* If @tail is non-zero, the target block, which can be @i or @i+1
* depending on @head, is reduced by @tail bytes and @tail byte block
* is inserted after the target block.
*
* RETURNS:
* 0 on success, -errno on failure.
*/
static int pcpu_split_block(struct pcpu_chunk *chunk, int i, int head, int tail)
{
int nr_extra = !!head + !!tail;
int target = chunk->map_used + nr_extra;
/* reallocation required? */
if (chunk->map_alloc < target) {
int new_alloc = chunk->map_alloc;
int *new;
while (new_alloc < target)
new_alloc *= 2;
new = pcpu_realloc(chunk->map,
chunk->map_alloc * sizeof(new[0]),
new_alloc * sizeof(new[0]));
if (!new)
return -ENOMEM;
chunk->map_alloc = new_alloc;
chunk->map = new;
}
/* insert a new subblock */
memmove(&chunk->map[i + nr_extra], &chunk->map[i],
sizeof(chunk->map[0]) * (chunk->map_used - i));
chunk->map_used += nr_extra;
if (head) {
chunk->map[i + 1] = chunk->map[i] - head;
chunk->map[i++] = head;
}
if (tail) {
chunk->map[i++] -= tail;
chunk->map[i] = tail;
}
return 0;
}
/**
* pcpu_alloc_area - allocate area from a pcpu_chunk
* @chunk: chunk of interest
* @size: wanted size in bytes
* @align: wanted align
*
* Try to allocate @size bytes area aligned at @align from @chunk.
* Note that this function only allocates the offset. It doesn't
* populate or map the area.
*
* RETURNS:
* Allocated offset in @chunk on success, -errno on failure.
*/
static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
{
int oslot = pcpu_chunk_slot(chunk);
int max_contig = 0;
int i, off;
/*
* The static chunk initially doesn't have map attached
* because kmalloc wasn't available during init. Give it one.
*/
if (unlikely(!chunk->map)) {
chunk->map = pcpu_realloc(NULL, 0,
PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
if (!chunk->map)
return -ENOMEM;
chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
chunk->map[chunk->map_used++] = -pcpu_static_size;
if (chunk->free_size)
chunk->map[chunk->map_used++] = chunk->free_size;
}
for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) {
bool is_last = i + 1 == chunk->map_used;
int head, tail;
/* extra for alignment requirement */
head = ALIGN(off, align) - off;
BUG_ON(i == 0 && head != 0);
if (chunk->map[i] < 0)
continue;
if (chunk->map[i] < head + size) {
max_contig = max(chunk->map[i], max_contig);
continue;
}
/*
* If head is small or the previous block is free,
* merge'em. Note that 'small' is defined as smaller
* than sizeof(int), which is very small but isn't too
* uncommon for percpu allocations.
*/
if (head && (head < sizeof(int) || chunk->map[i - 1] > 0)) {
if (chunk->map[i - 1] > 0)
chunk->map[i - 1] += head;
else {
chunk->map[i - 1] -= head;
chunk->free_size -= head;
}
chunk->map[i] -= head;
off += head;
head = 0;
}
/* if tail is small, just keep it around */
tail = chunk->map[i] - head - size;
if (tail < sizeof(int))
tail = 0;
/* split if warranted */
if (head || tail) {
if (pcpu_split_block(chunk, i, head, tail))
return -ENOMEM;
if (head) {
i++;
off += head;
max_contig = max(chunk->map[i - 1], max_contig);
}
if (tail)
max_contig = max(chunk->map[i + 1], max_contig);
}
/* update hint and mark allocated */
if (is_last)
chunk->contig_hint = max_contig; /* fully scanned */
else
chunk->contig_hint = max(chunk->contig_hint,
max_contig);
chunk->free_size -= chunk->map[i];
chunk->map[i] = -chunk->map[i];
pcpu_chunk_relocate(chunk, oslot);
return off;
}
chunk->contig_hint = max_contig; /* fully scanned */
pcpu_chunk_relocate(chunk, oslot);
/*
* Tell the upper layer that this chunk has no area left.
* Note that this is not an error condition but a notification
* to upper layer that it needs to look at other chunks.
* -ENOSPC is chosen as it isn't used in memory subsystem and
* matches the meaning in a way.
*/
return -ENOSPC;
}
/**
* pcpu_free_area - free area to a pcpu_chunk
* @chunk: chunk of interest
* @freeme: offset of area to free
*
* Free area starting from @freeme to @chunk. Note that this function
* only modifies the allocation map. It doesn't depopulate or unmap
* the area.
*/
static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
{
int oslot = pcpu_chunk_slot(chunk);
int i, off;
for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++]))
if (off == freeme)
break;
BUG_ON(off != freeme);
BUG_ON(chunk->map[i] > 0);
chunk->map[i] = -chunk->map[i];
chunk->free_size += chunk->map[i];
/* merge with previous? */
if (i > 0 && chunk->map[i - 1] >= 0) {
chunk->map[i - 1] += chunk->map[i];
chunk->map_used--;
memmove(&chunk->map[i], &chunk->map[i + 1],
(chunk->map_used - i) * sizeof(chunk->map[0]));
i--;
}
/* merge with next? */
if (i + 1 < chunk->map_used && chunk->map[i + 1] >= 0) {
chunk->map[i] += chunk->map[i + 1];
chunk->map_used--;
memmove(&chunk->map[i + 1], &chunk->map[i + 2],
(chunk->map_used - (i + 1)) * sizeof(chunk->map[0]));
}
chunk->contig_hint = max(chunk->map[i], chunk->contig_hint);
pcpu_chunk_relocate(chunk, oslot);
}
/**
* pcpu_unmap - unmap pages out of a pcpu_chunk
* @chunk: chunk of interest
* @page_start: page index of the first page to unmap
* @page_end: page index of the last page to unmap + 1
* @flush: whether to flush cache and tlb or not
*
* For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
* If @flush is true, vcache is flushed before unmapping and tlb
* after.
*/
static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end,
bool flush)
{
unsigned int last = num_possible_cpus() - 1;
unsigned int cpu;
/* unmap must not be done on immutable chunk */
WARN_ON(chunk->immutable);
/*
* Each flushing trial can be very expensive, issue flush on
* the whole region at once rather than doing it for each cpu.
* This could be an overkill but is more scalable.
*/
if (flush)
flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start),
pcpu_chunk_addr(chunk, last, page_end));
for_each_possible_cpu(cpu)
unmap_kernel_range_noflush(
pcpu_chunk_addr(chunk, cpu, page_start),
(page_end - page_start) << PAGE_SHIFT);
/* ditto as flush_cache_vunmap() */
if (flush)
flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start),
pcpu_chunk_addr(chunk, last, page_end));
}
/**
* pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
* @chunk: chunk to depopulate
* @off: offset to the area to depopulate
* @size: size of the area to depopulate in bytes
* @flush: whether to flush cache and tlb or not
*
* For each cpu, depopulate and unmap pages [@page_start,@page_end)
* from @chunk. If @flush is true, vcache is flushed before unmapping
* and tlb after.
*/
static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size,
bool flush)
{
int page_start = PFN_DOWN(off);
int page_end = PFN_UP(off + size);
int unmap_start = -1;
int uninitialized_var(unmap_end);
unsigned int cpu;
int i;
for (i = page_start; i < page_end; i++) {
for_each_possible_cpu(cpu) {
struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i);
if (!*pagep)
continue;
__free_page(*pagep);
/*
* If it's partial depopulation, it might get
* populated or depopulated again. Mark the
* page gone.
*/
*pagep = NULL;
unmap_start = unmap_start < 0 ? i : unmap_start;
unmap_end = i + 1;
}
}
if (unmap_start >= 0)
pcpu_unmap(chunk, unmap_start, unmap_end, flush);
}
/**
* pcpu_map - map pages into a pcpu_chunk
* @chunk: chunk of interest
* @page_start: page index of the first page to map
* @page_end: page index of the last page to map + 1
*
* For each cpu, map pages [@page_start,@page_end) into @chunk.
* vcache is flushed afterwards.
*/
static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
{
unsigned int last = num_possible_cpus() - 1;
unsigned int cpu;
int err;
/* map must not be done on immutable chunk */
WARN_ON(chunk->immutable);
for_each_possible_cpu(cpu) {
err = map_kernel_range_noflush(
pcpu_chunk_addr(chunk, cpu, page_start),
(page_end - page_start) << PAGE_SHIFT,
PAGE_KERNEL,
pcpu_chunk_pagep(chunk, cpu, page_start));
if (err < 0)
return err;
}
/* flush at once, please read comments in pcpu_unmap() */
flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start),
pcpu_chunk_addr(chunk, last, page_end));
return 0;
}
/**
* pcpu_populate_chunk - populate and map an area of a pcpu_chunk
* @chunk: chunk of interest
* @off: offset to the area to populate
* @size: size of the area to populate in bytes
*
* For each cpu, populate and map pages [@page_start,@page_end) into
* @chunk. The area is cleared on return.
*/
static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
{
const gfp_t alloc_mask = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
int page_start = PFN_DOWN(off);
int page_end = PFN_UP(off + size);
int map_start = -1;
int map_end;
unsigned int cpu;
int i;
for (i = page_start; i < page_end; i++) {
if (pcpu_chunk_page_occupied(chunk, i)) {
if (map_start >= 0) {
if (pcpu_map(chunk, map_start, map_end))
goto err;
map_start = -1;
}
continue;
}
map_start = map_start < 0 ? i : map_start;
map_end = i + 1;
for_each_possible_cpu(cpu) {
struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i);
*pagep = alloc_pages_node(cpu_to_node(cpu),
alloc_mask, 0);
if (!*pagep)
goto err;
}
}
if (map_start >= 0 && pcpu_map(chunk, map_start, map_end))
goto err;
for_each_possible_cpu(cpu)
memset(chunk->vm->addr + cpu * pcpu_unit_size + off, 0,
size);
return 0;
err:
/* likely under heavy memory pressure, give memory back */
pcpu_depopulate_chunk(chunk, off, size, true);
return -ENOMEM;
}
static void free_pcpu_chunk(struct pcpu_chunk *chunk)
{
if (!chunk)
return;
if (chunk->vm)
free_vm_area(chunk->vm);
pcpu_realloc(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]), 0);
kfree(chunk);
}
static struct pcpu_chunk *alloc_pcpu_chunk(void)
{
struct pcpu_chunk *chunk;
chunk = kzalloc(pcpu_chunk_struct_size, GFP_KERNEL);
if (!chunk)
return NULL;
chunk->map = pcpu_realloc(NULL, 0,
PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
chunk->map[chunk->map_used++] = pcpu_unit_size;
chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL);
if (!chunk->vm) {
free_pcpu_chunk(chunk);
return NULL;
}
INIT_LIST_HEAD(&chunk->list);
chunk->free_size = pcpu_unit_size;
chunk->contig_hint = pcpu_unit_size;
return chunk;
}
/**
* __alloc_percpu - allocate percpu area
* @size: size of area to allocate in bytes
* @align: alignment of area (max PAGE_SIZE)
*
* Allocate percpu area of @size bytes aligned at @align. Might
* sleep. Might trigger writeouts.
*
* RETURNS:
* Percpu pointer to the allocated area on success, NULL on failure.
*/
void *__alloc_percpu(size_t size, size_t align)
{
void *ptr = NULL;
struct pcpu_chunk *chunk;
int slot, off;
if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) {
WARN(true, "illegal size (%zu) or align (%zu) for "
"percpu allocation\n", size, align);
return NULL;
}
mutex_lock(&pcpu_mutex);
/* allocate area */
for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
list_for_each_entry(chunk, &pcpu_slot[slot], list) {
if (size > chunk->contig_hint)
continue;
off = pcpu_alloc_area(chunk, size, align);
if (off >= 0)
goto area_found;
if (off != -ENOSPC)
goto out_unlock;
}
}
/* hmmm... no space left, create a new chunk */
chunk = alloc_pcpu_chunk();
if (!chunk)
goto out_unlock;
pcpu_chunk_relocate(chunk, -1);
pcpu_chunk_addr_insert(chunk);
off = pcpu_alloc_area(chunk, size, align);
if (off < 0)
goto out_unlock;
area_found:
/* populate, map and clear the area */
if (pcpu_populate_chunk(chunk, off, size)) {
pcpu_free_area(chunk, off);
goto out_unlock;
}
ptr = __addr_to_pcpu_ptr(chunk->vm->addr + off);
out_unlock:
mutex_unlock(&pcpu_mutex);
return ptr;
}
EXPORT_SYMBOL_GPL(__alloc_percpu);
static void pcpu_kill_chunk(struct pcpu_chunk *chunk)
{
WARN_ON(chunk->immutable);
pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false);
list_del(&chunk->list);
rb_erase(&chunk->rb_node, &pcpu_addr_root);
free_pcpu_chunk(chunk);
}
/**
* free_percpu - free percpu area
* @ptr: pointer to area to free
*
* Free percpu area @ptr. Might sleep.
*/
void free_percpu(void *ptr)
{
void *addr = __pcpu_ptr_to_addr(ptr);
struct pcpu_chunk *chunk;
int off;
if (!ptr)
return;
mutex_lock(&pcpu_mutex);
chunk = pcpu_chunk_addr_search(addr);
off = addr - chunk->vm->addr;
pcpu_free_area(chunk, off);
/* the chunk became fully free, kill one if there are other free ones */
if (chunk->free_size == pcpu_unit_size) {
struct pcpu_chunk *pos;
list_for_each_entry(pos,
&pcpu_slot[pcpu_chunk_slot(chunk)], list)
if (pos != chunk) {
pcpu_kill_chunk(pos);
break;
}
}
mutex_unlock(&pcpu_mutex);
}
EXPORT_SYMBOL_GPL(free_percpu);
/**
* pcpu_setup_first_chunk - initialize the first percpu chunk
* @get_page_fn: callback to fetch page pointer
* @static_size: the size of static percpu area in bytes
* @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, 0 for auto
* @free_size: free size in bytes, 0 for auto
* @base_addr: mapped address, NULL for auto
* @populate_pte_fn: callback to allocate pagetable, NULL if unnecessary
*
* Initialize the first percpu chunk which contains the kernel static
* perpcu area. This function is to be called from arch percpu area
* setup path. The first two parameters are mandatory. The rest are
* optional.
*
* @get_page_fn() should return pointer to percpu page given cpu
* number and page number. It should at least return enough pages to
* cover the static area. The returned pages for static area should
* have been initialized with valid data. If @unit_size is specified,
* it can also return pages after the static area. NULL return
* indicates end of pages for the cpu. Note that @get_page_fn() must
* return the same number of pages for all cpus.
*
* @unit_size, if non-zero, determines unit size and must be aligned
* to PAGE_SIZE and equal to or larger than @static_size + @free_size.
*
* @free_size determines the number of free bytes after the static
* area in the first chunk. If zero, whatever left is available.
* Specifying non-zero value make percpu leave the area after
* @static_size + @free_size alone.
*
* Non-null @base_addr means that the caller already allocated virtual
* region for the first chunk and mapped it. percpu must not mess
* with the chunk. Note that @base_addr with 0 @unit_size or non-NULL
* @populate_pte_fn doesn't make any sense.
*
* @populate_pte_fn is used to populate the pagetable. NULL means the
* caller already populated the pagetable.
*
* RETURNS:
* The determined pcpu_unit_size which can be used to initialize
* percpu access.
*/
size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
size_t static_size, size_t unit_size,
size_t free_size, void *base_addr,
pcpu_populate_pte_fn_t populate_pte_fn)
{
static struct vm_struct static_vm;
struct pcpu_chunk *static_chunk;
unsigned int cpu;
int nr_pages;
int err, i;
/* santiy checks */
BUG_ON(!static_size);
BUG_ON(!unit_size && free_size);
BUG_ON(unit_size && unit_size < static_size + free_size);
BUG_ON(unit_size & ~PAGE_MASK);
BUG_ON(base_addr && !unit_size);
BUG_ON(base_addr && populate_pte_fn);
if (unit_size)
pcpu_unit_pages = unit_size >> PAGE_SHIFT;
else
pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT,
PFN_UP(static_size));
pcpu_static_size = static_size;
pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size;
pcpu_chunk_struct_size = sizeof(struct pcpu_chunk)
+ num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *);
/*
* Allocate chunk slots. The additional last slot is for
* empty chunks.
*/
pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;
pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0]));
for (i = 0; i < pcpu_nr_slots; i++)
INIT_LIST_HEAD(&pcpu_slot[i]);
/* init static_chunk */
static_chunk = alloc_bootmem(pcpu_chunk_struct_size);
INIT_LIST_HEAD(&static_chunk->list);
static_chunk->vm = &static_vm;
if (free_size)
static_chunk->free_size = free_size;
else
static_chunk->free_size = pcpu_unit_size - pcpu_static_size;
static_chunk->contig_hint = static_chunk->free_size;
/* allocate vm address */
static_vm.flags = VM_ALLOC;
static_vm.size = pcpu_chunk_size;
if (!base_addr)
vm_area_register_early(&static_vm, PAGE_SIZE);
else {
/*
* Pages already mapped. No need to remap into
* vmalloc area. In this case the static chunk can't
* be mapped or unmapped by percpu and is marked
* immutable.
*/
static_vm.addr = base_addr;
static_chunk->immutable = true;
}
/* assign pages */
nr_pages = -1;
for_each_possible_cpu(cpu) {
for (i = 0; i < pcpu_unit_pages; i++) {
struct page *page = get_page_fn(cpu, i);
if (!page)
break;
*pcpu_chunk_pagep(static_chunk, cpu, i) = page;
}
BUG_ON(i < PFN_UP(pcpu_static_size));
if (nr_pages < 0)
nr_pages = i;
else
BUG_ON(nr_pages != i);
}
/* map them */
if (populate_pte_fn) {
for_each_possible_cpu(cpu)
for (i = 0; i < nr_pages; i++)
populate_pte_fn(pcpu_chunk_addr(static_chunk,
cpu, i));
err = pcpu_map(static_chunk, 0, nr_pages);
if (err)
panic("failed to setup static percpu area, err=%d\n",
err);
}
/* link static_chunk in */
pcpu_chunk_relocate(static_chunk, -1);
pcpu_chunk_addr_insert(static_chunk);
/* we're done */
pcpu_base_addr = (void *)pcpu_chunk_addr(static_chunk, 0, 0);
return pcpu_unit_size;
}

View file

@ -24,6 +24,7 @@
#include <linux/radix-tree.h> #include <linux/radix-tree.h>
#include <linux/rcupdate.h> #include <linux/rcupdate.h>
#include <linux/bootmem.h> #include <linux/bootmem.h>
#include <linux/pfn.h>
#include <asm/atomic.h> #include <asm/atomic.h>
#include <asm/uaccess.h> #include <asm/uaccess.h>
@ -152,8 +153,8 @@ static int vmap_pud_range(pgd_t *pgd, unsigned long addr,
* *
* Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N] * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N]
*/ */
static int vmap_page_range(unsigned long start, unsigned long end, static int vmap_page_range_noflush(unsigned long start, unsigned long end,
pgprot_t prot, struct page **pages) pgprot_t prot, struct page **pages)
{ {
pgd_t *pgd; pgd_t *pgd;
unsigned long next; unsigned long next;
@ -169,13 +170,22 @@ static int vmap_page_range(unsigned long start, unsigned long end,
if (err) if (err)
break; break;
} while (pgd++, addr = next, addr != end); } while (pgd++, addr = next, addr != end);
flush_cache_vmap(start, end);
if (unlikely(err)) if (unlikely(err))
return err; return err;
return nr; return nr;
} }
static int vmap_page_range(unsigned long start, unsigned long end,
pgprot_t prot, struct page **pages)
{
int ret;
ret = vmap_page_range_noflush(start, end, prot, pages);
flush_cache_vmap(start, end);
return ret;
}
static inline int is_vmalloc_or_module_addr(const void *x) static inline int is_vmalloc_or_module_addr(const void *x)
{ {
/* /*
@ -982,6 +992,32 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t pro
} }
EXPORT_SYMBOL(vm_map_ram); EXPORT_SYMBOL(vm_map_ram);
/**
* vm_area_register_early - register vmap area early during boot
* @vm: vm_struct to register
* @align: requested alignment
*
* This function is used to register kernel vm area before
* vmalloc_init() is called. @vm->size and @vm->flags should contain
* proper values on entry and other fields should be zero. On return,
* vm->addr contains the allocated address.
*
* DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
*/
void __init vm_area_register_early(struct vm_struct *vm, size_t align)
{
static size_t vm_init_off __initdata;
unsigned long addr;
addr = ALIGN(VMALLOC_START + vm_init_off, align);
vm_init_off = PFN_ALIGN(addr + vm->size) - VMALLOC_START;
vm->addr = (void *)addr;
vm->next = vmlist;
vmlist = vm;
}
void __init vmalloc_init(void) void __init vmalloc_init(void)
{ {
struct vmap_area *va; struct vmap_area *va;
@ -1009,6 +1045,58 @@ void __init vmalloc_init(void)
vmap_initialized = true; vmap_initialized = true;
} }
/**
* map_kernel_range_noflush - map kernel VM area with the specified pages
* @addr: start of the VM area to map
* @size: size of the VM area to map
* @prot: page protection flags to use
* @pages: pages to map
*
* Map PFN_UP(@size) pages at @addr. The VM area @addr and @size
* specify should have been allocated using get_vm_area() and its
* friends.
*
* NOTE:
* This function does NOT do any cache flushing. The caller is
* responsible for calling flush_cache_vmap() on to-be-mapped areas
* before calling this function.
*
* RETURNS:
* The number of pages mapped on success, -errno on failure.
*/
int map_kernel_range_noflush(unsigned long addr, unsigned long size,
pgprot_t prot, struct page **pages)
{
return vmap_page_range_noflush(addr, addr + size, prot, pages);
}
/**
* unmap_kernel_range_noflush - unmap kernel VM area
* @addr: start of the VM area to unmap
* @size: size of the VM area to unmap
*
* Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size
* specify should have been allocated using get_vm_area() and its
* friends.
*
* NOTE:
* This function does NOT do any cache flushing. The caller is
* responsible for calling flush_cache_vunmap() on to-be-mapped areas
* before calling this function and flush_tlb_kernel_range() after.
*/
void unmap_kernel_range_noflush(unsigned long addr, unsigned long size)
{
vunmap_page_range(addr, addr + size);
}
/**
* unmap_kernel_range - unmap kernel VM area and flush cache and TLB
* @addr: start of the VM area to unmap
* @size: size of the VM area to unmap
*
* Similar to unmap_kernel_range_noflush() but flushes vcache before
* the unmapping and tlb after.
*/
void unmap_kernel_range(unsigned long addr, unsigned long size) void unmap_kernel_range(unsigned long addr, unsigned long size)
{ {
unsigned long end = addr + size; unsigned long end = addr + size;

View file

@ -1375,10 +1375,10 @@ EXPORT_SYMBOL_GPL(snmp_fold_field);
int snmp_mib_init(void *ptr[2], size_t mibsize) int snmp_mib_init(void *ptr[2], size_t mibsize)
{ {
BUG_ON(ptr == NULL); BUG_ON(ptr == NULL);
ptr[0] = __alloc_percpu(mibsize); ptr[0] = __alloc_percpu(mibsize, __alignof__(unsigned long long));
if (!ptr[0]) if (!ptr[0])
goto err0; goto err0;
ptr[1] = __alloc_percpu(mibsize); ptr[1] = __alloc_percpu(mibsize, __alignof__(unsigned long long));
if (!ptr[1]) if (!ptr[1])
goto err1; goto err1;
return 0; return 0;