diff --git a/drivers/base/node.c b/drivers/base/node.c index 5d4517ccc422..904b27caf697 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -87,7 +87,7 @@ static ssize_t node_read_numastat(struct sys_device * dev, char * buf) for (i = 0; i < MAX_NR_ZONES; i++) { struct zone *z = &pg->node_zones[i]; for (cpu = 0; cpu < NR_CPUS; cpu++) { - struct per_cpu_pageset *ps = &z->pageset[cpu]; + struct per_cpu_pageset *ps = zone_pcp(z,cpu); numa_hit += ps->numa_hit; numa_miss += ps->numa_miss; numa_foreign += ps->numa_foreign; diff --git a/include/linux/mm.h b/include/linux/mm.h index 17518fe0b311..1813b162b0a8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -691,6 +691,12 @@ extern void show_mem(void); extern void si_meminfo(struct sysinfo * val); extern void si_meminfo_node(struct sysinfo *val, int nid); +#ifdef CONFIG_NUMA +extern void setup_per_cpu_pageset(void); +#else +static inline void setup_per_cpu_pageset(void) {} +#endif + /* prio_tree.c */ void vma_prio_tree_add(struct vm_area_struct *, struct vm_area_struct *old); void vma_prio_tree_insert(struct vm_area_struct *, struct prio_tree_root *); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 18fed8b67943..4733d35d8223 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -63,6 +63,12 @@ struct per_cpu_pageset { #endif } ____cacheline_aligned_in_smp; +#ifdef CONFIG_NUMA +#define zone_pcp(__z, __cpu) ((__z)->pageset[(__cpu)]) +#else +#define zone_pcp(__z, __cpu) (&(__z)->pageset[(__cpu)]) +#endif + #define ZONE_DMA 0 #define ZONE_NORMAL 1 #define ZONE_HIGHMEM 2 @@ -122,8 +128,11 @@ struct zone { */ unsigned long lowmem_reserve[MAX_NR_ZONES]; +#ifdef CONFIG_NUMA + struct per_cpu_pageset *pageset[NR_CPUS]; +#else struct per_cpu_pageset pageset[NR_CPUS]; - +#endif /* * free areas of different sizes */ diff --git a/init/main.c b/init/main.c index 40bf367ffdf1..d324801729ba 100644 --- a/init/main.c +++ b/init/main.c @@ -490,6 +490,7 @@ asmlinkage void __init start_kernel(void) vfs_caches_init_early(); mem_init(); kmem_cache_init(); + setup_per_cpu_pageset(); numa_policy_init(); if (late_time_init) late_time_init(); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 08c41da429cf..39252c732db2 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -721,7 +721,7 @@ static struct page *alloc_page_interleave(unsigned int __nocast gfp, unsigned or zl = NODE_DATA(nid)->node_zonelists + (gfp & GFP_ZONEMASK); page = __alloc_pages(gfp, order, zl); if (page && page_zone(page) == zl->zones[0]) { - zl->zones[0]->pageset[get_cpu()].interleave_hit++; + zone_pcp(zl->zones[0],get_cpu())->interleave_hit++; put_cpu(); } return page; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2019c1b19254..95cbd30a67b8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -71,6 +71,11 @@ EXPORT_SYMBOL(nr_swap_pages); struct zone *zone_table[1 << (ZONES_SHIFT + NODES_SHIFT)]; EXPORT_SYMBOL(zone_table); +#ifdef CONFIG_NUMA +static struct per_cpu_pageset + pageset_table[MAX_NR_ZONES*MAX_NUMNODES*NR_CPUS] __initdata; +#endif + static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; int min_free_kbytes = 1024; @@ -520,7 +525,7 @@ static void __drain_pages(unsigned int cpu) for_each_zone(zone) { struct per_cpu_pageset *pset; - pset = &zone->pageset[cpu]; + pset = zone_pcp(zone, cpu); for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { struct per_cpu_pages *pcp; @@ -583,12 +588,12 @@ static void zone_statistics(struct zonelist *zonelist, struct zone *z) local_irq_save(flags); cpu = smp_processor_id(); - p = &z->pageset[cpu]; + p = zone_pcp(z,cpu); if (pg == orig) { - z->pageset[cpu].numa_hit++; + p->numa_hit++; } else { p->numa_miss++; - zonelist->zones[0]->pageset[cpu].numa_foreign++; + zone_pcp(zonelist->zones[0], cpu)->numa_foreign++; } if (pg == NODE_DATA(numa_node_id())) p->local_node++; @@ -615,7 +620,7 @@ static void fastcall free_hot_cold_page(struct page *page, int cold) if (PageAnon(page)) page->mapping = NULL; free_pages_check(__FUNCTION__, page); - pcp = &zone->pageset[get_cpu()].pcp[cold]; + pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; local_irq_save(flags); if (pcp->count >= pcp->high) pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0); @@ -659,7 +664,7 @@ buffered_rmqueue(struct zone *zone, int order, unsigned int __nocast gfp_flags) if (order == 0) { struct per_cpu_pages *pcp; - pcp = &zone->pageset[get_cpu()].pcp[cold]; + pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; local_irq_save(flags); if (pcp->count <= pcp->low) pcp->count += rmqueue_bulk(zone, 0, @@ -1262,7 +1267,7 @@ void show_free_areas(void) if (!cpu_possible(cpu)) continue; - pageset = zone->pageset + cpu; + pageset = zone_pcp(zone, cpu); for (temperature = 0; temperature < 2; temperature++) printk("cpu %d %s: low %d, high %d, batch %d\n", @@ -1645,6 +1650,157 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone, memmap_init_zone((size), (nid), (zone), (start_pfn)) #endif +static int __devinit zone_batchsize(struct zone *zone) +{ + int batch; + + /* + * The per-cpu-pages pools are set to around 1000th of the + * size of the zone. But no more than 1/4 of a meg - there's + * no point in going beyond the size of L2 cache. + * + * OK, so we don't know how big the cache is. So guess. + */ + batch = zone->present_pages / 1024; + if (batch * PAGE_SIZE > 256 * 1024) + batch = (256 * 1024) / PAGE_SIZE; + batch /= 4; /* We effectively *= 4 below */ + if (batch < 1) + batch = 1; + + /* + * Clamp the batch to a 2^n - 1 value. Having a power + * of 2 value was found to be more likely to have + * suboptimal cache aliasing properties in some cases. + * + * For example if 2 tasks are alternately allocating + * batches of pages, one task can end up with a lot + * of pages of one half of the possible page colors + * and the other with pages of the other colors. + */ + batch = (1 << fls(batch + batch/2)) - 1; + return batch; +} + +#ifdef CONFIG_NUMA +/* + * Dynamicaly allocate memory for the + * per cpu pageset array in struct zone. + */ +static int __devinit process_zones(int cpu) +{ + struct zone *zone, *dzone; + int i; + + for_each_zone(zone) { + struct per_cpu_pageset *npageset = NULL; + + npageset = kmalloc_node(sizeof(struct per_cpu_pageset), + GFP_KERNEL, cpu_to_node(cpu)); + if (!npageset) { + zone->pageset[cpu] = NULL; + goto bad; + } + + if (zone->pageset[cpu]) { + memcpy(npageset, zone->pageset[cpu], + sizeof(struct per_cpu_pageset)); + + /* Relocate lists */ + for (i = 0; i < 2; i++) { + INIT_LIST_HEAD(&npageset->pcp[i].list); + list_splice(&zone->pageset[cpu]->pcp[i].list, + &npageset->pcp[i].list); + } + } else { + struct per_cpu_pages *pcp; + unsigned long batch; + + batch = zone_batchsize(zone); + + pcp = &npageset->pcp[0]; /* hot */ + pcp->count = 0; + pcp->low = 2 * batch; + pcp->high = 6 * batch; + pcp->batch = 1 * batch; + INIT_LIST_HEAD(&pcp->list); + + pcp = &npageset->pcp[1]; /* cold*/ + pcp->count = 0; + pcp->low = 0; + pcp->high = 2 * batch; + pcp->batch = 1 * batch; + INIT_LIST_HEAD(&pcp->list); + } + zone->pageset[cpu] = npageset; + } + + return 0; +bad: + for_each_zone(dzone) { + if (dzone == zone) + break; + kfree(dzone->pageset[cpu]); + dzone->pageset[cpu] = NULL; + } + return -ENOMEM; +} + +static inline void free_zone_pagesets(int cpu) +{ +#ifdef CONFIG_NUMA + struct zone *zone; + + for_each_zone(zone) { + struct per_cpu_pageset *pset = zone_pcp(zone, cpu); + + zone_pcp(zone, cpu) = NULL; + kfree(pset); + } +#endif +} + +static int __devinit pageset_cpuup_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + int cpu = (long)hcpu; + int ret = NOTIFY_OK; + + switch (action) { + case CPU_UP_PREPARE: + if (process_zones(cpu)) + ret = NOTIFY_BAD; + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_DEAD: + free_zone_pagesets(cpu); + break; +#endif + default: + break; + } + return ret; +} + +static struct notifier_block pageset_notifier = + { &pageset_cpuup_callback, NULL, 0 }; + +void __init setup_per_cpu_pageset() +{ + int err; + + /* Initialize per_cpu_pageset for cpu 0. + * A cpuup callback will do this for every cpu + * as it comes online + */ + err = process_zones(smp_processor_id()); + BUG_ON(err); + register_cpu_notifier(&pageset_notifier); +} + +#endif + /* * Set up the zone data structures: * - mark all pages reserved @@ -1687,43 +1843,28 @@ static void __init free_area_init_core(struct pglist_data *pgdat, zone->temp_priority = zone->prev_priority = DEF_PRIORITY; - /* - * The per-cpu-pages pools are set to around 1000th of the - * size of the zone. But no more than 1/4 of a meg - there's - * no point in going beyond the size of L2 cache. - * - * OK, so we don't know how big the cache is. So guess. - */ - batch = zone->present_pages / 1024; - if (batch * PAGE_SIZE > 256 * 1024) - batch = (256 * 1024) / PAGE_SIZE; - batch /= 4; /* We effectively *= 4 below */ - if (batch < 1) - batch = 1; - - /* - * Clamp the batch to a 2^n - 1 value. Having a power - * of 2 value was found to be more likely to have - * suboptimal cache aliasing properties in some cases. - * - * For example if 2 tasks are alternately allocating - * batches of pages, one task can end up with a lot - * of pages of one half of the possible page colors - * and the other with pages of the other colors. - */ - batch = (1 << fls(batch + batch/2)) - 1; + batch = zone_batchsize(zone); for (cpu = 0; cpu < NR_CPUS; cpu++) { struct per_cpu_pages *pcp; +#ifdef CONFIG_NUMA + struct per_cpu_pageset *pgset; + pgset = &pageset_table[nid*MAX_NR_ZONES*NR_CPUS + + (j * NR_CPUS) + cpu]; - pcp = &zone->pageset[cpu].pcp[0]; /* hot */ + zone->pageset[cpu] = pgset; +#else + struct per_cpu_pageset *pgset = zone_pcp(zone, cpu); +#endif + + pcp = &pgset->pcp[0]; /* hot */ pcp->count = 0; pcp->low = 2 * batch; pcp->high = 6 * batch; pcp->batch = 1 * batch; INIT_LIST_HEAD(&pcp->list); - pcp = &zone->pageset[cpu].pcp[1]; /* cold */ + pcp = &pgset->pcp[1]; /* cold */ pcp->count = 0; pcp->low = 0; pcp->high = 2 * batch; @@ -1929,7 +2070,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg) struct per_cpu_pageset *pageset; int j; - pageset = &zone->pageset[i]; + pageset = zone_pcp(zone, i); for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { if (pageset->pcp[j].count) break;