diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S index 6cd1ed311f02..d408afaf6495 100644 --- a/arch/i386/kernel/syscall_table.S +++ b/arch/i386/kernel/syscall_table.S @@ -251,7 +251,7 @@ ENTRY(sys_call_table) .long sys_io_submit .long sys_io_cancel .long sys_fadvise64 /* 250 */ - .long sys_ni_syscall + .long sys_set_zone_reclaim .long sys_exit_group .long sys_lookup_dcookie .long sys_epoll_create diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S index d99316c9be28..b1d5d3d5276c 100644 --- a/arch/ia64/kernel/entry.S +++ b/arch/ia64/kernel/entry.S @@ -1579,7 +1579,7 @@ sys_call_table: data8 sys_keyctl data8 sys_ni_syscall data8 sys_ni_syscall // 1275 - data8 sys_ni_syscall + data8 sys_set_zone_reclaim data8 sys_ni_syscall data8 sys_ni_syscall data8 sys_ni_syscall diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h index 61bcc1b1e3f4..176413fb9ae3 100644 --- a/include/asm-i386/unistd.h +++ b/include/asm-i386/unistd.h @@ -256,7 +256,7 @@ #define __NR_io_submit 248 #define __NR_io_cancel 249 #define __NR_fadvise64 250 - +#define __NR_set_zone_reclaim 251 #define __NR_exit_group 252 #define __NR_lookup_dcookie 253 #define __NR_epoll_create 254 diff --git a/include/asm-ia64/unistd.h b/include/asm-ia64/unistd.h index 33e26c557c5c..f7f43ec2483a 100644 --- a/include/asm-ia64/unistd.h +++ b/include/asm-ia64/unistd.h @@ -263,6 +263,7 @@ #define __NR_add_key 1271 #define __NR_request_key 1272 #define __NR_keyctl 1273 +#define __NR_set_zone_reclaim 1276 #ifdef __KERNEL__ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index beacd931b606..dfc2452ccb10 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -144,6 +144,12 @@ struct zone { unsigned long pages_scanned; /* since last reclaim */ int all_unreclaimable; /* All pages pinned */ + /* + * Does the allocator try to reclaim pages from the zone as soon + * as it fails a watermark_ok() in __alloc_pages? + */ + int reclaim_pages; + /* * prev_priority holds the scanning priority for this zone. It is * defined as the scanning priority at which we achieved our reclaim diff --git a/include/linux/swap.h b/include/linux/swap.h index 3bbc41be9bd0..0d21e682d99d 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -173,6 +173,7 @@ extern void swap_setup(void); /* linux/mm/vmscan.c */ extern int try_to_free_pages(struct zone **, unsigned int, unsigned int); +extern int zone_reclaim(struct zone *, unsigned int, unsigned int); extern int shrink_all_memory(int); extern int vm_swappiness; diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 0dda70ed1f98..6f15bea7d1a8 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -77,6 +77,7 @@ cond_syscall(sys_request_key); cond_syscall(sys_keyctl); cond_syscall(compat_sys_keyctl); cond_syscall(compat_sys_socketcall); +cond_syscall(sys_set_zone_reclaim); /* arch-specific weak syscall entries */ cond_syscall(sys_pciconfig_read); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 40169f0b7e9e..3c0f69ded6b5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -724,6 +724,14 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark, return 1; } +static inline int +should_reclaim_zone(struct zone *z, unsigned int gfp_mask) +{ + if (!z->reclaim_pages) + return 0; + return 1; +} + /* * This is the 'heart' of the zoned buddy allocator. */ @@ -760,17 +768,32 @@ __alloc_pages(unsigned int __nocast gfp_mask, unsigned int order, classzone_idx = zone_idx(zones[0]); - restart: +restart: /* Go through the zonelist once, looking for a zone with enough free */ for (i = 0; (z = zones[i]) != NULL; i++) { - - if (!zone_watermark_ok(z, order, z->pages_low, - classzone_idx, 0, 0)) - continue; + int do_reclaim = should_reclaim_zone(z, gfp_mask); if (!cpuset_zone_allowed(z)) continue; + /* + * If the zone is to attempt early page reclaim then this loop + * will try to reclaim pages and check the watermark a second + * time before giving up and falling back to the next zone. + */ +zone_reclaim_retry: + if (!zone_watermark_ok(z, order, z->pages_low, + classzone_idx, 0, 0)) { + if (!do_reclaim) + continue; + else { + zone_reclaim(z, gfp_mask, order); + /* Only try reclaim once */ + do_reclaim = 0; + goto zone_reclaim_retry; + } + } + page = buffered_rmqueue(z, order, gfp_mask); if (page) goto got_pg; diff --git a/mm/vmscan.c b/mm/vmscan.c index 6379ddbffd9b..7da846960d8a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1323,3 +1323,67 @@ static int __init kswapd_init(void) } module_init(kswapd_init) + + +/* + * Try to free up some pages from this zone through reclaim. + */ +int zone_reclaim(struct zone *zone, unsigned int gfp_mask, unsigned int order) +{ + struct scan_control sc; + int nr_pages = 1 << order; + int total_reclaimed = 0; + + /* The reclaim may sleep, so don't do it if sleep isn't allowed */ + if (!(gfp_mask & __GFP_WAIT)) + return 0; + if (zone->all_unreclaimable) + return 0; + + sc.gfp_mask = gfp_mask; + sc.may_writepage = 0; + sc.may_swap = 0; + sc.nr_mapped = read_page_state(nr_mapped); + sc.nr_scanned = 0; + sc.nr_reclaimed = 0; + /* scan at the highest priority */ + sc.priority = 0; + + if (nr_pages > SWAP_CLUSTER_MAX) + sc.swap_cluster_max = nr_pages; + else + sc.swap_cluster_max = SWAP_CLUSTER_MAX; + + shrink_zone(zone, &sc); + total_reclaimed = sc.nr_reclaimed; + + return total_reclaimed; +} + +asmlinkage long sys_set_zone_reclaim(unsigned int node, unsigned int zone, + unsigned int state) +{ + struct zone *z; + int i; + + if (node >= MAX_NUMNODES || !node_online(node)) + return -EINVAL; + + /* This will break if we ever add more zones */ + if (!(zone & (1<node_zones[i]; + + if (state) + z->reclaim_pages = 1; + else + z->reclaim_pages = 0; + } + + return 0; +}