Merge branch 'akpm' (patches from Andrew)

Merge more updates from Andrew Morton:
 "The rest of MM"

* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (101 commits)
  mm, compaction: simplify contended compaction handling
  mm, compaction: introduce direct compaction priority
  mm, thp: remove __GFP_NORETRY from khugepaged and madvised allocations
  mm, page_alloc: make THP-specific decisions more generic
  mm, page_alloc: restructure direct compaction handling in slowpath
  mm, page_alloc: don't retry initial attempt in slowpath
  mm, page_alloc: set alloc_flags only once in slowpath
  lib/stackdepot.c: use __GFP_NOWARN for stack allocations
  mm, kasan: switch SLUB to stackdepot, enable memory quarantine for SLUB
  mm, kasan: account for object redzone in SLUB's nearest_obj()
  mm: fix use-after-free if memory allocation failed in vma_adjust()
  zsmalloc: Delete an unnecessary check before the function call "iput"
  mm/memblock.c: fix index adjustment error in __next_mem_range_rev()
  mem-hotplug: alloc new page from a nearest neighbor node when mem-offline
  mm: optimize copy_page_to/from_iter_iovec
  mm: add cond_resched() to generic_swapfile_activate()
  Revert "mm, mempool: only set __GFP_NOMEMALLOC if there are free elements"
  mm, compaction: don't isolate PageWriteback pages in MIGRATE_SYNC_LIGHT mode
  mm: hwpoison: remove incorrect comments
  make __section_nr() more efficient
  ...
This commit is contained in:
Linus Torvalds 2016-07-28 16:36:48 -07:00
commit 1c88e19b0f
90 changed files with 2525 additions and 1986 deletions

View file

@ -107,9 +107,9 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
8. LRU 8. LRU
Each memcg has its own private LRU. Now, its handling is under global Each memcg has its own private LRU. Now, its handling is under global
VM's control (means that it's handled under global zone->lru_lock). VM's control (means that it's handled under global zone_lru_lock).
Almost all routines around memcg's LRU is called by global LRU's Almost all routines around memcg's LRU is called by global LRU's
list management functions under zone->lru_lock(). list management functions under zone_lru_lock().
A special function is mem_cgroup_isolate_pages(). This scans A special function is mem_cgroup_isolate_pages(). This scans
memcg's private LRU and call __isolate_lru_page() to extract a page memcg's private LRU and call __isolate_lru_page() to extract a page

View file

@ -267,11 +267,11 @@ When oom event notifier is registered, event will be delivered.
Other lock order is following: Other lock order is following:
PG_locked. PG_locked.
mm->page_table_lock mm->page_table_lock
zone->lru_lock zone_lru_lock
lock_page_cgroup. lock_page_cgroup.
In many cases, just lock_page_cgroup() is called. In many cases, just lock_page_cgroup() is called.
per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by
zone->lru_lock, it has no lock of its own. zone_lru_lock, it has no lock of its own.
2.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM) 2.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM)

View file

@ -224,7 +224,7 @@ void __init arm64_memblock_init(void)
* via the linear mapping. * via the linear mapping.
*/ */
if (memory_limit != (phys_addr_t)ULLONG_MAX) { if (memory_limit != (phys_addr_t)ULLONG_MAX) {
memblock_enforce_memory_limit(memory_limit); memblock_mem_limit_remove_map(memory_limit);
memblock_add(__pa(_text), (u64)(_end - _text)); memblock_add(__pa(_text), (u64)(_end - _text));
} }

View file

@ -102,7 +102,7 @@ static void appldata_get_mem_data(void *data)
mem_data->totalhigh = P2K(val.totalhigh); mem_data->totalhigh = P2K(val.totalhigh);
mem_data->freehigh = P2K(val.freehigh); mem_data->freehigh = P2K(val.freehigh);
mem_data->bufferram = P2K(val.bufferram); mem_data->bufferram = P2K(val.bufferram);
mem_data->cached = P2K(global_page_state(NR_FILE_PAGES) mem_data->cached = P2K(global_node_page_state(NR_FILE_PAGES)
- val.bufferram); - val.bufferram);
si_swapinfo(&val); si_swapinfo(&val);

View file

@ -45,20 +45,20 @@ void show_mem(unsigned int filter)
struct zone *zone; struct zone *zone;
pr_err("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu free:%lu\n slab:%lu mapped:%lu pagetables:%lu bounce:%lu pagecache:%lu swap:%lu\n", pr_err("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu free:%lu\n slab:%lu mapped:%lu pagetables:%lu bounce:%lu pagecache:%lu swap:%lu\n",
(global_page_state(NR_ACTIVE_ANON) + (global_node_page_state(NR_ACTIVE_ANON) +
global_page_state(NR_ACTIVE_FILE)), global_node_page_state(NR_ACTIVE_FILE)),
(global_page_state(NR_INACTIVE_ANON) + (global_node_page_state(NR_INACTIVE_ANON) +
global_page_state(NR_INACTIVE_FILE)), global_node_page_state(NR_INACTIVE_FILE)),
global_page_state(NR_FILE_DIRTY), global_node_page_state(NR_FILE_DIRTY),
global_page_state(NR_WRITEBACK), global_node_page_state(NR_WRITEBACK),
global_page_state(NR_UNSTABLE_NFS), global_node_page_state(NR_UNSTABLE_NFS),
global_page_state(NR_FREE_PAGES), global_page_state(NR_FREE_PAGES),
(global_page_state(NR_SLAB_RECLAIMABLE) + (global_page_state(NR_SLAB_RECLAIMABLE) +
global_page_state(NR_SLAB_UNRECLAIMABLE)), global_page_state(NR_SLAB_UNRECLAIMABLE)),
global_page_state(NR_FILE_MAPPED), global_node_page_state(NR_FILE_MAPPED),
global_page_state(NR_PAGETABLE), global_page_state(NR_PAGETABLE),
global_page_state(NR_BOUNCE), global_page_state(NR_BOUNCE),
global_page_state(NR_FILE_PAGES), global_node_page_state(NR_FILE_PAGES),
get_nr_swap_pages()); get_nr_swap_pages());
for_each_zone(zone) { for_each_zone(zone) {

View file

@ -56,6 +56,7 @@ static ssize_t node_read_meminfo(struct device *dev,
{ {
int n; int n;
int nid = dev->id; int nid = dev->id;
struct pglist_data *pgdat = NODE_DATA(nid);
struct sysinfo i; struct sysinfo i;
si_meminfo_node(&i, nid); si_meminfo_node(&i, nid);
@ -74,16 +75,16 @@ static ssize_t node_read_meminfo(struct device *dev,
nid, K(i.totalram), nid, K(i.totalram),
nid, K(i.freeram), nid, K(i.freeram),
nid, K(i.totalram - i.freeram), nid, K(i.totalram - i.freeram),
nid, K(node_page_state(nid, NR_ACTIVE_ANON) + nid, K(node_page_state(pgdat, NR_ACTIVE_ANON) +
node_page_state(nid, NR_ACTIVE_FILE)), node_page_state(pgdat, NR_ACTIVE_FILE)),
nid, K(node_page_state(nid, NR_INACTIVE_ANON) + nid, K(node_page_state(pgdat, NR_INACTIVE_ANON) +
node_page_state(nid, NR_INACTIVE_FILE)), node_page_state(pgdat, NR_INACTIVE_FILE)),
nid, K(node_page_state(nid, NR_ACTIVE_ANON)), nid, K(node_page_state(pgdat, NR_ACTIVE_ANON)),
nid, K(node_page_state(nid, NR_INACTIVE_ANON)), nid, K(node_page_state(pgdat, NR_INACTIVE_ANON)),
nid, K(node_page_state(nid, NR_ACTIVE_FILE)), nid, K(node_page_state(pgdat, NR_ACTIVE_FILE)),
nid, K(node_page_state(nid, NR_INACTIVE_FILE)), nid, K(node_page_state(pgdat, NR_INACTIVE_FILE)),
nid, K(node_page_state(nid, NR_UNEVICTABLE)), nid, K(node_page_state(pgdat, NR_UNEVICTABLE)),
nid, K(node_page_state(nid, NR_MLOCK))); nid, K(sum_zone_node_page_state(nid, NR_MLOCK)));
#ifdef CONFIG_HIGHMEM #ifdef CONFIG_HIGHMEM
n += sprintf(buf + n, n += sprintf(buf + n,
@ -117,31 +118,30 @@ static ssize_t node_read_meminfo(struct device *dev,
"Node %d ShmemPmdMapped: %8lu kB\n" "Node %d ShmemPmdMapped: %8lu kB\n"
#endif #endif
, ,
nid, K(node_page_state(nid, NR_FILE_DIRTY)), nid, K(node_page_state(pgdat, NR_FILE_DIRTY)),
nid, K(node_page_state(nid, NR_WRITEBACK)), nid, K(node_page_state(pgdat, NR_WRITEBACK)),
nid, K(node_page_state(nid, NR_FILE_PAGES)), nid, K(node_page_state(pgdat, NR_FILE_PAGES)),
nid, K(node_page_state(nid, NR_FILE_MAPPED)), nid, K(node_page_state(pgdat, NR_FILE_MAPPED)),
nid, K(node_page_state(nid, NR_ANON_PAGES)), nid, K(node_page_state(pgdat, NR_ANON_MAPPED)),
nid, K(i.sharedram), nid, K(i.sharedram),
nid, node_page_state(nid, NR_KERNEL_STACK) * nid, sum_zone_node_page_state(nid, NR_KERNEL_STACK_KB),
THREAD_SIZE / 1024, nid, K(sum_zone_node_page_state(nid, NR_PAGETABLE)),
nid, K(node_page_state(nid, NR_PAGETABLE)), nid, K(node_page_state(pgdat, NR_UNSTABLE_NFS)),
nid, K(node_page_state(nid, NR_UNSTABLE_NFS)), nid, K(sum_zone_node_page_state(nid, NR_BOUNCE)),
nid, K(node_page_state(nid, NR_BOUNCE)), nid, K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
nid, K(node_page_state(nid, NR_WRITEBACK_TEMP)), nid, K(sum_zone_node_page_state(nid, NR_SLAB_RECLAIMABLE) +
nid, K(node_page_state(nid, NR_SLAB_RECLAIMABLE) + sum_zone_node_page_state(nid, NR_SLAB_UNRECLAIMABLE)),
node_page_state(nid, NR_SLAB_UNRECLAIMABLE)), nid, K(sum_zone_node_page_state(nid, NR_SLAB_RECLAIMABLE)),
nid, K(node_page_state(nid, NR_SLAB_RECLAIMABLE)),
#ifdef CONFIG_TRANSPARENT_HUGEPAGE #ifdef CONFIG_TRANSPARENT_HUGEPAGE
nid, K(node_page_state(nid, NR_SLAB_UNRECLAIMABLE)), nid, K(sum_zone_node_page_state(nid, NR_SLAB_UNRECLAIMABLE)),
nid, K(node_page_state(nid, NR_ANON_THPS) * nid, K(node_page_state(pgdat, NR_ANON_THPS) *
HPAGE_PMD_NR), HPAGE_PMD_NR),
nid, K(node_page_state(nid, NR_SHMEM_THPS) * nid, K(node_page_state(pgdat, NR_SHMEM_THPS) *
HPAGE_PMD_NR), HPAGE_PMD_NR),
nid, K(node_page_state(nid, NR_SHMEM_PMDMAPPED) * nid, K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) *
HPAGE_PMD_NR)); HPAGE_PMD_NR));
#else #else
nid, K(node_page_state(nid, NR_SLAB_UNRECLAIMABLE))); nid, K(sum_zone_node_page_state(nid, NR_SLAB_UNRECLAIMABLE)));
#endif #endif
n += hugetlb_report_node_meminfo(nid, buf + n); n += hugetlb_report_node_meminfo(nid, buf + n);
return n; return n;
@ -160,12 +160,12 @@ static ssize_t node_read_numastat(struct device *dev,
"interleave_hit %lu\n" "interleave_hit %lu\n"
"local_node %lu\n" "local_node %lu\n"
"other_node %lu\n", "other_node %lu\n",
node_page_state(dev->id, NUMA_HIT), sum_zone_node_page_state(dev->id, NUMA_HIT),
node_page_state(dev->id, NUMA_MISS), sum_zone_node_page_state(dev->id, NUMA_MISS),
node_page_state(dev->id, NUMA_FOREIGN), sum_zone_node_page_state(dev->id, NUMA_FOREIGN),
node_page_state(dev->id, NUMA_INTERLEAVE_HIT), sum_zone_node_page_state(dev->id, NUMA_INTERLEAVE_HIT),
node_page_state(dev->id, NUMA_LOCAL), sum_zone_node_page_state(dev->id, NUMA_LOCAL),
node_page_state(dev->id, NUMA_OTHER)); sum_zone_node_page_state(dev->id, NUMA_OTHER));
} }
static DEVICE_ATTR(numastat, S_IRUGO, node_read_numastat, NULL); static DEVICE_ATTR(numastat, S_IRUGO, node_read_numastat, NULL);
@ -173,12 +173,18 @@ static ssize_t node_read_vmstat(struct device *dev,
struct device_attribute *attr, char *buf) struct device_attribute *attr, char *buf)
{ {
int nid = dev->id; int nid = dev->id;
struct pglist_data *pgdat = NODE_DATA(nid);
int i; int i;
int n = 0; int n = 0;
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
n += sprintf(buf+n, "%s %lu\n", vmstat_text[i], n += sprintf(buf+n, "%s %lu\n", vmstat_text[i],
node_page_state(nid, i)); sum_zone_node_page_state(nid, i));
for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
n += sprintf(buf+n, "%s %lu\n",
vmstat_text[i + NR_VM_ZONE_STAT_ITEMS],
node_page_state(pgdat, i));
return n; return n;
} }

View file

@ -72,10 +72,10 @@ static unsigned long lowmem_deathpending_timeout;
static unsigned long lowmem_count(struct shrinker *s, static unsigned long lowmem_count(struct shrinker *s,
struct shrink_control *sc) struct shrink_control *sc)
{ {
return global_page_state(NR_ACTIVE_ANON) + return global_node_page_state(NR_ACTIVE_ANON) +
global_page_state(NR_ACTIVE_FILE) + global_node_page_state(NR_ACTIVE_FILE) +
global_page_state(NR_INACTIVE_ANON) + global_node_page_state(NR_INACTIVE_ANON) +
global_page_state(NR_INACTIVE_FILE); global_node_page_state(NR_INACTIVE_FILE);
} }
static unsigned long lowmem_scan(struct shrinker *s, struct shrink_control *sc) static unsigned long lowmem_scan(struct shrinker *s, struct shrink_control *sc)
@ -91,8 +91,8 @@ static unsigned long lowmem_scan(struct shrinker *s, struct shrink_control *sc)
short selected_oom_score_adj; short selected_oom_score_adj;
int array_size = ARRAY_SIZE(lowmem_adj); int array_size = ARRAY_SIZE(lowmem_adj);
int other_free = global_page_state(NR_FREE_PAGES) - totalreserve_pages; int other_free = global_page_state(NR_FREE_PAGES) - totalreserve_pages;
int other_file = global_page_state(NR_FILE_PAGES) - int other_file = global_node_page_state(NR_FILE_PAGES) -
global_page_state(NR_SHMEM) - global_node_page_state(NR_SHMEM) -
total_swapcache_pages(); total_swapcache_pages();
if (lowmem_adj_size < array_size) if (lowmem_adj_size < array_size)

View file

@ -1864,7 +1864,8 @@ void osc_dec_unstable_pages(struct ptlrpc_request *req)
LASSERT(page_count >= 0); LASSERT(page_count >= 0);
for (i = 0; i < page_count; i++) for (i = 0; i < page_count; i++)
dec_zone_page_state(desc->bd_iov[i].kiov_page, NR_UNSTABLE_NFS); dec_node_page_state(desc->bd_iov[i].kiov_page,
NR_UNSTABLE_NFS);
atomic_sub(page_count, &cli->cl_cache->ccc_unstable_nr); atomic_sub(page_count, &cli->cl_cache->ccc_unstable_nr);
LASSERT(atomic_read(&cli->cl_cache->ccc_unstable_nr) >= 0); LASSERT(atomic_read(&cli->cl_cache->ccc_unstable_nr) >= 0);
@ -1898,7 +1899,8 @@ void osc_inc_unstable_pages(struct ptlrpc_request *req)
LASSERT(page_count >= 0); LASSERT(page_count >= 0);
for (i = 0; i < page_count; i++) for (i = 0; i < page_count; i++)
inc_zone_page_state(desc->bd_iov[i].kiov_page, NR_UNSTABLE_NFS); inc_node_page_state(desc->bd_iov[i].kiov_page,
NR_UNSTABLE_NFS);
LASSERT(atomic_read(&cli->cl_cache->ccc_unstable_nr) >= 0); LASSERT(atomic_read(&cli->cl_cache->ccc_unstable_nr) >= 0);
atomic_add(page_count, &cli->cl_cache->ccc_unstable_nr); atomic_add(page_count, &cli->cl_cache->ccc_unstable_nr);

View file

@ -1807,8 +1807,8 @@ static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb)
*/ */
static unsigned long get_nr_dirty_pages(void) static unsigned long get_nr_dirty_pages(void)
{ {
return global_page_state(NR_FILE_DIRTY) + return global_node_page_state(NR_FILE_DIRTY) +
global_page_state(NR_UNSTABLE_NFS) + global_node_page_state(NR_UNSTABLE_NFS) +
get_nr_dirty_inodes(); get_nr_dirty_inodes();
} }

View file

@ -1452,7 +1452,7 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
list_del(&req->writepages_entry); list_del(&req->writepages_entry);
for (i = 0; i < req->num_pages; i++) { for (i = 0; i < req->num_pages; i++) {
dec_wb_stat(&bdi->wb, WB_WRITEBACK); dec_wb_stat(&bdi->wb, WB_WRITEBACK);
dec_zone_page_state(req->pages[i], NR_WRITEBACK_TEMP); dec_node_page_state(req->pages[i], NR_WRITEBACK_TEMP);
wb_writeout_inc(&bdi->wb); wb_writeout_inc(&bdi->wb);
} }
wake_up(&fi->page_waitq); wake_up(&fi->page_waitq);
@ -1642,7 +1642,7 @@ static int fuse_writepage_locked(struct page *page)
req->inode = inode; req->inode = inode;
inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP); inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP);
spin_lock(&fc->lock); spin_lock(&fc->lock);
list_add(&req->writepages_entry, &fi->writepages); list_add(&req->writepages_entry, &fi->writepages);
@ -1756,7 +1756,7 @@ static bool fuse_writepage_in_flight(struct fuse_req *new_req,
spin_unlock(&fc->lock); spin_unlock(&fc->lock);
dec_wb_stat(&bdi->wb, WB_WRITEBACK); dec_wb_stat(&bdi->wb, WB_WRITEBACK);
dec_zone_page_state(page, NR_WRITEBACK_TEMP); dec_node_page_state(page, NR_WRITEBACK_TEMP);
wb_writeout_inc(&bdi->wb); wb_writeout_inc(&bdi->wb);
fuse_writepage_free(fc, new_req); fuse_writepage_free(fc, new_req);
fuse_request_free(new_req); fuse_request_free(new_req);
@ -1855,7 +1855,7 @@ static int fuse_writepages_fill(struct page *page,
req->page_descs[req->num_pages].length = PAGE_SIZE; req->page_descs[req->num_pages].length = PAGE_SIZE;
inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP); inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP);
err = 0; err = 0;
if (is_writeback && fuse_writepage_in_flight(req, page)) { if (is_writeback && fuse_writepage_in_flight(req, page)) {

View file

@ -623,7 +623,7 @@ void nfs_mark_page_unstable(struct page *page, struct nfs_commit_info *cinfo)
if (!cinfo->dreq) { if (!cinfo->dreq) {
struct inode *inode = page_file_mapping(page)->host; struct inode *inode = page_file_mapping(page)->host;
inc_zone_page_state(page, NR_UNSTABLE_NFS); inc_node_page_state(page, NR_UNSTABLE_NFS);
inc_wb_stat(&inode_to_bdi(inode)->wb, WB_RECLAIMABLE); inc_wb_stat(&inode_to_bdi(inode)->wb, WB_RECLAIMABLE);
__mark_inode_dirty(inode, I_DIRTY_DATASYNC); __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
} }

View file

@ -898,7 +898,7 @@ nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
static void static void
nfs_clear_page_commit(struct page *page) nfs_clear_page_commit(struct page *page)
{ {
dec_zone_page_state(page, NR_UNSTABLE_NFS); dec_node_page_state(page, NR_UNSTABLE_NFS);
dec_wb_stat(&inode_to_bdi(page_file_mapping(page)->host)->wb, dec_wb_stat(&inode_to_bdi(page_file_mapping(page)->host)->wb,
WB_RECLAIMABLE); WB_RECLAIMABLE);
} }

View file

@ -1024,23 +1024,107 @@ static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count,
char buffer[PROC_NUMBUF]; char buffer[PROC_NUMBUF];
int oom_adj = OOM_ADJUST_MIN; int oom_adj = OOM_ADJUST_MIN;
size_t len; size_t len;
unsigned long flags;
if (!task) if (!task)
return -ESRCH; return -ESRCH;
if (lock_task_sighand(task, &flags)) { if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MAX)
if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MAX) oom_adj = OOM_ADJUST_MAX;
oom_adj = OOM_ADJUST_MAX; else
else oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) /
oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) / OOM_SCORE_ADJ_MAX;
OOM_SCORE_ADJ_MAX;
unlock_task_sighand(task, &flags);
}
put_task_struct(task); put_task_struct(task);
len = snprintf(buffer, sizeof(buffer), "%d\n", oom_adj); len = snprintf(buffer, sizeof(buffer), "%d\n", oom_adj);
return simple_read_from_buffer(buf, count, ppos, buffer, len); return simple_read_from_buffer(buf, count, ppos, buffer, len);
} }
static int __set_oom_adj(struct file *file, int oom_adj, bool legacy)
{
static DEFINE_MUTEX(oom_adj_mutex);
struct mm_struct *mm = NULL;
struct task_struct *task;
int err = 0;
task = get_proc_task(file_inode(file));
if (!task)
return -ESRCH;
mutex_lock(&oom_adj_mutex);
if (legacy) {
if (oom_adj < task->signal->oom_score_adj &&
!capable(CAP_SYS_RESOURCE)) {
err = -EACCES;
goto err_unlock;
}
/*
* /proc/pid/oom_adj is provided for legacy purposes, ask users to use
* /proc/pid/oom_score_adj instead.
*/
pr_warn_once("%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n",
current->comm, task_pid_nr(current), task_pid_nr(task),
task_pid_nr(task));
} else {
if ((short)oom_adj < task->signal->oom_score_adj_min &&
!capable(CAP_SYS_RESOURCE)) {
err = -EACCES;
goto err_unlock;
}
}
/*
* Make sure we will check other processes sharing the mm if this is
* not vfrok which wants its own oom_score_adj.
* pin the mm so it doesn't go away and get reused after task_unlock
*/
if (!task->vfork_done) {
struct task_struct *p = find_lock_task_mm(task);
if (p) {
if (atomic_read(&p->mm->mm_users) > 1) {
mm = p->mm;
atomic_inc(&mm->mm_count);
}
task_unlock(p);
}
}
task->signal->oom_score_adj = oom_adj;
if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE))
task->signal->oom_score_adj_min = (short)oom_adj;
trace_oom_score_adj_update(task);
if (mm) {
struct task_struct *p;
rcu_read_lock();
for_each_process(p) {
if (same_thread_group(task, p))
continue;
/* do not touch kernel threads or the global init */
if (p->flags & PF_KTHREAD || is_global_init(p))
continue;
task_lock(p);
if (!p->vfork_done && process_shares_mm(p, mm)) {
pr_info("updating oom_score_adj for %d (%s) from %d to %d because it shares mm with %d (%s). Report if this is unexpected.\n",
task_pid_nr(p), p->comm,
p->signal->oom_score_adj, oom_adj,
task_pid_nr(task), task->comm);
p->signal->oom_score_adj = oom_adj;
if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE))
p->signal->oom_score_adj_min = (short)oom_adj;
}
task_unlock(p);
}
rcu_read_unlock();
mmdrop(mm);
}
err_unlock:
mutex_unlock(&oom_adj_mutex);
put_task_struct(task);
return err;
}
/* /*
* /proc/pid/oom_adj exists solely for backwards compatibility with previous * /proc/pid/oom_adj exists solely for backwards compatibility with previous
* kernels. The effective policy is defined by oom_score_adj, which has a * kernels. The effective policy is defined by oom_score_adj, which has a
@ -1054,10 +1138,8 @@ static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count,
static ssize_t oom_adj_write(struct file *file, const char __user *buf, static ssize_t oom_adj_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos) size_t count, loff_t *ppos)
{ {
struct task_struct *task;
char buffer[PROC_NUMBUF]; char buffer[PROC_NUMBUF];
int oom_adj; int oom_adj;
unsigned long flags;
int err; int err;
memset(buffer, 0, sizeof(buffer)); memset(buffer, 0, sizeof(buffer));
@ -1077,23 +1159,6 @@ static ssize_t oom_adj_write(struct file *file, const char __user *buf,
goto out; goto out;
} }
task = get_proc_task(file_inode(file));
if (!task) {
err = -ESRCH;
goto out;
}
task_lock(task);
if (!task->mm) {
err = -EINVAL;
goto err_task_lock;
}
if (!lock_task_sighand(task, &flags)) {
err = -ESRCH;
goto err_task_lock;
}
/* /*
* Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum
* value is always attainable. * value is always attainable.
@ -1103,27 +1168,7 @@ static ssize_t oom_adj_write(struct file *file, const char __user *buf,
else else
oom_adj = (oom_adj * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE; oom_adj = (oom_adj * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE;
if (oom_adj < task->signal->oom_score_adj && err = __set_oom_adj(file, oom_adj, true);
!capable(CAP_SYS_RESOURCE)) {
err = -EACCES;
goto err_sighand;
}
/*
* /proc/pid/oom_adj is provided for legacy purposes, ask users to use
* /proc/pid/oom_score_adj instead.
*/
pr_warn_once("%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n",
current->comm, task_pid_nr(current), task_pid_nr(task),
task_pid_nr(task));
task->signal->oom_score_adj = oom_adj;
trace_oom_score_adj_update(task);
err_sighand:
unlock_task_sighand(task, &flags);
err_task_lock:
task_unlock(task);
put_task_struct(task);
out: out:
return err < 0 ? err : count; return err < 0 ? err : count;
} }
@ -1140,15 +1185,11 @@ static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
struct task_struct *task = get_proc_task(file_inode(file)); struct task_struct *task = get_proc_task(file_inode(file));
char buffer[PROC_NUMBUF]; char buffer[PROC_NUMBUF];
short oom_score_adj = OOM_SCORE_ADJ_MIN; short oom_score_adj = OOM_SCORE_ADJ_MIN;
unsigned long flags;
size_t len; size_t len;
if (!task) if (!task)
return -ESRCH; return -ESRCH;
if (lock_task_sighand(task, &flags)) { oom_score_adj = task->signal->oom_score_adj;
oom_score_adj = task->signal->oom_score_adj;
unlock_task_sighand(task, &flags);
}
put_task_struct(task); put_task_struct(task);
len = snprintf(buffer, sizeof(buffer), "%hd\n", oom_score_adj); len = snprintf(buffer, sizeof(buffer), "%hd\n", oom_score_adj);
return simple_read_from_buffer(buf, count, ppos, buffer, len); return simple_read_from_buffer(buf, count, ppos, buffer, len);
@ -1157,9 +1198,7 @@ static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos) size_t count, loff_t *ppos)
{ {
struct task_struct *task;
char buffer[PROC_NUMBUF]; char buffer[PROC_NUMBUF];
unsigned long flags;
int oom_score_adj; int oom_score_adj;
int err; int err;
@ -1180,39 +1219,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
goto out; goto out;
} }
task = get_proc_task(file_inode(file)); err = __set_oom_adj(file, oom_score_adj, false);
if (!task) {
err = -ESRCH;
goto out;
}
task_lock(task);
if (!task->mm) {
err = -EINVAL;
goto err_task_lock;
}
if (!lock_task_sighand(task, &flags)) {
err = -ESRCH;
goto err_task_lock;
}
if ((short)oom_score_adj < task->signal->oom_score_adj_min &&
!capable(CAP_SYS_RESOURCE)) {
err = -EACCES;
goto err_sighand;
}
task->signal->oom_score_adj = (short)oom_score_adj;
if (has_capability_noaudit(current, CAP_SYS_RESOURCE))
task->signal->oom_score_adj_min = (short)oom_score_adj;
trace_oom_score_adj_update(task);
err_sighand:
unlock_task_sighand(task, &flags);
err_task_lock:
task_unlock(task);
put_task_struct(task);
out: out:
return err < 0 ? err : count; return err < 0 ? err : count;
} }

View file

@ -40,7 +40,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
si_swapinfo(&i); si_swapinfo(&i);
committed = percpu_counter_read_positive(&vm_committed_as); committed = percpu_counter_read_positive(&vm_committed_as);
cached = global_page_state(NR_FILE_PAGES) - cached = global_node_page_state(NR_FILE_PAGES) -
total_swapcache_pages() - i.bufferram; total_swapcache_pages() - i.bufferram;
if (cached < 0) if (cached < 0)
cached = 0; cached = 0;
@ -138,23 +138,23 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
#endif #endif
K(i.totalswap), K(i.totalswap),
K(i.freeswap), K(i.freeswap),
K(global_page_state(NR_FILE_DIRTY)), K(global_node_page_state(NR_FILE_DIRTY)),
K(global_page_state(NR_WRITEBACK)), K(global_node_page_state(NR_WRITEBACK)),
K(global_page_state(NR_ANON_PAGES)), K(global_node_page_state(NR_ANON_MAPPED)),
K(global_page_state(NR_FILE_MAPPED)), K(global_node_page_state(NR_FILE_MAPPED)),
K(i.sharedram), K(i.sharedram),
K(global_page_state(NR_SLAB_RECLAIMABLE) + K(global_page_state(NR_SLAB_RECLAIMABLE) +
global_page_state(NR_SLAB_UNRECLAIMABLE)), global_page_state(NR_SLAB_UNRECLAIMABLE)),
K(global_page_state(NR_SLAB_RECLAIMABLE)), K(global_page_state(NR_SLAB_RECLAIMABLE)),
K(global_page_state(NR_SLAB_UNRECLAIMABLE)), K(global_page_state(NR_SLAB_UNRECLAIMABLE)),
global_page_state(NR_KERNEL_STACK) * THREAD_SIZE / 1024, global_page_state(NR_KERNEL_STACK_KB),
K(global_page_state(NR_PAGETABLE)), K(global_page_state(NR_PAGETABLE)),
#ifdef CONFIG_QUICKLIST #ifdef CONFIG_QUICKLIST
K(quicklist_total_size()), K(quicklist_total_size()),
#endif #endif
K(global_page_state(NR_UNSTABLE_NFS)), K(global_node_page_state(NR_UNSTABLE_NFS)),
K(global_page_state(NR_BOUNCE)), K(global_page_state(NR_BOUNCE)),
K(global_page_state(NR_WRITEBACK_TEMP)), K(global_node_page_state(NR_WRITEBACK_TEMP)),
K(vm_commit_limit()), K(vm_commit_limit()),
K(committed), K(committed),
(unsigned long)VMALLOC_TOTAL >> 10, (unsigned long)VMALLOC_TOTAL >> 10,
@ -164,9 +164,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
, atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10) , atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10)
#endif #endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE #ifdef CONFIG_TRANSPARENT_HUGEPAGE
, K(global_page_state(NR_ANON_THPS) * HPAGE_PMD_NR) , K(global_node_page_state(NR_ANON_THPS) * HPAGE_PMD_NR)
, K(global_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR) , K(global_node_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR)
, K(global_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR) , K(global_node_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR)
#endif #endif
#ifdef CONFIG_CMA #ifdef CONFIG_CMA
, K(totalcma_pages) , K(totalcma_pages)

View file

@ -197,7 +197,7 @@ static inline int wb_congested(struct bdi_writeback *wb, int cong_bits)
} }
long congestion_wait(int sync, long timeout); long congestion_wait(int sync, long timeout);
long wait_iff_congested(struct zone *zone, int sync, long timeout); long wait_iff_congested(struct pglist_data *pgdat, int sync, long timeout);
int pdflush_proc_obsolete(struct ctl_table *table, int write, int pdflush_proc_obsolete(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos); void __user *buffer, size_t *lenp, loff_t *ppos);

View file

@ -1,6 +1,18 @@
#ifndef _LINUX_COMPACTION_H #ifndef _LINUX_COMPACTION_H
#define _LINUX_COMPACTION_H #define _LINUX_COMPACTION_H
/*
* Determines how hard direct compaction should try to succeed.
* Lower value means higher priority, analogically to reclaim priority.
*/
enum compact_priority {
COMPACT_PRIO_SYNC_LIGHT,
MIN_COMPACT_PRIORITY = COMPACT_PRIO_SYNC_LIGHT,
DEF_COMPACT_PRIORITY = COMPACT_PRIO_SYNC_LIGHT,
COMPACT_PRIO_ASYNC,
INIT_COMPACT_PRIORITY = COMPACT_PRIO_ASYNC
};
/* Return values for compact_zone() and try_to_compact_pages() */ /* Return values for compact_zone() and try_to_compact_pages() */
/* When adding new states, please adjust include/trace/events/compaction.h */ /* When adding new states, please adjust include/trace/events/compaction.h */
enum compact_result { enum compact_result {
@ -43,14 +55,6 @@ enum compact_result {
COMPACT_PARTIAL, COMPACT_PARTIAL,
}; };
/* Used to signal whether compaction detected need_sched() or lock contention */
/* No contention detected */
#define COMPACT_CONTENDED_NONE 0
/* Either need_sched() was true or fatal signal pending */
#define COMPACT_CONTENDED_SCHED 1
/* Zone lock or lru_lock was contended in async compaction */
#define COMPACT_CONTENDED_LOCK 2
struct alloc_context; /* in mm/internal.h */ struct alloc_context; /* in mm/internal.h */
#ifdef CONFIG_COMPACTION #ifdef CONFIG_COMPACTION
@ -64,9 +68,8 @@ extern int sysctl_compact_unevictable_allowed;
extern int fragmentation_index(struct zone *zone, unsigned int order); extern int fragmentation_index(struct zone *zone, unsigned int order);
extern enum compact_result try_to_compact_pages(gfp_t gfp_mask, extern enum compact_result try_to_compact_pages(gfp_t gfp_mask,
unsigned int order, unsigned int order, unsigned int alloc_flags,
unsigned int alloc_flags, const struct alloc_context *ac, const struct alloc_context *ac, enum compact_priority prio);
enum migrate_mode mode, int *contended);
extern void compact_pgdat(pg_data_t *pgdat, int order); extern void compact_pgdat(pg_data_t *pgdat, int order);
extern void reset_isolation_suitable(pg_data_t *pgdat); extern void reset_isolation_suitable(pg_data_t *pgdat);
extern enum compact_result compaction_suitable(struct zone *zone, int order, extern enum compact_result compaction_suitable(struct zone *zone, int order,
@ -151,14 +154,6 @@ extern void kcompactd_stop(int nid);
extern void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx); extern void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx);
#else #else
static inline enum compact_result try_to_compact_pages(gfp_t gfp_mask,
unsigned int order, int alloc_flags,
const struct alloc_context *ac,
enum migrate_mode mode, int *contended)
{
return COMPACT_CONTINUE;
}
static inline void compact_pgdat(pg_data_t *pgdat, int order) static inline void compact_pgdat(pg_data_t *pgdat, int order)
{ {
} }

View file

@ -237,9 +237,11 @@ struct vm_area_struct;
* are expected to be movable via page reclaim or page migration. Typically, * are expected to be movable via page reclaim or page migration. Typically,
* pages on the LRU would also be allocated with GFP_HIGHUSER_MOVABLE. * pages on the LRU would also be allocated with GFP_HIGHUSER_MOVABLE.
* *
* GFP_TRANSHUGE is used for THP allocations. They are compound allocations * GFP_TRANSHUGE and GFP_TRANSHUGE_LIGHT are used for THP allocations. They are
* that will fail quickly if memory is not available and will not wake * compound allocations that will generally fail quickly if memory is not
* kswapd on failure. * available and will not wake kswapd/kcompactd on failure. The _LIGHT
* version does not attempt reclaim/compaction at all and is by default used
* in page fault path, while the non-light is used by khugepaged.
*/ */
#define GFP_ATOMIC (__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM) #define GFP_ATOMIC (__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM)
#define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS) #define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS)
@ -254,9 +256,9 @@ struct vm_area_struct;
#define GFP_DMA32 __GFP_DMA32 #define GFP_DMA32 __GFP_DMA32
#define GFP_HIGHUSER (GFP_USER | __GFP_HIGHMEM) #define GFP_HIGHUSER (GFP_USER | __GFP_HIGHMEM)
#define GFP_HIGHUSER_MOVABLE (GFP_HIGHUSER | __GFP_MOVABLE) #define GFP_HIGHUSER_MOVABLE (GFP_HIGHUSER | __GFP_MOVABLE)
#define GFP_TRANSHUGE ((GFP_HIGHUSER_MOVABLE | __GFP_COMP | \ #define GFP_TRANSHUGE_LIGHT ((GFP_HIGHUSER_MOVABLE | __GFP_COMP | \
__GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN) & \ __GFP_NOMEMALLOC | __GFP_NOWARN) & ~__GFP_RECLAIM)
~__GFP_RECLAIM) #define GFP_TRANSHUGE (GFP_TRANSHUGE_LIGHT | __GFP_DIRECT_RECLAIM)
/* Convert GFP flags to their corresponding migrate type */ /* Convert GFP flags to their corresponding migrate type */
#define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE) #define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE)

View file

@ -11,7 +11,7 @@ extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
unsigned long addr, unsigned long addr,
pmd_t *pmd, pmd_t *pmd,
unsigned int flags); unsigned int flags);
extern int madvise_free_huge_pmd(struct mmu_gather *tlb, extern bool madvise_free_huge_pmd(struct mmu_gather *tlb,
struct vm_area_struct *vma, struct vm_area_struct *vma,
pmd_t *pmd, unsigned long addr, unsigned long next); pmd_t *pmd, unsigned long addr, unsigned long next);
extern int zap_huge_pmd(struct mmu_gather *tlb, extern int zap_huge_pmd(struct mmu_gather *tlb,

View file

@ -77,6 +77,7 @@ void kasan_free_shadow(const struct vm_struct *vm);
size_t ksize(const void *); size_t ksize(const void *);
static inline void kasan_unpoison_slab(const void *ptr) { ksize(ptr); } static inline void kasan_unpoison_slab(const void *ptr) { ksize(ptr); }
size_t kasan_metadata_size(struct kmem_cache *cache);
#else /* CONFIG_KASAN */ #else /* CONFIG_KASAN */
@ -121,6 +122,7 @@ static inline int kasan_module_alloc(void *addr, size_t size) { return 0; }
static inline void kasan_free_shadow(const struct vm_struct *vm) {} static inline void kasan_free_shadow(const struct vm_struct *vm) {}
static inline void kasan_unpoison_slab(const void *ptr) { } static inline void kasan_unpoison_slab(const void *ptr) { }
static inline size_t kasan_metadata_size(struct kmem_cache *cache) { return 0; }
#endif /* CONFIG_KASAN */ #endif /* CONFIG_KASAN */

View file

@ -177,7 +177,7 @@ extern int kdb_get_kbd_char(void);
static inline static inline
int kdb_process_cpu(const struct task_struct *p) int kdb_process_cpu(const struct task_struct *p)
{ {
unsigned int cpu = task_thread_info(p)->cpu; unsigned int cpu = task_cpu(p);
if (cpu > num_possible_cpus()) if (cpu > num_possible_cpus())
cpu = 0; cpu = 0;
return cpu; return cpu;

View file

@ -332,6 +332,7 @@ phys_addr_t memblock_mem_size(unsigned long limit_pfn);
phys_addr_t memblock_start_of_DRAM(void); phys_addr_t memblock_start_of_DRAM(void);
phys_addr_t memblock_end_of_DRAM(void); phys_addr_t memblock_end_of_DRAM(void);
void memblock_enforce_memory_limit(phys_addr_t memory_limit); void memblock_enforce_memory_limit(phys_addr_t memory_limit);
void memblock_mem_limit_remove_map(phys_addr_t limit);
bool memblock_is_memory(phys_addr_t addr); bool memblock_is_memory(phys_addr_t addr);
int memblock_is_map_memory(phys_addr_t addr); int memblock_is_map_memory(phys_addr_t addr);
int memblock_is_region_memory(phys_addr_t base, phys_addr_t size); int memblock_is_region_memory(phys_addr_t base, phys_addr_t size);

View file

@ -52,7 +52,7 @@ enum mem_cgroup_stat_index {
MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */ MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */
MEM_CGROUP_STAT_NSTATS, MEM_CGROUP_STAT_NSTATS,
/* default hierarchy stats */ /* default hierarchy stats */
MEMCG_KERNEL_STACK = MEM_CGROUP_STAT_NSTATS, MEMCG_KERNEL_STACK_KB = MEM_CGROUP_STAT_NSTATS,
MEMCG_SLAB_RECLAIMABLE, MEMCG_SLAB_RECLAIMABLE,
MEMCG_SLAB_UNRECLAIMABLE, MEMCG_SLAB_UNRECLAIMABLE,
MEMCG_SOCK, MEMCG_SOCK,
@ -60,7 +60,7 @@ enum mem_cgroup_stat_index {
}; };
struct mem_cgroup_reclaim_cookie { struct mem_cgroup_reclaim_cookie {
struct zone *zone; pg_data_t *pgdat;
int priority; int priority;
unsigned int generation; unsigned int generation;
}; };
@ -118,7 +118,7 @@ struct mem_cgroup_reclaim_iter {
/* /*
* per-zone information in memory controller. * per-zone information in memory controller.
*/ */
struct mem_cgroup_per_zone { struct mem_cgroup_per_node {
struct lruvec lruvec; struct lruvec lruvec;
unsigned long lru_size[NR_LRU_LISTS]; unsigned long lru_size[NR_LRU_LISTS];
@ -132,10 +132,6 @@ struct mem_cgroup_per_zone {
/* use container_of */ /* use container_of */
}; };
struct mem_cgroup_per_node {
struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
};
struct mem_cgroup_threshold { struct mem_cgroup_threshold {
struct eventfd_ctx *eventfd; struct eventfd_ctx *eventfd;
unsigned long threshold; unsigned long threshold;
@ -314,8 +310,46 @@ void mem_cgroup_uncharge_list(struct list_head *page_list);
void mem_cgroup_migrate(struct page *oldpage, struct page *newpage); void mem_cgroup_migrate(struct page *oldpage, struct page *newpage);
struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *); static struct mem_cgroup_per_node *
struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *); mem_cgroup_nodeinfo(struct mem_cgroup *memcg, int nid)
{
return memcg->nodeinfo[nid];
}
/**
* mem_cgroup_lruvec - get the lru list vector for a node or a memcg zone
* @node: node of the wanted lruvec
* @memcg: memcg of the wanted lruvec
*
* Returns the lru list vector holding pages for a given @node or a given
* @memcg and @zone. This can be the node lruvec, if the memory controller
* is disabled.
*/
static inline struct lruvec *mem_cgroup_lruvec(struct pglist_data *pgdat,
struct mem_cgroup *memcg)
{
struct mem_cgroup_per_node *mz;
struct lruvec *lruvec;
if (mem_cgroup_disabled()) {
lruvec = node_lruvec(pgdat);
goto out;
}
mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
lruvec = &mz->lruvec;
out:
/*
* Since a node can be onlined after the mem_cgroup was created,
* we have to be prepared to initialize lruvec->pgdat here;
* and if offlined then reonlined, we need to reinitialize it.
*/
if (unlikely(lruvec->pgdat != pgdat))
lruvec->pgdat = pgdat;
return lruvec;
}
struct lruvec *mem_cgroup_page_lruvec(struct page *, struct pglist_data *);
bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg); bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg);
struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
@ -404,9 +438,9 @@ unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
static inline static inline
unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
{ {
struct mem_cgroup_per_zone *mz; struct mem_cgroup_per_node *mz;
mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
return mz->lru_size[lru]; return mz->lru_size[lru];
} }
@ -477,7 +511,7 @@ static inline void mem_cgroup_dec_page_stat(struct page *page,
mem_cgroup_update_page_stat(page, idx, -1); mem_cgroup_update_page_stat(page, idx, -1);
} }
unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
gfp_t gfp_mask, gfp_t gfp_mask,
unsigned long *total_scanned); unsigned long *total_scanned);
@ -568,16 +602,16 @@ static inline void mem_cgroup_migrate(struct page *old, struct page *new)
{ {
} }
static inline struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone, static inline struct lruvec *mem_cgroup_lruvec(struct pglist_data *pgdat,
struct mem_cgroup *memcg) struct mem_cgroup *memcg)
{ {
return &zone->lruvec; return node_lruvec(pgdat);
} }
static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page, static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page,
struct zone *zone) struct pglist_data *pgdat)
{ {
return &zone->lruvec; return &pgdat->lruvec;
} }
static inline bool mm_match_cgroup(struct mm_struct *mm, static inline bool mm_match_cgroup(struct mm_struct *mm,
@ -681,7 +715,7 @@ static inline void mem_cgroup_dec_page_stat(struct page *page,
} }
static inline static inline
unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
gfp_t gfp_mask, gfp_t gfp_mask,
unsigned long *total_scanned) unsigned long *total_scanned)
{ {

View file

@ -26,7 +26,7 @@ struct vmem_altmap {
unsigned long vmem_altmap_offset(struct vmem_altmap *altmap); unsigned long vmem_altmap_offset(struct vmem_altmap *altmap);
void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns); void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns);
#if defined(CONFIG_SPARSEMEM_VMEMMAP) && defined(CONFIG_ZONE_DEVICE) #ifdef CONFIG_ZONE_DEVICE
struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start); struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start);
#else #else
static inline struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) static inline struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start)

View file

@ -933,6 +933,11 @@ static inline struct zone *page_zone(const struct page *page)
return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)]; return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)];
} }
static inline pg_data_t *page_pgdat(const struct page *page)
{
return NODE_DATA(page_to_nid(page));
}
#ifdef SECTION_IN_PAGE_FLAGS #ifdef SECTION_IN_PAGE_FLAGS
static inline void set_page_section(struct page *page, unsigned long section) static inline void set_page_section(struct page *page, unsigned long section)
{ {
@ -973,11 +978,21 @@ static inline struct mem_cgroup *page_memcg(struct page *page)
{ {
return page->mem_cgroup; return page->mem_cgroup;
} }
static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
{
WARN_ON_ONCE(!rcu_read_lock_held());
return READ_ONCE(page->mem_cgroup);
}
#else #else
static inline struct mem_cgroup *page_memcg(struct page *page) static inline struct mem_cgroup *page_memcg(struct page *page)
{ {
return NULL; return NULL;
} }
static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
{
WARN_ON_ONCE(!rcu_read_lock_held());
return NULL;
}
#endif #endif
/* /*
@ -2284,6 +2299,8 @@ static inline int in_gate_area(struct mm_struct *mm, unsigned long addr)
} }
#endif /* __HAVE_ARCH_GATE_AREA */ #endif /* __HAVE_ARCH_GATE_AREA */
extern bool process_shares_mm(struct task_struct *p, struct mm_struct *mm);
#ifdef CONFIG_SYSCTL #ifdef CONFIG_SYSCTL
extern int sysctl_drop_caches; extern int sysctl_drop_caches;
int drop_caches_sysctl_handler(struct ctl_table *, int, int drop_caches_sysctl_handler(struct ctl_table *, int,

View file

@ -23,25 +23,30 @@ static inline int page_is_file_cache(struct page *page)
} }
static __always_inline void __update_lru_size(struct lruvec *lruvec, static __always_inline void __update_lru_size(struct lruvec *lruvec,
enum lru_list lru, int nr_pages) enum lru_list lru, enum zone_type zid,
int nr_pages)
{ {
__mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, nr_pages); struct pglist_data *pgdat = lruvec_pgdat(lruvec);
__mod_node_page_state(pgdat, NR_LRU_BASE + lru, nr_pages);
__mod_zone_page_state(&pgdat->node_zones[zid],
NR_ZONE_LRU_BASE + lru, nr_pages);
} }
static __always_inline void update_lru_size(struct lruvec *lruvec, static __always_inline void update_lru_size(struct lruvec *lruvec,
enum lru_list lru, int nr_pages) enum lru_list lru, enum zone_type zid,
int nr_pages)
{ {
__update_lru_size(lruvec, lru, zid, nr_pages);
#ifdef CONFIG_MEMCG #ifdef CONFIG_MEMCG
mem_cgroup_update_lru_size(lruvec, lru, nr_pages); mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
#else
__update_lru_size(lruvec, lru, nr_pages);
#endif #endif
} }
static __always_inline void add_page_to_lru_list(struct page *page, static __always_inline void add_page_to_lru_list(struct page *page,
struct lruvec *lruvec, enum lru_list lru) struct lruvec *lruvec, enum lru_list lru)
{ {
update_lru_size(lruvec, lru, hpage_nr_pages(page)); update_lru_size(lruvec, lru, page_zonenum(page), hpage_nr_pages(page));
list_add(&page->lru, &lruvec->lists[lru]); list_add(&page->lru, &lruvec->lists[lru]);
} }
@ -49,7 +54,7 @@ static __always_inline void del_page_from_lru_list(struct page *page,
struct lruvec *lruvec, enum lru_list lru) struct lruvec *lruvec, enum lru_list lru)
{ {
list_del(&page->lru); list_del(&page->lru);
update_lru_size(lruvec, lru, -hpage_nr_pages(page)); update_lru_size(lruvec, lru, page_zonenum(page), -hpage_nr_pages(page));
} }
/** /**

View file

@ -118,7 +118,7 @@ struct page {
*/ */
union { union {
struct list_head lru; /* Pageout list, eg. active_list struct list_head lru; /* Pageout list, eg. active_list
* protected by zone->lru_lock ! * protected by zone_lru_lock !
* Can be used as a generic list * Can be used as a generic list
* by the page owner. * by the page owner.
*/ */

View file

@ -93,7 +93,7 @@ struct free_area {
struct pglist_data; struct pglist_data;
/* /*
* zone->lock and zone->lru_lock are two of the hottest locks in the kernel. * zone->lock and the zone lru_lock are two of the hottest locks in the kernel.
* So add a wild amount of padding here to ensure that they fall into separate * So add a wild amount of padding here to ensure that they fall into separate
* cachelines. There are very few zone structures in the machine, so space * cachelines. There are very few zone structures in the machine, so space
* consumption is not a concern here. * consumption is not a concern here.
@ -110,36 +110,20 @@ struct zone_padding {
enum zone_stat_item { enum zone_stat_item {
/* First 128 byte cacheline (assuming 64 bit words) */ /* First 128 byte cacheline (assuming 64 bit words) */
NR_FREE_PAGES, NR_FREE_PAGES,
NR_ALLOC_BATCH, NR_ZONE_LRU_BASE, /* Used only for compaction and reclaim retry */
NR_LRU_BASE, NR_ZONE_INACTIVE_ANON = NR_ZONE_LRU_BASE,
NR_INACTIVE_ANON = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */ NR_ZONE_ACTIVE_ANON,
NR_ACTIVE_ANON, /* " " " " " */ NR_ZONE_INACTIVE_FILE,
NR_INACTIVE_FILE, /* " " " " " */ NR_ZONE_ACTIVE_FILE,
NR_ACTIVE_FILE, /* " " " " " */ NR_ZONE_UNEVICTABLE,
NR_UNEVICTABLE, /* " " " " " */ NR_ZONE_WRITE_PENDING, /* Count of dirty, writeback and unstable pages */
NR_MLOCK, /* mlock()ed pages found and moved off LRU */ NR_MLOCK, /* mlock()ed pages found and moved off LRU */
NR_ANON_PAGES, /* Mapped anonymous pages */
NR_FILE_MAPPED, /* pagecache pages mapped into pagetables.
only modified from process context */
NR_FILE_PAGES,
NR_FILE_DIRTY,
NR_WRITEBACK,
NR_SLAB_RECLAIMABLE, NR_SLAB_RECLAIMABLE,
NR_SLAB_UNRECLAIMABLE, NR_SLAB_UNRECLAIMABLE,
NR_PAGETABLE, /* used for pagetables */ NR_PAGETABLE, /* used for pagetables */
NR_KERNEL_STACK, NR_KERNEL_STACK_KB, /* measured in KiB */
/* Second 128 byte cacheline */ /* Second 128 byte cacheline */
NR_UNSTABLE_NFS, /* NFS unstable pages */
NR_BOUNCE, NR_BOUNCE,
NR_VMSCAN_WRITE,
NR_VMSCAN_IMMEDIATE, /* Prioritise for reclaim when writeback ends */
NR_WRITEBACK_TEMP, /* Writeback using temporary buffers */
NR_ISOLATED_ANON, /* Temporary isolated pages from anon lru */
NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */
NR_SHMEM, /* shmem pages (included tmpfs/GEM pages) */
NR_DIRTIED, /* page dirtyings since bootup */
NR_WRITTEN, /* page writings since bootup */
NR_PAGES_SCANNED, /* pages scanned since last reclaim */
#if IS_ENABLED(CONFIG_ZSMALLOC) #if IS_ENABLED(CONFIG_ZSMALLOC)
NR_ZSPAGES, /* allocated in zsmalloc */ NR_ZSPAGES, /* allocated in zsmalloc */
#endif #endif
@ -151,14 +135,40 @@ enum zone_stat_item {
NUMA_LOCAL, /* allocation from local node */ NUMA_LOCAL, /* allocation from local node */
NUMA_OTHER, /* allocation from other node */ NUMA_OTHER, /* allocation from other node */
#endif #endif
NR_FREE_CMA_PAGES,
NR_VM_ZONE_STAT_ITEMS };
enum node_stat_item {
NR_LRU_BASE,
NR_INACTIVE_ANON = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */
NR_ACTIVE_ANON, /* " " " " " */
NR_INACTIVE_FILE, /* " " " " " */
NR_ACTIVE_FILE, /* " " " " " */
NR_UNEVICTABLE, /* " " " " " */
NR_ISOLATED_ANON, /* Temporary isolated pages from anon lru */
NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */
NR_PAGES_SCANNED, /* pages scanned since last reclaim */
WORKINGSET_REFAULT, WORKINGSET_REFAULT,
WORKINGSET_ACTIVATE, WORKINGSET_ACTIVATE,
WORKINGSET_NODERECLAIM, WORKINGSET_NODERECLAIM,
NR_ANON_THPS, NR_ANON_MAPPED, /* Mapped anonymous pages */
NR_FILE_MAPPED, /* pagecache pages mapped into pagetables.
only modified from process context */
NR_FILE_PAGES,
NR_FILE_DIRTY,
NR_WRITEBACK,
NR_WRITEBACK_TEMP, /* Writeback using temporary buffers */
NR_SHMEM, /* shmem pages (included tmpfs/GEM pages) */
NR_SHMEM_THPS, NR_SHMEM_THPS,
NR_SHMEM_PMDMAPPED, NR_SHMEM_PMDMAPPED,
NR_FREE_CMA_PAGES, NR_ANON_THPS,
NR_VM_ZONE_STAT_ITEMS }; NR_UNSTABLE_NFS, /* NFS unstable pages */
NR_VMSCAN_WRITE,
NR_VMSCAN_IMMEDIATE, /* Prioritise for reclaim when writeback ends */
NR_DIRTIED, /* page dirtyings since bootup */
NR_WRITTEN, /* page writings since bootup */
NR_VM_NODE_STAT_ITEMS
};
/* /*
* We do arithmetic on the LRU lists in various places in the code, * We do arithmetic on the LRU lists in various places in the code,
@ -215,7 +225,7 @@ struct lruvec {
/* Evictions & activations on the inactive file list */ /* Evictions & activations on the inactive file list */
atomic_long_t inactive_age; atomic_long_t inactive_age;
#ifdef CONFIG_MEMCG #ifdef CONFIG_MEMCG
struct zone *zone; struct pglist_data *pgdat;
#endif #endif
}; };
@ -267,6 +277,11 @@ struct per_cpu_pageset {
#endif #endif
}; };
struct per_cpu_nodestat {
s8 stat_threshold;
s8 vm_node_stat_diff[NR_VM_NODE_STAT_ITEMS];
};
#endif /* !__GENERATING_BOUNDS.H */ #endif /* !__GENERATING_BOUNDS.H */
enum zone_type { enum zone_type {
@ -348,22 +363,9 @@ struct zone {
#ifdef CONFIG_NUMA #ifdef CONFIG_NUMA
int node; int node;
#endif #endif
/*
* The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
* this zone's LRU. Maintained by the pageout code.
*/
unsigned int inactive_ratio;
struct pglist_data *zone_pgdat; struct pglist_data *zone_pgdat;
struct per_cpu_pageset __percpu *pageset; struct per_cpu_pageset __percpu *pageset;
/*
* This is a per-zone reserve of pages that are not available
* to userspace allocations.
*/
unsigned long totalreserve_pages;
#ifndef CONFIG_SPARSEMEM #ifndef CONFIG_SPARSEMEM
/* /*
* Flags for a pageblock_nr_pages block. See pageblock-flags.h. * Flags for a pageblock_nr_pages block. See pageblock-flags.h.
@ -372,14 +374,6 @@ struct zone {
unsigned long *pageblock_flags; unsigned long *pageblock_flags;
#endif /* CONFIG_SPARSEMEM */ #endif /* CONFIG_SPARSEMEM */
#ifdef CONFIG_NUMA
/*
* zone reclaim becomes active if more unmapped pages exist.
*/
unsigned long min_unmapped_pages;
unsigned long min_slab_pages;
#endif /* CONFIG_NUMA */
/* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */ /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
unsigned long zone_start_pfn; unsigned long zone_start_pfn;
@ -472,24 +466,21 @@ struct zone {
unsigned long wait_table_hash_nr_entries; unsigned long wait_table_hash_nr_entries;
unsigned long wait_table_bits; unsigned long wait_table_bits;
/* Write-intensive fields used from the page allocator */
ZONE_PADDING(_pad1_) ZONE_PADDING(_pad1_)
/* free areas of different sizes */ /* free areas of different sizes */
struct free_area free_area[MAX_ORDER]; struct free_area free_area[MAX_ORDER];
/* zone flags, see below */ /* zone flags, see below */
unsigned long flags; unsigned long flags;
/* Write-intensive fields used from the page allocator */ /* Primarily protects free_area */
spinlock_t lock; spinlock_t lock;
/* Write-intensive fields used by compaction and vmstats. */
ZONE_PADDING(_pad2_) ZONE_PADDING(_pad2_)
/* Write-intensive fields used by page reclaim */
/* Fields commonly accessed by the page reclaim scanner */
spinlock_t lru_lock;
struct lruvec lruvec;
/* /*
* When free pages are below this point, additional steps are taken * When free pages are below this point, additional steps are taken
* when reading the number of free pages to avoid per-cpu counter * when reading the number of free pages to avoid per-cpu counter
@ -527,19 +518,18 @@ struct zone {
atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
} ____cacheline_internodealigned_in_smp; } ____cacheline_internodealigned_in_smp;
enum zone_flags { enum pgdat_flags {
ZONE_RECLAIM_LOCKED, /* prevents concurrent reclaim */ PGDAT_CONGESTED, /* pgdat has many dirty pages backed by
ZONE_CONGESTED, /* zone has many dirty pages backed by
* a congested BDI * a congested BDI
*/ */
ZONE_DIRTY, /* reclaim scanning has recently found PGDAT_DIRTY, /* reclaim scanning has recently found
* many dirty file pages at the tail * many dirty file pages at the tail
* of the LRU. * of the LRU.
*/ */
ZONE_WRITEBACK, /* reclaim scanning has recently found PGDAT_WRITEBACK, /* reclaim scanning has recently found
* many pages under writeback * many pages under writeback
*/ */
ZONE_FAIR_DEPLETED, /* fair zone policy batch depleted */ PGDAT_RECLAIM_LOCKED, /* prevents concurrent reclaim */
}; };
static inline unsigned long zone_end_pfn(const struct zone *zone) static inline unsigned long zone_end_pfn(const struct zone *zone)
@ -663,8 +653,9 @@ typedef struct pglist_data {
wait_queue_head_t pfmemalloc_wait; wait_queue_head_t pfmemalloc_wait;
struct task_struct *kswapd; /* Protected by struct task_struct *kswapd; /* Protected by
mem_hotplug_begin/end() */ mem_hotplug_begin/end() */
int kswapd_max_order; int kswapd_order;
enum zone_type classzone_idx; enum zone_type kswapd_classzone_idx;
#ifdef CONFIG_COMPACTION #ifdef CONFIG_COMPACTION
int kcompactd_max_order; int kcompactd_max_order;
enum zone_type kcompactd_classzone_idx; enum zone_type kcompactd_classzone_idx;
@ -681,6 +672,23 @@ typedef struct pglist_data {
/* Number of pages migrated during the rate limiting time interval */ /* Number of pages migrated during the rate limiting time interval */
unsigned long numabalancing_migrate_nr_pages; unsigned long numabalancing_migrate_nr_pages;
#endif #endif
/*
* This is a per-node reserve of pages that are not available
* to userspace allocations.
*/
unsigned long totalreserve_pages;
#ifdef CONFIG_NUMA
/*
* zone reclaim becomes active if more unmapped pages exist.
*/
unsigned long min_unmapped_pages;
unsigned long min_slab_pages;
#endif /* CONFIG_NUMA */
/* Write-intensive fields used by page reclaim */
ZONE_PADDING(_pad1_)
spinlock_t lru_lock;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/* /*
@ -695,6 +703,23 @@ typedef struct pglist_data {
struct list_head split_queue; struct list_head split_queue;
unsigned long split_queue_len; unsigned long split_queue_len;
#endif #endif
/* Fields commonly accessed by the page reclaim scanner */
struct lruvec lruvec;
/*
* The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
* this node's LRU. Maintained by the pageout code.
*/
unsigned int inactive_ratio;
unsigned long flags;
ZONE_PADDING(_pad2_)
/* Per-node vmstats */
struct per_cpu_nodestat __percpu *per_cpu_nodestats;
atomic_long_t vm_stat[NR_VM_NODE_STAT_ITEMS];
} pg_data_t; } pg_data_t;
#define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages)
@ -708,6 +733,15 @@ typedef struct pglist_data {
#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn)
#define node_end_pfn(nid) pgdat_end_pfn(NODE_DATA(nid)) #define node_end_pfn(nid) pgdat_end_pfn(NODE_DATA(nid))
static inline spinlock_t *zone_lru_lock(struct zone *zone)
{
return &zone->zone_pgdat->lru_lock;
}
static inline struct lruvec *node_lruvec(struct pglist_data *pgdat)
{
return &pgdat->lruvec;
}
static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat) static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat)
{ {
@ -760,12 +794,12 @@ extern int init_currently_empty_zone(struct zone *zone, unsigned long start_pfn,
extern void lruvec_init(struct lruvec *lruvec); extern void lruvec_init(struct lruvec *lruvec);
static inline struct zone *lruvec_zone(struct lruvec *lruvec) static inline struct pglist_data *lruvec_pgdat(struct lruvec *lruvec)
{ {
#ifdef CONFIG_MEMCG #ifdef CONFIG_MEMCG
return lruvec->zone; return lruvec->pgdat;
#else #else
return container_of(lruvec, struct zone, lruvec); return container_of(lruvec, struct pglist_data, lruvec);
#endif #endif
} }

View file

@ -73,9 +73,9 @@ static inline bool oom_task_origin(const struct task_struct *p)
extern void mark_oom_victim(struct task_struct *tsk); extern void mark_oom_victim(struct task_struct *tsk);
#ifdef CONFIG_MMU #ifdef CONFIG_MMU
extern void try_oom_reaper(struct task_struct *tsk); extern void wake_oom_reaper(struct task_struct *tsk);
#else #else
static inline void try_oom_reaper(struct task_struct *tsk) static inline void wake_oom_reaper(struct task_struct *tsk)
{ {
} }
#endif #endif
@ -107,27 +107,7 @@ extern void oom_killer_enable(void);
extern struct task_struct *find_lock_task_mm(struct task_struct *p); extern struct task_struct *find_lock_task_mm(struct task_struct *p);
static inline bool task_will_free_mem(struct task_struct *task) bool task_will_free_mem(struct task_struct *task);
{
struct signal_struct *sig = task->signal;
/*
* A coredumping process may sleep for an extended period in exit_mm(),
* so the oom killer cannot assume that the process will promptly exit
* and release memory.
*/
if (sig->flags & SIGNAL_GROUP_COREDUMP)
return false;
if (!(task->flags & PF_EXITING))
return false;
/* Make sure that the whole thread group is going down */
if (!thread_group_empty(task) && !(sig->flags & SIGNAL_GROUP_EXIT))
return false;
return true;
}
/* sysctls */ /* sysctls */
extern int sysctl_oom_dump_tasks; extern int sysctl_oom_dump_tasks;

View file

@ -523,6 +523,7 @@ static inline int get_dumpable(struct mm_struct *mm)
#define MMF_HAS_UPROBES 19 /* has uprobes */ #define MMF_HAS_UPROBES 19 /* has uprobes */
#define MMF_RECALC_UPROBES 20 /* MMF_HAS_UPROBES can be wrong */ #define MMF_RECALC_UPROBES 20 /* MMF_HAS_UPROBES can be wrong */
#define MMF_OOM_REAPED 21 /* mm has been already reaped */ #define MMF_OOM_REAPED 21 /* mm has been already reaped */
#define MMF_OOM_NOT_REAPABLE 22 /* mm couldn't be reaped */
#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK) #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK)
@ -1949,6 +1950,32 @@ static inline int tsk_nr_cpus_allowed(struct task_struct *p)
#define TNF_FAULT_LOCAL 0x08 #define TNF_FAULT_LOCAL 0x08
#define TNF_MIGRATE_FAIL 0x10 #define TNF_MIGRATE_FAIL 0x10
static inline bool in_vfork(struct task_struct *tsk)
{
bool ret;
/*
* need RCU to access ->real_parent if CLONE_VM was used along with
* CLONE_PARENT.
*
* We check real_parent->mm == tsk->mm because CLONE_VFORK does not
* imply CLONE_VM
*
* CLONE_VFORK can be used with CLONE_PARENT/CLONE_THREAD and thus
* ->real_parent is not necessarily the task doing vfork(), so in
* theory we can't rely on task_lock() if we want to dereference it.
*
* And in this case we can't trust the real_parent->mm == tsk->mm
* check, it can be false negative. But we do not care, if init or
* another oom-unkillable task does this it should blame itself.
*/
rcu_read_lock();
ret = tsk->vfork_done && tsk->real_parent->mm == tsk->mm;
rcu_read_unlock();
return ret;
}
#ifdef CONFIG_NUMA_BALANCING #ifdef CONFIG_NUMA_BALANCING
extern void task_numa_fault(int last_node, int node, int pages, int flags); extern void task_numa_fault(int last_node, int node, int pages, int flags);
extern pid_t task_numa_group_id(struct task_struct *p); extern pid_t task_numa_group_id(struct task_struct *p);

View file

@ -88,7 +88,8 @@ struct kmem_cache {
}; };
static inline void *nearest_obj(struct kmem_cache *cache, struct page *page, static inline void *nearest_obj(struct kmem_cache *cache, struct page *page,
void *x) { void *x)
{
void *object = x - (x - page->s_mem) % cache->size; void *object = x - (x - page->s_mem) % cache->size;
void *last_object = page->s_mem + (cache->num - 1) * cache->size; void *last_object = page->s_mem + (cache->num - 1) * cache->size;

View file

@ -104,6 +104,10 @@ struct kmem_cache {
unsigned int *random_seq; unsigned int *random_seq;
#endif #endif
#ifdef CONFIG_KASAN
struct kasan_cache kasan_info;
#endif
struct kmem_cache_node *node[MAX_NUMNODES]; struct kmem_cache_node *node[MAX_NUMNODES];
}; };
@ -119,15 +123,17 @@ static inline void sysfs_slab_remove(struct kmem_cache *s)
void object_err(struct kmem_cache *s, struct page *page, void object_err(struct kmem_cache *s, struct page *page,
u8 *object, char *reason); u8 *object, char *reason);
void *fixup_red_left(struct kmem_cache *s, void *p);
static inline void *nearest_obj(struct kmem_cache *cache, struct page *page, static inline void *nearest_obj(struct kmem_cache *cache, struct page *page,
void *x) { void *x) {
void *object = x - (x - page_address(page)) % cache->size; void *object = x - (x - page_address(page)) % cache->size;
void *last_object = page_address(page) + void *last_object = page_address(page) +
(page->objects - 1) * cache->size; (page->objects - 1) * cache->size;
if (unlikely(object > last_object)) void *result = (unlikely(object > last_object)) ? last_object : object;
return last_object;
else result = fixup_red_left(cache, result);
return object; return result;
} }
#endif /* _LINUX_SLUB_DEF_H */ #endif /* _LINUX_SLUB_DEF_H */

View file

@ -157,15 +157,6 @@ enum {
#define SWAP_CLUSTER_MAX 32UL #define SWAP_CLUSTER_MAX 32UL
#define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX #define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX
/*
* Ratio between zone->managed_pages and the "gap" that above the per-zone
* "high_wmark". While balancing nodes, We allow kswapd to shrink zones that
* do not meet the (high_wmark + gap) watermark, even which already met the
* high_wmark, in order to provide better per-zone lru behavior. We are ok to
* spend not more than 1% of the memory for this zone balancing "gap".
*/
#define KSWAPD_ZONE_BALANCE_GAP_RATIO 100
#define SWAP_MAP_MAX 0x3e /* Max duplication count, in first swap_map */ #define SWAP_MAP_MAX 0x3e /* Max duplication count, in first swap_map */
#define SWAP_MAP_BAD 0x3f /* Note pageblock is bad, in first swap_map */ #define SWAP_MAP_BAD 0x3f /* Note pageblock is bad, in first swap_map */
#define SWAP_HAS_CACHE 0x40 /* Flag page is cached, in first swap_map */ #define SWAP_HAS_CACHE 0x40 /* Flag page is cached, in first swap_map */
@ -317,6 +308,7 @@ extern void lru_cache_add_active_or_unevictable(struct page *page,
/* linux/mm/vmscan.c */ /* linux/mm/vmscan.c */
extern unsigned long zone_reclaimable_pages(struct zone *zone); extern unsigned long zone_reclaimable_pages(struct zone *zone);
extern unsigned long pgdat_reclaimable_pages(struct pglist_data *pgdat);
extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
gfp_t gfp_mask, nodemask_t *mask); gfp_t gfp_mask, nodemask_t *mask);
extern int __isolate_lru_page(struct page *page, isolate_mode_t mode); extern int __isolate_lru_page(struct page *page, isolate_mode_t mode);
@ -324,9 +316,9 @@ extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
unsigned long nr_pages, unsigned long nr_pages,
gfp_t gfp_mask, gfp_t gfp_mask,
bool may_swap); bool may_swap);
extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem,
gfp_t gfp_mask, bool noswap, gfp_t gfp_mask, bool noswap,
struct zone *zone, pg_data_t *pgdat,
unsigned long *nr_scanned); unsigned long *nr_scanned);
extern unsigned long shrink_all_memory(unsigned long nr_pages); extern unsigned long shrink_all_memory(unsigned long nr_pages);
extern int vm_swappiness; extern int vm_swappiness;
@ -334,13 +326,14 @@ extern int remove_mapping(struct address_space *mapping, struct page *page);
extern unsigned long vm_total_pages; extern unsigned long vm_total_pages;
#ifdef CONFIG_NUMA #ifdef CONFIG_NUMA
extern int zone_reclaim_mode; extern int node_reclaim_mode;
extern int sysctl_min_unmapped_ratio; extern int sysctl_min_unmapped_ratio;
extern int sysctl_min_slab_ratio; extern int sysctl_min_slab_ratio;
extern int zone_reclaim(struct zone *, gfp_t, unsigned int); extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int);
#else #else
#define zone_reclaim_mode 0 #define node_reclaim_mode 0
static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order) static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask,
unsigned int order)
{ {
return 0; return 0;
} }

View file

@ -54,7 +54,7 @@ int arch_update_cpu_topology(void);
/* /*
* If the distance between nodes in a system is larger than RECLAIM_DISTANCE * If the distance between nodes in a system is larger than RECLAIM_DISTANCE
* (in whatever arch specific measurement units returned by node_distance()) * (in whatever arch specific measurement units returned by node_distance())
* and zone_reclaim_mode is enabled then the VM will only call zone_reclaim() * and node_reclaim_mode is enabled then the VM will only call node_reclaim()
* on nodes within this distance. * on nodes within this distance.
*/ */
#define RECLAIM_DISTANCE 30 #define RECLAIM_DISTANCE 30

View file

@ -23,21 +23,23 @@
enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
FOR_ALL_ZONES(PGALLOC), FOR_ALL_ZONES(PGALLOC),
FOR_ALL_ZONES(ALLOCSTALL),
FOR_ALL_ZONES(PGSCAN_SKIP),
PGFREE, PGACTIVATE, PGDEACTIVATE, PGFREE, PGACTIVATE, PGDEACTIVATE,
PGFAULT, PGMAJFAULT, PGFAULT, PGMAJFAULT,
PGLAZYFREED, PGLAZYFREED,
FOR_ALL_ZONES(PGREFILL), PGREFILL,
FOR_ALL_ZONES(PGSTEAL_KSWAPD), PGSTEAL_KSWAPD,
FOR_ALL_ZONES(PGSTEAL_DIRECT), PGSTEAL_DIRECT,
FOR_ALL_ZONES(PGSCAN_KSWAPD), PGSCAN_KSWAPD,
FOR_ALL_ZONES(PGSCAN_DIRECT), PGSCAN_DIRECT,
PGSCAN_DIRECT_THROTTLE, PGSCAN_DIRECT_THROTTLE,
#ifdef CONFIG_NUMA #ifdef CONFIG_NUMA
PGSCAN_ZONE_RECLAIM_FAILED, PGSCAN_ZONE_RECLAIM_FAILED,
#endif #endif
PGINODESTEAL, SLABS_SCANNED, KSWAPD_INODESTEAL, PGINODESTEAL, SLABS_SCANNED, KSWAPD_INODESTEAL,
KSWAPD_LOW_WMARK_HIT_QUICKLY, KSWAPD_HIGH_WMARK_HIT_QUICKLY, KSWAPD_LOW_WMARK_HIT_QUICKLY, KSWAPD_HIGH_WMARK_HIT_QUICKLY,
PAGEOUTRUN, ALLOCSTALL, PGROTATED, PAGEOUTRUN, PGROTATED,
DROP_PAGECACHE, DROP_SLAB, DROP_PAGECACHE, DROP_SLAB,
#ifdef CONFIG_NUMA_BALANCING #ifdef CONFIG_NUMA_BALANCING
NUMA_PTE_UPDATES, NUMA_PTE_UPDATES,

View file

@ -101,25 +101,42 @@ static inline void vm_events_fold_cpu(int cpu)
#define count_vm_vmacache_event(x) do {} while (0) #define count_vm_vmacache_event(x) do {} while (0)
#endif #endif
#define __count_zone_vm_events(item, zone, delta) \ #define __count_zid_vm_events(item, zid, delta) \
__count_vm_events(item##_NORMAL - ZONE_NORMAL + \ __count_vm_events(item##_NORMAL - ZONE_NORMAL + zid, delta)
zone_idx(zone), delta)
/* /*
* Zone based page accounting with per cpu differentials. * Zone and node-based page accounting with per cpu differentials.
*/ */
extern atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; extern atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS];
extern atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS];
static inline void zone_page_state_add(long x, struct zone *zone, static inline void zone_page_state_add(long x, struct zone *zone,
enum zone_stat_item item) enum zone_stat_item item)
{ {
atomic_long_add(x, &zone->vm_stat[item]); atomic_long_add(x, &zone->vm_stat[item]);
atomic_long_add(x, &vm_stat[item]); atomic_long_add(x, &vm_zone_stat[item]);
}
static inline void node_page_state_add(long x, struct pglist_data *pgdat,
enum node_stat_item item)
{
atomic_long_add(x, &pgdat->vm_stat[item]);
atomic_long_add(x, &vm_node_stat[item]);
} }
static inline unsigned long global_page_state(enum zone_stat_item item) static inline unsigned long global_page_state(enum zone_stat_item item)
{ {
long x = atomic_long_read(&vm_stat[item]); long x = atomic_long_read(&vm_zone_stat[item]);
#ifdef CONFIG_SMP
if (x < 0)
x = 0;
#endif
return x;
}
static inline unsigned long global_node_page_state(enum node_stat_item item)
{
long x = atomic_long_read(&vm_node_stat[item]);
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
if (x < 0) if (x < 0)
x = 0; x = 0;
@ -160,32 +177,61 @@ static inline unsigned long zone_page_state_snapshot(struct zone *zone,
return x; return x;
} }
static inline unsigned long node_page_state_snapshot(pg_data_t *pgdat,
enum node_stat_item item)
{
long x = atomic_long_read(&pgdat->vm_stat[item]);
#ifdef CONFIG_SMP
int cpu;
for_each_online_cpu(cpu)
x += per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->vm_node_stat_diff[item];
if (x < 0)
x = 0;
#endif
return x;
}
#ifdef CONFIG_NUMA #ifdef CONFIG_NUMA
extern unsigned long sum_zone_node_page_state(int node,
extern unsigned long node_page_state(int node, enum zone_stat_item item); enum zone_stat_item item);
extern unsigned long node_page_state(struct pglist_data *pgdat,
enum node_stat_item item);
#else #else
#define sum_zone_node_page_state(node, item) global_page_state(item)
#define node_page_state(node, item) global_page_state(item) #define node_page_state(node, item) global_node_page_state(item)
#endif /* CONFIG_NUMA */ #endif /* CONFIG_NUMA */
#define add_zone_page_state(__z, __i, __d) mod_zone_page_state(__z, __i, __d) #define add_zone_page_state(__z, __i, __d) mod_zone_page_state(__z, __i, __d)
#define sub_zone_page_state(__z, __i, __d) mod_zone_page_state(__z, __i, -(__d)) #define sub_zone_page_state(__z, __i, __d) mod_zone_page_state(__z, __i, -(__d))
#define add_node_page_state(__p, __i, __d) mod_node_page_state(__p, __i, __d)
#define sub_node_page_state(__p, __i, __d) mod_node_page_state(__p, __i, -(__d))
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
void __mod_zone_page_state(struct zone *, enum zone_stat_item item, long); void __mod_zone_page_state(struct zone *, enum zone_stat_item item, long);
void __inc_zone_page_state(struct page *, enum zone_stat_item); void __inc_zone_page_state(struct page *, enum zone_stat_item);
void __dec_zone_page_state(struct page *, enum zone_stat_item); void __dec_zone_page_state(struct page *, enum zone_stat_item);
void __mod_node_page_state(struct pglist_data *, enum node_stat_item item, long);
void __inc_node_page_state(struct page *, enum node_stat_item);
void __dec_node_page_state(struct page *, enum node_stat_item);
void mod_zone_page_state(struct zone *, enum zone_stat_item, long); void mod_zone_page_state(struct zone *, enum zone_stat_item, long);
void inc_zone_page_state(struct page *, enum zone_stat_item); void inc_zone_page_state(struct page *, enum zone_stat_item);
void dec_zone_page_state(struct page *, enum zone_stat_item); void dec_zone_page_state(struct page *, enum zone_stat_item);
extern void inc_zone_state(struct zone *, enum zone_stat_item); void mod_node_page_state(struct pglist_data *, enum node_stat_item, long);
void inc_node_page_state(struct page *, enum node_stat_item);
void dec_node_page_state(struct page *, enum node_stat_item);
extern void inc_node_state(struct pglist_data *, enum node_stat_item);
extern void __inc_zone_state(struct zone *, enum zone_stat_item); extern void __inc_zone_state(struct zone *, enum zone_stat_item);
extern void __inc_node_state(struct pglist_data *, enum node_stat_item);
extern void dec_zone_state(struct zone *, enum zone_stat_item); extern void dec_zone_state(struct zone *, enum zone_stat_item);
extern void __dec_zone_state(struct zone *, enum zone_stat_item); extern void __dec_zone_state(struct zone *, enum zone_stat_item);
extern void __dec_node_state(struct pglist_data *, enum node_stat_item);
void quiet_vmstat(void); void quiet_vmstat(void);
void cpu_vm_stats_fold(int cpu); void cpu_vm_stats_fold(int cpu);
@ -213,16 +259,34 @@ static inline void __mod_zone_page_state(struct zone *zone,
zone_page_state_add(delta, zone, item); zone_page_state_add(delta, zone, item);
} }
static inline void __mod_node_page_state(struct pglist_data *pgdat,
enum node_stat_item item, int delta)
{
node_page_state_add(delta, pgdat, item);
}
static inline void __inc_zone_state(struct zone *zone, enum zone_stat_item item) static inline void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
{ {
atomic_long_inc(&zone->vm_stat[item]); atomic_long_inc(&zone->vm_stat[item]);
atomic_long_inc(&vm_stat[item]); atomic_long_inc(&vm_zone_stat[item]);
}
static inline void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
{
atomic_long_inc(&pgdat->vm_stat[item]);
atomic_long_inc(&vm_node_stat[item]);
} }
static inline void __dec_zone_state(struct zone *zone, enum zone_stat_item item) static inline void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
{ {
atomic_long_dec(&zone->vm_stat[item]); atomic_long_dec(&zone->vm_stat[item]);
atomic_long_dec(&vm_stat[item]); atomic_long_dec(&vm_zone_stat[item]);
}
static inline void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
{
atomic_long_dec(&pgdat->vm_stat[item]);
atomic_long_dec(&vm_node_stat[item]);
} }
static inline void __inc_zone_page_state(struct page *page, static inline void __inc_zone_page_state(struct page *page,
@ -231,12 +295,26 @@ static inline void __inc_zone_page_state(struct page *page,
__inc_zone_state(page_zone(page), item); __inc_zone_state(page_zone(page), item);
} }
static inline void __inc_node_page_state(struct page *page,
enum node_stat_item item)
{
__inc_node_state(page_pgdat(page), item);
}
static inline void __dec_zone_page_state(struct page *page, static inline void __dec_zone_page_state(struct page *page,
enum zone_stat_item item) enum zone_stat_item item)
{ {
__dec_zone_state(page_zone(page), item); __dec_zone_state(page_zone(page), item);
} }
static inline void __dec_node_page_state(struct page *page,
enum node_stat_item item)
{
__dec_node_state(page_pgdat(page), item);
}
/* /*
* We only use atomic operations to update counters. So there is no need to * We only use atomic operations to update counters. So there is no need to
* disable interrupts. * disable interrupts.
@ -245,7 +323,12 @@ static inline void __dec_zone_page_state(struct page *page,
#define dec_zone_page_state __dec_zone_page_state #define dec_zone_page_state __dec_zone_page_state
#define mod_zone_page_state __mod_zone_page_state #define mod_zone_page_state __mod_zone_page_state
#define inc_node_page_state __inc_node_page_state
#define dec_node_page_state __dec_node_page_state
#define mod_node_page_state __mod_node_page_state
#define inc_zone_state __inc_zone_state #define inc_zone_state __inc_zone_state
#define inc_node_state __inc_node_state
#define dec_zone_state __dec_zone_state #define dec_zone_state __dec_zone_state
#define set_pgdat_percpu_threshold(pgdat, callback) { } #define set_pgdat_percpu_threshold(pgdat, callback) { }

View file

@ -320,7 +320,7 @@ void laptop_mode_timer_fn(unsigned long data);
static inline void laptop_sync_completion(void) { } static inline void laptop_sync_completion(void) { }
#endif #endif
void throttle_vm_writeout(gfp_t gfp_mask); void throttle_vm_writeout(gfp_t gfp_mask);
bool zone_dirty_ok(struct zone *zone); bool node_dirty_ok(struct pglist_data *pgdat);
int wb_domain_init(struct wb_domain *dom, gfp_t gfp); int wb_domain_init(struct wb_domain *dom, gfp_t gfp);
#ifdef CONFIG_CGROUP_WRITEBACK #ifdef CONFIG_CGROUP_WRITEBACK
void wb_domain_exit(struct wb_domain *dom); void wb_domain_exit(struct wb_domain *dom);

View file

@ -226,26 +226,26 @@ TRACE_EVENT(mm_compaction_try_to_compact_pages,
TP_PROTO( TP_PROTO(
int order, int order,
gfp_t gfp_mask, gfp_t gfp_mask,
enum migrate_mode mode), int prio),
TP_ARGS(order, gfp_mask, mode), TP_ARGS(order, gfp_mask, prio),
TP_STRUCT__entry( TP_STRUCT__entry(
__field(int, order) __field(int, order)
__field(gfp_t, gfp_mask) __field(gfp_t, gfp_mask)
__field(enum migrate_mode, mode) __field(int, prio)
), ),
TP_fast_assign( TP_fast_assign(
__entry->order = order; __entry->order = order;
__entry->gfp_mask = gfp_mask; __entry->gfp_mask = gfp_mask;
__entry->mode = mode; __entry->prio = prio;
), ),
TP_printk("order=%d gfp_mask=0x%x mode=%d", TP_printk("order=%d gfp_mask=0x%x priority=%d",
__entry->order, __entry->order,
__entry->gfp_mask, __entry->gfp_mask,
(int)__entry->mode) __entry->prio)
); );
DECLARE_EVENT_CLASS(mm_compaction_suitable_template, DECLARE_EVENT_CLASS(mm_compaction_suitable_template,

View file

@ -11,6 +11,7 @@
#define __def_gfpflag_names \ #define __def_gfpflag_names \
{(unsigned long)GFP_TRANSHUGE, "GFP_TRANSHUGE"}, \ {(unsigned long)GFP_TRANSHUGE, "GFP_TRANSHUGE"}, \
{(unsigned long)GFP_TRANSHUGE_LIGHT, "GFP_TRANSHUGE_LIGHT"}, \
{(unsigned long)GFP_HIGHUSER_MOVABLE, "GFP_HIGHUSER_MOVABLE"},\ {(unsigned long)GFP_HIGHUSER_MOVABLE, "GFP_HIGHUSER_MOVABLE"},\
{(unsigned long)GFP_HIGHUSER, "GFP_HIGHUSER"}, \ {(unsigned long)GFP_HIGHUSER, "GFP_HIGHUSER"}, \
{(unsigned long)GFP_USER, "GFP_USER"}, \ {(unsigned long)GFP_USER, "GFP_USER"}, \

View file

@ -55,21 +55,23 @@ TRACE_EVENT(mm_vmscan_kswapd_sleep,
TRACE_EVENT(mm_vmscan_kswapd_wake, TRACE_EVENT(mm_vmscan_kswapd_wake,
TP_PROTO(int nid, int order), TP_PROTO(int nid, int zid, int order),
TP_ARGS(nid, order), TP_ARGS(nid, zid, order),
TP_STRUCT__entry( TP_STRUCT__entry(
__field( int, nid ) __field( int, nid )
__field( int, zid )
__field( int, order ) __field( int, order )
), ),
TP_fast_assign( TP_fast_assign(
__entry->nid = nid; __entry->nid = nid;
__entry->zid = zid;
__entry->order = order; __entry->order = order;
), ),
TP_printk("nid=%d order=%d", __entry->nid, __entry->order) TP_printk("nid=%d zid=%d order=%d", __entry->nid, __entry->zid, __entry->order)
); );
TRACE_EVENT(mm_vmscan_wakeup_kswapd, TRACE_EVENT(mm_vmscan_wakeup_kswapd,
@ -98,47 +100,50 @@ TRACE_EVENT(mm_vmscan_wakeup_kswapd,
DECLARE_EVENT_CLASS(mm_vmscan_direct_reclaim_begin_template, DECLARE_EVENT_CLASS(mm_vmscan_direct_reclaim_begin_template,
TP_PROTO(int order, int may_writepage, gfp_t gfp_flags), TP_PROTO(int order, int may_writepage, gfp_t gfp_flags, int classzone_idx),
TP_ARGS(order, may_writepage, gfp_flags), TP_ARGS(order, may_writepage, gfp_flags, classzone_idx),
TP_STRUCT__entry( TP_STRUCT__entry(
__field( int, order ) __field( int, order )
__field( int, may_writepage ) __field( int, may_writepage )
__field( gfp_t, gfp_flags ) __field( gfp_t, gfp_flags )
__field( int, classzone_idx )
), ),
TP_fast_assign( TP_fast_assign(
__entry->order = order; __entry->order = order;
__entry->may_writepage = may_writepage; __entry->may_writepage = may_writepage;
__entry->gfp_flags = gfp_flags; __entry->gfp_flags = gfp_flags;
__entry->classzone_idx = classzone_idx;
), ),
TP_printk("order=%d may_writepage=%d gfp_flags=%s", TP_printk("order=%d may_writepage=%d gfp_flags=%s classzone_idx=%d",
__entry->order, __entry->order,
__entry->may_writepage, __entry->may_writepage,
show_gfp_flags(__entry->gfp_flags)) show_gfp_flags(__entry->gfp_flags),
__entry->classzone_idx)
); );
DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_direct_reclaim_begin, DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_direct_reclaim_begin,
TP_PROTO(int order, int may_writepage, gfp_t gfp_flags), TP_PROTO(int order, int may_writepage, gfp_t gfp_flags, int classzone_idx),
TP_ARGS(order, may_writepage, gfp_flags) TP_ARGS(order, may_writepage, gfp_flags, classzone_idx)
); );
DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_memcg_reclaim_begin, DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_memcg_reclaim_begin,
TP_PROTO(int order, int may_writepage, gfp_t gfp_flags), TP_PROTO(int order, int may_writepage, gfp_t gfp_flags, int classzone_idx),
TP_ARGS(order, may_writepage, gfp_flags) TP_ARGS(order, may_writepage, gfp_flags, classzone_idx)
); );
DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_memcg_softlimit_reclaim_begin, DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_memcg_softlimit_reclaim_begin,
TP_PROTO(int order, int may_writepage, gfp_t gfp_flags), TP_PROTO(int order, int may_writepage, gfp_t gfp_flags, int classzone_idx),
TP_ARGS(order, may_writepage, gfp_flags) TP_ARGS(order, may_writepage, gfp_flags, classzone_idx)
); );
DECLARE_EVENT_CLASS(mm_vmscan_direct_reclaim_end_template, DECLARE_EVENT_CLASS(mm_vmscan_direct_reclaim_end_template,
@ -266,16 +271,18 @@ TRACE_EVENT(mm_shrink_slab_end,
DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template, DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template,
TP_PROTO(int order, TP_PROTO(int classzone_idx,
int order,
unsigned long nr_requested, unsigned long nr_requested,
unsigned long nr_scanned, unsigned long nr_scanned,
unsigned long nr_taken, unsigned long nr_taken,
isolate_mode_t isolate_mode, isolate_mode_t isolate_mode,
int file), int file),
TP_ARGS(order, nr_requested, nr_scanned, nr_taken, isolate_mode, file), TP_ARGS(classzone_idx, order, nr_requested, nr_scanned, nr_taken, isolate_mode, file),
TP_STRUCT__entry( TP_STRUCT__entry(
__field(int, classzone_idx)
__field(int, order) __field(int, order)
__field(unsigned long, nr_requested) __field(unsigned long, nr_requested)
__field(unsigned long, nr_scanned) __field(unsigned long, nr_scanned)
@ -285,6 +292,7 @@ DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template,
), ),
TP_fast_assign( TP_fast_assign(
__entry->classzone_idx = classzone_idx;
__entry->order = order; __entry->order = order;
__entry->nr_requested = nr_requested; __entry->nr_requested = nr_requested;
__entry->nr_scanned = nr_scanned; __entry->nr_scanned = nr_scanned;
@ -293,8 +301,9 @@ DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template,
__entry->file = file; __entry->file = file;
), ),
TP_printk("isolate_mode=%d order=%d nr_requested=%lu nr_scanned=%lu nr_taken=%lu file=%d", TP_printk("isolate_mode=%d classzone=%d order=%d nr_requested=%lu nr_scanned=%lu nr_taken=%lu file=%d",
__entry->isolate_mode, __entry->isolate_mode,
__entry->classzone_idx,
__entry->order, __entry->order,
__entry->nr_requested, __entry->nr_requested,
__entry->nr_scanned, __entry->nr_scanned,
@ -304,27 +313,29 @@ DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template,
DEFINE_EVENT(mm_vmscan_lru_isolate_template, mm_vmscan_lru_isolate, DEFINE_EVENT(mm_vmscan_lru_isolate_template, mm_vmscan_lru_isolate,
TP_PROTO(int order, TP_PROTO(int classzone_idx,
int order,
unsigned long nr_requested, unsigned long nr_requested,
unsigned long nr_scanned, unsigned long nr_scanned,
unsigned long nr_taken, unsigned long nr_taken,
isolate_mode_t isolate_mode, isolate_mode_t isolate_mode,
int file), int file),
TP_ARGS(order, nr_requested, nr_scanned, nr_taken, isolate_mode, file) TP_ARGS(classzone_idx, order, nr_requested, nr_scanned, nr_taken, isolate_mode, file)
); );
DEFINE_EVENT(mm_vmscan_lru_isolate_template, mm_vmscan_memcg_isolate, DEFINE_EVENT(mm_vmscan_lru_isolate_template, mm_vmscan_memcg_isolate,
TP_PROTO(int order, TP_PROTO(int classzone_idx,
int order,
unsigned long nr_requested, unsigned long nr_requested,
unsigned long nr_scanned, unsigned long nr_scanned,
unsigned long nr_taken, unsigned long nr_taken,
isolate_mode_t isolate_mode, isolate_mode_t isolate_mode,
int file), int file),
TP_ARGS(order, nr_requested, nr_scanned, nr_taken, isolate_mode, file) TP_ARGS(classzone_idx, order, nr_requested, nr_scanned, nr_taken, isolate_mode, file)
); );
@ -352,15 +363,14 @@ TRACE_EVENT(mm_vmscan_writepage,
TRACE_EVENT(mm_vmscan_lru_shrink_inactive, TRACE_EVENT(mm_vmscan_lru_shrink_inactive,
TP_PROTO(struct zone *zone, TP_PROTO(int nid,
unsigned long nr_scanned, unsigned long nr_reclaimed, unsigned long nr_scanned, unsigned long nr_reclaimed,
int priority, int file), int priority, int file),
TP_ARGS(zone, nr_scanned, nr_reclaimed, priority, file), TP_ARGS(nid, nr_scanned, nr_reclaimed, priority, file),
TP_STRUCT__entry( TP_STRUCT__entry(
__field(int, nid) __field(int, nid)
__field(int, zid)
__field(unsigned long, nr_scanned) __field(unsigned long, nr_scanned)
__field(unsigned long, nr_reclaimed) __field(unsigned long, nr_reclaimed)
__field(int, priority) __field(int, priority)
@ -368,16 +378,15 @@ TRACE_EVENT(mm_vmscan_lru_shrink_inactive,
), ),
TP_fast_assign( TP_fast_assign(
__entry->nid = zone_to_nid(zone); __entry->nid = nid;
__entry->zid = zone_idx(zone);
__entry->nr_scanned = nr_scanned; __entry->nr_scanned = nr_scanned;
__entry->nr_reclaimed = nr_reclaimed; __entry->nr_reclaimed = nr_reclaimed;
__entry->priority = priority; __entry->priority = priority;
__entry->reclaim_flags = trace_shrink_flags(file); __entry->reclaim_flags = trace_shrink_flags(file);
), ),
TP_printk("nid=%d zid=%d nr_scanned=%ld nr_reclaimed=%ld priority=%d flags=%s", TP_printk("nid=%d nr_scanned=%ld nr_reclaimed=%ld priority=%d flags=%s",
__entry->nid, __entry->zid, __entry->nid,
__entry->nr_scanned, __entry->nr_reclaimed, __entry->nr_scanned, __entry->nr_reclaimed,
__entry->priority, __entry->priority,
show_reclaim_flags(__entry->reclaim_flags)) show_reclaim_flags(__entry->reclaim_flags))

View file

@ -412,11 +412,11 @@ TRACE_EVENT(global_dirty_state,
), ),
TP_fast_assign( TP_fast_assign(
__entry->nr_dirty = global_page_state(NR_FILE_DIRTY); __entry->nr_dirty = global_node_page_state(NR_FILE_DIRTY);
__entry->nr_writeback = global_page_state(NR_WRITEBACK); __entry->nr_writeback = global_node_page_state(NR_WRITEBACK);
__entry->nr_unstable = global_page_state(NR_UNSTABLE_NFS); __entry->nr_unstable = global_node_page_state(NR_UNSTABLE_NFS);
__entry->nr_dirtied = global_page_state(NR_DIRTIED); __entry->nr_dirtied = global_node_page_state(NR_DIRTIED);
__entry->nr_written = global_page_state(NR_WRITTEN); __entry->nr_written = global_node_page_state(NR_WRITTEN);
__entry->background_thresh = background_thresh; __entry->background_thresh = background_thresh;
__entry->dirty_thresh = dirty_thresh; __entry->dirty_thresh = dirty_thresh;
__entry->dirty_limit = global_wb_domain.dirty_limit; __entry->dirty_limit = global_wb_domain.dirty_limit;

View file

@ -1034,15 +1034,6 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
{ {
bool need_loop; bool need_loop;
/*
* Allow tasks that have access to memory reserves because they have
* been OOM killed to get memory anywhere.
*/
if (unlikely(test_thread_flag(TIF_MEMDIE)))
return;
if (current->flags & PF_EXITING) /* Let dying task have memory */
return;
task_lock(tsk); task_lock(tsk);
/* /*
* Determine if a loop is necessary if another thread is doing * Determine if a loop is necessary if another thread is doing

View file

@ -165,20 +165,12 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
struct page *page = alloc_pages_node(node, THREADINFO_GFP, struct page *page = alloc_pages_node(node, THREADINFO_GFP,
THREAD_SIZE_ORDER); THREAD_SIZE_ORDER);
if (page)
memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK,
1 << THREAD_SIZE_ORDER);
return page ? page_address(page) : NULL; return page ? page_address(page) : NULL;
} }
static inline void free_thread_stack(unsigned long *stack) static inline void free_thread_stack(unsigned long *stack)
{ {
struct page *page = virt_to_page(stack); __free_pages(virt_to_page(stack), THREAD_SIZE_ORDER);
memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK,
-(1 << THREAD_SIZE_ORDER));
__free_pages(page, THREAD_SIZE_ORDER);
} }
# else # else
static struct kmem_cache *thread_stack_cache; static struct kmem_cache *thread_stack_cache;
@ -223,9 +215,15 @@ static struct kmem_cache *mm_cachep;
static void account_kernel_stack(unsigned long *stack, int account) static void account_kernel_stack(unsigned long *stack, int account)
{ {
struct zone *zone = page_zone(virt_to_page(stack)); /* All stack pages are in the same zone and belong to the same memcg. */
struct page *first_page = virt_to_page(stack);
mod_zone_page_state(zone, NR_KERNEL_STACK, account); mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
THREAD_SIZE / 1024 * account);
memcg_kmem_update_page_stat(
first_page, MEMCG_KERNEL_STACK_KB,
account * (THREAD_SIZE / 1024));
} }
void free_task(struct task_struct *tsk) void free_task(struct task_struct *tsk)

View file

@ -42,7 +42,7 @@ bool freezing_slow_path(struct task_struct *p)
if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK)) if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK))
return false; return false;
if (test_thread_flag(TIF_MEMDIE)) if (test_tsk_thread_flag(p, TIF_MEMDIE))
return false; return false;
if (pm_nosig_freezing || cgroup_freezing(p)) if (pm_nosig_freezing || cgroup_freezing(p))

View file

@ -308,12 +308,6 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
if (is_ram == REGION_INTERSECTS) if (is_ram == REGION_INTERSECTS)
return __va(res->start); return __va(res->start);
if (altmap && !IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP)) {
dev_err(dev, "%s: altmap requires CONFIG_SPARSEMEM_VMEMMAP=y\n",
__func__);
return ERR_PTR(-ENXIO);
}
if (!ref) if (!ref)
return ERR_PTR(-EINVAL); return ERR_PTR(-EINVAL);
@ -401,7 +395,6 @@ void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns)
altmap->alloc -= nr_pfns; altmap->alloc -= nr_pfns;
} }
#ifdef CONFIG_SPARSEMEM_VMEMMAP
struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start)
{ {
/* /*
@ -427,5 +420,4 @@ struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start)
return pgmap ? pgmap->altmap : NULL; return pgmap ? pgmap->altmap : NULL;
} }
#endif /* CONFIG_SPARSEMEM_VMEMMAP */
#endif /* CONFIG_ZONE_DEVICE */ #endif /* CONFIG_ZONE_DEVICE */

View file

@ -1627,11 +1627,11 @@ static unsigned long minimum_image_size(unsigned long saveable)
unsigned long size; unsigned long size;
size = global_page_state(NR_SLAB_RECLAIMABLE) size = global_page_state(NR_SLAB_RECLAIMABLE)
+ global_page_state(NR_ACTIVE_ANON) + global_node_page_state(NR_ACTIVE_ANON)
+ global_page_state(NR_INACTIVE_ANON) + global_node_page_state(NR_INACTIVE_ANON)
+ global_page_state(NR_ACTIVE_FILE) + global_node_page_state(NR_ACTIVE_FILE)
+ global_page_state(NR_INACTIVE_FILE) + global_node_page_state(NR_INACTIVE_FILE)
- global_page_state(NR_FILE_MAPPED); - global_node_page_state(NR_FILE_MAPPED);
return saveable <= size ? 0 : saveable - size; return saveable <= size ? 0 : saveable - size;
} }

View file

@ -3177,9 +3177,8 @@ void show_regs_print_info(const char *log_lvl)
{ {
dump_stack_print_info(log_lvl); dump_stack_print_info(log_lvl);
printk("%stask: %p ti: %p task.ti: %p\n", printk("%stask: %p task.stack: %p\n",
log_lvl, current, current_thread_info(), log_lvl, current, task_stack_page(current));
task_thread_info(current));
} }
#endif #endif

View file

@ -1508,8 +1508,8 @@ static struct ctl_table vm_table[] = {
#ifdef CONFIG_NUMA #ifdef CONFIG_NUMA
{ {
.procname = "zone_reclaim_mode", .procname = "zone_reclaim_mode",
.data = &zone_reclaim_mode, .data = &node_reclaim_mode,
.maxlen = sizeof(zone_reclaim_mode), .maxlen = sizeof(node_reclaim_mode),
.mode = 0644, .mode = 0644,
.proc_handler = proc_dointvec, .proc_handler = proc_dointvec,
.extra1 = &zero, .extra1 = &zero,

View file

@ -5,9 +5,9 @@ if HAVE_ARCH_KASAN
config KASAN config KASAN
bool "KASan: runtime memory debugger" bool "KASan: runtime memory debugger"
depends on SLUB_DEBUG || (SLAB && !DEBUG_SLAB) depends on SLUB || (SLAB && !DEBUG_SLAB)
select CONSTRUCTORS select CONSTRUCTORS
select STACKDEPOT if SLAB select STACKDEPOT
help help
Enables kernel address sanitizer - runtime memory debugger, Enables kernel address sanitizer - runtime memory debugger,
designed to find out-of-bounds accesses and use-after-free bugs. designed to find out-of-bounds accesses and use-after-free bugs.

View file

@ -144,7 +144,7 @@ static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t b
buf = iov->iov_base + skip; buf = iov->iov_base + skip;
copy = min(bytes, iov->iov_len - skip); copy = min(bytes, iov->iov_len - skip);
if (!fault_in_pages_writeable(buf, copy)) { if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_writeable(buf, copy)) {
kaddr = kmap_atomic(page); kaddr = kmap_atomic(page);
from = kaddr + offset; from = kaddr + offset;
@ -175,6 +175,7 @@ static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t b
copy = min(bytes, iov->iov_len - skip); copy = min(bytes, iov->iov_len - skip);
} }
/* Too bad - revert to non-atomic kmap */ /* Too bad - revert to non-atomic kmap */
kaddr = kmap(page); kaddr = kmap(page);
from = kaddr + offset; from = kaddr + offset;
left = __copy_to_user(buf, from, copy); left = __copy_to_user(buf, from, copy);
@ -193,6 +194,7 @@ static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t b
bytes -= copy; bytes -= copy;
} }
kunmap(page); kunmap(page);
done: done:
if (skip == iov->iov_len) { if (skip == iov->iov_len) {
iov++; iov++;
@ -225,7 +227,7 @@ static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t
buf = iov->iov_base + skip; buf = iov->iov_base + skip;
copy = min(bytes, iov->iov_len - skip); copy = min(bytes, iov->iov_len - skip);
if (!fault_in_pages_readable(buf, copy)) { if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_readable(buf, copy)) {
kaddr = kmap_atomic(page); kaddr = kmap_atomic(page);
to = kaddr + offset; to = kaddr + offset;
@ -256,6 +258,7 @@ static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t
copy = min(bytes, iov->iov_len - skip); copy = min(bytes, iov->iov_len - skip);
} }
/* Too bad - revert to non-atomic kmap */ /* Too bad - revert to non-atomic kmap */
kaddr = kmap(page); kaddr = kmap(page);
to = kaddr + offset; to = kaddr + offset;
left = __copy_from_user(to, buf, copy); left = __copy_from_user(to, buf, copy);
@ -274,6 +277,7 @@ static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t
bytes -= copy; bytes -= copy;
} }
kunmap(page); kunmap(page);
done: done:
if (skip == iov->iov_len) { if (skip == iov->iov_len) {
iov++; iov++;

View file

@ -242,6 +242,7 @@ depot_stack_handle_t depot_save_stack(struct stack_trace *trace,
*/ */
alloc_flags &= ~GFP_ZONEMASK; alloc_flags &= ~GFP_ZONEMASK;
alloc_flags &= (GFP_ATOMIC | GFP_KERNEL); alloc_flags &= (GFP_ATOMIC | GFP_KERNEL);
alloc_flags |= __GFP_NOWARN;
page = alloc_pages(alloc_flags, STACK_ALLOC_ORDER); page = alloc_pages(alloc_flags, STACK_ALLOC_ORDER);
if (page) if (page)
prealloc = page_address(page); prealloc = page_address(page);

View file

@ -681,7 +681,7 @@ config IDLE_PAGE_TRACKING
See Documentation/vm/idle_page_tracking.txt for more details. See Documentation/vm/idle_page_tracking.txt for more details.
config ZONE_DEVICE config ZONE_DEVICE
bool "Device memory (pmem, etc...) hotplug support" if EXPERT bool "Device memory (pmem, etc...) hotplug support"
depends on MEMORY_HOTPLUG depends on MEMORY_HOTPLUG
depends on MEMORY_HOTREMOVE depends on MEMORY_HOTREMOVE
depends on SPARSEMEM_VMEMMAP depends on SPARSEMEM_VMEMMAP

View file

@ -947,24 +947,24 @@ long congestion_wait(int sync, long timeout)
EXPORT_SYMBOL(congestion_wait); EXPORT_SYMBOL(congestion_wait);
/** /**
* wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a zone to complete writes * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a pgdat to complete writes
* @zone: A zone to check if it is heavily congested * @pgdat: A pgdat to check if it is heavily congested
* @sync: SYNC or ASYNC IO * @sync: SYNC or ASYNC IO
* @timeout: timeout in jiffies * @timeout: timeout in jiffies
* *
* In the event of a congested backing_dev (any backing_dev) and the given * In the event of a congested backing_dev (any backing_dev) and the given
* @zone has experienced recent congestion, this waits for up to @timeout * @pgdat has experienced recent congestion, this waits for up to @timeout
* jiffies for either a BDI to exit congestion of the given @sync queue * jiffies for either a BDI to exit congestion of the given @sync queue
* or a write to complete. * or a write to complete.
* *
* In the absence of zone congestion, cond_resched() is called to yield * In the absence of pgdat congestion, cond_resched() is called to yield
* the processor if necessary but otherwise does not sleep. * the processor if necessary but otherwise does not sleep.
* *
* The return value is 0 if the sleep is for the full timeout. Otherwise, * The return value is 0 if the sleep is for the full timeout. Otherwise,
* it is the number of jiffies that were still remaining when the function * it is the number of jiffies that were still remaining when the function
* returned. return_value == timeout implies the function did not sleep. * returned. return_value == timeout implies the function did not sleep.
*/ */
long wait_iff_congested(struct zone *zone, int sync, long timeout) long wait_iff_congested(struct pglist_data *pgdat, int sync, long timeout)
{ {
long ret; long ret;
unsigned long start = jiffies; unsigned long start = jiffies;
@ -973,12 +973,13 @@ long wait_iff_congested(struct zone *zone, int sync, long timeout)
/* /*
* If there is no congestion, or heavy congestion is not being * If there is no congestion, or heavy congestion is not being
* encountered in the current zone, yield if necessary instead * encountered in the current pgdat, yield if necessary instead
* of sleeping on the congestion queue * of sleeping on the congestion queue
*/ */
if (atomic_read(&nr_wb_congested[sync]) == 0 || if (atomic_read(&nr_wb_congested[sync]) == 0 ||
!test_bit(ZONE_CONGESTED, &zone->flags)) { !test_bit(PGDAT_CONGESTED, &pgdat->flags)) {
cond_resched(); cond_resched();
/* In case we scheduled, work out time remaining */ /* In case we scheduled, work out time remaining */
ret = timeout - (jiffies - start); ret = timeout - (jiffies - start);
if (ret < 0) if (ret < 0)

View file

@ -331,7 +331,7 @@ static bool compact_trylock_irqsave(spinlock_t *lock, unsigned long *flags,
{ {
if (cc->mode == MIGRATE_ASYNC) { if (cc->mode == MIGRATE_ASYNC) {
if (!spin_trylock_irqsave(lock, *flags)) { if (!spin_trylock_irqsave(lock, *flags)) {
cc->contended = COMPACT_CONTENDED_LOCK; cc->contended = true;
return false; return false;
} }
} else { } else {
@ -365,13 +365,13 @@ static bool compact_unlock_should_abort(spinlock_t *lock,
} }
if (fatal_signal_pending(current)) { if (fatal_signal_pending(current)) {
cc->contended = COMPACT_CONTENDED_SCHED; cc->contended = true;
return true; return true;
} }
if (need_resched()) { if (need_resched()) {
if (cc->mode == MIGRATE_ASYNC) { if (cc->mode == MIGRATE_ASYNC) {
cc->contended = COMPACT_CONTENDED_SCHED; cc->contended = true;
return true; return true;
} }
cond_resched(); cond_resched();
@ -394,7 +394,7 @@ static inline bool compact_should_abort(struct compact_control *cc)
/* async compaction aborts if contended */ /* async compaction aborts if contended */
if (need_resched()) { if (need_resched()) {
if (cc->mode == MIGRATE_ASYNC) { if (cc->mode == MIGRATE_ASYNC) {
cc->contended = COMPACT_CONTENDED_SCHED; cc->contended = true;
return true; return true;
} }
@ -646,8 +646,8 @@ static void acct_isolated(struct zone *zone, struct compact_control *cc)
list_for_each_entry(page, &cc->migratepages, lru) list_for_each_entry(page, &cc->migratepages, lru)
count[!!page_is_file_cache(page)]++; count[!!page_is_file_cache(page)]++;
mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]); mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON, count[0]);
mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]); mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, count[1]);
} }
/* Similar to reclaim, but different enough that they don't share logic */ /* Similar to reclaim, but different enough that they don't share logic */
@ -655,12 +655,12 @@ static bool too_many_isolated(struct zone *zone)
{ {
unsigned long active, inactive, isolated; unsigned long active, inactive, isolated;
inactive = zone_page_state(zone, NR_INACTIVE_FILE) + inactive = node_page_state(zone->zone_pgdat, NR_INACTIVE_FILE) +
zone_page_state(zone, NR_INACTIVE_ANON); node_page_state(zone->zone_pgdat, NR_INACTIVE_ANON);
active = zone_page_state(zone, NR_ACTIVE_FILE) + active = node_page_state(zone->zone_pgdat, NR_ACTIVE_FILE) +
zone_page_state(zone, NR_ACTIVE_ANON); node_page_state(zone->zone_pgdat, NR_ACTIVE_ANON);
isolated = zone_page_state(zone, NR_ISOLATED_FILE) + isolated = node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE) +
zone_page_state(zone, NR_ISOLATED_ANON); node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON);
return isolated > (inactive + active) / 2; return isolated > (inactive + active) / 2;
} }
@ -752,7 +752,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
* if contended. * if contended.
*/ */
if (!(low_pfn % SWAP_CLUSTER_MAX) if (!(low_pfn % SWAP_CLUSTER_MAX)
&& compact_unlock_should_abort(&zone->lru_lock, flags, && compact_unlock_should_abort(zone_lru_lock(zone), flags,
&locked, cc)) &locked, cc))
break; break;
@ -813,7 +813,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
if (unlikely(__PageMovable(page)) && if (unlikely(__PageMovable(page)) &&
!PageIsolated(page)) { !PageIsolated(page)) {
if (locked) { if (locked) {
spin_unlock_irqrestore(&zone->lru_lock, spin_unlock_irqrestore(zone_lru_lock(zone),
flags); flags);
locked = false; locked = false;
} }
@ -836,7 +836,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
/* If we already hold the lock, we can skip some rechecking */ /* If we already hold the lock, we can skip some rechecking */
if (!locked) { if (!locked) {
locked = compact_trylock_irqsave(&zone->lru_lock, locked = compact_trylock_irqsave(zone_lru_lock(zone),
&flags, cc); &flags, cc);
if (!locked) if (!locked)
break; break;
@ -856,7 +856,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
} }
} }
lruvec = mem_cgroup_page_lruvec(page, zone); lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
/* Try isolate the page */ /* Try isolate the page */
if (__isolate_lru_page(page, isolate_mode) != 0) if (__isolate_lru_page(page, isolate_mode) != 0)
@ -899,7 +899,7 @@ isolate_fail:
*/ */
if (nr_isolated) { if (nr_isolated) {
if (locked) { if (locked) {
spin_unlock_irqrestore(&zone->lru_lock, flags); spin_unlock_irqrestore(zone_lru_lock(zone), flags);
locked = false; locked = false;
} }
acct_isolated(zone, cc); acct_isolated(zone, cc);
@ -927,7 +927,7 @@ isolate_fail:
low_pfn = end_pfn; low_pfn = end_pfn;
if (locked) if (locked)
spin_unlock_irqrestore(&zone->lru_lock, flags); spin_unlock_irqrestore(zone_lru_lock(zone), flags);
/* /*
* Update the pageblock-skip information and cached scanner pfn, * Update the pageblock-skip information and cached scanner pfn,
@ -1200,7 +1200,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
struct page *page; struct page *page;
const isolate_mode_t isolate_mode = const isolate_mode_t isolate_mode =
(sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) | (sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) |
(cc->mode == MIGRATE_ASYNC ? ISOLATE_ASYNC_MIGRATE : 0); (cc->mode != MIGRATE_SYNC ? ISOLATE_ASYNC_MIGRATE : 0);
/* /*
* Start at where we last stopped, or beginning of the zone as * Start at where we last stopped, or beginning of the zone as
@ -1619,14 +1619,11 @@ out:
trace_mm_compaction_end(start_pfn, cc->migrate_pfn, trace_mm_compaction_end(start_pfn, cc->migrate_pfn,
cc->free_pfn, end_pfn, sync, ret); cc->free_pfn, end_pfn, sync, ret);
if (ret == COMPACT_CONTENDED)
ret = COMPACT_PARTIAL;
return ret; return ret;
} }
static enum compact_result compact_zone_order(struct zone *zone, int order, static enum compact_result compact_zone_order(struct zone *zone, int order,
gfp_t gfp_mask, enum migrate_mode mode, int *contended, gfp_t gfp_mask, enum compact_priority prio,
unsigned int alloc_flags, int classzone_idx) unsigned int alloc_flags, int classzone_idx)
{ {
enum compact_result ret; enum compact_result ret;
@ -1636,7 +1633,8 @@ static enum compact_result compact_zone_order(struct zone *zone, int order,
.order = order, .order = order,
.gfp_mask = gfp_mask, .gfp_mask = gfp_mask,
.zone = zone, .zone = zone,
.mode = mode, .mode = (prio == COMPACT_PRIO_ASYNC) ?
MIGRATE_ASYNC : MIGRATE_SYNC_LIGHT,
.alloc_flags = alloc_flags, .alloc_flags = alloc_flags,
.classzone_idx = classzone_idx, .classzone_idx = classzone_idx,
.direct_compaction = true, .direct_compaction = true,
@ -1649,7 +1647,6 @@ static enum compact_result compact_zone_order(struct zone *zone, int order,
VM_BUG_ON(!list_empty(&cc.freepages)); VM_BUG_ON(!list_empty(&cc.freepages));
VM_BUG_ON(!list_empty(&cc.migratepages)); VM_BUG_ON(!list_empty(&cc.migratepages));
*contended = cc.contended;
return ret; return ret;
} }
@ -1662,50 +1659,38 @@ int sysctl_extfrag_threshold = 500;
* @alloc_flags: The allocation flags of the current allocation * @alloc_flags: The allocation flags of the current allocation
* @ac: The context of current allocation * @ac: The context of current allocation
* @mode: The migration mode for async, sync light, or sync migration * @mode: The migration mode for async, sync light, or sync migration
* @contended: Return value that determines if compaction was aborted due to
* need_resched() or lock contention
* *
* This is the main entry point for direct page compaction. * This is the main entry point for direct page compaction.
*/ */
enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
unsigned int alloc_flags, const struct alloc_context *ac, unsigned int alloc_flags, const struct alloc_context *ac,
enum migrate_mode mode, int *contended) enum compact_priority prio)
{ {
int may_enter_fs = gfp_mask & __GFP_FS; int may_enter_fs = gfp_mask & __GFP_FS;
int may_perform_io = gfp_mask & __GFP_IO; int may_perform_io = gfp_mask & __GFP_IO;
struct zoneref *z; struct zoneref *z;
struct zone *zone; struct zone *zone;
enum compact_result rc = COMPACT_SKIPPED; enum compact_result rc = COMPACT_SKIPPED;
int all_zones_contended = COMPACT_CONTENDED_LOCK; /* init for &= op */
*contended = COMPACT_CONTENDED_NONE;
/* Check if the GFP flags allow compaction */ /* Check if the GFP flags allow compaction */
if (!order || !may_enter_fs || !may_perform_io) if (!may_enter_fs || !may_perform_io)
return COMPACT_SKIPPED; return COMPACT_SKIPPED;
trace_mm_compaction_try_to_compact_pages(order, gfp_mask, mode); trace_mm_compaction_try_to_compact_pages(order, gfp_mask, prio);
/* Compact each zone in the list */ /* Compact each zone in the list */
for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
ac->nodemask) { ac->nodemask) {
enum compact_result status; enum compact_result status;
int zone_contended;
if (compaction_deferred(zone, order)) { if (compaction_deferred(zone, order)) {
rc = max_t(enum compact_result, COMPACT_DEFERRED, rc); rc = max_t(enum compact_result, COMPACT_DEFERRED, rc);
continue; continue;
} }
status = compact_zone_order(zone, order, gfp_mask, mode, status = compact_zone_order(zone, order, gfp_mask, prio,
&zone_contended, alloc_flags, alloc_flags, ac_classzone_idx(ac));
ac_classzone_idx(ac));
rc = max(status, rc); rc = max(status, rc);
/*
* It takes at least one zone that wasn't lock contended
* to clear all_zones_contended.
*/
all_zones_contended &= zone_contended;
/* If a normal allocation would succeed, stop compacting */ /* If a normal allocation would succeed, stop compacting */
if (zone_watermark_ok(zone, order, low_wmark_pages(zone), if (zone_watermark_ok(zone, order, low_wmark_pages(zone),
@ -1717,59 +1702,29 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
* succeeds in this zone. * succeeds in this zone.
*/ */
compaction_defer_reset(zone, order, false); compaction_defer_reset(zone, order, false);
/*
* It is possible that async compaction aborted due to
* need_resched() and the watermarks were ok thanks to
* somebody else freeing memory. The allocation can
* however still fail so we better signal the
* need_resched() contention anyway (this will not
* prevent the allocation attempt).
*/
if (zone_contended == COMPACT_CONTENDED_SCHED)
*contended = COMPACT_CONTENDED_SCHED;
goto break_loop; break;
} }
if (mode != MIGRATE_ASYNC && (status == COMPACT_COMPLETE || if (prio != COMPACT_PRIO_ASYNC && (status == COMPACT_COMPLETE ||
status == COMPACT_PARTIAL_SKIPPED)) { status == COMPACT_PARTIAL_SKIPPED))
/* /*
* We think that allocation won't succeed in this zone * We think that allocation won't succeed in this zone
* so we defer compaction there. If it ends up * so we defer compaction there. If it ends up
* succeeding after all, it will be reset. * succeeding after all, it will be reset.
*/ */
defer_compaction(zone, order); defer_compaction(zone, order);
}
/* /*
* We might have stopped compacting due to need_resched() in * We might have stopped compacting due to need_resched() in
* async compaction, or due to a fatal signal detected. In that * async compaction, or due to a fatal signal detected. In that
* case do not try further zones and signal need_resched() * case do not try further zones
* contention.
*/ */
if ((zone_contended == COMPACT_CONTENDED_SCHED) if ((prio == COMPACT_PRIO_ASYNC && need_resched())
|| fatal_signal_pending(current)) { || fatal_signal_pending(current))
*contended = COMPACT_CONTENDED_SCHED; break;
goto break_loop;
}
continue;
break_loop:
/*
* We might not have tried all the zones, so be conservative
* and assume they are not all lock contended.
*/
all_zones_contended = 0;
break;
} }
/*
* If at least one zone wasn't deferred or skipped, we report if all
* zones that were tried were lock contended.
*/
if (rc > COMPACT_INACTIVE && all_zones_contended)
*contended = COMPACT_CONTENDED_LOCK;
return rc; return rc;
} }

View file

@ -95,8 +95,8 @@
* ->swap_lock (try_to_unmap_one) * ->swap_lock (try_to_unmap_one)
* ->private_lock (try_to_unmap_one) * ->private_lock (try_to_unmap_one)
* ->tree_lock (try_to_unmap_one) * ->tree_lock (try_to_unmap_one)
* ->zone.lru_lock (follow_page->mark_page_accessed) * ->zone_lru_lock(zone) (follow_page->mark_page_accessed)
* ->zone.lru_lock (check_pte_range->isolate_lru_page) * ->zone_lru_lock(zone) (check_pte_range->isolate_lru_page)
* ->private_lock (page_remove_rmap->set_page_dirty) * ->private_lock (page_remove_rmap->set_page_dirty)
* ->tree_lock (page_remove_rmap->set_page_dirty) * ->tree_lock (page_remove_rmap->set_page_dirty)
* bdi.wb->list_lock (page_remove_rmap->set_page_dirty) * bdi.wb->list_lock (page_remove_rmap->set_page_dirty)
@ -218,11 +218,11 @@ void __delete_from_page_cache(struct page *page, void *shadow)
/* hugetlb pages do not participate in page cache accounting. */ /* hugetlb pages do not participate in page cache accounting. */
if (!PageHuge(page)) if (!PageHuge(page))
__mod_zone_page_state(page_zone(page), NR_FILE_PAGES, -nr); __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr);
if (PageSwapBacked(page)) { if (PageSwapBacked(page)) {
__mod_zone_page_state(page_zone(page), NR_SHMEM, -nr); __mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr);
if (PageTransHuge(page)) if (PageTransHuge(page))
__dec_zone_page_state(page, NR_SHMEM_THPS); __dec_node_page_state(page, NR_SHMEM_THPS);
} else { } else {
VM_BUG_ON_PAGE(PageTransHuge(page) && !PageHuge(page), page); VM_BUG_ON_PAGE(PageTransHuge(page) && !PageHuge(page), page);
} }
@ -568,9 +568,9 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
* hugetlb pages do not participate in page cache accounting. * hugetlb pages do not participate in page cache accounting.
*/ */
if (!PageHuge(new)) if (!PageHuge(new))
__inc_zone_page_state(new, NR_FILE_PAGES); __inc_node_page_state(new, NR_FILE_PAGES);
if (PageSwapBacked(new)) if (PageSwapBacked(new))
__inc_zone_page_state(new, NR_SHMEM); __inc_node_page_state(new, NR_SHMEM);
spin_unlock_irqrestore(&mapping->tree_lock, flags); spin_unlock_irqrestore(&mapping->tree_lock, flags);
mem_cgroup_migrate(old, new); mem_cgroup_migrate(old, new);
radix_tree_preload_end(); radix_tree_preload_end();
@ -677,7 +677,7 @@ static int __add_to_page_cache_locked(struct page *page,
/* hugetlb pages do not participate in page cache accounting. */ /* hugetlb pages do not participate in page cache accounting. */
if (!huge) if (!huge)
__inc_zone_page_state(page, NR_FILE_PAGES); __inc_node_page_state(page, NR_FILE_PAGES);
spin_unlock_irq(&mapping->tree_lock); spin_unlock_irq(&mapping->tree_lock);
if (!huge) if (!huge)
mem_cgroup_commit_charge(page, memcg, false, false); mem_cgroup_commit_charge(page, memcg, false, false);

View file

@ -539,23 +539,26 @@ static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page,
} }
/* /*
* If THP is set to always then directly reclaim/compact as necessary * If THP defrag is set to always then directly reclaim/compact as necessary
* If set to defer then do no reclaim and defer to khugepaged * If set to defer then do only background reclaim/compact and defer to khugepaged
* If set to madvise and the VMA is flagged then directly reclaim/compact * If set to madvise and the VMA is flagged then directly reclaim/compact
* When direct reclaim/compact is allowed, don't retry except for flagged VMA's
*/ */
static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma) static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
{ {
gfp_t reclaim_flags = 0; bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags) && if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
(vma->vm_flags & VM_HUGEPAGE)) &transparent_hugepage_flags) && vma_madvised)
reclaim_flags = __GFP_DIRECT_RECLAIM; return GFP_TRANSHUGE;
else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
reclaim_flags = __GFP_KSWAPD_RECLAIM; &transparent_hugepage_flags))
else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
reclaim_flags = __GFP_DIRECT_RECLAIM; else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
&transparent_hugepage_flags))
return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
return GFP_TRANSHUGE | reclaim_flags; return GFP_TRANSHUGE_LIGHT;
} }
/* Caller must hold page table lock. */ /* Caller must hold page table lock. */
@ -1249,25 +1252,26 @@ out:
return 0; return 0;
} }
int madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, /*
* Return true if we do MADV_FREE successfully on entire pmd page.
* Otherwise, return false.
*/
bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
pmd_t *pmd, unsigned long addr, unsigned long next) pmd_t *pmd, unsigned long addr, unsigned long next)
{ {
spinlock_t *ptl; spinlock_t *ptl;
pmd_t orig_pmd; pmd_t orig_pmd;
struct page *page; struct page *page;
struct mm_struct *mm = tlb->mm; struct mm_struct *mm = tlb->mm;
int ret = 0; bool ret = false;
ptl = pmd_trans_huge_lock(pmd, vma); ptl = pmd_trans_huge_lock(pmd, vma);
if (!ptl) if (!ptl)
goto out_unlocked; goto out_unlocked;
orig_pmd = *pmd; orig_pmd = *pmd;
if (is_huge_zero_pmd(orig_pmd)) { if (is_huge_zero_pmd(orig_pmd))
ret = 1;
goto out; goto out;
}
page = pmd_page(orig_pmd); page = pmd_page(orig_pmd);
/* /*
@ -1309,7 +1313,7 @@ int madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
set_pmd_at(mm, addr, pmd, orig_pmd); set_pmd_at(mm, addr, pmd, orig_pmd);
tlb_remove_pmd_tlb_entry(tlb, pmd, addr); tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
} }
ret = 1; ret = true;
out: out:
spin_unlock(ptl); spin_unlock(ptl);
out_unlocked: out_unlocked:
@ -1586,7 +1590,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
if (atomic_add_negative(-1, compound_mapcount_ptr(page))) { if (atomic_add_negative(-1, compound_mapcount_ptr(page))) {
/* Last compound_mapcount is gone. */ /* Last compound_mapcount is gone. */
__dec_zone_page_state(page, NR_ANON_THPS); __dec_node_page_state(page, NR_ANON_THPS);
if (TestClearPageDoubleMap(page)) { if (TestClearPageDoubleMap(page)) {
/* No need in mapcount reference anymore */ /* No need in mapcount reference anymore */
for (i = 0; i < HPAGE_PMD_NR; i++) for (i = 0; i < HPAGE_PMD_NR; i++)
@ -1818,7 +1822,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
pgoff_t end = -1; pgoff_t end = -1;
int i; int i;
lruvec = mem_cgroup_page_lruvec(head, zone); lruvec = mem_cgroup_page_lruvec(head, zone->zone_pgdat);
/* complete memcg works before add pages to LRU */ /* complete memcg works before add pages to LRU */
mem_cgroup_split_huge_fixup(head); mem_cgroup_split_huge_fixup(head);
@ -1848,7 +1852,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
spin_unlock(&head->mapping->tree_lock); spin_unlock(&head->mapping->tree_lock);
} }
spin_unlock_irqrestore(&page_zone(head)->lru_lock, flags); spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags);
unfreeze_page(head); unfreeze_page(head);
@ -2034,7 +2038,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
lru_add_drain(); lru_add_drain();
/* prevent PageLRU to go away from under us, and freeze lru stats */ /* prevent PageLRU to go away from under us, and freeze lru stats */
spin_lock_irqsave(&page_zone(head)->lru_lock, flags); spin_lock_irqsave(zone_lru_lock(page_zone(head)), flags);
if (mapping) { if (mapping) {
void **pslot; void **pslot;
@ -2061,7 +2065,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
list_del(page_deferred_list(head)); list_del(page_deferred_list(head));
} }
if (mapping) if (mapping)
__dec_zone_page_state(page, NR_SHMEM_THPS); __dec_node_page_state(page, NR_SHMEM_THPS);
spin_unlock(&pgdata->split_queue_lock); spin_unlock(&pgdata->split_queue_lock);
__split_huge_page(page, list, flags); __split_huge_page(page, list, flags);
ret = 0; ret = 0;
@ -2077,7 +2081,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
spin_unlock(&pgdata->split_queue_lock); spin_unlock(&pgdata->split_queue_lock);
fail: if (mapping) fail: if (mapping)
spin_unlock(&mapping->tree_lock); spin_unlock(&mapping->tree_lock);
spin_unlock_irqrestore(&page_zone(head)->lru_lock, flags); spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags);
unfreeze_page(head); unfreeze_page(head);
ret = -EBUSY; ret = -EBUSY;
} }

View file

@ -4391,7 +4391,6 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
/* /*
* This function is called from memory failure code. * This function is called from memory failure code.
* Assume the caller holds page lock of the head page.
*/ */
int dequeue_hwpoisoned_huge_page(struct page *hpage) int dequeue_hwpoisoned_huge_page(struct page *hpage)
{ {

View file

@ -78,7 +78,7 @@ extern unsigned long highest_memmap_pfn;
*/ */
extern int isolate_lru_page(struct page *page); extern int isolate_lru_page(struct page *page);
extern void putback_lru_page(struct page *page); extern void putback_lru_page(struct page *page);
extern bool zone_reclaimable(struct zone *zone); extern bool pgdat_reclaimable(struct pglist_data *pgdat);
/* /*
* in mm/rmap.c: * in mm/rmap.c:
@ -185,10 +185,7 @@ struct compact_control {
const unsigned int alloc_flags; /* alloc flags of a direct compactor */ const unsigned int alloc_flags; /* alloc flags of a direct compactor */
const int classzone_idx; /* zone index of a direct compactor */ const int classzone_idx; /* zone index of a direct compactor */
struct zone *zone; struct zone *zone;
int contended; /* Signal need_sched() or lock bool contended; /* Signal lock or sched contention */
* contention detected during
* compaction
*/
}; };
unsigned long unsigned long
@ -433,10 +430,10 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
} }
#endif /* CONFIG_SPARSEMEM */ #endif /* CONFIG_SPARSEMEM */
#define ZONE_RECLAIM_NOSCAN -2 #define NODE_RECLAIM_NOSCAN -2
#define ZONE_RECLAIM_FULL -1 #define NODE_RECLAIM_FULL -1
#define ZONE_RECLAIM_SOME 0 #define NODE_RECLAIM_SOME 0
#define ZONE_RECLAIM_SUCCESS 1 #define NODE_RECLAIM_SUCCESS 1
extern int hwpoison_filter(struct page *p); extern int hwpoison_filter(struct page *p);
@ -467,7 +464,6 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
#define ALLOC_CPUSET 0x40 /* check for correct cpuset */ #define ALLOC_CPUSET 0x40 /* check for correct cpuset */
#define ALLOC_CMA 0x80 /* allow allocations from CMA areas */ #define ALLOC_CMA 0x80 /* allow allocations from CMA areas */
#define ALLOC_FAIR 0x100 /* fair zone allocation */
enum ttu_flags; enum ttu_flags;
struct tlbflush_unmap_batch; struct tlbflush_unmap_batch;

View file

@ -7,5 +7,4 @@ CFLAGS_REMOVE_kasan.o = -pg
# see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533 # see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533
CFLAGS_kasan.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) CFLAGS_kasan.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
obj-y := kasan.o report.o kasan_init.o obj-y := kasan.o report.o kasan_init.o quarantine.o
obj-$(CONFIG_SLAB) += quarantine.o

View file

@ -351,7 +351,6 @@ void kasan_free_pages(struct page *page, unsigned int order)
KASAN_FREE_PAGE); KASAN_FREE_PAGE);
} }
#ifdef CONFIG_SLAB
/* /*
* Adaptive redzone policy taken from the userspace AddressSanitizer runtime. * Adaptive redzone policy taken from the userspace AddressSanitizer runtime.
* For larger allocations larger redzones are used. * For larger allocations larger redzones are used.
@ -373,16 +372,8 @@ void kasan_cache_create(struct kmem_cache *cache, size_t *size,
unsigned long *flags) unsigned long *flags)
{ {
int redzone_adjust; int redzone_adjust;
/* Make sure the adjusted size is still less than int orig_size = *size;
* KMALLOC_MAX_CACHE_SIZE.
* TODO: this check is only useful for SLAB, but not SLUB. We'll need
* to skip it for SLUB when it starts using kasan_cache_create().
*/
if (*size > KMALLOC_MAX_CACHE_SIZE -
sizeof(struct kasan_alloc_meta) -
sizeof(struct kasan_free_meta))
return;
*flags |= SLAB_KASAN;
/* Add alloc meta. */ /* Add alloc meta. */
cache->kasan_info.alloc_meta_offset = *size; cache->kasan_info.alloc_meta_offset = *size;
*size += sizeof(struct kasan_alloc_meta); *size += sizeof(struct kasan_alloc_meta);
@ -395,14 +386,26 @@ void kasan_cache_create(struct kmem_cache *cache, size_t *size,
} }
redzone_adjust = optimal_redzone(cache->object_size) - redzone_adjust = optimal_redzone(cache->object_size) -
(*size - cache->object_size); (*size - cache->object_size);
if (redzone_adjust > 0) if (redzone_adjust > 0)
*size += redzone_adjust; *size += redzone_adjust;
*size = min(KMALLOC_MAX_CACHE_SIZE,
max(*size, *size = min(KMALLOC_MAX_SIZE, max(*size, cache->object_size +
cache->object_size + optimal_redzone(cache->object_size)));
optimal_redzone(cache->object_size)));
/*
* If the metadata doesn't fit, don't enable KASAN at all.
*/
if (*size <= cache->kasan_info.alloc_meta_offset ||
*size <= cache->kasan_info.free_meta_offset) {
cache->kasan_info.alloc_meta_offset = 0;
cache->kasan_info.free_meta_offset = 0;
*size = orig_size;
return;
}
*flags |= SLAB_KASAN;
} }
#endif
void kasan_cache_shrink(struct kmem_cache *cache) void kasan_cache_shrink(struct kmem_cache *cache)
{ {
@ -414,6 +417,14 @@ void kasan_cache_destroy(struct kmem_cache *cache)
quarantine_remove_cache(cache); quarantine_remove_cache(cache);
} }
size_t kasan_metadata_size(struct kmem_cache *cache)
{
return (cache->kasan_info.alloc_meta_offset ?
sizeof(struct kasan_alloc_meta) : 0) +
(cache->kasan_info.free_meta_offset ?
sizeof(struct kasan_free_meta) : 0);
}
void kasan_poison_slab(struct page *page) void kasan_poison_slab(struct page *page)
{ {
kasan_poison_shadow(page_address(page), kasan_poison_shadow(page_address(page),
@ -431,16 +442,13 @@ void kasan_poison_object_data(struct kmem_cache *cache, void *object)
kasan_poison_shadow(object, kasan_poison_shadow(object,
round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE), round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE),
KASAN_KMALLOC_REDZONE); KASAN_KMALLOC_REDZONE);
#ifdef CONFIG_SLAB
if (cache->flags & SLAB_KASAN) { if (cache->flags & SLAB_KASAN) {
struct kasan_alloc_meta *alloc_info = struct kasan_alloc_meta *alloc_info =
get_alloc_info(cache, object); get_alloc_info(cache, object);
alloc_info->state = KASAN_STATE_INIT; alloc_info->state = KASAN_STATE_INIT;
} }
#endif
} }
#ifdef CONFIG_SLAB
static inline int in_irqentry_text(unsigned long ptr) static inline int in_irqentry_text(unsigned long ptr)
{ {
return (ptr >= (unsigned long)&__irqentry_text_start && return (ptr >= (unsigned long)&__irqentry_text_start &&
@ -501,7 +509,6 @@ struct kasan_free_meta *get_free_info(struct kmem_cache *cache,
BUILD_BUG_ON(sizeof(struct kasan_free_meta) > 32); BUILD_BUG_ON(sizeof(struct kasan_free_meta) > 32);
return (void *)object + cache->kasan_info.free_meta_offset; return (void *)object + cache->kasan_info.free_meta_offset;
} }
#endif
void kasan_slab_alloc(struct kmem_cache *cache, void *object, gfp_t flags) void kasan_slab_alloc(struct kmem_cache *cache, void *object, gfp_t flags)
{ {
@ -522,16 +529,16 @@ static void kasan_poison_slab_free(struct kmem_cache *cache, void *object)
bool kasan_slab_free(struct kmem_cache *cache, void *object) bool kasan_slab_free(struct kmem_cache *cache, void *object)
{ {
#ifdef CONFIG_SLAB
/* RCU slabs could be legally used after free within the RCU period */ /* RCU slabs could be legally used after free within the RCU period */
if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU)) if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU))
return false; return false;
if (likely(cache->flags & SLAB_KASAN)) { if (likely(cache->flags & SLAB_KASAN)) {
struct kasan_alloc_meta *alloc_info = struct kasan_alloc_meta *alloc_info;
get_alloc_info(cache, object); struct kasan_free_meta *free_info;
struct kasan_free_meta *free_info =
get_free_info(cache, object); alloc_info = get_alloc_info(cache, object);
free_info = get_free_info(cache, object);
switch (alloc_info->state) { switch (alloc_info->state) {
case KASAN_STATE_ALLOC: case KASAN_STATE_ALLOC:
@ -550,10 +557,6 @@ bool kasan_slab_free(struct kmem_cache *cache, void *object)
} }
} }
return false; return false;
#else
kasan_poison_slab_free(cache, object);
return false;
#endif
} }
void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size, void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size,
@ -576,7 +579,6 @@ void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size,
kasan_unpoison_shadow(object, size); kasan_unpoison_shadow(object, size);
kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start, kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start,
KASAN_KMALLOC_REDZONE); KASAN_KMALLOC_REDZONE);
#ifdef CONFIG_SLAB
if (cache->flags & SLAB_KASAN) { if (cache->flags & SLAB_KASAN) {
struct kasan_alloc_meta *alloc_info = struct kasan_alloc_meta *alloc_info =
get_alloc_info(cache, object); get_alloc_info(cache, object);
@ -585,7 +587,6 @@ void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size,
alloc_info->alloc_size = size; alloc_info->alloc_size = size;
set_track(&alloc_info->track, flags); set_track(&alloc_info->track, flags);
} }
#endif
} }
EXPORT_SYMBOL(kasan_kmalloc); EXPORT_SYMBOL(kasan_kmalloc);

View file

@ -95,7 +95,6 @@ struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache,
struct kasan_free_meta *get_free_info(struct kmem_cache *cache, struct kasan_free_meta *get_free_info(struct kmem_cache *cache,
const void *object); const void *object);
static inline const void *kasan_shadow_to_mem(const void *shadow_addr) static inline const void *kasan_shadow_to_mem(const void *shadow_addr)
{ {
return (void *)(((unsigned long)shadow_addr - KASAN_SHADOW_OFFSET) return (void *)(((unsigned long)shadow_addr - KASAN_SHADOW_OFFSET)
@ -110,7 +109,7 @@ static inline bool kasan_report_enabled(void)
void kasan_report(unsigned long addr, size_t size, void kasan_report(unsigned long addr, size_t size,
bool is_write, unsigned long ip); bool is_write, unsigned long ip);
#ifdef CONFIG_SLAB #if defined(CONFIG_SLAB) || defined(CONFIG_SLUB)
void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache); void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache);
void quarantine_reduce(void); void quarantine_reduce(void);
void quarantine_remove_cache(struct kmem_cache *cache); void quarantine_remove_cache(struct kmem_cache *cache);

View file

@ -116,7 +116,6 @@ static inline bool init_task_stack_addr(const void *addr)
sizeof(init_thread_union.stack)); sizeof(init_thread_union.stack));
} }
#ifdef CONFIG_SLAB
static void print_track(struct kasan_track *track) static void print_track(struct kasan_track *track)
{ {
pr_err("PID = %u\n", track->pid); pr_err("PID = %u\n", track->pid);
@ -130,8 +129,8 @@ static void print_track(struct kasan_track *track)
} }
} }
static void object_err(struct kmem_cache *cache, struct page *page, static void kasan_object_err(struct kmem_cache *cache, struct page *page,
void *object, char *unused_reason) void *object, char *unused_reason)
{ {
struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object); struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object);
struct kasan_free_meta *free_info; struct kasan_free_meta *free_info;
@ -162,7 +161,6 @@ static void object_err(struct kmem_cache *cache, struct page *page,
break; break;
} }
} }
#endif
static void print_address_description(struct kasan_access_info *info) static void print_address_description(struct kasan_access_info *info)
{ {
@ -177,7 +175,7 @@ static void print_address_description(struct kasan_access_info *info)
struct kmem_cache *cache = page->slab_cache; struct kmem_cache *cache = page->slab_cache;
object = nearest_obj(cache, page, object = nearest_obj(cache, page,
(void *)info->access_addr); (void *)info->access_addr);
object_err(cache, page, object, kasan_object_err(cache, page, object,
"kasan: bad access detected"); "kasan: bad access detected");
return; return;
} }

View file

@ -480,7 +480,7 @@ void __khugepaged_exit(struct mm_struct *mm)
static void release_pte_page(struct page *page) static void release_pte_page(struct page *page)
{ {
/* 0 stands for page_is_file_cache(page) == false */ /* 0 stands for page_is_file_cache(page) == false */
dec_zone_page_state(page, NR_ISOLATED_ANON + 0); dec_node_page_state(page, NR_ISOLATED_ANON + 0);
unlock_page(page); unlock_page(page);
putback_lru_page(page); putback_lru_page(page);
} }
@ -576,7 +576,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
goto out; goto out;
} }
/* 0 stands for page_is_file_cache(page) == false */ /* 0 stands for page_is_file_cache(page) == false */
inc_zone_page_state(page, NR_ISOLATED_ANON + 0); inc_node_page_state(page, NR_ISOLATED_ANON + 0);
VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageLRU(page), page); VM_BUG_ON_PAGE(PageLRU(page), page);
@ -672,10 +672,10 @@ static bool khugepaged_scan_abort(int nid)
int i; int i;
/* /*
* If zone_reclaim_mode is disabled, then no extra effort is made to * If node_reclaim_mode is disabled, then no extra effort is made to
* allocate memory locally. * allocate memory locally.
*/ */
if (!zone_reclaim_mode) if (!node_reclaim_mode)
return false; return false;
/* If there is a count for this node already, it must be acceptable */ /* If there is a count for this node already, it must be acceptable */
@ -694,7 +694,7 @@ static bool khugepaged_scan_abort(int nid)
/* Defrag for khugepaged will enter direct reclaim/compaction if necessary */ /* Defrag for khugepaged will enter direct reclaim/compaction if necessary */
static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void) static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void)
{ {
return GFP_TRANSHUGE | (khugepaged_defrag() ? __GFP_DIRECT_RECLAIM : 0); return khugepaged_defrag() ? GFP_TRANSHUGE : GFP_TRANSHUGE_LIGHT;
} }
#ifdef CONFIG_NUMA #ifdef CONFIG_NUMA
@ -1483,10 +1483,10 @@ tree_unlocked:
} }
local_irq_save(flags); local_irq_save(flags);
__inc_zone_page_state(new_page, NR_SHMEM_THPS); __inc_node_page_state(new_page, NR_SHMEM_THPS);
if (nr_none) { if (nr_none) {
__mod_zone_page_state(zone, NR_FILE_PAGES, nr_none); __mod_node_page_state(zone->zone_pgdat, NR_FILE_PAGES, nr_none);
__mod_zone_page_state(zone, NR_SHMEM, nr_none); __mod_node_page_state(zone->zone_pgdat, NR_SHMEM, nr_none);
} }
local_irq_restore(flags); local_irq_restore(flags);

View file

@ -1485,8 +1485,10 @@ static int kmemleak_scan_thread(void *arg)
* Wait before the first scan to allow the system to fully initialize. * Wait before the first scan to allow the system to fully initialize.
*/ */
if (first_run) { if (first_run) {
signed long timeout = msecs_to_jiffies(SECS_FIRST_SCAN * 1000);
first_run = 0; first_run = 0;
ssleep(SECS_FIRST_SCAN); while (timeout && !kthread_should_stop())
timeout = schedule_timeout_interruptible(timeout);
} }
while (!kthread_should_stop()) { while (!kthread_should_stop()) {

View file

@ -20,7 +20,7 @@
#include <linux/seq_file.h> #include <linux/seq_file.h>
#include <linux/memblock.h> #include <linux/memblock.h>
#include <asm-generic/sections.h> #include <asm/sections.h>
#include <linux/io.h> #include <linux/io.h>
#include "internal.h" #include "internal.h"
@ -1027,7 +1027,7 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid, ulong flags,
*out_end = m_end; *out_end = m_end;
if (out_nid) if (out_nid)
*out_nid = m_nid; *out_nid = m_nid;
idx_a++; idx_a--;
*idx = (u32)idx_a | (u64)idx_b << 32; *idx = (u32)idx_a | (u64)idx_b << 32;
return; return;
} }
@ -1465,15 +1465,16 @@ phys_addr_t __init_memblock memblock_end_of_DRAM(void)
return (memblock.memory.regions[idx].base + memblock.memory.regions[idx].size); return (memblock.memory.regions[idx].base + memblock.memory.regions[idx].size);
} }
void __init memblock_enforce_memory_limit(phys_addr_t limit) static phys_addr_t __init_memblock __find_max_addr(phys_addr_t limit)
{ {
phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX; phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX;
struct memblock_region *r; struct memblock_region *r;
if (!limit) /*
return; * translate the memory @limit size into the max address within one of
* the memory memblock regions, if the @limit exceeds the total size
/* find out max address */ * of those regions, max_addr will keep original value ULLONG_MAX
*/
for_each_memblock(memory, r) { for_each_memblock(memory, r) {
if (limit <= r->size) { if (limit <= r->size) {
max_addr = r->base + limit; max_addr = r->base + limit;
@ -1482,6 +1483,22 @@ void __init memblock_enforce_memory_limit(phys_addr_t limit)
limit -= r->size; limit -= r->size;
} }
return max_addr;
}
void __init memblock_enforce_memory_limit(phys_addr_t limit)
{
phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX;
if (!limit)
return;
max_addr = __find_max_addr(limit);
/* @limit exceeds the total size of the memory, do nothing */
if (max_addr == (phys_addr_t)ULLONG_MAX)
return;
/* truncate both memory and reserved regions */ /* truncate both memory and reserved regions */
memblock_remove_range(&memblock.memory, max_addr, memblock_remove_range(&memblock.memory, max_addr,
(phys_addr_t)ULLONG_MAX); (phys_addr_t)ULLONG_MAX);
@ -1489,6 +1506,36 @@ void __init memblock_enforce_memory_limit(phys_addr_t limit)
(phys_addr_t)ULLONG_MAX); (phys_addr_t)ULLONG_MAX);
} }
void __init memblock_mem_limit_remove_map(phys_addr_t limit)
{
struct memblock_type *type = &memblock.memory;
phys_addr_t max_addr;
int i, ret, start_rgn, end_rgn;
if (!limit)
return;
max_addr = __find_max_addr(limit);
/* @limit exceeds the total size of the memory, do nothing */
if (max_addr == (phys_addr_t)ULLONG_MAX)
return;
ret = memblock_isolate_range(type, max_addr, (phys_addr_t)ULLONG_MAX,
&start_rgn, &end_rgn);
if (ret)
return;
/* remove all the MAP regions above the limit */
for (i = end_rgn - 1; i >= start_rgn; i--) {
if (!memblock_is_nomap(&type->regions[i]))
memblock_remove_region(type, i);
}
/* truncate the reserved regions */
memblock_remove_range(&memblock.reserved, max_addr,
(phys_addr_t)ULLONG_MAX);
}
static int __init_memblock memblock_search(struct memblock_type *type, phys_addr_t addr) static int __init_memblock memblock_search(struct memblock_type *type, phys_addr_t addr)
{ {
unsigned int left = 0, right = type->cnt; unsigned int left = 0, right = type->cnt;

View file

@ -132,15 +132,11 @@ static const char * const mem_cgroup_lru_names[] = {
* their hierarchy representation * their hierarchy representation
*/ */
struct mem_cgroup_tree_per_zone { struct mem_cgroup_tree_per_node {
struct rb_root rb_root; struct rb_root rb_root;
spinlock_t lock; spinlock_t lock;
}; };
struct mem_cgroup_tree_per_node {
struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
};
struct mem_cgroup_tree { struct mem_cgroup_tree {
struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
}; };
@ -323,15 +319,6 @@ EXPORT_SYMBOL(memcg_kmem_enabled_key);
#endif /* !CONFIG_SLOB */ #endif /* !CONFIG_SLOB */
static struct mem_cgroup_per_zone *
mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone)
{
int nid = zone_to_nid(zone);
int zid = zone_idx(zone);
return &memcg->nodeinfo[nid]->zoneinfo[zid];
}
/** /**
* mem_cgroup_css_from_page - css of the memcg associated with a page * mem_cgroup_css_from_page - css of the memcg associated with a page
* @page: page of interest * @page: page of interest
@ -383,37 +370,35 @@ ino_t page_cgroup_ino(struct page *page)
return ino; return ino;
} }
static struct mem_cgroup_per_zone * static struct mem_cgroup_per_node *
mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page) mem_cgroup_page_nodeinfo(struct mem_cgroup *memcg, struct page *page)
{ {
int nid = page_to_nid(page); int nid = page_to_nid(page);
int zid = page_zonenum(page);
return &memcg->nodeinfo[nid]->zoneinfo[zid]; return memcg->nodeinfo[nid];
} }
static struct mem_cgroup_tree_per_zone * static struct mem_cgroup_tree_per_node *
soft_limit_tree_node_zone(int nid, int zid) soft_limit_tree_node(int nid)
{ {
return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; return soft_limit_tree.rb_tree_per_node[nid];
} }
static struct mem_cgroup_tree_per_zone * static struct mem_cgroup_tree_per_node *
soft_limit_tree_from_page(struct page *page) soft_limit_tree_from_page(struct page *page)
{ {
int nid = page_to_nid(page); int nid = page_to_nid(page);
int zid = page_zonenum(page);
return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; return soft_limit_tree.rb_tree_per_node[nid];
} }
static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz, static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
struct mem_cgroup_tree_per_zone *mctz, struct mem_cgroup_tree_per_node *mctz,
unsigned long new_usage_in_excess) unsigned long new_usage_in_excess)
{ {
struct rb_node **p = &mctz->rb_root.rb_node; struct rb_node **p = &mctz->rb_root.rb_node;
struct rb_node *parent = NULL; struct rb_node *parent = NULL;
struct mem_cgroup_per_zone *mz_node; struct mem_cgroup_per_node *mz_node;
if (mz->on_tree) if (mz->on_tree)
return; return;
@ -423,7 +408,7 @@ static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz,
return; return;
while (*p) { while (*p) {
parent = *p; parent = *p;
mz_node = rb_entry(parent, struct mem_cgroup_per_zone, mz_node = rb_entry(parent, struct mem_cgroup_per_node,
tree_node); tree_node);
if (mz->usage_in_excess < mz_node->usage_in_excess) if (mz->usage_in_excess < mz_node->usage_in_excess)
p = &(*p)->rb_left; p = &(*p)->rb_left;
@ -439,8 +424,8 @@ static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz,
mz->on_tree = true; mz->on_tree = true;
} }
static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
struct mem_cgroup_tree_per_zone *mctz) struct mem_cgroup_tree_per_node *mctz)
{ {
if (!mz->on_tree) if (!mz->on_tree)
return; return;
@ -448,8 +433,8 @@ static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
mz->on_tree = false; mz->on_tree = false;
} }
static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
struct mem_cgroup_tree_per_zone *mctz) struct mem_cgroup_tree_per_node *mctz)
{ {
unsigned long flags; unsigned long flags;
@ -473,8 +458,8 @@ static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
{ {
unsigned long excess; unsigned long excess;
struct mem_cgroup_per_zone *mz; struct mem_cgroup_per_node *mz;
struct mem_cgroup_tree_per_zone *mctz; struct mem_cgroup_tree_per_node *mctz;
mctz = soft_limit_tree_from_page(page); mctz = soft_limit_tree_from_page(page);
/* /*
@ -482,7 +467,7 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
* because their event counter is not touched. * because their event counter is not touched.
*/ */
for (; memcg; memcg = parent_mem_cgroup(memcg)) { for (; memcg; memcg = parent_mem_cgroup(memcg)) {
mz = mem_cgroup_page_zoneinfo(memcg, page); mz = mem_cgroup_page_nodeinfo(memcg, page);
excess = soft_limit_excess(memcg); excess = soft_limit_excess(memcg);
/* /*
* We have to update the tree if mz is on RB-tree or * We have to update the tree if mz is on RB-tree or
@ -507,24 +492,22 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
{ {
struct mem_cgroup_tree_per_zone *mctz; struct mem_cgroup_tree_per_node *mctz;
struct mem_cgroup_per_zone *mz; struct mem_cgroup_per_node *mz;
int nid, zid; int nid;
for_each_node(nid) { for_each_node(nid) {
for (zid = 0; zid < MAX_NR_ZONES; zid++) { mz = mem_cgroup_nodeinfo(memcg, nid);
mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; mctz = soft_limit_tree_node(nid);
mctz = soft_limit_tree_node_zone(nid, zid); mem_cgroup_remove_exceeded(mz, mctz);
mem_cgroup_remove_exceeded(mz, mctz);
}
} }
} }
static struct mem_cgroup_per_zone * static struct mem_cgroup_per_node *
__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
{ {
struct rb_node *rightmost = NULL; struct rb_node *rightmost = NULL;
struct mem_cgroup_per_zone *mz; struct mem_cgroup_per_node *mz;
retry: retry:
mz = NULL; mz = NULL;
@ -532,7 +515,7 @@ retry:
if (!rightmost) if (!rightmost)
goto done; /* Nothing to reclaim from */ goto done; /* Nothing to reclaim from */
mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); mz = rb_entry(rightmost, struct mem_cgroup_per_node, tree_node);
/* /*
* Remove the node now but someone else can add it back, * Remove the node now but someone else can add it back,
* we will to add it back at the end of reclaim to its correct * we will to add it back at the end of reclaim to its correct
@ -546,10 +529,10 @@ done:
return mz; return mz;
} }
static struct mem_cgroup_per_zone * static struct mem_cgroup_per_node *
mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
{ {
struct mem_cgroup_per_zone *mz; struct mem_cgroup_per_node *mz;
spin_lock_irq(&mctz->lock); spin_lock_irq(&mctz->lock);
mz = __mem_cgroup_largest_soft_limit_node(mctz); mz = __mem_cgroup_largest_soft_limit_node(mctz);
@ -643,20 +626,16 @@ unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
int nid, unsigned int lru_mask) int nid, unsigned int lru_mask)
{ {
unsigned long nr = 0; unsigned long nr = 0;
int zid; struct mem_cgroup_per_node *mz;
enum lru_list lru;
VM_BUG_ON((unsigned)nid >= nr_node_ids); VM_BUG_ON((unsigned)nid >= nr_node_ids);
for (zid = 0; zid < MAX_NR_ZONES; zid++) { for_each_lru(lru) {
struct mem_cgroup_per_zone *mz; if (!(BIT(lru) & lru_mask))
enum lru_list lru; continue;
mz = mem_cgroup_nodeinfo(memcg, nid);
for_each_lru(lru) { nr += mz->lru_size[lru];
if (!(BIT(lru) & lru_mask))
continue;
mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
nr += mz->lru_size[lru];
}
} }
return nr; return nr;
} }
@ -809,9 +788,9 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
rcu_read_lock(); rcu_read_lock();
if (reclaim) { if (reclaim) {
struct mem_cgroup_per_zone *mz; struct mem_cgroup_per_node *mz;
mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone); mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id);
iter = &mz->iter[reclaim->priority]; iter = &mz->iter[reclaim->priority];
if (prev && reclaim->generation != iter->generation) if (prev && reclaim->generation != iter->generation)
@ -910,19 +889,17 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
{ {
struct mem_cgroup *memcg = dead_memcg; struct mem_cgroup *memcg = dead_memcg;
struct mem_cgroup_reclaim_iter *iter; struct mem_cgroup_reclaim_iter *iter;
struct mem_cgroup_per_zone *mz; struct mem_cgroup_per_node *mz;
int nid, zid; int nid;
int i; int i;
while ((memcg = parent_mem_cgroup(memcg))) { while ((memcg = parent_mem_cgroup(memcg))) {
for_each_node(nid) { for_each_node(nid) {
for (zid = 0; zid < MAX_NR_ZONES; zid++) { mz = mem_cgroup_nodeinfo(memcg, nid);
mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; for (i = 0; i <= DEF_PRIORITY; i++) {
for (i = 0; i <= DEF_PRIORITY; i++) { iter = &mz->iter[i];
iter = &mz->iter[i]; cmpxchg(&iter->position,
cmpxchg(&iter->position, dead_memcg, NULL);
dead_memcg, NULL);
}
} }
} }
} }
@ -943,39 +920,6 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
iter != NULL; \ iter != NULL; \
iter = mem_cgroup_iter(NULL, iter, NULL)) iter = mem_cgroup_iter(NULL, iter, NULL))
/**
* mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
* @zone: zone of the wanted lruvec
* @memcg: memcg of the wanted lruvec
*
* Returns the lru list vector holding pages for the given @zone and
* @mem. This can be the global zone lruvec, if the memory controller
* is disabled.
*/
struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
struct mem_cgroup *memcg)
{
struct mem_cgroup_per_zone *mz;
struct lruvec *lruvec;
if (mem_cgroup_disabled()) {
lruvec = &zone->lruvec;
goto out;
}
mz = mem_cgroup_zone_zoneinfo(memcg, zone);
lruvec = &mz->lruvec;
out:
/*
* Since a node can be onlined after the mem_cgroup was created,
* we have to be prepared to initialize lruvec->zone here;
* and if offlined then reonlined, we need to reinitialize it.
*/
if (unlikely(lruvec->zone != zone))
lruvec->zone = zone;
return lruvec;
}
/** /**
* mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
* @page: the page * @page: the page
@ -985,14 +929,14 @@ out:
* and putback protocol: the LRU lock must be held, and the page must * and putback protocol: the LRU lock must be held, and the page must
* either be PageLRU() or the caller must have isolated/allocated it. * either be PageLRU() or the caller must have isolated/allocated it.
*/ */
struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone) struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat)
{ {
struct mem_cgroup_per_zone *mz; struct mem_cgroup_per_node *mz;
struct mem_cgroup *memcg; struct mem_cgroup *memcg;
struct lruvec *lruvec; struct lruvec *lruvec;
if (mem_cgroup_disabled()) { if (mem_cgroup_disabled()) {
lruvec = &zone->lruvec; lruvec = &pgdat->lruvec;
goto out; goto out;
} }
@ -1004,7 +948,7 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
if (!memcg) if (!memcg)
memcg = root_mem_cgroup; memcg = root_mem_cgroup;
mz = mem_cgroup_page_zoneinfo(memcg, page); mz = mem_cgroup_page_nodeinfo(memcg, page);
lruvec = &mz->lruvec; lruvec = &mz->lruvec;
out: out:
/* /*
@ -1012,8 +956,8 @@ out:
* we have to be prepared to initialize lruvec->zone here; * we have to be prepared to initialize lruvec->zone here;
* and if offlined then reonlined, we need to reinitialize it. * and if offlined then reonlined, we need to reinitialize it.
*/ */
if (unlikely(lruvec->zone != zone)) if (unlikely(lruvec->pgdat != pgdat))
lruvec->zone = zone; lruvec->pgdat = pgdat;
return lruvec; return lruvec;
} }
@ -1030,17 +974,15 @@ out:
void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
int nr_pages) int nr_pages)
{ {
struct mem_cgroup_per_zone *mz; struct mem_cgroup_per_node *mz;
unsigned long *lru_size; unsigned long *lru_size;
long size; long size;
bool empty; bool empty;
__update_lru_size(lruvec, lru, nr_pages);
if (mem_cgroup_disabled()) if (mem_cgroup_disabled())
return; return;
mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
lru_size = mz->lru_size + lru; lru_size = mz->lru_size + lru;
empty = list_empty(lruvec->lists + lru); empty = list_empty(lruvec->lists + lru);
@ -1276,9 +1218,9 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
* select it. The goal is to allow it to allocate so that it may * select it. The goal is to allow it to allocate so that it may
* quickly exit and free its memory. * quickly exit and free its memory.
*/ */
if (fatal_signal_pending(current) || task_will_free_mem(current)) { if (task_will_free_mem(current)) {
mark_oom_victim(current); mark_oom_victim(current);
try_oom_reaper(current); wake_oom_reaper(current);
goto unlock; goto unlock;
} }
@ -1433,7 +1375,7 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
#endif #endif
static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
struct zone *zone, pg_data_t *pgdat,
gfp_t gfp_mask, gfp_t gfp_mask,
unsigned long *total_scanned) unsigned long *total_scanned)
{ {
@ -1443,7 +1385,7 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
unsigned long excess; unsigned long excess;
unsigned long nr_scanned; unsigned long nr_scanned;
struct mem_cgroup_reclaim_cookie reclaim = { struct mem_cgroup_reclaim_cookie reclaim = {
.zone = zone, .pgdat = pgdat,
.priority = 0, .priority = 0,
}; };
@ -1473,8 +1415,8 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
} }
continue; continue;
} }
total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false, total += mem_cgroup_shrink_node(victim, gfp_mask, false,
zone, &nr_scanned); pgdat, &nr_scanned);
*total_scanned += nr_scanned; *total_scanned += nr_scanned;
if (!soft_limit_excess(root_memcg)) if (!soft_limit_excess(root_memcg))
break; break;
@ -2107,11 +2049,11 @@ static void lock_page_lru(struct page *page, int *isolated)
{ {
struct zone *zone = page_zone(page); struct zone *zone = page_zone(page);
spin_lock_irq(&zone->lru_lock); spin_lock_irq(zone_lru_lock(zone));
if (PageLRU(page)) { if (PageLRU(page)) {
struct lruvec *lruvec; struct lruvec *lruvec;
lruvec = mem_cgroup_page_lruvec(page, zone); lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
ClearPageLRU(page); ClearPageLRU(page);
del_page_from_lru_list(page, lruvec, page_lru(page)); del_page_from_lru_list(page, lruvec, page_lru(page));
*isolated = 1; *isolated = 1;
@ -2126,12 +2068,12 @@ static void unlock_page_lru(struct page *page, int isolated)
if (isolated) { if (isolated) {
struct lruvec *lruvec; struct lruvec *lruvec;
lruvec = mem_cgroup_page_lruvec(page, zone); lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
VM_BUG_ON_PAGE(PageLRU(page), page); VM_BUG_ON_PAGE(PageLRU(page), page);
SetPageLRU(page); SetPageLRU(page);
add_page_to_lru_list(page, lruvec, page_lru(page)); add_page_to_lru_list(page, lruvec, page_lru(page));
} }
spin_unlock_irq(&zone->lru_lock); spin_unlock_irq(zone_lru_lock(zone));
} }
static void commit_charge(struct page *page, struct mem_cgroup *memcg, static void commit_charge(struct page *page, struct mem_cgroup *memcg,
@ -2431,7 +2373,7 @@ void memcg_kmem_uncharge(struct page *page, int order)
/* /*
* Because tail pages are not marked as "used", set it. We're under * Because tail pages are not marked as "used", set it. We're under
* zone->lru_lock and migration entries setup in all page mappings. * zone_lru_lock and migration entries setup in all page mappings.
*/ */
void mem_cgroup_split_huge_fixup(struct page *head) void mem_cgroup_split_huge_fixup(struct page *head)
{ {
@ -2601,22 +2543,22 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
return ret; return ret;
} }
unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
gfp_t gfp_mask, gfp_t gfp_mask,
unsigned long *total_scanned) unsigned long *total_scanned)
{ {
unsigned long nr_reclaimed = 0; unsigned long nr_reclaimed = 0;
struct mem_cgroup_per_zone *mz, *next_mz = NULL; struct mem_cgroup_per_node *mz, *next_mz = NULL;
unsigned long reclaimed; unsigned long reclaimed;
int loop = 0; int loop = 0;
struct mem_cgroup_tree_per_zone *mctz; struct mem_cgroup_tree_per_node *mctz;
unsigned long excess; unsigned long excess;
unsigned long nr_scanned; unsigned long nr_scanned;
if (order > 0) if (order > 0)
return 0; return 0;
mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone)); mctz = soft_limit_tree_node(pgdat->node_id);
/* /*
* This loop can run a while, specially if mem_cgroup's continuously * This loop can run a while, specially if mem_cgroup's continuously
* keep exceeding their soft limit and putting the system under * keep exceeding their soft limit and putting the system under
@ -2631,7 +2573,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
break; break;
nr_scanned = 0; nr_scanned = 0;
reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone, reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat,
gfp_mask, &nr_scanned); gfp_mask, &nr_scanned);
nr_reclaimed += reclaimed; nr_reclaimed += reclaimed;
*total_scanned += nr_scanned; *total_scanned += nr_scanned;
@ -3252,22 +3194,21 @@ static int memcg_stat_show(struct seq_file *m, void *v)
#ifdef CONFIG_DEBUG_VM #ifdef CONFIG_DEBUG_VM
{ {
int nid, zid; pg_data_t *pgdat;
struct mem_cgroup_per_zone *mz; struct mem_cgroup_per_node *mz;
struct zone_reclaim_stat *rstat; struct zone_reclaim_stat *rstat;
unsigned long recent_rotated[2] = {0, 0}; unsigned long recent_rotated[2] = {0, 0};
unsigned long recent_scanned[2] = {0, 0}; unsigned long recent_scanned[2] = {0, 0};
for_each_online_node(nid) for_each_online_pgdat(pgdat) {
for (zid = 0; zid < MAX_NR_ZONES; zid++) { mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; rstat = &mz->lruvec.reclaim_stat;
rstat = &mz->lruvec.reclaim_stat;
recent_rotated[0] += rstat->recent_rotated[0]; recent_rotated[0] += rstat->recent_rotated[0];
recent_rotated[1] += rstat->recent_rotated[1]; recent_rotated[1] += rstat->recent_rotated[1];
recent_scanned[0] += rstat->recent_scanned[0]; recent_scanned[0] += rstat->recent_scanned[0];
recent_scanned[1] += rstat->recent_scanned[1]; recent_scanned[1] += rstat->recent_scanned[1];
} }
seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]); seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);
seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]); seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);
seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]); seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);
@ -4147,11 +4088,10 @@ struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
return idr_find(&mem_cgroup_idr, id); return idr_find(&mem_cgroup_idr, id);
} }
static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
{ {
struct mem_cgroup_per_node *pn; struct mem_cgroup_per_node *pn;
struct mem_cgroup_per_zone *mz; int tmp = node;
int zone, tmp = node;
/* /*
* This routine is called against possible nodes. * This routine is called against possible nodes.
* But it's BUG to call kmalloc() against offline node. * But it's BUG to call kmalloc() against offline node.
@ -4166,18 +4106,16 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
if (!pn) if (!pn)
return 1; return 1;
for (zone = 0; zone < MAX_NR_ZONES; zone++) { lruvec_init(&pn->lruvec);
mz = &pn->zoneinfo[zone]; pn->usage_in_excess = 0;
lruvec_init(&mz->lruvec); pn->on_tree = false;
mz->usage_in_excess = 0; pn->memcg = memcg;
mz->on_tree = false;
mz->memcg = memcg;
}
memcg->nodeinfo[node] = pn; memcg->nodeinfo[node] = pn;
return 0; return 0;
} }
static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
{ {
kfree(memcg->nodeinfo[node]); kfree(memcg->nodeinfo[node]);
} }
@ -4188,7 +4126,7 @@ static void mem_cgroup_free(struct mem_cgroup *memcg)
memcg_wb_domain_exit(memcg); memcg_wb_domain_exit(memcg);
for_each_node(node) for_each_node(node)
free_mem_cgroup_per_zone_info(memcg, node); free_mem_cgroup_per_node_info(memcg, node);
free_percpu(memcg->stat); free_percpu(memcg->stat);
kfree(memcg); kfree(memcg);
} }
@ -4217,7 +4155,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
goto fail; goto fail;
for_each_node(node) for_each_node(node)
if (alloc_mem_cgroup_per_zone_info(memcg, node)) if (alloc_mem_cgroup_per_node_info(memcg, node))
goto fail; goto fail;
if (memcg_wb_domain_init(memcg, GFP_KERNEL)) if (memcg_wb_domain_init(memcg, GFP_KERNEL))
@ -5233,7 +5171,7 @@ static int memory_stat_show(struct seq_file *m, void *v)
seq_printf(m, "file %llu\n", seq_printf(m, "file %llu\n",
(u64)stat[MEM_CGROUP_STAT_CACHE] * PAGE_SIZE); (u64)stat[MEM_CGROUP_STAT_CACHE] * PAGE_SIZE);
seq_printf(m, "kernel_stack %llu\n", seq_printf(m, "kernel_stack %llu\n",
(u64)stat[MEMCG_KERNEL_STACK] * PAGE_SIZE); (u64)stat[MEMCG_KERNEL_STACK_KB] * 1024);
seq_printf(m, "slab %llu\n", seq_printf(m, "slab %llu\n",
(u64)(stat[MEMCG_SLAB_RECLAIMABLE] + (u64)(stat[MEMCG_SLAB_RECLAIMABLE] +
stat[MEMCG_SLAB_UNRECLAIMABLE]) * PAGE_SIZE); stat[MEMCG_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
@ -5820,18 +5758,12 @@ static int __init mem_cgroup_init(void)
for_each_node(node) { for_each_node(node) {
struct mem_cgroup_tree_per_node *rtpn; struct mem_cgroup_tree_per_node *rtpn;
int zone;
rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL,
node_online(node) ? node : NUMA_NO_NODE); node_online(node) ? node : NUMA_NO_NODE);
for (zone = 0; zone < MAX_NR_ZONES; zone++) { rtpn->rb_root = RB_ROOT;
struct mem_cgroup_tree_per_zone *rtpz; spin_lock_init(&rtpn->lock);
rtpz = &rtpn->rb_tree_per_zone[zone];
rtpz->rb_root = RB_ROOT;
spin_lock_init(&rtpz->lock);
}
soft_limit_tree.rb_tree_per_node[node] = rtpn; soft_limit_tree.rb_tree_per_node[node] = rtpn;
} }

View file

@ -741,8 +741,6 @@ static int me_huge_page(struct page *p, unsigned long pfn)
* page->lru because it can be used in other hugepage operations, * page->lru because it can be used in other hugepage operations,
* such as __unmap_hugepage_range() and gather_surplus_pages(). * such as __unmap_hugepage_range() and gather_surplus_pages().
* So instead we use page_mapping() and PageAnon(). * So instead we use page_mapping() and PageAnon().
* We assume that this function is called with page lock held,
* so there is no race between isolation and mapping/unmapping.
*/ */
if (!(page_mapping(hpage) || PageAnon(hpage))) { if (!(page_mapping(hpage) || PageAnon(hpage))) {
res = dequeue_hwpoisoned_huge_page(hpage); res = dequeue_hwpoisoned_huge_page(hpage);
@ -1663,7 +1661,7 @@ static int __soft_offline_page(struct page *page, int flags)
put_hwpoison_page(page); put_hwpoison_page(page);
if (!ret) { if (!ret) {
LIST_HEAD(pagelist); LIST_HEAD(pagelist);
inc_zone_page_state(page, NR_ISOLATED_ANON + inc_node_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page)); page_is_file_cache(page));
list_add(&page->lru, &pagelist); list_add(&page->lru, &pagelist);
ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
@ -1671,7 +1669,7 @@ static int __soft_offline_page(struct page *page, int flags)
if (ret) { if (ret) {
if (!list_empty(&pagelist)) { if (!list_empty(&pagelist)) {
list_del(&page->lru); list_del(&page->lru);
dec_zone_page_state(page, NR_ISOLATED_ANON + dec_node_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page)); page_is_file_cache(page));
putback_lru_page(page); putback_lru_page(page);
} }

View file

@ -1209,9 +1209,10 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
arch_refresh_nodedata(nid, pgdat); arch_refresh_nodedata(nid, pgdat);
} else { } else {
/* Reset the nr_zones and classzone_idx to 0 before reuse */ /* Reset the nr_zones, order and classzone_idx before reuse */
pgdat->nr_zones = 0; pgdat->nr_zones = 0;
pgdat->classzone_idx = 0; pgdat->kswapd_order = 0;
pgdat->kswapd_classzone_idx = 0;
} }
/* we can use NODE_DATA(nid) from here */ /* we can use NODE_DATA(nid) from here */
@ -1547,6 +1548,37 @@ static unsigned long scan_movable_pages(unsigned long start, unsigned long end)
return 0; return 0;
} }
static struct page *new_node_page(struct page *page, unsigned long private,
int **result)
{
gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE;
int nid = page_to_nid(page);
nodemask_t nmask = node_online_map;
struct page *new_page;
/*
* TODO: allocate a destination hugepage from a nearest neighbor node,
* accordance with memory policy of the user process if possible. For
* now as a simple work-around, we use the next node for destination.
*/
if (PageHuge(page))
return alloc_huge_page_node(page_hstate(compound_head(page)),
next_node_in(nid, nmask));
node_clear(nid, nmask);
if (PageHighMem(page)
|| (zone_idx(page_zone(page)) == ZONE_MOVABLE))
gfp_mask |= __GFP_HIGHMEM;
new_page = __alloc_pages_nodemask(gfp_mask, 0,
node_zonelist(nid, gfp_mask), &nmask);
if (!new_page)
new_page = __alloc_pages(gfp_mask, 0,
node_zonelist(nid, gfp_mask));
return new_page;
}
#define NR_OFFLINE_AT_ONCE_PAGES (256) #define NR_OFFLINE_AT_ONCE_PAGES (256)
static int static int
do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
@ -1586,7 +1618,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
put_page(page); put_page(page);
list_add_tail(&page->lru, &source); list_add_tail(&page->lru, &source);
move_pages--; move_pages--;
inc_zone_page_state(page, NR_ISOLATED_ANON + inc_node_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page)); page_is_file_cache(page));
} else { } else {
@ -1610,11 +1642,8 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
goto out; goto out;
} }
/* /* Allocate a new page from the nearest neighbor node */
* alloc_migrate_target should be improooooved!! ret = migrate_pages(&source, new_node_page, NULL, 0,
* migrate_pages returns # of failed pages.
*/
ret = migrate_pages(&source, alloc_migrate_target, NULL, 0,
MIGRATE_SYNC, MR_MEMORY_HOTPLUG); MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
if (ret) if (ret)
putback_movable_pages(&source); putback_movable_pages(&source);

View file

@ -962,7 +962,7 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) { if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
if (!isolate_lru_page(page)) { if (!isolate_lru_page(page)) {
list_add_tail(&page->lru, pagelist); list_add_tail(&page->lru, pagelist);
inc_zone_page_state(page, NR_ISOLATED_ANON + inc_node_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page)); page_is_file_cache(page));
} }
} }

View file

@ -306,7 +306,7 @@ EXPORT_SYMBOL(mempool_resize);
* returns NULL. Note that due to preallocation, this function * returns NULL. Note that due to preallocation, this function
* *never* fails when called from process contexts. (it might * *never* fails when called from process contexts. (it might
* fail if called from an IRQ context.) * fail if called from an IRQ context.)
* Note: neither __GFP_NOMEMALLOC nor __GFP_ZERO are supported. * Note: using __GFP_ZERO is not supported.
*/ */
void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask) void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
{ {
@ -315,27 +315,16 @@ void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
wait_queue_t wait; wait_queue_t wait;
gfp_t gfp_temp; gfp_t gfp_temp;
/* If oom killed, memory reserves are essential to prevent livelock */
VM_WARN_ON_ONCE(gfp_mask & __GFP_NOMEMALLOC);
/* No element size to zero on allocation */
VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO); VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO);
might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM); might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */
gfp_mask |= __GFP_NORETRY; /* don't loop in __alloc_pages */ gfp_mask |= __GFP_NORETRY; /* don't loop in __alloc_pages */
gfp_mask |= __GFP_NOWARN; /* failures are OK */ gfp_mask |= __GFP_NOWARN; /* failures are OK */
gfp_temp = gfp_mask & ~(__GFP_DIRECT_RECLAIM|__GFP_IO); gfp_temp = gfp_mask & ~(__GFP_DIRECT_RECLAIM|__GFP_IO);
repeat_alloc: repeat_alloc:
if (likely(pool->curr_nr)) {
/*
* Don't allocate from emergency reserves if there are
* elements available. This check is racy, but it will
* be rechecked each loop.
*/
gfp_temp |= __GFP_NOMEMALLOC;
}
element = pool->alloc(gfp_temp, pool->pool_data); element = pool->alloc(gfp_temp, pool->pool_data);
if (likely(element != NULL)) if (likely(element != NULL))
@ -359,12 +348,11 @@ repeat_alloc:
* We use gfp mask w/o direct reclaim or IO for the first round. If * We use gfp mask w/o direct reclaim or IO for the first round. If
* alloc failed with that and @pool was empty, retry immediately. * alloc failed with that and @pool was empty, retry immediately.
*/ */
if ((gfp_temp & ~__GFP_NOMEMALLOC) != gfp_mask) { if (gfp_temp != gfp_mask) {
spin_unlock_irqrestore(&pool->lock, flags); spin_unlock_irqrestore(&pool->lock, flags);
gfp_temp = gfp_mask; gfp_temp = gfp_mask;
goto repeat_alloc; goto repeat_alloc;
} }
gfp_temp = gfp_mask;
/* We must not sleep if !__GFP_DIRECT_RECLAIM */ /* We must not sleep if !__GFP_DIRECT_RECLAIM */
if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) { if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) {

View file

@ -168,7 +168,7 @@ void putback_movable_pages(struct list_head *l)
continue; continue;
} }
list_del(&page->lru); list_del(&page->lru);
dec_zone_page_state(page, NR_ISOLATED_ANON + dec_node_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page)); page_is_file_cache(page));
/* /*
* We isolated non-lru movable page so here we can use * We isolated non-lru movable page so here we can use
@ -501,19 +501,21 @@ int migrate_page_move_mapping(struct address_space *mapping,
* new page and drop references to the old page. * new page and drop references to the old page.
* *
* Note that anonymous pages are accounted for * Note that anonymous pages are accounted for
* via NR_FILE_PAGES and NR_ANON_PAGES if they * via NR_FILE_PAGES and NR_ANON_MAPPED if they
* are mapped to swap space. * are mapped to swap space.
*/ */
if (newzone != oldzone) { if (newzone != oldzone) {
__dec_zone_state(oldzone, NR_FILE_PAGES); __dec_node_state(oldzone->zone_pgdat, NR_FILE_PAGES);
__inc_zone_state(newzone, NR_FILE_PAGES); __inc_node_state(newzone->zone_pgdat, NR_FILE_PAGES);
if (PageSwapBacked(page) && !PageSwapCache(page)) { if (PageSwapBacked(page) && !PageSwapCache(page)) {
__dec_zone_state(oldzone, NR_SHMEM); __dec_node_state(oldzone->zone_pgdat, NR_SHMEM);
__inc_zone_state(newzone, NR_SHMEM); __inc_node_state(newzone->zone_pgdat, NR_SHMEM);
} }
if (dirty && mapping_cap_account_dirty(mapping)) { if (dirty && mapping_cap_account_dirty(mapping)) {
__dec_zone_state(oldzone, NR_FILE_DIRTY); __dec_node_state(oldzone->zone_pgdat, NR_FILE_DIRTY);
__inc_zone_state(newzone, NR_FILE_DIRTY); __dec_zone_state(oldzone, NR_ZONE_WRITE_PENDING);
__inc_node_state(newzone->zone_pgdat, NR_FILE_DIRTY);
__inc_zone_state(newzone, NR_ZONE_WRITE_PENDING);
} }
} }
local_irq_enable(); local_irq_enable();
@ -1119,7 +1121,7 @@ out:
* restored. * restored.
*/ */
list_del(&page->lru); list_del(&page->lru);
dec_zone_page_state(page, NR_ISOLATED_ANON + dec_node_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page)); page_is_file_cache(page));
} }
@ -1460,7 +1462,7 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
err = isolate_lru_page(page); err = isolate_lru_page(page);
if (!err) { if (!err) {
list_add_tail(&page->lru, &pagelist); list_add_tail(&page->lru, &pagelist);
inc_zone_page_state(page, NR_ISOLATED_ANON + inc_node_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page)); page_is_file_cache(page));
} }
put_and_set: put_and_set:
@ -1726,15 +1728,16 @@ static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
unsigned long nr_migrate_pages) unsigned long nr_migrate_pages)
{ {
int z; int z;
if (!pgdat_reclaimable(pgdat))
return false;
for (z = pgdat->nr_zones - 1; z >= 0; z--) { for (z = pgdat->nr_zones - 1; z >= 0; z--) {
struct zone *zone = pgdat->node_zones + z; struct zone *zone = pgdat->node_zones + z;
if (!populated_zone(zone)) if (!populated_zone(zone))
continue; continue;
if (!zone_reclaimable(zone))
continue;
/* Avoid waking kswapd by allocating pages_to_migrate pages. */ /* Avoid waking kswapd by allocating pages_to_migrate pages. */
if (!zone_watermark_ok(zone, 0, if (!zone_watermark_ok(zone, 0,
high_wmark_pages(zone) + high_wmark_pages(zone) +
@ -1828,7 +1831,7 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
} }
page_lru = page_is_file_cache(page); page_lru = page_is_file_cache(page);
mod_zone_page_state(page_zone(page), NR_ISOLATED_ANON + page_lru, mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_lru,
hpage_nr_pages(page)); hpage_nr_pages(page));
/* /*
@ -1886,7 +1889,7 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
if (nr_remaining) { if (nr_remaining) {
if (!list_empty(&migratepages)) { if (!list_empty(&migratepages)) {
list_del(&page->lru); list_del(&page->lru);
dec_zone_page_state(page, NR_ISOLATED_ANON + dec_node_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page)); page_is_file_cache(page));
putback_lru_page(page); putback_lru_page(page);
} }
@ -1931,7 +1934,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
goto out_dropref; goto out_dropref;
new_page = alloc_pages_node(node, new_page = alloc_pages_node(node,
(GFP_TRANSHUGE | __GFP_THISNODE) & ~__GFP_RECLAIM, (GFP_TRANSHUGE_LIGHT | __GFP_THISNODE),
HPAGE_PMD_ORDER); HPAGE_PMD_ORDER);
if (!new_page) if (!new_page)
goto out_fail; goto out_fail;
@ -1979,7 +1982,7 @@ fail_putback:
/* Retake the callers reference and putback on LRU */ /* Retake the callers reference and putback on LRU */
get_page(page); get_page(page);
putback_lru_page(page); putback_lru_page(page);
mod_zone_page_state(page_zone(page), mod_node_page_state(page_pgdat(page),
NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR); NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR);
goto out_unlock; goto out_unlock;
@ -2030,7 +2033,7 @@ fail_putback:
count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR); count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR);
count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR); count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR);
mod_zone_page_state(page_zone(page), mod_node_page_state(page_pgdat(page),
NR_ISOLATED_ANON + page_lru, NR_ISOLATED_ANON + page_lru,
-HPAGE_PMD_NR); -HPAGE_PMD_NR);
return isolated; return isolated;

View file

@ -103,7 +103,7 @@ static bool __munlock_isolate_lru_page(struct page *page, bool getpage)
if (PageLRU(page)) { if (PageLRU(page)) {
struct lruvec *lruvec; struct lruvec *lruvec;
lruvec = mem_cgroup_page_lruvec(page, page_zone(page)); lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
if (getpage) if (getpage)
get_page(page); get_page(page);
ClearPageLRU(page); ClearPageLRU(page);
@ -188,7 +188,7 @@ unsigned int munlock_vma_page(struct page *page)
* might otherwise copy PageMlocked to part of the tail pages before * might otherwise copy PageMlocked to part of the tail pages before
* we clear it in the head page. It also stabilizes hpage_nr_pages(). * we clear it in the head page. It also stabilizes hpage_nr_pages().
*/ */
spin_lock_irq(&zone->lru_lock); spin_lock_irq(zone_lru_lock(zone));
nr_pages = hpage_nr_pages(page); nr_pages = hpage_nr_pages(page);
if (!TestClearPageMlocked(page)) if (!TestClearPageMlocked(page))
@ -197,14 +197,14 @@ unsigned int munlock_vma_page(struct page *page)
__mod_zone_page_state(zone, NR_MLOCK, -nr_pages); __mod_zone_page_state(zone, NR_MLOCK, -nr_pages);
if (__munlock_isolate_lru_page(page, true)) { if (__munlock_isolate_lru_page(page, true)) {
spin_unlock_irq(&zone->lru_lock); spin_unlock_irq(zone_lru_lock(zone));
__munlock_isolated_page(page); __munlock_isolated_page(page);
goto out; goto out;
} }
__munlock_isolation_failed(page); __munlock_isolation_failed(page);
unlock_out: unlock_out:
spin_unlock_irq(&zone->lru_lock); spin_unlock_irq(zone_lru_lock(zone));
out: out:
return nr_pages - 1; return nr_pages - 1;
@ -289,7 +289,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
pagevec_init(&pvec_putback, 0); pagevec_init(&pvec_putback, 0);
/* Phase 1: page isolation */ /* Phase 1: page isolation */
spin_lock_irq(&zone->lru_lock); spin_lock_irq(zone_lru_lock(zone));
for (i = 0; i < nr; i++) { for (i = 0; i < nr; i++) {
struct page *page = pvec->pages[i]; struct page *page = pvec->pages[i];
@ -315,7 +315,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
} }
delta_munlocked = -nr + pagevec_count(&pvec_putback); delta_munlocked = -nr + pagevec_count(&pvec_putback);
__mod_zone_page_state(zone, NR_MLOCK, delta_munlocked); __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
spin_unlock_irq(&zone->lru_lock); spin_unlock_irq(zone_lru_lock(zone));
/* Now we can release pins of pages that we are not munlocking */ /* Now we can release pins of pages that we are not munlocking */
pagevec_release(&pvec_putback); pagevec_release(&pvec_putback);

View file

@ -621,7 +621,6 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
{ {
struct mm_struct *mm = vma->vm_mm; struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *next = vma->vm_next; struct vm_area_struct *next = vma->vm_next;
struct vm_area_struct *importer = NULL;
struct address_space *mapping = NULL; struct address_space *mapping = NULL;
struct rb_root *root = NULL; struct rb_root *root = NULL;
struct anon_vma *anon_vma = NULL; struct anon_vma *anon_vma = NULL;
@ -631,17 +630,25 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
int remove_next = 0; int remove_next = 0;
if (next && !insert) { if (next && !insert) {
struct vm_area_struct *exporter = NULL; struct vm_area_struct *exporter = NULL, *importer = NULL;
if (end >= next->vm_end) { if (end >= next->vm_end) {
/* /*
* vma expands, overlapping all the next, and * vma expands, overlapping all the next, and
* perhaps the one after too (mprotect case 6). * perhaps the one after too (mprotect case 6).
*/ */
again: remove_next = 1 + (end > next->vm_end); remove_next = 1 + (end > next->vm_end);
end = next->vm_end; end = next->vm_end;
exporter = next; exporter = next;
importer = vma; importer = vma;
/*
* If next doesn't have anon_vma, import from vma after
* next, if the vma overlaps with it.
*/
if (remove_next == 2 && next && !next->anon_vma)
exporter = next->vm_next;
} else if (end > next->vm_start) { } else if (end > next->vm_start) {
/* /*
* vma expands, overlapping part of the next: * vma expands, overlapping part of the next:
@ -675,7 +682,7 @@ again: remove_next = 1 + (end > next->vm_end);
return error; return error;
} }
} }
again:
vma_adjust_trans_huge(vma, start, end, adjust_next); vma_adjust_trans_huge(vma, start, end, adjust_next);
if (file) { if (file) {
@ -796,8 +803,11 @@ again: remove_next = 1 + (end > next->vm_end);
* up the code too much to do both in one go. * up the code too much to do both in one go.
*/ */
next = vma->vm_next; next = vma->vm_next;
if (remove_next == 2) if (remove_next == 2) {
remove_next = 1;
end = next->vm_end;
goto again; goto again;
}
else if (next) else if (next)
vma_gap_update(next); vma_gap_update(next);
else else

View file

@ -176,11 +176,13 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
/* /*
* Do not even consider tasks which are explicitly marked oom * Do not even consider tasks which are explicitly marked oom
* unkillable or have been already oom reaped. * unkillable or have been already oom reaped or the are in
* the middle of vfork
*/ */
adj = (long)p->signal->oom_score_adj; adj = (long)p->signal->oom_score_adj;
if (adj == OOM_SCORE_ADJ_MIN || if (adj == OOM_SCORE_ADJ_MIN ||
test_bit(MMF_OOM_REAPED, &p->mm->flags)) { test_bit(MMF_OOM_REAPED, &p->mm->flags) ||
in_vfork(p)) {
task_unlock(p); task_unlock(p);
return 0; return 0;
} }
@ -281,10 +283,22 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
/* /*
* This task already has access to memory reserves and is being killed. * This task already has access to memory reserves and is being killed.
* Don't allow any other task to have access to the reserves. * Don't allow any other task to have access to the reserves unless
* the task has MMF_OOM_REAPED because chances that it would release
* any memory is quite low.
*/ */
if (!is_sysrq_oom(oc) && atomic_read(&task->signal->oom_victims)) if (!is_sysrq_oom(oc) && atomic_read(&task->signal->oom_victims)) {
return OOM_SCAN_ABORT; struct task_struct *p = find_lock_task_mm(task);
enum oom_scan_t ret = OOM_SCAN_ABORT;
if (p) {
if (test_bit(MMF_OOM_REAPED, &p->mm->flags))
ret = OOM_SCAN_CONTINUE;
task_unlock(p);
}
return ret;
}
/* /*
* If task is allocating a lot of memory and has been marked to be * If task is allocating a lot of memory and has been marked to be
@ -415,7 +429,7 @@ bool oom_killer_disabled __read_mostly;
* task's threads: if one of those is using this mm then this task was also * task's threads: if one of those is using this mm then this task was also
* using it. * using it.
*/ */
static bool process_shares_mm(struct task_struct *p, struct mm_struct *mm) bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
{ {
struct task_struct *t; struct task_struct *t;
@ -554,8 +568,27 @@ static void oom_reap_task(struct task_struct *tsk)
schedule_timeout_idle(HZ/10); schedule_timeout_idle(HZ/10);
if (attempts > MAX_OOM_REAP_RETRIES) { if (attempts > MAX_OOM_REAP_RETRIES) {
struct task_struct *p;
pr_info("oom_reaper: unable to reap pid:%d (%s)\n", pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
task_pid_nr(tsk), tsk->comm); task_pid_nr(tsk), tsk->comm);
/*
* If we've already tried to reap this task in the past and
* failed it probably doesn't make much sense to try yet again
* so hide the mm from the oom killer so that it can move on
* to another task with a different mm struct.
*/
p = find_lock_task_mm(tsk);
if (p) {
if (test_and_set_bit(MMF_OOM_NOT_REAPABLE, &p->mm->flags)) {
pr_info("oom_reaper: giving up pid:%d (%s)\n",
task_pid_nr(tsk), tsk->comm);
set_bit(MMF_OOM_REAPED, &p->mm->flags);
}
task_unlock(p);
}
debug_show_all_locks(); debug_show_all_locks();
} }
@ -594,7 +627,7 @@ static int oom_reaper(void *unused)
return 0; return 0;
} }
static void wake_oom_reaper(struct task_struct *tsk) void wake_oom_reaper(struct task_struct *tsk)
{ {
if (!oom_reaper_th) if (!oom_reaper_th)
return; return;
@ -612,46 +645,6 @@ static void wake_oom_reaper(struct task_struct *tsk)
wake_up(&oom_reaper_wait); wake_up(&oom_reaper_wait);
} }
/* Check if we can reap the given task. This has to be called with stable
* tsk->mm
*/
void try_oom_reaper(struct task_struct *tsk)
{
struct mm_struct *mm = tsk->mm;
struct task_struct *p;
if (!mm)
return;
/*
* There might be other threads/processes which are either not
* dying or even not killable.
*/
if (atomic_read(&mm->mm_users) > 1) {
rcu_read_lock();
for_each_process(p) {
if (!process_shares_mm(p, mm))
continue;
if (fatal_signal_pending(p))
continue;
/*
* If the task is exiting make sure the whole thread group
* is exiting and cannot acces mm anymore.
*/
if (signal_group_exit(p->signal))
continue;
/* Give up */
rcu_read_unlock();
return;
}
rcu_read_unlock();
}
wake_oom_reaper(tsk);
}
static int __init oom_init(void) static int __init oom_init(void)
{ {
oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper"); oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
@ -663,10 +656,6 @@ static int __init oom_init(void)
return 0; return 0;
} }
subsys_initcall(oom_init) subsys_initcall(oom_init)
#else
static void wake_oom_reaper(struct task_struct *tsk)
{
}
#endif #endif
/** /**
@ -743,6 +732,80 @@ void oom_killer_enable(void)
oom_killer_disabled = false; oom_killer_disabled = false;
} }
static inline bool __task_will_free_mem(struct task_struct *task)
{
struct signal_struct *sig = task->signal;
/*
* A coredumping process may sleep for an extended period in exit_mm(),
* so the oom killer cannot assume that the process will promptly exit
* and release memory.
*/
if (sig->flags & SIGNAL_GROUP_COREDUMP)
return false;
if (sig->flags & SIGNAL_GROUP_EXIT)
return true;
if (thread_group_empty(task) && (task->flags & PF_EXITING))
return true;
return false;
}
/*
* Checks whether the given task is dying or exiting and likely to
* release its address space. This means that all threads and processes
* sharing the same mm have to be killed or exiting.
* Caller has to make sure that task->mm is stable (hold task_lock or
* it operates on the current).
*/
bool task_will_free_mem(struct task_struct *task)
{
struct mm_struct *mm = task->mm;
struct task_struct *p;
bool ret;
/*
* Skip tasks without mm because it might have passed its exit_mm and
* exit_oom_victim. oom_reaper could have rescued that but do not rely
* on that for now. We can consider find_lock_task_mm in future.
*/
if (!mm)
return false;
if (!__task_will_free_mem(task))
return false;
/*
* This task has already been drained by the oom reaper so there are
* only small chances it will free some more
*/
if (test_bit(MMF_OOM_REAPED, &mm->flags))
return false;
if (atomic_read(&mm->mm_users) <= 1)
return true;
/*
* This is really pessimistic but we do not have any reliable way
* to check that external processes share with our mm
*/
rcu_read_lock();
for_each_process(p) {
if (!process_shares_mm(p, mm))
continue;
if (same_thread_group(task, p))
continue;
ret = __task_will_free_mem(p);
if (!ret)
break;
}
rcu_read_unlock();
return ret;
}
/* /*
* Must be called while holding a reference to p, which will be released upon * Must be called while holding a reference to p, which will be released upon
* returning. * returning.
@ -765,9 +828,9 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
* its children or threads, just set TIF_MEMDIE so it can die quickly * its children or threads, just set TIF_MEMDIE so it can die quickly
*/ */
task_lock(p); task_lock(p);
if (p->mm && task_will_free_mem(p)) { if (task_will_free_mem(p)) {
mark_oom_victim(p); mark_oom_victim(p);
try_oom_reaper(p); wake_oom_reaper(p);
task_unlock(p); task_unlock(p);
put_task_struct(p); put_task_struct(p);
return; return;
@ -850,14 +913,18 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
continue; continue;
if (same_thread_group(p, victim)) if (same_thread_group(p, victim))
continue; continue;
if (unlikely(p->flags & PF_KTHREAD) || is_global_init(p) || if (unlikely(p->flags & PF_KTHREAD) || is_global_init(p)) {
p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
/* /*
* We cannot use oom_reaper for the mm shared by this * We cannot use oom_reaper for the mm shared by this
* process because it wouldn't get killed and so the * process because it wouldn't get killed and so the
* memory might be still used. * memory might be still used. Hide the mm from the oom
* killer to guarantee OOM forward progress.
*/ */
can_oom_reap = false; can_oom_reap = false;
set_bit(MMF_OOM_REAPED, &mm->flags);
pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n",
task_pid_nr(victim), victim->comm,
task_pid_nr(p), p->comm);
continue; continue;
} }
do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
@ -939,14 +1006,10 @@ bool out_of_memory(struct oom_control *oc)
* If current has a pending SIGKILL or is exiting, then automatically * If current has a pending SIGKILL or is exiting, then automatically
* select it. The goal is to allow it to allocate so that it may * select it. The goal is to allow it to allocate so that it may
* quickly exit and free its memory. * quickly exit and free its memory.
*
* But don't select if current has already released its mm and cleared
* TIF_MEMDIE flag at exit_mm(), otherwise an OOM livelock may occur.
*/ */
if (current->mm && if (task_will_free_mem(current)) {
(fatal_signal_pending(current) || task_will_free_mem(current))) {
mark_oom_victim(current); mark_oom_victim(current);
try_oom_reaper(current); wake_oom_reaper(current);
return true; return true;
} }

View file

@ -267,26 +267,35 @@ static void wb_min_max_ratio(struct bdi_writeback *wb,
*/ */
/** /**
* zone_dirtyable_memory - number of dirtyable pages in a zone * node_dirtyable_memory - number of dirtyable pages in a node
* @zone: the zone * @pgdat: the node
* *
* Returns the zone's number of pages potentially available for dirty * Returns the node's number of pages potentially available for dirty
* page cache. This is the base value for the per-zone dirty limits. * page cache. This is the base value for the per-node dirty limits.
*/ */
static unsigned long zone_dirtyable_memory(struct zone *zone) static unsigned long node_dirtyable_memory(struct pglist_data *pgdat)
{ {
unsigned long nr_pages; unsigned long nr_pages = 0;
int z;
for (z = 0; z < MAX_NR_ZONES; z++) {
struct zone *zone = pgdat->node_zones + z;
if (!populated_zone(zone))
continue;
nr_pages += zone_page_state(zone, NR_FREE_PAGES);
}
nr_pages = zone_page_state(zone, NR_FREE_PAGES);
/* /*
* Pages reserved for the kernel should not be considered * Pages reserved for the kernel should not be considered
* dirtyable, to prevent a situation where reclaim has to * dirtyable, to prevent a situation where reclaim has to
* clean pages in order to balance the zones. * clean pages in order to balance the zones.
*/ */
nr_pages -= min(nr_pages, zone->totalreserve_pages); nr_pages -= min(nr_pages, pgdat->totalreserve_pages);
nr_pages += zone_page_state(zone, NR_INACTIVE_FILE); nr_pages += node_page_state(pgdat, NR_INACTIVE_FILE);
nr_pages += zone_page_state(zone, NR_ACTIVE_FILE); nr_pages += node_page_state(pgdat, NR_ACTIVE_FILE);
return nr_pages; return nr_pages;
} }
@ -299,13 +308,26 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
int i; int i;
for_each_node_state(node, N_HIGH_MEMORY) { for_each_node_state(node, N_HIGH_MEMORY) {
for (i = 0; i < MAX_NR_ZONES; i++) { for (i = ZONE_NORMAL + 1; i < MAX_NR_ZONES; i++) {
struct zone *z = &NODE_DATA(node)->node_zones[i]; struct zone *z;
unsigned long nr_pages;
if (is_highmem(z)) if (!is_highmem_idx(i))
x += zone_dirtyable_memory(z); continue;
z = &NODE_DATA(node)->node_zones[i];
if (!populated_zone(z))
continue;
nr_pages = zone_page_state(z, NR_FREE_PAGES);
/* watch for underflows */
nr_pages -= min(nr_pages, high_wmark_pages(z));
nr_pages += zone_page_state(z, NR_ZONE_INACTIVE_FILE);
nr_pages += zone_page_state(z, NR_ZONE_ACTIVE_FILE);
x += nr_pages;
} }
} }
/* /*
* Unreclaimable memory (kernel memory or anonymous memory * Unreclaimable memory (kernel memory or anonymous memory
* without swap) can bring down the dirtyable pages below * without swap) can bring down the dirtyable pages below
@ -348,8 +370,8 @@ static unsigned long global_dirtyable_memory(void)
*/ */
x -= min(x, totalreserve_pages); x -= min(x, totalreserve_pages);
x += global_page_state(NR_INACTIVE_FILE); x += global_node_page_state(NR_INACTIVE_FILE);
x += global_page_state(NR_ACTIVE_FILE); x += global_node_page_state(NR_ACTIVE_FILE);
if (!vm_highmem_is_dirtyable) if (!vm_highmem_is_dirtyable)
x -= highmem_dirtyable_memory(x); x -= highmem_dirtyable_memory(x);
@ -445,23 +467,23 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
} }
/** /**
* zone_dirty_limit - maximum number of dirty pages allowed in a zone * node_dirty_limit - maximum number of dirty pages allowed in a node
* @zone: the zone * @pgdat: the node
* *
* Returns the maximum number of dirty pages allowed in a zone, based * Returns the maximum number of dirty pages allowed in a node, based
* on the zone's dirtyable memory. * on the node's dirtyable memory.
*/ */
static unsigned long zone_dirty_limit(struct zone *zone) static unsigned long node_dirty_limit(struct pglist_data *pgdat)
{ {
unsigned long zone_memory = zone_dirtyable_memory(zone); unsigned long node_memory = node_dirtyable_memory(pgdat);
struct task_struct *tsk = current; struct task_struct *tsk = current;
unsigned long dirty; unsigned long dirty;
if (vm_dirty_bytes) if (vm_dirty_bytes)
dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) * dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) *
zone_memory / global_dirtyable_memory(); node_memory / global_dirtyable_memory();
else else
dirty = vm_dirty_ratio * zone_memory / 100; dirty = vm_dirty_ratio * node_memory / 100;
if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk))
dirty += dirty / 4; dirty += dirty / 4;
@ -470,19 +492,22 @@ static unsigned long zone_dirty_limit(struct zone *zone)
} }
/** /**
* zone_dirty_ok - tells whether a zone is within its dirty limits * node_dirty_ok - tells whether a node is within its dirty limits
* @zone: the zone to check * @pgdat: the node to check
* *
* Returns %true when the dirty pages in @zone are within the zone's * Returns %true when the dirty pages in @pgdat are within the node's
* dirty limit, %false if the limit is exceeded. * dirty limit, %false if the limit is exceeded.
*/ */
bool zone_dirty_ok(struct zone *zone) bool node_dirty_ok(struct pglist_data *pgdat)
{ {
unsigned long limit = zone_dirty_limit(zone); unsigned long limit = node_dirty_limit(pgdat);
unsigned long nr_pages = 0;
return zone_page_state(zone, NR_FILE_DIRTY) + nr_pages += node_page_state(pgdat, NR_FILE_DIRTY);
zone_page_state(zone, NR_UNSTABLE_NFS) + nr_pages += node_page_state(pgdat, NR_UNSTABLE_NFS);
zone_page_state(zone, NR_WRITEBACK) <= limit; nr_pages += node_page_state(pgdat, NR_WRITEBACK);
return nr_pages <= limit;
} }
int dirty_background_ratio_handler(struct ctl_table *table, int write, int dirty_background_ratio_handler(struct ctl_table *table, int write,
@ -1570,10 +1595,10 @@ static void balance_dirty_pages(struct address_space *mapping,
* written to the server's write cache, but has not yet * written to the server's write cache, but has not yet
* been flushed to permanent storage. * been flushed to permanent storage.
*/ */
nr_reclaimable = global_page_state(NR_FILE_DIRTY) + nr_reclaimable = global_node_page_state(NR_FILE_DIRTY) +
global_page_state(NR_UNSTABLE_NFS); global_node_page_state(NR_UNSTABLE_NFS);
gdtc->avail = global_dirtyable_memory(); gdtc->avail = global_dirtyable_memory();
gdtc->dirty = nr_reclaimable + global_page_state(NR_WRITEBACK); gdtc->dirty = nr_reclaimable + global_node_page_state(NR_WRITEBACK);
domain_dirty_limits(gdtc); domain_dirty_limits(gdtc);
@ -1910,8 +1935,8 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb)
* as we're trying to decide whether to put more under writeback. * as we're trying to decide whether to put more under writeback.
*/ */
gdtc->avail = global_dirtyable_memory(); gdtc->avail = global_dirtyable_memory();
gdtc->dirty = global_page_state(NR_FILE_DIRTY) + gdtc->dirty = global_node_page_state(NR_FILE_DIRTY) +
global_page_state(NR_UNSTABLE_NFS); global_node_page_state(NR_UNSTABLE_NFS);
domain_dirty_limits(gdtc); domain_dirty_limits(gdtc);
if (gdtc->dirty > gdtc->bg_thresh) if (gdtc->dirty > gdtc->bg_thresh)
@ -1955,8 +1980,8 @@ void throttle_vm_writeout(gfp_t gfp_mask)
*/ */
dirty_thresh += dirty_thresh / 10; /* wheeee... */ dirty_thresh += dirty_thresh / 10; /* wheeee... */
if (global_page_state(NR_UNSTABLE_NFS) + if (global_node_page_state(NR_UNSTABLE_NFS) +
global_page_state(NR_WRITEBACK) <= dirty_thresh) global_node_page_state(NR_WRITEBACK) <= dirty_thresh)
break; break;
congestion_wait(BLK_RW_ASYNC, HZ/10); congestion_wait(BLK_RW_ASYNC, HZ/10);
@ -1984,8 +2009,8 @@ int dirty_writeback_centisecs_handler(struct ctl_table *table, int write,
void laptop_mode_timer_fn(unsigned long data) void laptop_mode_timer_fn(unsigned long data)
{ {
struct request_queue *q = (struct request_queue *)data; struct request_queue *q = (struct request_queue *)data;
int nr_pages = global_page_state(NR_FILE_DIRTY) + int nr_pages = global_node_page_state(NR_FILE_DIRTY) +
global_page_state(NR_UNSTABLE_NFS); global_node_page_state(NR_UNSTABLE_NFS);
struct bdi_writeback *wb; struct bdi_writeback *wb;
/* /*
@ -2436,8 +2461,9 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
wb = inode_to_wb(inode); wb = inode_to_wb(inode);
mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_DIRTY); mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_DIRTY);
__inc_zone_page_state(page, NR_FILE_DIRTY); __inc_node_page_state(page, NR_FILE_DIRTY);
__inc_zone_page_state(page, NR_DIRTIED); __inc_zone_page_state(page, NR_ZONE_WRITE_PENDING);
__inc_node_page_state(page, NR_DIRTIED);
__inc_wb_stat(wb, WB_RECLAIMABLE); __inc_wb_stat(wb, WB_RECLAIMABLE);
__inc_wb_stat(wb, WB_DIRTIED); __inc_wb_stat(wb, WB_DIRTIED);
task_io_account_write(PAGE_SIZE); task_io_account_write(PAGE_SIZE);
@ -2457,7 +2483,8 @@ void account_page_cleaned(struct page *page, struct address_space *mapping,
{ {
if (mapping_cap_account_dirty(mapping)) { if (mapping_cap_account_dirty(mapping)) {
mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY); mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY);
dec_zone_page_state(page, NR_FILE_DIRTY); dec_node_page_state(page, NR_FILE_DIRTY);
dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
dec_wb_stat(wb, WB_RECLAIMABLE); dec_wb_stat(wb, WB_RECLAIMABLE);
task_io_account_cancelled_write(PAGE_SIZE); task_io_account_cancelled_write(PAGE_SIZE);
} }
@ -2525,7 +2552,7 @@ void account_page_redirty(struct page *page)
wb = unlocked_inode_to_wb_begin(inode, &locked); wb = unlocked_inode_to_wb_begin(inode, &locked);
current->nr_dirtied--; current->nr_dirtied--;
dec_zone_page_state(page, NR_DIRTIED); dec_node_page_state(page, NR_DIRTIED);
dec_wb_stat(wb, WB_DIRTIED); dec_wb_stat(wb, WB_DIRTIED);
unlocked_inode_to_wb_end(inode, locked); unlocked_inode_to_wb_end(inode, locked);
} }
@ -2713,7 +2740,8 @@ int clear_page_dirty_for_io(struct page *page)
wb = unlocked_inode_to_wb_begin(inode, &locked); wb = unlocked_inode_to_wb_begin(inode, &locked);
if (TestClearPageDirty(page)) { if (TestClearPageDirty(page)) {
mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY); mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY);
dec_zone_page_state(page, NR_FILE_DIRTY); dec_node_page_state(page, NR_FILE_DIRTY);
dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
dec_wb_stat(wb, WB_RECLAIMABLE); dec_wb_stat(wb, WB_RECLAIMABLE);
ret = 1; ret = 1;
} }
@ -2759,8 +2787,9 @@ int test_clear_page_writeback(struct page *page)
} }
if (ret) { if (ret) {
mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK);
dec_zone_page_state(page, NR_WRITEBACK); dec_node_page_state(page, NR_WRITEBACK);
inc_zone_page_state(page, NR_WRITTEN); dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
inc_node_page_state(page, NR_WRITTEN);
} }
unlock_page_memcg(page); unlock_page_memcg(page);
return ret; return ret;
@ -2813,7 +2842,8 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
} }
if (!ret) { if (!ret) {
mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK);
inc_zone_page_state(page, NR_WRITEBACK); inc_node_page_state(page, NR_WRITEBACK);
inc_zone_page_state(page, NR_ZONE_WRITE_PENDING);
} }
unlock_page_memcg(page); unlock_page_memcg(page);
return ret; return ret;

View file

@ -295,14 +295,6 @@ static inline bool __meminit early_page_uninitialised(unsigned long pfn)
return false; return false;
} }
static inline bool early_page_nid_uninitialised(unsigned long pfn, int nid)
{
if (pfn >= NODE_DATA(nid)->first_deferred_pfn)
return true;
return false;
}
/* /*
* Returns false when the remaining initialisation should be deferred until * Returns false when the remaining initialisation should be deferred until
* later in the boot cycle when it can be parallelised. * later in the boot cycle when it can be parallelised.
@ -342,11 +334,6 @@ static inline bool early_page_uninitialised(unsigned long pfn)
return false; return false;
} }
static inline bool early_page_nid_uninitialised(unsigned long pfn, int nid)
{
return false;
}
static inline bool update_defer_init(pg_data_t *pgdat, static inline bool update_defer_init(pg_data_t *pgdat,
unsigned long pfn, unsigned long zone_end, unsigned long pfn, unsigned long zone_end,
unsigned long *nr_initialised) unsigned long *nr_initialised)
@ -1091,9 +1078,9 @@ static void free_pcppages_bulk(struct zone *zone, int count,
spin_lock(&zone->lock); spin_lock(&zone->lock);
isolated_pageblocks = has_isolate_pageblock(zone); isolated_pageblocks = has_isolate_pageblock(zone);
nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED); nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
if (nr_scanned) if (nr_scanned)
__mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned); __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned);
while (count) { while (count) {
struct page *page; struct page *page;
@ -1148,9 +1135,9 @@ static void free_one_page(struct zone *zone,
{ {
unsigned long nr_scanned; unsigned long nr_scanned;
spin_lock(&zone->lock); spin_lock(&zone->lock);
nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED); nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
if (nr_scanned) if (nr_scanned)
__mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned); __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned);
if (unlikely(has_isolate_pageblock(zone) || if (unlikely(has_isolate_pageblock(zone) ||
is_migrate_isolate(migratetype))) { is_migrate_isolate(migratetype))) {
@ -2517,7 +2504,10 @@ int __isolate_free_page(struct page *page, unsigned int order)
zone->free_area[order].nr_free--; zone->free_area[order].nr_free--;
rmv_page_order(page); rmv_page_order(page);
/* Set the pageblock if the isolated page is at least a pageblock */ /*
* Set the pageblock if the isolated page is at least half of a
* pageblock
*/
if (order >= pageblock_order - 1) { if (order >= pageblock_order - 1) {
struct page *endpage = page + (1 << order) - 1; struct page *endpage = page + (1 << order) - 1;
for (; page < endpage; page += pageblock_nr_pages) { for (; page < endpage; page += pageblock_nr_pages) {
@ -2597,7 +2587,6 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
else else
page = list_first_entry(list, struct page, lru); page = list_first_entry(list, struct page, lru);
__dec_zone_state(zone, NR_ALLOC_BATCH);
list_del(&page->lru); list_del(&page->lru);
pcp->count--; pcp->count--;
@ -2623,16 +2612,11 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
spin_unlock(&zone->lock); spin_unlock(&zone->lock);
if (!page) if (!page)
goto failed; goto failed;
__mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
__mod_zone_freepage_state(zone, -(1 << order), __mod_zone_freepage_state(zone, -(1 << order),
get_pcppage_migratetype(page)); get_pcppage_migratetype(page));
} }
if (atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]) <= 0 && __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
!test_bit(ZONE_FAIR_DEPLETED, &zone->flags))
set_bit(ZONE_FAIR_DEPLETED, &zone->flags);
__count_zone_vm_events(PGALLOC, zone, 1 << order);
zone_statistics(preferred_zone, zone, gfp_flags); zone_statistics(preferred_zone, zone, gfp_flags);
local_irq_restore(flags); local_irq_restore(flags);
@ -2842,40 +2826,18 @@ bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
} }
#ifdef CONFIG_NUMA #ifdef CONFIG_NUMA
static bool zone_local(struct zone *local_zone, struct zone *zone)
{
return local_zone->node == zone->node;
}
static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
{ {
return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) < return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <
RECLAIM_DISTANCE; RECLAIM_DISTANCE;
} }
#else /* CONFIG_NUMA */ #else /* CONFIG_NUMA */
static bool zone_local(struct zone *local_zone, struct zone *zone)
{
return true;
}
static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
{ {
return true; return true;
} }
#endif /* CONFIG_NUMA */ #endif /* CONFIG_NUMA */
static void reset_alloc_batches(struct zone *preferred_zone)
{
struct zone *zone = preferred_zone->zone_pgdat->node_zones;
do {
mod_zone_page_state(zone, NR_ALLOC_BATCH,
high_wmark_pages(zone) - low_wmark_pages(zone) -
atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
clear_bit(ZONE_FAIR_DEPLETED, &zone->flags);
} while (zone++ != preferred_zone);
}
/* /*
* get_page_from_freelist goes through the zonelist trying to allocate * get_page_from_freelist goes through the zonelist trying to allocate
* a page. * a page.
@ -2886,10 +2848,8 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
{ {
struct zoneref *z = ac->preferred_zoneref; struct zoneref *z = ac->preferred_zoneref;
struct zone *zone; struct zone *zone;
bool fair_skipped = false; struct pglist_data *last_pgdat_dirty_limit = NULL;
bool apply_fair = (alloc_flags & ALLOC_FAIR);
zonelist_scan:
/* /*
* Scan zonelist, looking for a zone with enough free. * Scan zonelist, looking for a zone with enough free.
* See also __cpuset_node_allowed() comment in kernel/cpuset.c. * See also __cpuset_node_allowed() comment in kernel/cpuset.c.
@ -2903,51 +2863,34 @@ zonelist_scan:
(alloc_flags & ALLOC_CPUSET) && (alloc_flags & ALLOC_CPUSET) &&
!__cpuset_zone_allowed(zone, gfp_mask)) !__cpuset_zone_allowed(zone, gfp_mask))
continue; continue;
/*
* Distribute pages in proportion to the individual
* zone size to ensure fair page aging. The zone a
* page was allocated in should have no effect on the
* time the page has in memory before being reclaimed.
*/
if (apply_fair) {
if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) {
fair_skipped = true;
continue;
}
if (!zone_local(ac->preferred_zoneref->zone, zone)) {
if (fair_skipped)
goto reset_fair;
apply_fair = false;
}
}
/* /*
* When allocating a page cache page for writing, we * When allocating a page cache page for writing, we
* want to get it from a zone that is within its dirty * want to get it from a node that is within its dirty
* limit, such that no single zone holds more than its * limit, such that no single node holds more than its
* proportional share of globally allowed dirty pages. * proportional share of globally allowed dirty pages.
* The dirty limits take into account the zone's * The dirty limits take into account the node's
* lowmem reserves and high watermark so that kswapd * lowmem reserves and high watermark so that kswapd
* should be able to balance it without having to * should be able to balance it without having to
* write pages from its LRU list. * write pages from its LRU list.
* *
* This may look like it could increase pressure on
* lower zones by failing allocations in higher zones
* before they are full. But the pages that do spill
* over are limited as the lower zones are protected
* by this very same mechanism. It should not become
* a practical burden to them.
*
* XXX: For now, allow allocations to potentially * XXX: For now, allow allocations to potentially
* exceed the per-zone dirty limit in the slowpath * exceed the per-node dirty limit in the slowpath
* (spread_dirty_pages unset) before going into reclaim, * (spread_dirty_pages unset) before going into reclaim,
* which is important when on a NUMA setup the allowed * which is important when on a NUMA setup the allowed
* zones are together not big enough to reach the * nodes are together not big enough to reach the
* global limit. The proper fix for these situations * global limit. The proper fix for these situations
* will require awareness of zones in the * will require awareness of nodes in the
* dirty-throttling and the flusher threads. * dirty-throttling and the flusher threads.
*/ */
if (ac->spread_dirty_pages && !zone_dirty_ok(zone)) if (ac->spread_dirty_pages) {
continue; if (last_pgdat_dirty_limit == zone->zone_pgdat)
continue;
if (!node_dirty_ok(zone->zone_pgdat)) {
last_pgdat_dirty_limit = zone->zone_pgdat;
continue;
}
}
mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
if (!zone_watermark_fast(zone, order, mark, if (!zone_watermark_fast(zone, order, mark,
@ -2959,16 +2902,16 @@ zonelist_scan:
if (alloc_flags & ALLOC_NO_WATERMARKS) if (alloc_flags & ALLOC_NO_WATERMARKS)
goto try_this_zone; goto try_this_zone;
if (zone_reclaim_mode == 0 || if (node_reclaim_mode == 0 ||
!zone_allows_reclaim(ac->preferred_zoneref->zone, zone)) !zone_allows_reclaim(ac->preferred_zoneref->zone, zone))
continue; continue;
ret = zone_reclaim(zone, gfp_mask, order); ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);
switch (ret) { switch (ret) {
case ZONE_RECLAIM_NOSCAN: case NODE_RECLAIM_NOSCAN:
/* did not scan */ /* did not scan */
continue; continue;
case ZONE_RECLAIM_FULL: case NODE_RECLAIM_FULL:
/* scanned but unreclaimable */ /* scanned but unreclaimable */
continue; continue;
default: default:
@ -2998,23 +2941,6 @@ try_this_zone:
} }
} }
/*
* The first pass makes sure allocations are spread fairly within the
* local node. However, the local node might have free pages left
* after the fairness batches are exhausted, and remote zones haven't
* even been considered yet. Try once more without fairness, and
* include remote zones now, before entering the slowpath and waking
* kswapd: prefer spilling to a remote zone over swapping locally.
*/
if (fair_skipped) {
reset_fair:
apply_fair = false;
fair_skipped = false;
reset_alloc_batches(ac->preferred_zoneref->zone);
z = ac->preferred_zoneref;
goto zonelist_scan;
}
return NULL; return NULL;
} }
@ -3159,7 +3085,6 @@ out:
return page; return page;
} }
/* /*
* Maximum number of compaction retries wit a progress before OOM * Maximum number of compaction retries wit a progress before OOM
* killer is consider as the only way to move forward. * killer is consider as the only way to move forward.
@ -3171,17 +3096,16 @@ out:
static struct page * static struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
unsigned int alloc_flags, const struct alloc_context *ac, unsigned int alloc_flags, const struct alloc_context *ac,
enum migrate_mode mode, enum compact_result *compact_result) enum compact_priority prio, enum compact_result *compact_result)
{ {
struct page *page; struct page *page;
int contended_compaction;
if (!order) if (!order)
return NULL; return NULL;
current->flags |= PF_MEMALLOC; current->flags |= PF_MEMALLOC;
*compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
mode, &contended_compaction); prio);
current->flags &= ~PF_MEMALLOC; current->flags &= ~PF_MEMALLOC;
if (*compact_result <= COMPACT_INACTIVE) if (*compact_result <= COMPACT_INACTIVE)
@ -3193,8 +3117,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
*/ */
count_vm_event(COMPACTSTALL); count_vm_event(COMPACTSTALL);
page = get_page_from_freelist(gfp_mask, order, page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
if (page) { if (page) {
struct zone *zone = page_zone(page); struct zone *zone = page_zone(page);
@ -3211,24 +3134,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
*/ */
count_vm_event(COMPACTFAIL); count_vm_event(COMPACTFAIL);
/*
* In all zones where compaction was attempted (and not
* deferred or skipped), lock contention has been detected.
* For THP allocation we do not want to disrupt the others
* so we fallback to base pages instead.
*/
if (contended_compaction == COMPACT_CONTENDED_LOCK)
*compact_result = COMPACT_CONTENDED;
/*
* If compaction was aborted due to need_resched(), we do not
* want to further increase allocation latency, unless it is
* khugepaged trying to collapse.
*/
if (contended_compaction == COMPACT_CONTENDED_SCHED
&& !(current->flags & PF_KTHREAD))
*compact_result = COMPACT_CONTENDED;
cond_resched(); cond_resched();
return NULL; return NULL;
@ -3236,7 +3141,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
static inline bool static inline bool
should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
enum compact_result compact_result, enum migrate_mode *migrate_mode, enum compact_result compact_result,
enum compact_priority *compact_priority,
int compaction_retries) int compaction_retries)
{ {
int max_retries = MAX_COMPACT_RETRIES; int max_retries = MAX_COMPACT_RETRIES;
@ -3247,11 +3153,11 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
/* /*
* compaction considers all the zone as desperately out of memory * compaction considers all the zone as desperately out of memory
* so it doesn't really make much sense to retry except when the * so it doesn't really make much sense to retry except when the
* failure could be caused by weak migration mode. * failure could be caused by insufficient priority
*/ */
if (compaction_failed(compact_result)) { if (compaction_failed(compact_result)) {
if (*migrate_mode == MIGRATE_ASYNC) { if (*compact_priority > MIN_COMPACT_PRIORITY) {
*migrate_mode = MIGRATE_SYNC_LIGHT; (*compact_priority)--;
return true; return true;
} }
return false; return false;
@ -3285,7 +3191,7 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
static inline struct page * static inline struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
unsigned int alloc_flags, const struct alloc_context *ac, unsigned int alloc_flags, const struct alloc_context *ac,
enum migrate_mode mode, enum compact_result *compact_result) enum compact_priority prio, enum compact_result *compact_result)
{ {
*compact_result = COMPACT_SKIPPED; *compact_result = COMPACT_SKIPPED;
return NULL; return NULL;
@ -3294,7 +3200,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
static inline bool static inline bool
should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags, should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags,
enum compact_result compact_result, enum compact_result compact_result,
enum migrate_mode *migrate_mode, enum compact_priority *compact_priority,
int compaction_retries) int compaction_retries)
{ {
struct zone *zone; struct zone *zone;
@ -3362,8 +3268,7 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
return NULL; return NULL;
retry: retry:
page = get_page_from_freelist(gfp_mask, order, page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
/* /*
* If an allocation failed after direct reclaim, it could be because * If an allocation failed after direct reclaim, it could be because
@ -3384,10 +3289,14 @@ static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac)
{ {
struct zoneref *z; struct zoneref *z;
struct zone *zone; struct zone *zone;
pg_data_t *last_pgdat = NULL;
for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
ac->high_zoneidx, ac->nodemask) ac->high_zoneidx, ac->nodemask) {
wakeup_kswapd(zone, order, ac_classzone_idx(ac)); if (last_pgdat != zone->zone_pgdat)
wakeup_kswapd(zone, order, ac->high_zoneidx);
last_pgdat = zone->zone_pgdat;
}
} }
static inline unsigned int static inline unsigned int
@ -3421,16 +3330,6 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
} else if (unlikely(rt_task(current)) && !in_interrupt()) } else if (unlikely(rt_task(current)) && !in_interrupt())
alloc_flags |= ALLOC_HARDER; alloc_flags |= ALLOC_HARDER;
if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
if (gfp_mask & __GFP_MEMALLOC)
alloc_flags |= ALLOC_NO_WATERMARKS;
else if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
alloc_flags |= ALLOC_NO_WATERMARKS;
else if (!in_interrupt() &&
((current->flags & PF_MEMALLOC) ||
unlikely(test_thread_flag(TIF_MEMDIE))))
alloc_flags |= ALLOC_NO_WATERMARKS;
}
#ifdef CONFIG_CMA #ifdef CONFIG_CMA
if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
alloc_flags |= ALLOC_CMA; alloc_flags |= ALLOC_CMA;
@ -3440,12 +3339,19 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
{ {
return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS); if (unlikely(gfp_mask & __GFP_NOMEMALLOC))
} return false;
static inline bool is_thp_gfp_mask(gfp_t gfp_mask) if (gfp_mask & __GFP_MEMALLOC)
{ return true;
return (gfp_mask & (GFP_TRANSHUGE | __GFP_KSWAPD_RECLAIM)) == GFP_TRANSHUGE; if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
return true;
if (!in_interrupt() &&
((current->flags & PF_MEMALLOC) ||
unlikely(test_thread_flag(TIF_MEMDIE))))
return true;
return false;
} }
/* /*
@ -3481,10 +3387,10 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
return false; return false;
/* /*
* Keep reclaiming pages while there is a chance this will lead somewhere. * Keep reclaiming pages while there is a chance this will lead
* If none of the target zones can satisfy our allocation request even * somewhere. If none of the target zones can satisfy our allocation
* if all reclaimable pages are considered then we are screwed and have * request even if all reclaimable pages are considered then we are
* to go OOM. * screwed and have to go OOM.
*/ */
for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
ac->nodemask) { ac->nodemask) {
@ -3509,14 +3415,12 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
* prevent from pre mature OOM * prevent from pre mature OOM
*/ */
if (!did_some_progress) { if (!did_some_progress) {
unsigned long writeback; unsigned long write_pending;
unsigned long dirty;
writeback = zone_page_state_snapshot(zone, write_pending = zone_page_state_snapshot(zone,
NR_WRITEBACK); NR_ZONE_WRITE_PENDING);
dirty = zone_page_state_snapshot(zone, NR_FILE_DIRTY);
if (2*(writeback + dirty) > reclaimable) { if (2 * write_pending > reclaimable) {
congestion_wait(BLK_RW_ASYNC, HZ/10); congestion_wait(BLK_RW_ASYNC, HZ/10);
return true; return true;
} }
@ -3551,7 +3455,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
struct page *page = NULL; struct page *page = NULL;
unsigned int alloc_flags; unsigned int alloc_flags;
unsigned long did_some_progress; unsigned long did_some_progress;
enum migrate_mode migration_mode = MIGRATE_ASYNC; enum compact_priority compact_priority = DEF_COMPACT_PRIORITY;
enum compact_result compact_result; enum compact_result compact_result;
int compaction_retries = 0; int compaction_retries = 0;
int no_progress_loops = 0; int no_progress_loops = 0;
@ -3575,42 +3479,88 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
(__GFP_ATOMIC|__GFP_DIRECT_RECLAIM))) (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
gfp_mask &= ~__GFP_ATOMIC; gfp_mask &= ~__GFP_ATOMIC;
retry: /*
* The fast path uses conservative alloc_flags to succeed only until
* kswapd needs to be woken up, and to avoid the cost of setting up
* alloc_flags precisely. So we do that now.
*/
alloc_flags = gfp_to_alloc_flags(gfp_mask);
if (gfp_mask & __GFP_KSWAPD_RECLAIM) if (gfp_mask & __GFP_KSWAPD_RECLAIM)
wake_all_kswapds(order, ac); wake_all_kswapds(order, ac);
/* /*
* OK, we're below the kswapd watermark and have kicked background * The adjusted alloc_flags might result in immediate success, so try
* reclaim. Now things get more complex, so set up alloc_flags according * that first
* to how we want to proceed.
*/ */
alloc_flags = gfp_to_alloc_flags(gfp_mask); page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
if (page)
goto got_pg;
/*
* For costly allocations, try direct compaction first, as it's likely
* that we have enough base pages and don't need to reclaim. Don't try
* that for allocations that are allowed to ignore watermarks, as the
* ALLOC_NO_WATERMARKS attempt didn't yet happen.
*/
if (can_direct_reclaim && order > PAGE_ALLOC_COSTLY_ORDER &&
!gfp_pfmemalloc_allowed(gfp_mask)) {
page = __alloc_pages_direct_compact(gfp_mask, order,
alloc_flags, ac,
INIT_COMPACT_PRIORITY,
&compact_result);
if (page)
goto got_pg;
/*
* Checks for costly allocations with __GFP_NORETRY, which
* includes THP page fault allocations
*/
if (gfp_mask & __GFP_NORETRY) {
/*
* If compaction is deferred for high-order allocations,
* it is because sync compaction recently failed. If
* this is the case and the caller requested a THP
* allocation, we do not want to heavily disrupt the
* system, so we fail the allocation instead of entering
* direct reclaim.
*/
if (compact_result == COMPACT_DEFERRED)
goto nopage;
/*
* Looks like reclaim/compaction is worth trying, but
* sync compaction could be very expensive, so keep
* using async compaction.
*/
compact_priority = INIT_COMPACT_PRIORITY;
}
}
retry:
/* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
if (gfp_mask & __GFP_KSWAPD_RECLAIM)
wake_all_kswapds(order, ac);
if (gfp_pfmemalloc_allowed(gfp_mask))
alloc_flags = ALLOC_NO_WATERMARKS;
/* /*
* Reset the zonelist iterators if memory policies can be ignored. * Reset the zonelist iterators if memory policies can be ignored.
* These allocations are high priority and system rather than user * These allocations are high priority and system rather than user
* orientated. * orientated.
*/ */
if ((alloc_flags & ALLOC_NO_WATERMARKS) || !(alloc_flags & ALLOC_CPUSET)) { if (!(alloc_flags & ALLOC_CPUSET) || (alloc_flags & ALLOC_NO_WATERMARKS)) {
ac->zonelist = node_zonelist(numa_node_id(), gfp_mask); ac->zonelist = node_zonelist(numa_node_id(), gfp_mask);
ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
ac->high_zoneidx, ac->nodemask); ac->high_zoneidx, ac->nodemask);
} }
/* This is the last chance, in general, before the goto nopage. */ /* Attempt with potentially adjusted zonelist and alloc_flags */
page = get_page_from_freelist(gfp_mask, order, page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
if (page) if (page)
goto got_pg; goto got_pg;
/* Allocate without watermarks if the context allows */
if (alloc_flags & ALLOC_NO_WATERMARKS) {
page = get_page_from_freelist(gfp_mask, order,
ALLOC_NO_WATERMARKS, ac);
if (page)
goto got_pg;
}
/* Caller is not willing to reclaim, we can't balance anything */ /* Caller is not willing to reclaim, we can't balance anything */
if (!can_direct_reclaim) { if (!can_direct_reclaim) {
/* /*
@ -3640,38 +3590,6 @@ retry:
if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
goto nopage; goto nopage;
/*
* Try direct compaction. The first pass is asynchronous. Subsequent
* attempts after direct reclaim are synchronous
*/
page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
migration_mode,
&compact_result);
if (page)
goto got_pg;
/* Checks for THP-specific high-order allocations */
if (is_thp_gfp_mask(gfp_mask)) {
/*
* If compaction is deferred for high-order allocations, it is
* because sync compaction recently failed. If this is the case
* and the caller requested a THP allocation, we do not want
* to heavily disrupt the system, so we fail the allocation
* instead of entering direct reclaim.
*/
if (compact_result == COMPACT_DEFERRED)
goto nopage;
/*
* Compaction is contended so rather back off than cause
* excessive stalls.
*/
if(compact_result == COMPACT_CONTENDED)
goto nopage;
}
if (order && compaction_made_progress(compact_result))
compaction_retries++;
/* Try direct reclaim and then allocating */ /* Try direct reclaim and then allocating */
page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac, page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
@ -3679,16 +3597,25 @@ retry:
if (page) if (page)
goto got_pg; goto got_pg;
/* Try direct compaction and then allocating */
page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
compact_priority, &compact_result);
if (page)
goto got_pg;
if (order && compaction_made_progress(compact_result))
compaction_retries++;
/* Do not loop if specifically requested */ /* Do not loop if specifically requested */
if (gfp_mask & __GFP_NORETRY) if (gfp_mask & __GFP_NORETRY)
goto noretry; goto nopage;
/* /*
* Do not retry costly high order allocations unless they are * Do not retry costly high order allocations unless they are
* __GFP_REPEAT * __GFP_REPEAT
*/ */
if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT)) if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT))
goto noretry; goto nopage;
/* /*
* Costly allocations might have made a progress but this doesn't mean * Costly allocations might have made a progress but this doesn't mean
@ -3712,7 +3639,7 @@ retry:
*/ */
if (did_some_progress > 0 && if (did_some_progress > 0 &&
should_compact_retry(ac, order, alloc_flags, should_compact_retry(ac, order, alloc_flags,
compact_result, &migration_mode, compact_result, &compact_priority,
compaction_retries)) compaction_retries))
goto retry; goto retry;
@ -3727,25 +3654,6 @@ retry:
goto retry; goto retry;
} }
noretry:
/*
* High-order allocations do not necessarily loop after direct reclaim
* and reclaim/compaction depends on compaction being called after
* reclaim so call directly if necessary.
* It can become very expensive to allocate transparent hugepages at
* fault, so use asynchronous memory compaction for THP unless it is
* khugepaged trying to collapse. All other requests should tolerate
* at least light sync migration.
*/
if (is_thp_gfp_mask(gfp_mask) && !(current->flags & PF_KTHREAD))
migration_mode = MIGRATE_ASYNC;
else
migration_mode = MIGRATE_SYNC_LIGHT;
page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags,
ac, migration_mode,
&compact_result);
if (page)
goto got_pg;
nopage: nopage:
warn_alloc_failed(gfp_mask, order, NULL); warn_alloc_failed(gfp_mask, order, NULL);
got_pg: got_pg:
@ -3761,7 +3669,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
{ {
struct page *page; struct page *page;
unsigned int cpuset_mems_cookie; unsigned int cpuset_mems_cookie;
unsigned int alloc_flags = ALLOC_WMARK_LOW|ALLOC_FAIR; unsigned int alloc_flags = ALLOC_WMARK_LOW;
gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */ gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */
struct alloc_context ac = { struct alloc_context ac = {
.high_zoneidx = gfp_zone(gfp_mask), .high_zoneidx = gfp_zone(gfp_mask),
@ -4192,7 +4100,7 @@ EXPORT_SYMBOL_GPL(si_mem_available);
void si_meminfo(struct sysinfo *val) void si_meminfo(struct sysinfo *val)
{ {
val->totalram = totalram_pages; val->totalram = totalram_pages;
val->sharedram = global_page_state(NR_SHMEM); val->sharedram = global_node_page_state(NR_SHMEM);
val->freeram = global_page_state(NR_FREE_PAGES); val->freeram = global_page_state(NR_FREE_PAGES);
val->bufferram = nr_blockdev_pages(); val->bufferram = nr_blockdev_pages();
val->totalhigh = totalhigh_pages; val->totalhigh = totalhigh_pages;
@ -4214,8 +4122,8 @@ void si_meminfo_node(struct sysinfo *val, int nid)
for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
managed_pages += pgdat->node_zones[zone_type].managed_pages; managed_pages += pgdat->node_zones[zone_type].managed_pages;
val->totalram = managed_pages; val->totalram = managed_pages;
val->sharedram = node_page_state(nid, NR_SHMEM); val->sharedram = node_page_state(pgdat, NR_SHMEM);
val->freeram = node_page_state(nid, NR_FREE_PAGES); val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES);
#ifdef CONFIG_HIGHMEM #ifdef CONFIG_HIGHMEM
for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
struct zone *zone = &pgdat->node_zones[zone_type]; struct zone *zone = &pgdat->node_zones[zone_type];
@ -4298,6 +4206,7 @@ void show_free_areas(unsigned int filter)
unsigned long free_pcp = 0; unsigned long free_pcp = 0;
int cpu; int cpu;
struct zone *zone; struct zone *zone;
pg_data_t *pgdat;
for_each_populated_zone(zone) { for_each_populated_zone(zone) {
if (skip_free_areas_node(filter, zone_to_nid(zone))) if (skip_free_areas_node(filter, zone_to_nid(zone)))
@ -4312,35 +4221,74 @@ void show_free_areas(unsigned int filter)
" unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n" " unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n"
" slab_reclaimable:%lu slab_unreclaimable:%lu\n" " slab_reclaimable:%lu slab_unreclaimable:%lu\n"
" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n" " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
" anon_thp: %lu shmem_thp: %lu shmem_pmdmapped: %lu\n"
#endif
" free:%lu free_pcp:%lu free_cma:%lu\n", " free:%lu free_pcp:%lu free_cma:%lu\n",
global_page_state(NR_ACTIVE_ANON), global_node_page_state(NR_ACTIVE_ANON),
global_page_state(NR_INACTIVE_ANON), global_node_page_state(NR_INACTIVE_ANON),
global_page_state(NR_ISOLATED_ANON), global_node_page_state(NR_ISOLATED_ANON),
global_page_state(NR_ACTIVE_FILE), global_node_page_state(NR_ACTIVE_FILE),
global_page_state(NR_INACTIVE_FILE), global_node_page_state(NR_INACTIVE_FILE),
global_page_state(NR_ISOLATED_FILE), global_node_page_state(NR_ISOLATED_FILE),
global_page_state(NR_UNEVICTABLE), global_node_page_state(NR_UNEVICTABLE),
global_page_state(NR_FILE_DIRTY), global_node_page_state(NR_FILE_DIRTY),
global_page_state(NR_WRITEBACK), global_node_page_state(NR_WRITEBACK),
global_page_state(NR_UNSTABLE_NFS), global_node_page_state(NR_UNSTABLE_NFS),
global_page_state(NR_SLAB_RECLAIMABLE), global_page_state(NR_SLAB_RECLAIMABLE),
global_page_state(NR_SLAB_UNRECLAIMABLE), global_page_state(NR_SLAB_UNRECLAIMABLE),
global_page_state(NR_FILE_MAPPED), global_node_page_state(NR_FILE_MAPPED),
global_page_state(NR_SHMEM), global_node_page_state(NR_SHMEM),
global_page_state(NR_PAGETABLE), global_page_state(NR_PAGETABLE),
global_page_state(NR_BOUNCE), global_page_state(NR_BOUNCE),
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
global_page_state(NR_ANON_THPS) * HPAGE_PMD_NR,
global_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR,
global_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR,
#endif
global_page_state(NR_FREE_PAGES), global_page_state(NR_FREE_PAGES),
free_pcp, free_pcp,
global_page_state(NR_FREE_CMA_PAGES)); global_page_state(NR_FREE_CMA_PAGES));
for_each_online_pgdat(pgdat) {
printk("Node %d"
" active_anon:%lukB"
" inactive_anon:%lukB"
" active_file:%lukB"
" inactive_file:%lukB"
" unevictable:%lukB"
" isolated(anon):%lukB"
" isolated(file):%lukB"
" mapped:%lukB"
" dirty:%lukB"
" writeback:%lukB"
" shmem:%lukB"
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
" shmem_thp: %lukB"
" shmem_pmdmapped: %lukB"
" anon_thp: %lukB"
#endif
" writeback_tmp:%lukB"
" unstable:%lukB"
" pages_scanned:%lu"
" all_unreclaimable? %s"
"\n",
pgdat->node_id,
K(node_page_state(pgdat, NR_ACTIVE_ANON)),
K(node_page_state(pgdat, NR_INACTIVE_ANON)),
K(node_page_state(pgdat, NR_ACTIVE_FILE)),
K(node_page_state(pgdat, NR_INACTIVE_FILE)),
K(node_page_state(pgdat, NR_UNEVICTABLE)),
K(node_page_state(pgdat, NR_ISOLATED_ANON)),
K(node_page_state(pgdat, NR_ISOLATED_FILE)),
K(node_page_state(pgdat, NR_FILE_MAPPED)),
K(node_page_state(pgdat, NR_FILE_DIRTY)),
K(node_page_state(pgdat, NR_WRITEBACK)),
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
K(node_page_state(pgdat, NR_SHMEM_THPS) * HPAGE_PMD_NR),
K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)
* HPAGE_PMD_NR),
K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR),
#endif
K(node_page_state(pgdat, NR_SHMEM)),
K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
K(node_page_state(pgdat, NR_UNSTABLE_NFS)),
node_page_state(pgdat, NR_PAGES_SCANNED),
!pgdat_reclaimable(pgdat) ? "yes" : "no");
}
for_each_populated_zone(zone) { for_each_populated_zone(zone) {
int i; int i;
@ -4362,72 +4310,41 @@ void show_free_areas(unsigned int filter)
" active_file:%lukB" " active_file:%lukB"
" inactive_file:%lukB" " inactive_file:%lukB"
" unevictable:%lukB" " unevictable:%lukB"
" isolated(anon):%lukB" " writepending:%lukB"
" isolated(file):%lukB"
" present:%lukB" " present:%lukB"
" managed:%lukB" " managed:%lukB"
" mlocked:%lukB" " mlocked:%lukB"
" dirty:%lukB"
" writeback:%lukB"
" mapped:%lukB"
" shmem:%lukB"
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
" shmem_thp: %lukB"
" shmem_pmdmapped: %lukB"
" anon_thp: %lukB"
#endif
" slab_reclaimable:%lukB" " slab_reclaimable:%lukB"
" slab_unreclaimable:%lukB" " slab_unreclaimable:%lukB"
" kernel_stack:%lukB" " kernel_stack:%lukB"
" pagetables:%lukB" " pagetables:%lukB"
" unstable:%lukB"
" bounce:%lukB" " bounce:%lukB"
" free_pcp:%lukB" " free_pcp:%lukB"
" local_pcp:%ukB" " local_pcp:%ukB"
" free_cma:%lukB" " free_cma:%lukB"
" writeback_tmp:%lukB"
" pages_scanned:%lu"
" all_unreclaimable? %s"
"\n", "\n",
zone->name, zone->name,
K(zone_page_state(zone, NR_FREE_PAGES)), K(zone_page_state(zone, NR_FREE_PAGES)),
K(min_wmark_pages(zone)), K(min_wmark_pages(zone)),
K(low_wmark_pages(zone)), K(low_wmark_pages(zone)),
K(high_wmark_pages(zone)), K(high_wmark_pages(zone)),
K(zone_page_state(zone, NR_ACTIVE_ANON)), K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)),
K(zone_page_state(zone, NR_INACTIVE_ANON)), K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)),
K(zone_page_state(zone, NR_ACTIVE_FILE)), K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)),
K(zone_page_state(zone, NR_INACTIVE_FILE)), K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)),
K(zone_page_state(zone, NR_UNEVICTABLE)), K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)),
K(zone_page_state(zone, NR_ISOLATED_ANON)), K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)),
K(zone_page_state(zone, NR_ISOLATED_FILE)),
K(zone->present_pages), K(zone->present_pages),
K(zone->managed_pages), K(zone->managed_pages),
K(zone_page_state(zone, NR_MLOCK)), K(zone_page_state(zone, NR_MLOCK)),
K(zone_page_state(zone, NR_FILE_DIRTY)),
K(zone_page_state(zone, NR_WRITEBACK)),
K(zone_page_state(zone, NR_FILE_MAPPED)),
K(zone_page_state(zone, NR_SHMEM)),
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
K(zone_page_state(zone, NR_SHMEM_THPS) * HPAGE_PMD_NR),
K(zone_page_state(zone, NR_SHMEM_PMDMAPPED)
* HPAGE_PMD_NR),
K(zone_page_state(zone, NR_ANON_THPS) * HPAGE_PMD_NR),
#endif
K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)), K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),
K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)), K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),
zone_page_state(zone, NR_KERNEL_STACK) * zone_page_state(zone, NR_KERNEL_STACK_KB),
THREAD_SIZE / 1024,
K(zone_page_state(zone, NR_PAGETABLE)), K(zone_page_state(zone, NR_PAGETABLE)),
K(zone_page_state(zone, NR_UNSTABLE_NFS)),
K(zone_page_state(zone, NR_BOUNCE)), K(zone_page_state(zone, NR_BOUNCE)),
K(free_pcp), K(free_pcp),
K(this_cpu_read(zone->pageset->pcp.count)), K(this_cpu_read(zone->pageset->pcp.count)),
K(zone_page_state(zone, NR_FREE_CMA_PAGES)), K(zone_page_state(zone, NR_FREE_CMA_PAGES)));
K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
K(zone_page_state(zone, NR_PAGES_SCANNED)),
(!zone_reclaimable(zone) ? "yes" : "no")
);
printk("lowmem_reserve[]:"); printk("lowmem_reserve[]:");
for (i = 0; i < MAX_NR_ZONES; i++) for (i = 0; i < MAX_NR_ZONES; i++)
printk(" %ld", zone->lowmem_reserve[i]); printk(" %ld", zone->lowmem_reserve[i]);
@ -4469,7 +4386,7 @@ void show_free_areas(unsigned int filter)
hugetlb_show_meminfo(); hugetlb_show_meminfo();
printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES)); printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES));
show_swap_cache_info(); show_swap_cache_info();
} }
@ -5340,6 +5257,11 @@ static void __meminit setup_zone_pageset(struct zone *zone)
zone->pageset = alloc_percpu(struct per_cpu_pageset); zone->pageset = alloc_percpu(struct per_cpu_pageset);
for_each_possible_cpu(cpu) for_each_possible_cpu(cpu)
zone_pageset_init(zone, cpu); zone_pageset_init(zone, cpu);
if (!zone->zone_pgdat->per_cpu_nodestats) {
zone->zone_pgdat->per_cpu_nodestats =
alloc_percpu(struct per_cpu_nodestat);
}
} }
/* /*
@ -5909,6 +5831,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
init_waitqueue_head(&pgdat->kcompactd_wait); init_waitqueue_head(&pgdat->kcompactd_wait);
#endif #endif
pgdat_page_ext_init(pgdat); pgdat_page_ext_init(pgdat);
spin_lock_init(&pgdat->lru_lock);
lruvec_init(node_lruvec(pgdat));
for (j = 0; j < MAX_NR_ZONES; j++) { for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j; struct zone *zone = pgdat->node_zones + j;
@ -5958,21 +5882,16 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
zone->managed_pages = is_highmem_idx(j) ? realsize : freesize; zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;
#ifdef CONFIG_NUMA #ifdef CONFIG_NUMA
zone->node = nid; zone->node = nid;
zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio) pgdat->min_unmapped_pages += (freesize*sysctl_min_unmapped_ratio)
/ 100; / 100;
zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100; pgdat->min_slab_pages += (freesize * sysctl_min_slab_ratio) / 100;
#endif #endif
zone->name = zone_names[j]; zone->name = zone_names[j];
spin_lock_init(&zone->lock);
spin_lock_init(&zone->lru_lock);
zone_seqlock_init(zone);
zone->zone_pgdat = pgdat; zone->zone_pgdat = pgdat;
spin_lock_init(&zone->lock);
zone_seqlock_init(zone);
zone_pcp_init(zone); zone_pcp_init(zone);
/* For bootup, initialized properly in watermark setup */
mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages);
lruvec_init(&zone->lruvec);
if (!size) if (!size)
continue; continue;
@ -6038,11 +5957,12 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
unsigned long end_pfn = 0; unsigned long end_pfn = 0;
/* pg_data_t should be reset to zero when it's allocated */ /* pg_data_t should be reset to zero when it's allocated */
WARN_ON(pgdat->nr_zones || pgdat->classzone_idx); WARN_ON(pgdat->nr_zones || pgdat->kswapd_classzone_idx);
reset_deferred_meminit(pgdat); reset_deferred_meminit(pgdat);
pgdat->node_id = nid; pgdat->node_id = nid;
pgdat->node_start_pfn = node_start_pfn; pgdat->node_start_pfn = node_start_pfn;
pgdat->per_cpu_nodestats = NULL;
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
@ -6699,6 +6619,9 @@ static void calculate_totalreserve_pages(void)
enum zone_type i, j; enum zone_type i, j;
for_each_online_pgdat(pgdat) { for_each_online_pgdat(pgdat) {
pgdat->totalreserve_pages = 0;
for (i = 0; i < MAX_NR_ZONES; i++) { for (i = 0; i < MAX_NR_ZONES; i++) {
struct zone *zone = pgdat->node_zones + i; struct zone *zone = pgdat->node_zones + i;
long max = 0; long max = 0;
@ -6715,7 +6638,7 @@ static void calculate_totalreserve_pages(void)
if (max > zone->managed_pages) if (max > zone->managed_pages)
max = zone->managed_pages; max = zone->managed_pages;
zone->totalreserve_pages = max; pgdat->totalreserve_pages += max;
reserve_pages += max; reserve_pages += max;
} }
@ -6816,10 +6739,6 @@ static void __setup_per_zone_wmarks(void)
zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp; zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp;
zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2; zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2;
__mod_zone_page_state(zone, NR_ALLOC_BATCH,
high_wmark_pages(zone) - low_wmark_pages(zone) -
atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
spin_unlock_irqrestore(&zone->lock, flags); spin_unlock_irqrestore(&zone->lock, flags);
} }
@ -6930,6 +6849,7 @@ int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos) void __user *buffer, size_t *length, loff_t *ppos)
{ {
struct pglist_data *pgdat;
struct zone *zone; struct zone *zone;
int rc; int rc;
@ -6937,8 +6857,11 @@ int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
if (rc) if (rc)
return rc; return rc;
for_each_online_pgdat(pgdat)
pgdat->min_slab_pages = 0;
for_each_zone(zone) for_each_zone(zone)
zone->min_unmapped_pages = (zone->managed_pages * zone->zone_pgdat->min_unmapped_pages += (zone->managed_pages *
sysctl_min_unmapped_ratio) / 100; sysctl_min_unmapped_ratio) / 100;
return 0; return 0;
} }
@ -6946,6 +6869,7 @@ int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos) void __user *buffer, size_t *length, loff_t *ppos)
{ {
struct pglist_data *pgdat;
struct zone *zone; struct zone *zone;
int rc; int rc;
@ -6953,8 +6877,11 @@ int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
if (rc) if (rc)
return rc; return rc;
for_each_online_pgdat(pgdat)
pgdat->min_slab_pages = 0;
for_each_zone(zone) for_each_zone(zone)
zone->min_slab_pages = (zone->managed_pages * zone->zone_pgdat->min_slab_pages += (zone->managed_pages *
sysctl_min_slab_ratio) / 100; sysctl_min_slab_ratio) / 100;
return 0; return 0;
} }

View file

@ -41,12 +41,12 @@ static struct page *page_idle_get_page(unsigned long pfn)
return NULL; return NULL;
zone = page_zone(page); zone = page_zone(page);
spin_lock_irq(&zone->lru_lock); spin_lock_irq(zone_lru_lock(zone));
if (unlikely(!PageLRU(page))) { if (unlikely(!PageLRU(page))) {
put_page(page); put_page(page);
page = NULL; page = NULL;
} }
spin_unlock_irq(&zone->lru_lock); spin_unlock_irq(zone_lru_lock(zone));
return page; return page;
} }

View file

@ -166,6 +166,8 @@ int generic_swapfile_activate(struct swap_info_struct *sis,
unsigned block_in_page; unsigned block_in_page;
sector_t first_block; sector_t first_block;
cond_resched();
first_block = bmap(inode, probe_block); first_block = bmap(inode, probe_block);
if (first_block == 0) if (first_block == 0)
goto bad_bmap; goto bad_bmap;

View file

@ -27,7 +27,7 @@
* mapping->i_mmap_rwsem * mapping->i_mmap_rwsem
* anon_vma->rwsem * anon_vma->rwsem
* mm->page_table_lock or pte_lock * mm->page_table_lock or pte_lock
* zone->lru_lock (in mark_page_accessed, isolate_lru_page) * zone_lru_lock (in mark_page_accessed, isolate_lru_page)
* swap_lock (in swap_duplicate, swap_info_get) * swap_lock (in swap_duplicate, swap_info_get)
* mmlist_lock (in mmput, drain_mmlist and others) * mmlist_lock (in mmput, drain_mmlist and others)
* mapping->private_lock (in __set_page_dirty_buffers) * mapping->private_lock (in __set_page_dirty_buffers)
@ -1213,8 +1213,8 @@ void do_page_add_anon_rmap(struct page *page,
* disabled. * disabled.
*/ */
if (compound) if (compound)
__inc_zone_page_state(page, NR_ANON_THPS); __inc_node_page_state(page, NR_ANON_THPS);
__mod_zone_page_state(page_zone(page), NR_ANON_PAGES, nr); __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, nr);
} }
if (unlikely(PageKsm(page))) if (unlikely(PageKsm(page)))
return; return;
@ -1251,14 +1251,14 @@ void page_add_new_anon_rmap(struct page *page,
VM_BUG_ON_PAGE(!PageTransHuge(page), page); VM_BUG_ON_PAGE(!PageTransHuge(page), page);
/* increment count (starts at -1) */ /* increment count (starts at -1) */
atomic_set(compound_mapcount_ptr(page), 0); atomic_set(compound_mapcount_ptr(page), 0);
__inc_zone_page_state(page, NR_ANON_THPS); __inc_node_page_state(page, NR_ANON_THPS);
} else { } else {
/* Anon THP always mapped first with PMD */ /* Anon THP always mapped first with PMD */
VM_BUG_ON_PAGE(PageTransCompound(page), page); VM_BUG_ON_PAGE(PageTransCompound(page), page);
/* increment count (starts at -1) */ /* increment count (starts at -1) */
atomic_set(&page->_mapcount, 0); atomic_set(&page->_mapcount, 0);
} }
__mod_zone_page_state(page_zone(page), NR_ANON_PAGES, nr); __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, nr);
__page_set_anon_rmap(page, vma, address, 1); __page_set_anon_rmap(page, vma, address, 1);
} }
@ -1282,7 +1282,7 @@ void page_add_file_rmap(struct page *page, bool compound)
if (!atomic_inc_and_test(compound_mapcount_ptr(page))) if (!atomic_inc_and_test(compound_mapcount_ptr(page)))
goto out; goto out;
VM_BUG_ON_PAGE(!PageSwapBacked(page), page); VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
__inc_zone_page_state(page, NR_SHMEM_PMDMAPPED); __inc_node_page_state(page, NR_SHMEM_PMDMAPPED);
} else { } else {
if (PageTransCompound(page)) { if (PageTransCompound(page)) {
VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(!PageLocked(page), page);
@ -1293,7 +1293,7 @@ void page_add_file_rmap(struct page *page, bool compound)
if (!atomic_inc_and_test(&page->_mapcount)) if (!atomic_inc_and_test(&page->_mapcount))
goto out; goto out;
} }
__mod_zone_page_state(page_zone(page), NR_FILE_MAPPED, nr); __mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, nr);
mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED);
out: out:
unlock_page_memcg(page); unlock_page_memcg(page);
@ -1322,18 +1322,18 @@ static void page_remove_file_rmap(struct page *page, bool compound)
if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
goto out; goto out;
VM_BUG_ON_PAGE(!PageSwapBacked(page), page); VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
__dec_zone_page_state(page, NR_SHMEM_PMDMAPPED); __dec_node_page_state(page, NR_SHMEM_PMDMAPPED);
} else { } else {
if (!atomic_add_negative(-1, &page->_mapcount)) if (!atomic_add_negative(-1, &page->_mapcount))
goto out; goto out;
} }
/* /*
* We use the irq-unsafe __{inc|mod}_zone_page_stat because * We use the irq-unsafe __{inc|mod}_zone_page_state because
* these counters are not modified in interrupt context, and * these counters are not modified in interrupt context, and
* pte lock(a spinlock) is held, which implies preemption disabled. * pte lock(a spinlock) is held, which implies preemption disabled.
*/ */
__mod_zone_page_state(page_zone(page), NR_FILE_MAPPED, -nr); __mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, -nr);
mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED);
if (unlikely(PageMlocked(page))) if (unlikely(PageMlocked(page)))
@ -1356,7 +1356,7 @@ static void page_remove_anon_compound_rmap(struct page *page)
if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
return; return;
__dec_zone_page_state(page, NR_ANON_THPS); __dec_node_page_state(page, NR_ANON_THPS);
if (TestClearPageDoubleMap(page)) { if (TestClearPageDoubleMap(page)) {
/* /*
@ -1375,7 +1375,7 @@ static void page_remove_anon_compound_rmap(struct page *page)
clear_page_mlock(page); clear_page_mlock(page);
if (nr) { if (nr) {
__mod_zone_page_state(page_zone(page), NR_ANON_PAGES, -nr); __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, -nr);
deferred_split_huge_page(page); deferred_split_huge_page(page);
} }
} }
@ -1404,7 +1404,7 @@ void page_remove_rmap(struct page *page, bool compound)
* these counters are not modified in interrupt context, and * these counters are not modified in interrupt context, and
* pte lock(a spinlock) is held, which implies preemption disabled. * pte lock(a spinlock) is held, which implies preemption disabled.
*/ */
__dec_zone_page_state(page, NR_ANON_PAGES); __dec_node_page_state(page, NR_ANON_MAPPED);
if (unlikely(PageMlocked(page))) if (unlikely(PageMlocked(page)))
clear_page_mlock(page); clear_page_mlock(page);

View file

@ -575,9 +575,9 @@ static int shmem_add_to_page_cache(struct page *page,
if (!error) { if (!error) {
mapping->nrpages += nr; mapping->nrpages += nr;
if (PageTransHuge(page)) if (PageTransHuge(page))
__inc_zone_page_state(page, NR_SHMEM_THPS); __inc_node_page_state(page, NR_SHMEM_THPS);
__mod_zone_page_state(page_zone(page), NR_FILE_PAGES, nr); __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr);
__mod_zone_page_state(page_zone(page), NR_SHMEM, nr); __mod_node_page_state(page_pgdat(page), NR_SHMEM, nr);
spin_unlock_irq(&mapping->tree_lock); spin_unlock_irq(&mapping->tree_lock);
} else { } else {
page->mapping = NULL; page->mapping = NULL;
@ -601,8 +601,8 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap)
error = shmem_radix_tree_replace(mapping, page->index, page, radswap); error = shmem_radix_tree_replace(mapping, page->index, page, radswap);
page->mapping = NULL; page->mapping = NULL;
mapping->nrpages--; mapping->nrpages--;
__dec_zone_page_state(page, NR_FILE_PAGES); __dec_node_page_state(page, NR_FILE_PAGES);
__dec_zone_page_state(page, NR_SHMEM); __dec_node_page_state(page, NR_SHMEM);
spin_unlock_irq(&mapping->tree_lock); spin_unlock_irq(&mapping->tree_lock);
put_page(page); put_page(page);
BUG_ON(error); BUG_ON(error);
@ -1493,8 +1493,8 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage, error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage,
newpage); newpage);
if (!error) { if (!error) {
__inc_zone_page_state(newpage, NR_FILE_PAGES); __inc_node_page_state(newpage, NR_FILE_PAGES);
__dec_zone_page_state(oldpage, NR_FILE_PAGES); __dec_node_page_state(oldpage, NR_FILE_PAGES);
} }
spin_unlock_irq(&swap_mapping->tree_lock); spin_unlock_irq(&swap_mapping->tree_lock);

View file

@ -369,6 +369,8 @@ static inline size_t slab_ksize(const struct kmem_cache *s)
if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
return s->object_size; return s->object_size;
# endif # endif
if (s->flags & SLAB_KASAN)
return s->object_size;
/* /*
* If we have the need to store the freelist pointer * If we have the need to store the freelist pointer
* back there or track user information then we can * back there or track user information then we can

View file

@ -124,7 +124,7 @@ static inline int kmem_cache_debug(struct kmem_cache *s)
#endif #endif
} }
static inline void *fixup_red_left(struct kmem_cache *s, void *p) inline void *fixup_red_left(struct kmem_cache *s, void *p)
{ {
if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE) if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE)
p += s->red_left_pad; p += s->red_left_pad;
@ -454,8 +454,6 @@ static inline void *restore_red_left(struct kmem_cache *s, void *p)
*/ */
#if defined(CONFIG_SLUB_DEBUG_ON) #if defined(CONFIG_SLUB_DEBUG_ON)
static int slub_debug = DEBUG_DEFAULT_FLAGS; static int slub_debug = DEBUG_DEFAULT_FLAGS;
#elif defined(CONFIG_KASAN)
static int slub_debug = SLAB_STORE_USER;
#else #else
static int slub_debug; static int slub_debug;
#endif #endif
@ -660,6 +658,8 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
if (s->flags & SLAB_STORE_USER) if (s->flags & SLAB_STORE_USER)
off += 2 * sizeof(struct track); off += 2 * sizeof(struct track);
off += kasan_metadata_size(s);
if (off != size_from_object(s)) if (off != size_from_object(s))
/* Beginning of the filler is the free pointer */ /* Beginning of the filler is the free pointer */
print_section("Padding ", p + off, size_from_object(s) - off); print_section("Padding ", p + off, size_from_object(s) - off);
@ -787,6 +787,8 @@ static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p)
/* We also have user information there */ /* We also have user information there */
off += 2 * sizeof(struct track); off += 2 * sizeof(struct track);
off += kasan_metadata_size(s);
if (size_from_object(s) == off) if (size_from_object(s) == off)
return 1; return 1;
@ -1322,8 +1324,10 @@ static inline void kfree_hook(const void *x)
kasan_kfree_large(x); kasan_kfree_large(x);
} }
static inline void slab_free_hook(struct kmem_cache *s, void *x) static inline void *slab_free_hook(struct kmem_cache *s, void *x)
{ {
void *freeptr;
kmemleak_free_recursive(x, s->flags); kmemleak_free_recursive(x, s->flags);
/* /*
@ -1344,7 +1348,13 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x)
if (!(s->flags & SLAB_DEBUG_OBJECTS)) if (!(s->flags & SLAB_DEBUG_OBJECTS))
debug_check_no_obj_freed(x, s->object_size); debug_check_no_obj_freed(x, s->object_size);
freeptr = get_freepointer(s, x);
/*
* kasan_slab_free() may put x into memory quarantine, delaying its
* reuse. In this case the object's freelist pointer is changed.
*/
kasan_slab_free(s, x); kasan_slab_free(s, x);
return freeptr;
} }
static inline void slab_free_freelist_hook(struct kmem_cache *s, static inline void slab_free_freelist_hook(struct kmem_cache *s,
@ -1362,11 +1372,11 @@ static inline void slab_free_freelist_hook(struct kmem_cache *s,
void *object = head; void *object = head;
void *tail_obj = tail ? : head; void *tail_obj = tail ? : head;
void *freeptr;
do { do {
slab_free_hook(s, object); freeptr = slab_free_hook(s, object);
} while ((object != tail_obj) && } while ((object != tail_obj) && (object = freeptr));
(object = get_freepointer(s, object)));
#endif #endif
} }
@ -2878,16 +2888,13 @@ slab_empty:
* same page) possible by specifying head and tail ptr, plus objects * same page) possible by specifying head and tail ptr, plus objects
* count (cnt). Bulk free indicated by tail pointer being set. * count (cnt). Bulk free indicated by tail pointer being set.
*/ */
static __always_inline void slab_free(struct kmem_cache *s, struct page *page, static __always_inline void do_slab_free(struct kmem_cache *s,
void *head, void *tail, int cnt, struct page *page, void *head, void *tail,
unsigned long addr) int cnt, unsigned long addr)
{ {
void *tail_obj = tail ? : head; void *tail_obj = tail ? : head;
struct kmem_cache_cpu *c; struct kmem_cache_cpu *c;
unsigned long tid; unsigned long tid;
slab_free_freelist_hook(s, head, tail);
redo: redo:
/* /*
* Determine the currently cpus per cpu slab. * Determine the currently cpus per cpu slab.
@ -2921,6 +2928,27 @@ redo:
} }
static __always_inline void slab_free(struct kmem_cache *s, struct page *page,
void *head, void *tail, int cnt,
unsigned long addr)
{
slab_free_freelist_hook(s, head, tail);
/*
* slab_free_freelist_hook() could have put the items into quarantine.
* If so, no need to free them.
*/
if (s->flags & SLAB_KASAN && !(s->flags & SLAB_DESTROY_BY_RCU))
return;
do_slab_free(s, page, head, tail, cnt, addr);
}
#ifdef CONFIG_KASAN
void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr)
{
do_slab_free(cache, virt_to_head_page(x), x, NULL, 1, addr);
}
#endif
void kmem_cache_free(struct kmem_cache *s, void *x) void kmem_cache_free(struct kmem_cache *s, void *x)
{ {
s = cache_from_obj(s, x); s = cache_from_obj(s, x);
@ -3363,7 +3391,7 @@ static void set_min_partial(struct kmem_cache *s, unsigned long min)
static int calculate_sizes(struct kmem_cache *s, int forced_order) static int calculate_sizes(struct kmem_cache *s, int forced_order)
{ {
unsigned long flags = s->flags; unsigned long flags = s->flags;
unsigned long size = s->object_size; size_t size = s->object_size;
int order; int order;
/* /*
@ -3422,7 +3450,10 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
* the object. * the object.
*/ */
size += 2 * sizeof(struct track); size += 2 * sizeof(struct track);
#endif
kasan_cache_create(s, &size, &s->flags);
#ifdef CONFIG_SLUB_DEBUG
if (flags & SLAB_RED_ZONE) { if (flags & SLAB_RED_ZONE) {
/* /*
* Add some empty padding so that we can catch * Add some empty padding so that we can catch

View file

@ -100,11 +100,7 @@ static inline int sparse_index_init(unsigned long section_nr, int nid)
} }
#endif #endif
/* #ifdef CONFIG_SPARSEMEM_EXTREME
* Although written for the SPARSEMEM_EXTREME case, this happens
* to also work for the flat array case because
* NR_SECTION_ROOTS==NR_MEM_SECTIONS.
*/
int __section_nr(struct mem_section* ms) int __section_nr(struct mem_section* ms)
{ {
unsigned long root_nr; unsigned long root_nr;
@ -123,6 +119,12 @@ int __section_nr(struct mem_section* ms)
return (root_nr * SECTIONS_PER_ROOT) + (ms - root); return (root_nr * SECTIONS_PER_ROOT) + (ms - root);
} }
#else
int __section_nr(struct mem_section* ms)
{
return (int)(ms - mem_section[0]);
}
#endif
/* /*
* During early boot, before section_mem_map is used for an actual * During early boot, before section_mem_map is used for an actual

View file

@ -62,12 +62,12 @@ static void __page_cache_release(struct page *page)
struct lruvec *lruvec; struct lruvec *lruvec;
unsigned long flags; unsigned long flags;
spin_lock_irqsave(&zone->lru_lock, flags); spin_lock_irqsave(zone_lru_lock(zone), flags);
lruvec = mem_cgroup_page_lruvec(page, zone); lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
VM_BUG_ON_PAGE(!PageLRU(page), page); VM_BUG_ON_PAGE(!PageLRU(page), page);
__ClearPageLRU(page); __ClearPageLRU(page);
del_page_from_lru_list(page, lruvec, page_off_lru(page)); del_page_from_lru_list(page, lruvec, page_off_lru(page));
spin_unlock_irqrestore(&zone->lru_lock, flags); spin_unlock_irqrestore(zone_lru_lock(zone), flags);
} }
mem_cgroup_uncharge(page); mem_cgroup_uncharge(page);
} }
@ -179,26 +179,26 @@ static void pagevec_lru_move_fn(struct pagevec *pvec,
void *arg) void *arg)
{ {
int i; int i;
struct zone *zone = NULL; struct pglist_data *pgdat = NULL;
struct lruvec *lruvec; struct lruvec *lruvec;
unsigned long flags = 0; unsigned long flags = 0;
for (i = 0; i < pagevec_count(pvec); i++) { for (i = 0; i < pagevec_count(pvec); i++) {
struct page *page = pvec->pages[i]; struct page *page = pvec->pages[i];
struct zone *pagezone = page_zone(page); struct pglist_data *pagepgdat = page_pgdat(page);
if (pagezone != zone) { if (pagepgdat != pgdat) {
if (zone) if (pgdat)
spin_unlock_irqrestore(&zone->lru_lock, flags); spin_unlock_irqrestore(&pgdat->lru_lock, flags);
zone = pagezone; pgdat = pagepgdat;
spin_lock_irqsave(&zone->lru_lock, flags); spin_lock_irqsave(&pgdat->lru_lock, flags);
} }
lruvec = mem_cgroup_page_lruvec(page, zone); lruvec = mem_cgroup_page_lruvec(page, pgdat);
(*move_fn)(page, lruvec, arg); (*move_fn)(page, lruvec, arg);
} }
if (zone) if (pgdat)
spin_unlock_irqrestore(&zone->lru_lock, flags); spin_unlock_irqrestore(&pgdat->lru_lock, flags);
release_pages(pvec->pages, pvec->nr, pvec->cold); release_pages(pvec->pages, pvec->nr, pvec->cold);
pagevec_reinit(pvec); pagevec_reinit(pvec);
} }
@ -318,9 +318,9 @@ void activate_page(struct page *page)
struct zone *zone = page_zone(page); struct zone *zone = page_zone(page);
page = compound_head(page); page = compound_head(page);
spin_lock_irq(&zone->lru_lock); spin_lock_irq(zone_lru_lock(zone));
__activate_page(page, mem_cgroup_page_lruvec(page, zone), NULL); __activate_page(page, mem_cgroup_page_lruvec(page, zone->zone_pgdat), NULL);
spin_unlock_irq(&zone->lru_lock); spin_unlock_irq(zone_lru_lock(zone));
} }
#endif #endif
@ -445,16 +445,16 @@ void lru_cache_add(struct page *page)
*/ */
void add_page_to_unevictable_list(struct page *page) void add_page_to_unevictable_list(struct page *page)
{ {
struct zone *zone = page_zone(page); struct pglist_data *pgdat = page_pgdat(page);
struct lruvec *lruvec; struct lruvec *lruvec;
spin_lock_irq(&zone->lru_lock); spin_lock_irq(&pgdat->lru_lock);
lruvec = mem_cgroup_page_lruvec(page, zone); lruvec = mem_cgroup_page_lruvec(page, pgdat);
ClearPageActive(page); ClearPageActive(page);
SetPageUnevictable(page); SetPageUnevictable(page);
SetPageLRU(page); SetPageLRU(page);
add_page_to_lru_list(page, lruvec, LRU_UNEVICTABLE); add_page_to_lru_list(page, lruvec, LRU_UNEVICTABLE);
spin_unlock_irq(&zone->lru_lock); spin_unlock_irq(&pgdat->lru_lock);
} }
/** /**
@ -730,7 +730,7 @@ void release_pages(struct page **pages, int nr, bool cold)
{ {
int i; int i;
LIST_HEAD(pages_to_free); LIST_HEAD(pages_to_free);
struct zone *zone = NULL; struct pglist_data *locked_pgdat = NULL;
struct lruvec *lruvec; struct lruvec *lruvec;
unsigned long uninitialized_var(flags); unsigned long uninitialized_var(flags);
unsigned int uninitialized_var(lock_batch); unsigned int uninitialized_var(lock_batch);
@ -741,11 +741,11 @@ void release_pages(struct page **pages, int nr, bool cold)
/* /*
* Make sure the IRQ-safe lock-holding time does not get * Make sure the IRQ-safe lock-holding time does not get
* excessive with a continuous string of pages from the * excessive with a continuous string of pages from the
* same zone. The lock is held only if zone != NULL. * same pgdat. The lock is held only if pgdat != NULL.
*/ */
if (zone && ++lock_batch == SWAP_CLUSTER_MAX) { if (locked_pgdat && ++lock_batch == SWAP_CLUSTER_MAX) {
spin_unlock_irqrestore(&zone->lru_lock, flags); spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
zone = NULL; locked_pgdat = NULL;
} }
if (is_huge_zero_page(page)) { if (is_huge_zero_page(page)) {
@ -758,27 +758,27 @@ void release_pages(struct page **pages, int nr, bool cold)
continue; continue;
if (PageCompound(page)) { if (PageCompound(page)) {
if (zone) { if (locked_pgdat) {
spin_unlock_irqrestore(&zone->lru_lock, flags); spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
zone = NULL; locked_pgdat = NULL;
} }
__put_compound_page(page); __put_compound_page(page);
continue; continue;
} }
if (PageLRU(page)) { if (PageLRU(page)) {
struct zone *pagezone = page_zone(page); struct pglist_data *pgdat = page_pgdat(page);
if (pagezone != zone) { if (pgdat != locked_pgdat) {
if (zone) if (locked_pgdat)
spin_unlock_irqrestore(&zone->lru_lock, spin_unlock_irqrestore(&locked_pgdat->lru_lock,
flags); flags);
lock_batch = 0; lock_batch = 0;
zone = pagezone; locked_pgdat = pgdat;
spin_lock_irqsave(&zone->lru_lock, flags); spin_lock_irqsave(&locked_pgdat->lru_lock, flags);
} }
lruvec = mem_cgroup_page_lruvec(page, zone); lruvec = mem_cgroup_page_lruvec(page, locked_pgdat);
VM_BUG_ON_PAGE(!PageLRU(page), page); VM_BUG_ON_PAGE(!PageLRU(page), page);
__ClearPageLRU(page); __ClearPageLRU(page);
del_page_from_lru_list(page, lruvec, page_off_lru(page)); del_page_from_lru_list(page, lruvec, page_off_lru(page));
@ -789,8 +789,8 @@ void release_pages(struct page **pages, int nr, bool cold)
list_add(&page->lru, &pages_to_free); list_add(&page->lru, &pages_to_free);
} }
if (zone) if (locked_pgdat)
spin_unlock_irqrestore(&zone->lru_lock, flags); spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
mem_cgroup_uncharge_list(&pages_to_free); mem_cgroup_uncharge_list(&pages_to_free);
free_hot_cold_page_list(&pages_to_free, cold); free_hot_cold_page_list(&pages_to_free, cold);
@ -826,7 +826,7 @@ void lru_add_page_tail(struct page *page, struct page *page_tail,
VM_BUG_ON_PAGE(PageCompound(page_tail), page); VM_BUG_ON_PAGE(PageCompound(page_tail), page);
VM_BUG_ON_PAGE(PageLRU(page_tail), page); VM_BUG_ON_PAGE(PageLRU(page_tail), page);
VM_BUG_ON(NR_CPUS != 1 && VM_BUG_ON(NR_CPUS != 1 &&
!spin_is_locked(&lruvec_zone(lruvec)->lru_lock)); !spin_is_locked(&lruvec_pgdat(lruvec)->lru_lock));
if (!list) if (!list)
SetPageLRU(page_tail); SetPageLRU(page_tail);

View file

@ -95,7 +95,7 @@ int __add_to_swap_cache(struct page *page, swp_entry_t entry)
entry.val, page); entry.val, page);
if (likely(!error)) { if (likely(!error)) {
address_space->nrpages++; address_space->nrpages++;
__inc_zone_page_state(page, NR_FILE_PAGES); __inc_node_page_state(page, NR_FILE_PAGES);
INC_CACHE_INFO(add_total); INC_CACHE_INFO(add_total);
} }
spin_unlock_irq(&address_space->tree_lock); spin_unlock_irq(&address_space->tree_lock);
@ -147,7 +147,7 @@ void __delete_from_swap_cache(struct page *page)
set_page_private(page, 0); set_page_private(page, 0);
ClearPageSwapCache(page); ClearPageSwapCache(page);
address_space->nrpages--; address_space->nrpages--;
__dec_zone_page_state(page, NR_FILE_PAGES); __dec_node_page_state(page, NR_FILE_PAGES);
INC_CACHE_INFO(del_total); INC_CACHE_INFO(del_total);
} }

View file

@ -528,7 +528,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
free = global_page_state(NR_FREE_PAGES); free = global_page_state(NR_FREE_PAGES);
free += global_page_state(NR_FILE_PAGES); free += global_node_page_state(NR_FILE_PAGES);
/* /*
* shmem pages shouldn't be counted as free in this * shmem pages shouldn't be counted as free in this
@ -536,7 +536,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
* that won't affect the overall amount of available * that won't affect the overall amount of available
* memory in the system. * memory in the system.
*/ */
free -= global_page_state(NR_SHMEM); free -= global_node_page_state(NR_SHMEM);
free += get_nr_swap_pages(); free += get_nr_swap_pages();

File diff suppressed because it is too large Load diff

View file

@ -86,8 +86,10 @@ void vm_events_fold_cpu(int cpu)
* *
* vm_stat contains the global counters * vm_stat contains the global counters
*/ */
atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp; atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp;
EXPORT_SYMBOL(vm_stat); atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS] __cacheline_aligned_in_smp;
EXPORT_SYMBOL(vm_zone_stat);
EXPORT_SYMBOL(vm_node_stat);
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
@ -167,19 +169,36 @@ int calculate_normal_threshold(struct zone *zone)
*/ */
void refresh_zone_stat_thresholds(void) void refresh_zone_stat_thresholds(void)
{ {
struct pglist_data *pgdat;
struct zone *zone; struct zone *zone;
int cpu; int cpu;
int threshold; int threshold;
/* Zero current pgdat thresholds */
for_each_online_pgdat(pgdat) {
for_each_online_cpu(cpu) {
per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold = 0;
}
}
for_each_populated_zone(zone) { for_each_populated_zone(zone) {
struct pglist_data *pgdat = zone->zone_pgdat;
unsigned long max_drift, tolerate_drift; unsigned long max_drift, tolerate_drift;
threshold = calculate_normal_threshold(zone); threshold = calculate_normal_threshold(zone);
for_each_online_cpu(cpu) for_each_online_cpu(cpu) {
int pgdat_threshold;
per_cpu_ptr(zone->pageset, cpu)->stat_threshold per_cpu_ptr(zone->pageset, cpu)->stat_threshold
= threshold; = threshold;
/* Base nodestat threshold on the largest populated zone. */
pgdat_threshold = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold;
per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold
= max(threshold, pgdat_threshold);
}
/* /*
* Only set percpu_drift_mark if there is a danger that * Only set percpu_drift_mark if there is a danger that
* NR_FREE_PAGES reports the low watermark is ok when in fact * NR_FREE_PAGES reports the low watermark is ok when in fact
@ -238,6 +257,26 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
} }
EXPORT_SYMBOL(__mod_zone_page_state); EXPORT_SYMBOL(__mod_zone_page_state);
void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
long delta)
{
struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
s8 __percpu *p = pcp->vm_node_stat_diff + item;
long x;
long t;
x = delta + __this_cpu_read(*p);
t = __this_cpu_read(pcp->stat_threshold);
if (unlikely(x > t || x < -t)) {
node_page_state_add(x, pgdat, item);
x = 0;
}
__this_cpu_write(*p, x);
}
EXPORT_SYMBOL(__mod_node_page_state);
/* /*
* Optimized increment and decrement functions. * Optimized increment and decrement functions.
* *
@ -277,12 +316,34 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
} }
} }
void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
{
struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
s8 __percpu *p = pcp->vm_node_stat_diff + item;
s8 v, t;
v = __this_cpu_inc_return(*p);
t = __this_cpu_read(pcp->stat_threshold);
if (unlikely(v > t)) {
s8 overstep = t >> 1;
node_page_state_add(v + overstep, pgdat, item);
__this_cpu_write(*p, -overstep);
}
}
void __inc_zone_page_state(struct page *page, enum zone_stat_item item) void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
{ {
__inc_zone_state(page_zone(page), item); __inc_zone_state(page_zone(page), item);
} }
EXPORT_SYMBOL(__inc_zone_page_state); EXPORT_SYMBOL(__inc_zone_page_state);
void __inc_node_page_state(struct page *page, enum node_stat_item item)
{
__inc_node_state(page_pgdat(page), item);
}
EXPORT_SYMBOL(__inc_node_page_state);
void __dec_zone_state(struct zone *zone, enum zone_stat_item item) void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
{ {
struct per_cpu_pageset __percpu *pcp = zone->pageset; struct per_cpu_pageset __percpu *pcp = zone->pageset;
@ -299,12 +360,34 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
} }
} }
void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
{
struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
s8 __percpu *p = pcp->vm_node_stat_diff + item;
s8 v, t;
v = __this_cpu_dec_return(*p);
t = __this_cpu_read(pcp->stat_threshold);
if (unlikely(v < - t)) {
s8 overstep = t >> 1;
node_page_state_add(v - overstep, pgdat, item);
__this_cpu_write(*p, overstep);
}
}
void __dec_zone_page_state(struct page *page, enum zone_stat_item item) void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
{ {
__dec_zone_state(page_zone(page), item); __dec_zone_state(page_zone(page), item);
} }
EXPORT_SYMBOL(__dec_zone_page_state); EXPORT_SYMBOL(__dec_zone_page_state);
void __dec_node_page_state(struct page *page, enum node_stat_item item)
{
__dec_node_state(page_pgdat(page), item);
}
EXPORT_SYMBOL(__dec_node_page_state);
#ifdef CONFIG_HAVE_CMPXCHG_LOCAL #ifdef CONFIG_HAVE_CMPXCHG_LOCAL
/* /*
* If we have cmpxchg_local support then we do not need to incur the overhead * If we have cmpxchg_local support then we do not need to incur the overhead
@ -318,8 +401,8 @@ EXPORT_SYMBOL(__dec_zone_page_state);
* 1 Overstepping half of threshold * 1 Overstepping half of threshold
* -1 Overstepping minus half of threshold * -1 Overstepping minus half of threshold
*/ */
static inline void mod_state(struct zone *zone, enum zone_stat_item item, static inline void mod_zone_state(struct zone *zone,
long delta, int overstep_mode) enum zone_stat_item item, long delta, int overstep_mode)
{ {
struct per_cpu_pageset __percpu *pcp = zone->pageset; struct per_cpu_pageset __percpu *pcp = zone->pageset;
s8 __percpu *p = pcp->vm_stat_diff + item; s8 __percpu *p = pcp->vm_stat_diff + item;
@ -359,26 +442,83 @@ static inline void mod_state(struct zone *zone, enum zone_stat_item item,
void mod_zone_page_state(struct zone *zone, enum zone_stat_item item, void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
long delta) long delta)
{ {
mod_state(zone, item, delta, 0); mod_zone_state(zone, item, delta, 0);
} }
EXPORT_SYMBOL(mod_zone_page_state); EXPORT_SYMBOL(mod_zone_page_state);
void inc_zone_state(struct zone *zone, enum zone_stat_item item)
{
mod_state(zone, item, 1, 1);
}
void inc_zone_page_state(struct page *page, enum zone_stat_item item) void inc_zone_page_state(struct page *page, enum zone_stat_item item)
{ {
mod_state(page_zone(page), item, 1, 1); mod_zone_state(page_zone(page), item, 1, 1);
} }
EXPORT_SYMBOL(inc_zone_page_state); EXPORT_SYMBOL(inc_zone_page_state);
void dec_zone_page_state(struct page *page, enum zone_stat_item item) void dec_zone_page_state(struct page *page, enum zone_stat_item item)
{ {
mod_state(page_zone(page), item, -1, -1); mod_zone_state(page_zone(page), item, -1, -1);
} }
EXPORT_SYMBOL(dec_zone_page_state); EXPORT_SYMBOL(dec_zone_page_state);
static inline void mod_node_state(struct pglist_data *pgdat,
enum node_stat_item item, int delta, int overstep_mode)
{
struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
s8 __percpu *p = pcp->vm_node_stat_diff + item;
long o, n, t, z;
do {
z = 0; /* overflow to node counters */
/*
* The fetching of the stat_threshold is racy. We may apply
* a counter threshold to the wrong the cpu if we get
* rescheduled while executing here. However, the next
* counter update will apply the threshold again and
* therefore bring the counter under the threshold again.
*
* Most of the time the thresholds are the same anyways
* for all cpus in a node.
*/
t = this_cpu_read(pcp->stat_threshold);
o = this_cpu_read(*p);
n = delta + o;
if (n > t || n < -t) {
int os = overstep_mode * (t >> 1) ;
/* Overflow must be added to node counters */
z = n + os;
n = -os;
}
} while (this_cpu_cmpxchg(*p, o, n) != o);
if (z)
node_page_state_add(z, pgdat, item);
}
void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
long delta)
{
mod_node_state(pgdat, item, delta, 0);
}
EXPORT_SYMBOL(mod_node_page_state);
void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
{
mod_node_state(pgdat, item, 1, 1);
}
void inc_node_page_state(struct page *page, enum node_stat_item item)
{
mod_node_state(page_pgdat(page), item, 1, 1);
}
EXPORT_SYMBOL(inc_node_page_state);
void dec_node_page_state(struct page *page, enum node_stat_item item)
{
mod_node_state(page_pgdat(page), item, -1, -1);
}
EXPORT_SYMBOL(dec_node_page_state);
#else #else
/* /*
* Use interrupt disable to serialize counter updates * Use interrupt disable to serialize counter updates
@ -394,15 +534,6 @@ void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
} }
EXPORT_SYMBOL(mod_zone_page_state); EXPORT_SYMBOL(mod_zone_page_state);
void inc_zone_state(struct zone *zone, enum zone_stat_item item)
{
unsigned long flags;
local_irq_save(flags);
__inc_zone_state(zone, item);
local_irq_restore(flags);
}
void inc_zone_page_state(struct page *page, enum zone_stat_item item) void inc_zone_page_state(struct page *page, enum zone_stat_item item)
{ {
unsigned long flags; unsigned long flags;
@ -424,21 +555,69 @@ void dec_zone_page_state(struct page *page, enum zone_stat_item item)
local_irq_restore(flags); local_irq_restore(flags);
} }
EXPORT_SYMBOL(dec_zone_page_state); EXPORT_SYMBOL(dec_zone_page_state);
#endif
void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
{
unsigned long flags;
local_irq_save(flags);
__inc_node_state(pgdat, item);
local_irq_restore(flags);
}
EXPORT_SYMBOL(inc_node_state);
void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
long delta)
{
unsigned long flags;
local_irq_save(flags);
__mod_node_page_state(pgdat, item, delta);
local_irq_restore(flags);
}
EXPORT_SYMBOL(mod_node_page_state);
void inc_node_page_state(struct page *page, enum node_stat_item item)
{
unsigned long flags;
struct pglist_data *pgdat;
pgdat = page_pgdat(page);
local_irq_save(flags);
__inc_node_state(pgdat, item);
local_irq_restore(flags);
}
EXPORT_SYMBOL(inc_node_page_state);
void dec_node_page_state(struct page *page, enum node_stat_item item)
{
unsigned long flags;
local_irq_save(flags);
__dec_node_page_state(page, item);
local_irq_restore(flags);
}
EXPORT_SYMBOL(dec_node_page_state);
#endif
/* /*
* Fold a differential into the global counters. * Fold a differential into the global counters.
* Returns the number of counters updated. * Returns the number of counters updated.
*/ */
static int fold_diff(int *diff) static int fold_diff(int *zone_diff, int *node_diff)
{ {
int i; int i;
int changes = 0; int changes = 0;
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
if (diff[i]) { if (zone_diff[i]) {
atomic_long_add(diff[i], &vm_stat[i]); atomic_long_add(zone_diff[i], &vm_zone_stat[i]);
changes++;
}
for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
if (node_diff[i]) {
atomic_long_add(node_diff[i], &vm_node_stat[i]);
changes++; changes++;
} }
return changes; return changes;
@ -462,9 +641,11 @@ static int fold_diff(int *diff)
*/ */
static int refresh_cpu_vm_stats(bool do_pagesets) static int refresh_cpu_vm_stats(bool do_pagesets)
{ {
struct pglist_data *pgdat;
struct zone *zone; struct zone *zone;
int i; int i;
int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
int changes = 0; int changes = 0;
for_each_populated_zone(zone) { for_each_populated_zone(zone) {
@ -477,7 +658,7 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
if (v) { if (v) {
atomic_long_add(v, &zone->vm_stat[i]); atomic_long_add(v, &zone->vm_stat[i]);
global_diff[i] += v; global_zone_diff[i] += v;
#ifdef CONFIG_NUMA #ifdef CONFIG_NUMA
/* 3 seconds idle till flush */ /* 3 seconds idle till flush */
__this_cpu_write(p->expire, 3); __this_cpu_write(p->expire, 3);
@ -516,7 +697,22 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
} }
#endif #endif
} }
changes += fold_diff(global_diff);
for_each_online_pgdat(pgdat) {
struct per_cpu_nodestat __percpu *p = pgdat->per_cpu_nodestats;
for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
int v;
v = this_cpu_xchg(p->vm_node_stat_diff[i], 0);
if (v) {
atomic_long_add(v, &pgdat->vm_stat[i]);
global_node_diff[i] += v;
}
}
}
changes += fold_diff(global_zone_diff, global_node_diff);
return changes; return changes;
} }
@ -527,9 +723,11 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
*/ */
void cpu_vm_stats_fold(int cpu) void cpu_vm_stats_fold(int cpu)
{ {
struct pglist_data *pgdat;
struct zone *zone; struct zone *zone;
int i; int i;
int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
for_each_populated_zone(zone) { for_each_populated_zone(zone) {
struct per_cpu_pageset *p; struct per_cpu_pageset *p;
@ -543,11 +741,27 @@ void cpu_vm_stats_fold(int cpu)
v = p->vm_stat_diff[i]; v = p->vm_stat_diff[i];
p->vm_stat_diff[i] = 0; p->vm_stat_diff[i] = 0;
atomic_long_add(v, &zone->vm_stat[i]); atomic_long_add(v, &zone->vm_stat[i]);
global_diff[i] += v; global_zone_diff[i] += v;
} }
} }
fold_diff(global_diff); for_each_online_pgdat(pgdat) {
struct per_cpu_nodestat *p;
p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
if (p->vm_node_stat_diff[i]) {
int v;
v = p->vm_node_stat_diff[i];
p->vm_node_stat_diff[i] = 0;
atomic_long_add(v, &pgdat->vm_stat[i]);
global_node_diff[i] += v;
}
}
fold_diff(global_zone_diff, global_node_diff);
} }
/* /*
@ -563,16 +777,19 @@ void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset)
int v = pset->vm_stat_diff[i]; int v = pset->vm_stat_diff[i];
pset->vm_stat_diff[i] = 0; pset->vm_stat_diff[i] = 0;
atomic_long_add(v, &zone->vm_stat[i]); atomic_long_add(v, &zone->vm_stat[i]);
atomic_long_add(v, &vm_stat[i]); atomic_long_add(v, &vm_zone_stat[i]);
} }
} }
#endif #endif
#ifdef CONFIG_NUMA #ifdef CONFIG_NUMA
/* /*
* Determine the per node value of a stat item. * Determine the per node value of a stat item. This function
* is called frequently in a NUMA machine, so try to be as
* frugal as possible.
*/ */
unsigned long node_page_state(int node, enum zone_stat_item item) unsigned long sum_zone_node_page_state(int node,
enum zone_stat_item item)
{ {
struct zone *zones = NODE_DATA(node)->node_zones; struct zone *zones = NODE_DATA(node)->node_zones;
int i; int i;
@ -584,6 +801,19 @@ unsigned long node_page_state(int node, enum zone_stat_item item)
return count; return count;
} }
/*
* Determine the per node value of a stat item.
*/
unsigned long node_page_state(struct pglist_data *pgdat,
enum node_stat_item item)
{
long x = atomic_long_read(&pgdat->vm_stat[item]);
#ifdef CONFIG_SMP
if (x < 0)
x = 0;
#endif
return x;
}
#endif #endif
#ifdef CONFIG_COMPACTION #ifdef CONFIG_COMPACTION
@ -691,33 +921,18 @@ int fragmentation_index(struct zone *zone, unsigned int order)
const char * const vmstat_text[] = { const char * const vmstat_text[] = {
/* enum zone_stat_item countes */ /* enum zone_stat_item countes */
"nr_free_pages", "nr_free_pages",
"nr_alloc_batch", "nr_zone_inactive_anon",
"nr_inactive_anon", "nr_zone_active_anon",
"nr_active_anon", "nr_zone_inactive_file",
"nr_inactive_file", "nr_zone_active_file",
"nr_active_file", "nr_zone_unevictable",
"nr_unevictable", "nr_zone_write_pending",
"nr_mlock", "nr_mlock",
"nr_anon_pages",
"nr_mapped",
"nr_file_pages",
"nr_dirty",
"nr_writeback",
"nr_slab_reclaimable", "nr_slab_reclaimable",
"nr_slab_unreclaimable", "nr_slab_unreclaimable",
"nr_page_table_pages", "nr_page_table_pages",
"nr_kernel_stack", "nr_kernel_stack",
"nr_unstable",
"nr_bounce", "nr_bounce",
"nr_vmscan_write",
"nr_vmscan_immediate_reclaim",
"nr_writeback_temp",
"nr_isolated_anon",
"nr_isolated_file",
"nr_shmem",
"nr_dirtied",
"nr_written",
"nr_pages_scanned",
#if IS_ENABLED(CONFIG_ZSMALLOC) #if IS_ENABLED(CONFIG_ZSMALLOC)
"nr_zspages", "nr_zspages",
#endif #endif
@ -729,13 +944,35 @@ const char * const vmstat_text[] = {
"numa_local", "numa_local",
"numa_other", "numa_other",
#endif #endif
"nr_free_cma",
/* Node-based counters */
"nr_inactive_anon",
"nr_active_anon",
"nr_inactive_file",
"nr_active_file",
"nr_unevictable",
"nr_isolated_anon",
"nr_isolated_file",
"nr_pages_scanned",
"workingset_refault", "workingset_refault",
"workingset_activate", "workingset_activate",
"workingset_nodereclaim", "workingset_nodereclaim",
"nr_anon_transparent_hugepages", "nr_anon_pages",
"nr_mapped",
"nr_file_pages",
"nr_dirty",
"nr_writeback",
"nr_writeback_temp",
"nr_shmem",
"nr_shmem_hugepages", "nr_shmem_hugepages",
"nr_shmem_pmdmapped", "nr_shmem_pmdmapped",
"nr_free_cma", "nr_anon_transparent_hugepages",
"nr_unstable",
"nr_vmscan_write",
"nr_vmscan_immediate_reclaim",
"nr_dirtied",
"nr_written",
/* enum writeback_stat_item counters */ /* enum writeback_stat_item counters */
"nr_dirty_threshold", "nr_dirty_threshold",
@ -749,6 +986,8 @@ const char * const vmstat_text[] = {
"pswpout", "pswpout",
TEXTS_FOR_ZONES("pgalloc") TEXTS_FOR_ZONES("pgalloc")
TEXTS_FOR_ZONES("allocstall")
TEXTS_FOR_ZONES("pgskip")
"pgfree", "pgfree",
"pgactivate", "pgactivate",
@ -758,11 +997,11 @@ const char * const vmstat_text[] = {
"pgmajfault", "pgmajfault",
"pglazyfreed", "pglazyfreed",
TEXTS_FOR_ZONES("pgrefill") "pgrefill",
TEXTS_FOR_ZONES("pgsteal_kswapd") "pgsteal_kswapd",
TEXTS_FOR_ZONES("pgsteal_direct") "pgsteal_direct",
TEXTS_FOR_ZONES("pgscan_kswapd") "pgscan_kswapd",
TEXTS_FOR_ZONES("pgscan_direct") "pgscan_direct",
"pgscan_direct_throttle", "pgscan_direct_throttle",
#ifdef CONFIG_NUMA #ifdef CONFIG_NUMA
@ -774,7 +1013,6 @@ const char * const vmstat_text[] = {
"kswapd_low_wmark_hit_quickly", "kswapd_low_wmark_hit_quickly",
"kswapd_high_wmark_hit_quickly", "kswapd_high_wmark_hit_quickly",
"pageoutrun", "pageoutrun",
"allocstall",
"pgrotated", "pgrotated",
@ -1180,17 +1418,41 @@ static const struct file_operations pagetypeinfo_file_ops = {
.release = seq_release, .release = seq_release,
}; };
static bool is_zone_first_populated(pg_data_t *pgdat, struct zone *zone)
{
int zid;
for (zid = 0; zid < MAX_NR_ZONES; zid++) {
struct zone *compare = &pgdat->node_zones[zid];
if (populated_zone(compare))
return zone == compare;
}
/* The zone must be somewhere! */
WARN_ON_ONCE(1);
return false;
}
static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
struct zone *zone) struct zone *zone)
{ {
int i; int i;
seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name); seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
if (is_zone_first_populated(pgdat, zone)) {
seq_printf(m, "\n per-node stats");
for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
seq_printf(m, "\n %-12s %lu",
vmstat_text[i + NR_VM_ZONE_STAT_ITEMS],
node_page_state(pgdat, i));
}
}
seq_printf(m, seq_printf(m,
"\n pages free %lu" "\n pages free %lu"
"\n min %lu" "\n min %lu"
"\n low %lu" "\n low %lu"
"\n high %lu" "\n high %lu"
"\n scanned %lu" "\n node_scanned %lu"
"\n spanned %lu" "\n spanned %lu"
"\n present %lu" "\n present %lu"
"\n managed %lu", "\n managed %lu",
@ -1198,13 +1460,13 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
min_wmark_pages(zone), min_wmark_pages(zone),
low_wmark_pages(zone), low_wmark_pages(zone),
high_wmark_pages(zone), high_wmark_pages(zone),
zone_page_state(zone, NR_PAGES_SCANNED), node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED),
zone->spanned_pages, zone->spanned_pages,
zone->present_pages, zone->present_pages,
zone->managed_pages); zone->managed_pages);
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
seq_printf(m, "\n %-12s %lu", vmstat_text[i], seq_printf(m, "\n %-12s %lu", vmstat_text[i],
zone_page_state(zone, i)); zone_page_state(zone, i));
seq_printf(m, seq_printf(m,
@ -1234,12 +1496,12 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
#endif #endif
} }
seq_printf(m, seq_printf(m,
"\n all_unreclaimable: %u" "\n node_unreclaimable: %u"
"\n start_pfn: %lu" "\n start_pfn: %lu"
"\n inactive_ratio: %u", "\n node_inactive_ratio: %u",
!zone_reclaimable(zone), !pgdat_reclaimable(zone->zone_pgdat),
zone->zone_start_pfn, zone->zone_start_pfn,
zone->inactive_ratio); zone->zone_pgdat->inactive_ratio);
seq_putc(m, '\n'); seq_putc(m, '\n');
} }
@ -1287,6 +1549,7 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos)
if (*pos >= ARRAY_SIZE(vmstat_text)) if (*pos >= ARRAY_SIZE(vmstat_text))
return NULL; return NULL;
stat_items_size = NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) + stat_items_size = NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) +
NR_VM_NODE_STAT_ITEMS * sizeof(unsigned long) +
NR_VM_WRITEBACK_STAT_ITEMS * sizeof(unsigned long); NR_VM_WRITEBACK_STAT_ITEMS * sizeof(unsigned long);
#ifdef CONFIG_VM_EVENT_COUNTERS #ifdef CONFIG_VM_EVENT_COUNTERS
@ -1301,6 +1564,10 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos)
v[i] = global_page_state(i); v[i] = global_page_state(i);
v += NR_VM_ZONE_STAT_ITEMS; v += NR_VM_ZONE_STAT_ITEMS;
for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
v[i] = global_node_page_state(i);
v += NR_VM_NODE_STAT_ITEMS;
global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD, global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD,
v + NR_DIRTY_THRESHOLD); v + NR_DIRTY_THRESHOLD);
v += NR_VM_WRITEBACK_STAT_ITEMS; v += NR_VM_WRITEBACK_STAT_ITEMS;
@ -1325,7 +1592,6 @@ static int vmstat_show(struct seq_file *m, void *arg)
{ {
unsigned long *l = arg; unsigned long *l = arg;
unsigned long off = l - (unsigned long *)m->private; unsigned long off = l - (unsigned long *)m->private;
seq_printf(m, "%s %lu\n", vmstat_text[off], *l); seq_printf(m, "%s %lu\n", vmstat_text[off], *l);
return 0; return 0;
} }
@ -1390,13 +1656,12 @@ int vmstat_refresh(struct ctl_table *table, int write,
if (err) if (err)
return err; return err;
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) { for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
val = atomic_long_read(&vm_stat[i]); val = atomic_long_read(&vm_zone_stat[i]);
if (val < 0) { if (val < 0) {
switch (i) { switch (i) {
case NR_ALLOC_BATCH:
case NR_PAGES_SCANNED: case NR_PAGES_SCANNED:
/* /*
* These are often seen to go negative in * This is often seen to go negative in
* recent kernels, but not to go permanently * recent kernels, but not to go permanently
* negative. Whilst it would be nicer not to * negative. Whilst it would be nicer not to
* have exceptions, rooting them out would be * have exceptions, rooting them out would be

View file

@ -16,7 +16,7 @@
/* /*
* Double CLOCK lists * Double CLOCK lists
* *
* Per zone, two clock lists are maintained for file pages: the * Per node, two clock lists are maintained for file pages: the
* inactive and the active list. Freshly faulted pages start out at * inactive and the active list. Freshly faulted pages start out at
* the head of the inactive list and page reclaim scans pages from the * the head of the inactive list and page reclaim scans pages from the
* tail. Pages that are accessed multiple times on the inactive list * tail. Pages that are accessed multiple times on the inactive list
@ -141,11 +141,11 @@
* *
* Implementation * Implementation
* *
* For each zone's file LRU lists, a counter for inactive evictions * For each node's file LRU lists, a counter for inactive evictions
* and activations is maintained (zone->inactive_age). * and activations is maintained (node->inactive_age).
* *
* On eviction, a snapshot of this counter (along with some bits to * On eviction, a snapshot of this counter (along with some bits to
* identify the zone) is stored in the now empty page cache radix tree * identify the node) is stored in the now empty page cache radix tree
* slot of the evicted page. This is called a shadow entry. * slot of the evicted page. This is called a shadow entry.
* *
* On cache misses for which there are shadow entries, an eligible * On cache misses for which there are shadow entries, an eligible
@ -153,7 +153,7 @@
*/ */
#define EVICTION_SHIFT (RADIX_TREE_EXCEPTIONAL_ENTRY + \ #define EVICTION_SHIFT (RADIX_TREE_EXCEPTIONAL_ENTRY + \
ZONES_SHIFT + NODES_SHIFT + \ NODES_SHIFT + \
MEM_CGROUP_ID_SHIFT) MEM_CGROUP_ID_SHIFT)
#define EVICTION_MASK (~0UL >> EVICTION_SHIFT) #define EVICTION_MASK (~0UL >> EVICTION_SHIFT)
@ -167,33 +167,30 @@
*/ */
static unsigned int bucket_order __read_mostly; static unsigned int bucket_order __read_mostly;
static void *pack_shadow(int memcgid, struct zone *zone, unsigned long eviction) static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction)
{ {
eviction >>= bucket_order; eviction >>= bucket_order;
eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
eviction = (eviction << NODES_SHIFT) | zone_to_nid(zone); eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
eviction = (eviction << ZONES_SHIFT) | zone_idx(zone);
eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT); eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT);
return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY); return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY);
} }
static void unpack_shadow(void *shadow, int *memcgidp, struct zone **zonep, static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
unsigned long *evictionp) unsigned long *evictionp)
{ {
unsigned long entry = (unsigned long)shadow; unsigned long entry = (unsigned long)shadow;
int memcgid, nid, zid; int memcgid, nid;
entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT; entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT;
zid = entry & ((1UL << ZONES_SHIFT) - 1);
entry >>= ZONES_SHIFT;
nid = entry & ((1UL << NODES_SHIFT) - 1); nid = entry & ((1UL << NODES_SHIFT) - 1);
entry >>= NODES_SHIFT; entry >>= NODES_SHIFT;
memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1); memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1);
entry >>= MEM_CGROUP_ID_SHIFT; entry >>= MEM_CGROUP_ID_SHIFT;
*memcgidp = memcgid; *memcgidp = memcgid;
*zonep = NODE_DATA(nid)->node_zones + zid; *pgdat = NODE_DATA(nid);
*evictionp = entry << bucket_order; *evictionp = entry << bucket_order;
} }
@ -208,7 +205,7 @@ static void unpack_shadow(void *shadow, int *memcgidp, struct zone **zonep,
void *workingset_eviction(struct address_space *mapping, struct page *page) void *workingset_eviction(struct address_space *mapping, struct page *page)
{ {
struct mem_cgroup *memcg = page_memcg(page); struct mem_cgroup *memcg = page_memcg(page);
struct zone *zone = page_zone(page); struct pglist_data *pgdat = page_pgdat(page);
int memcgid = mem_cgroup_id(memcg); int memcgid = mem_cgroup_id(memcg);
unsigned long eviction; unsigned long eviction;
struct lruvec *lruvec; struct lruvec *lruvec;
@ -218,9 +215,9 @@ void *workingset_eviction(struct address_space *mapping, struct page *page)
VM_BUG_ON_PAGE(page_count(page), page); VM_BUG_ON_PAGE(page_count(page), page);
VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(!PageLocked(page), page);
lruvec = mem_cgroup_zone_lruvec(zone, memcg); lruvec = mem_cgroup_lruvec(pgdat, memcg);
eviction = atomic_long_inc_return(&lruvec->inactive_age); eviction = atomic_long_inc_return(&lruvec->inactive_age);
return pack_shadow(memcgid, zone, eviction); return pack_shadow(memcgid, pgdat, eviction);
} }
/** /**
@ -228,7 +225,7 @@ void *workingset_eviction(struct address_space *mapping, struct page *page)
* @shadow: shadow entry of the evicted page * @shadow: shadow entry of the evicted page
* *
* Calculates and evaluates the refault distance of the previously * Calculates and evaluates the refault distance of the previously
* evicted page in the context of the zone it was allocated in. * evicted page in the context of the node it was allocated in.
* *
* Returns %true if the page should be activated, %false otherwise. * Returns %true if the page should be activated, %false otherwise.
*/ */
@ -240,10 +237,10 @@ bool workingset_refault(void *shadow)
unsigned long eviction; unsigned long eviction;
struct lruvec *lruvec; struct lruvec *lruvec;
unsigned long refault; unsigned long refault;
struct zone *zone; struct pglist_data *pgdat;
int memcgid; int memcgid;
unpack_shadow(shadow, &memcgid, &zone, &eviction); unpack_shadow(shadow, &memcgid, &pgdat, &eviction);
rcu_read_lock(); rcu_read_lock();
/* /*
@ -267,7 +264,7 @@ bool workingset_refault(void *shadow)
rcu_read_unlock(); rcu_read_unlock();
return false; return false;
} }
lruvec = mem_cgroup_zone_lruvec(zone, memcg); lruvec = mem_cgroup_lruvec(pgdat, memcg);
refault = atomic_long_read(&lruvec->inactive_age); refault = atomic_long_read(&lruvec->inactive_age);
active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE); active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE);
rcu_read_unlock(); rcu_read_unlock();
@ -290,10 +287,10 @@ bool workingset_refault(void *shadow)
*/ */
refault_distance = (refault - eviction) & EVICTION_MASK; refault_distance = (refault - eviction) & EVICTION_MASK;
inc_zone_state(zone, WORKINGSET_REFAULT); inc_node_state(pgdat, WORKINGSET_REFAULT);
if (refault_distance <= active_file) { if (refault_distance <= active_file) {
inc_zone_state(zone, WORKINGSET_ACTIVATE); inc_node_state(pgdat, WORKINGSET_ACTIVATE);
return true; return true;
} }
return false; return false;
@ -305,9 +302,10 @@ bool workingset_refault(void *shadow)
*/ */
void workingset_activation(struct page *page) void workingset_activation(struct page *page)
{ {
struct mem_cgroup *memcg;
struct lruvec *lruvec; struct lruvec *lruvec;
lock_page_memcg(page); rcu_read_lock();
/* /*
* Filter non-memcg pages here, e.g. unmap can call * Filter non-memcg pages here, e.g. unmap can call
* mark_page_accessed() on VDSO pages. * mark_page_accessed() on VDSO pages.
@ -315,12 +313,13 @@ void workingset_activation(struct page *page)
* XXX: See workingset_refault() - this should return * XXX: See workingset_refault() - this should return
* root_mem_cgroup even for !CONFIG_MEMCG. * root_mem_cgroup even for !CONFIG_MEMCG.
*/ */
if (!mem_cgroup_disabled() && !page_memcg(page)) memcg = page_memcg_rcu(page);
if (!mem_cgroup_disabled() && !memcg)
goto out; goto out;
lruvec = mem_cgroup_zone_lruvec(page_zone(page), page_memcg(page)); lruvec = mem_cgroup_lruvec(page_pgdat(page), memcg);
atomic_long_inc(&lruvec->inactive_age); atomic_long_inc(&lruvec->inactive_age);
out: out:
unlock_page_memcg(page); rcu_read_unlock();
} }
/* /*
@ -349,12 +348,13 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc); shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc);
local_irq_enable(); local_irq_enable();
if (memcg_kmem_enabled()) if (memcg_kmem_enabled()) {
pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid, pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid,
LRU_ALL_FILE); LRU_ALL_FILE);
else } else {
pages = node_page_state(sc->nid, NR_ACTIVE_FILE) + pages = node_page_state(NODE_DATA(sc->nid), NR_ACTIVE_FILE) +
node_page_state(sc->nid, NR_INACTIVE_FILE); node_page_state(NODE_DATA(sc->nid), NR_INACTIVE_FILE);
}
/* /*
* Active cache pages are limited to 50% of memory, and shadow * Active cache pages are limited to 50% of memory, and shadow
@ -433,7 +433,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
} }
} }
BUG_ON(node->count); BUG_ON(node->count);
inc_zone_state(page_zone(virt_to_page(node)), WORKINGSET_NODERECLAIM); inc_node_state(page_pgdat(virt_to_page(node)), WORKINGSET_NODERECLAIM);
if (!__radix_tree_delete_node(&mapping->page_tree, node)) if (!__radix_tree_delete_node(&mapping->page_tree, node))
BUG(); BUG();

View file

@ -20,6 +20,7 @@
* page->freelist(index): links together all component pages of a zspage * page->freelist(index): links together all component pages of a zspage
* For the huge page, this is always 0, so we use this field * For the huge page, this is always 0, so we use this field
* to store handle. * to store handle.
* page->units: first object offset in a subpage of zspage
* *
* Usage of struct page flags: * Usage of struct page flags:
* PG_private: identifies the first component page * PG_private: identifies the first component page
@ -137,9 +138,6 @@
*/ */
#define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> CLASS_BITS) #define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> CLASS_BITS)
/*
* We do not maintain any list for completely empty or full pages
*/
enum fullness_group { enum fullness_group {
ZS_EMPTY, ZS_EMPTY,
ZS_ALMOST_EMPTY, ZS_ALMOST_EMPTY,
@ -467,11 +465,6 @@ static struct zpool_driver zs_zpool_driver = {
MODULE_ALIAS("zpool-zsmalloc"); MODULE_ALIAS("zpool-zsmalloc");
#endif /* CONFIG_ZPOOL */ #endif /* CONFIG_ZPOOL */
static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage)
{
return pages_per_zspage * PAGE_SIZE / size;
}
/* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
static DEFINE_PER_CPU(struct mapping_area, zs_map_area); static DEFINE_PER_CPU(struct mapping_area, zs_map_area);
@ -635,8 +628,7 @@ static int zs_stats_size_show(struct seq_file *s, void *v)
freeable = zs_can_compact(class); freeable = zs_can_compact(class);
spin_unlock(&class->lock); spin_unlock(&class->lock);
objs_per_zspage = get_maxobj_per_zspage(class->size, objs_per_zspage = class->objs_per_zspage;
class->pages_per_zspage);
pages_used = obj_allocated / objs_per_zspage * pages_used = obj_allocated / objs_per_zspage *
class->pages_per_zspage; class->pages_per_zspage;
@ -945,8 +937,8 @@ static void unpin_tag(unsigned long handle)
static void reset_page(struct page *page) static void reset_page(struct page *page)
{ {
__ClearPageMovable(page); __ClearPageMovable(page);
clear_bit(PG_private, &page->flags); ClearPagePrivate(page);
clear_bit(PG_private_2, &page->flags); ClearPagePrivate2(page);
set_page_private(page, 0); set_page_private(page, 0);
page_mapcount_reset(page); page_mapcount_reset(page);
ClearPageHugeObject(page); ClearPageHugeObject(page);
@ -1014,8 +1006,7 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class,
cache_free_zspage(pool, zspage); cache_free_zspage(pool, zspage);
zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( zs_stat_dec(class, OBJ_ALLOCATED, class->objs_per_zspage);
class->size, class->pages_per_zspage));
atomic_long_sub(class->pages_per_zspage, atomic_long_sub(class->pages_per_zspage,
&pool->pages_allocated); &pool->pages_allocated);
} }
@ -1350,7 +1341,7 @@ static void zs_unregister_cpu_notifier(void)
cpu_notifier_register_done(); cpu_notifier_register_done();
} }
static void init_zs_size_classes(void) static void __init init_zs_size_classes(void)
{ {
int nr; int nr;
@ -1361,16 +1352,14 @@ static void init_zs_size_classes(void)
zs_size_classes = nr; zs_size_classes = nr;
} }
static bool can_merge(struct size_class *prev, int size, int pages_per_zspage) static bool can_merge(struct size_class *prev, int pages_per_zspage,
int objs_per_zspage)
{ {
if (prev->pages_per_zspage != pages_per_zspage) if (prev->pages_per_zspage == pages_per_zspage &&
return false; prev->objs_per_zspage == objs_per_zspage)
return true;
if (get_maxobj_per_zspage(prev->size, prev->pages_per_zspage) return false;
!= get_maxobj_per_zspage(size, pages_per_zspage))
return false;
return true;
} }
static bool zspage_full(struct size_class *class, struct zspage *zspage) static bool zspage_full(struct size_class *class, struct zspage *zspage)
@ -1541,6 +1530,7 @@ static unsigned long obj_malloc(struct size_class *class,
* zs_malloc - Allocate block of given size from pool. * zs_malloc - Allocate block of given size from pool.
* @pool: pool to allocate from * @pool: pool to allocate from
* @size: size of block to allocate * @size: size of block to allocate
* @gfp: gfp flags when allocating object
* *
* On success, handle to the allocated object is returned, * On success, handle to the allocated object is returned,
* otherwise 0. * otherwise 0.
@ -1592,8 +1582,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
record_obj(handle, obj); record_obj(handle, obj);
atomic_long_add(class->pages_per_zspage, atomic_long_add(class->pages_per_zspage,
&pool->pages_allocated); &pool->pages_allocated);
zs_stat_inc(class, OBJ_ALLOCATED, get_maxobj_per_zspage( zs_stat_inc(class, OBJ_ALLOCATED, class->objs_per_zspage);
class->size, class->pages_per_zspage));
/* We completely set up zspage so mark them as movable */ /* We completely set up zspage so mark them as movable */
SetZsPageMovable(pool, zspage); SetZsPageMovable(pool, zspage);
@ -1741,10 +1730,11 @@ static void zs_object_copy(struct size_class *class, unsigned long dst,
* return handle. * return handle.
*/ */
static unsigned long find_alloced_obj(struct size_class *class, static unsigned long find_alloced_obj(struct size_class *class,
struct page *page, int index) struct page *page, int *obj_idx)
{ {
unsigned long head; unsigned long head;
int offset = 0; int offset = 0;
int index = *obj_idx;
unsigned long handle = 0; unsigned long handle = 0;
void *addr = kmap_atomic(page); void *addr = kmap_atomic(page);
@ -1765,6 +1755,9 @@ static unsigned long find_alloced_obj(struct size_class *class,
} }
kunmap_atomic(addr); kunmap_atomic(addr);
*obj_idx = index;
return handle; return handle;
} }
@ -1776,7 +1769,7 @@ struct zs_compact_control {
struct page *d_page; struct page *d_page;
/* Starting object index within @s_page which used for live object /* Starting object index within @s_page which used for live object
* in the subpage. */ * in the subpage. */
int index; int obj_idx;
}; };
static int migrate_zspage(struct zs_pool *pool, struct size_class *class, static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
@ -1786,16 +1779,16 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
unsigned long handle; unsigned long handle;
struct page *s_page = cc->s_page; struct page *s_page = cc->s_page;
struct page *d_page = cc->d_page; struct page *d_page = cc->d_page;
unsigned long index = cc->index; int obj_idx = cc->obj_idx;
int ret = 0; int ret = 0;
while (1) { while (1) {
handle = find_alloced_obj(class, s_page, index); handle = find_alloced_obj(class, s_page, &obj_idx);
if (!handle) { if (!handle) {
s_page = get_next_page(s_page); s_page = get_next_page(s_page);
if (!s_page) if (!s_page)
break; break;
index = 0; obj_idx = 0;
continue; continue;
} }
@ -1809,7 +1802,7 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
used_obj = handle_to_obj(handle); used_obj = handle_to_obj(handle);
free_obj = obj_malloc(class, get_zspage(d_page), handle); free_obj = obj_malloc(class, get_zspage(d_page), handle);
zs_object_copy(class, free_obj, used_obj); zs_object_copy(class, free_obj, used_obj);
index++; obj_idx++;
/* /*
* record_obj updates handle's value to free_obj and it will * record_obj updates handle's value to free_obj and it will
* invalidate lock bit(ie, HANDLE_PIN_BIT) of handle, which * invalidate lock bit(ie, HANDLE_PIN_BIT) of handle, which
@ -1824,7 +1817,7 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
/* Remember last position in this iteration */ /* Remember last position in this iteration */
cc->s_page = s_page; cc->s_page = s_page;
cc->index = index; cc->obj_idx = obj_idx;
return ret; return ret;
} }
@ -2181,8 +2174,7 @@ static int zs_register_migration(struct zs_pool *pool)
static void zs_unregister_migration(struct zs_pool *pool) static void zs_unregister_migration(struct zs_pool *pool)
{ {
flush_work(&pool->free_work); flush_work(&pool->free_work);
if (pool->inode) iput(pool->inode);
iput(pool->inode);
} }
/* /*
@ -2261,8 +2253,7 @@ static unsigned long zs_can_compact(struct size_class *class)
return 0; return 0;
obj_wasted = obj_allocated - obj_used; obj_wasted = obj_allocated - obj_used;
obj_wasted /= get_maxobj_per_zspage(class->size, obj_wasted /= class->objs_per_zspage;
class->pages_per_zspage);
return obj_wasted * class->pages_per_zspage; return obj_wasted * class->pages_per_zspage;
} }
@ -2279,7 +2270,7 @@ static void __zs_compact(struct zs_pool *pool, struct size_class *class)
if (!zs_can_compact(class)) if (!zs_can_compact(class))
break; break;
cc.index = 0; cc.obj_idx = 0;
cc.s_page = get_first_page(src_zspage); cc.s_page = get_first_page(src_zspage);
while ((dst_zspage = isolate_zspage(class, false))) { while ((dst_zspage = isolate_zspage(class, false))) {
@ -2398,7 +2389,7 @@ static int zs_register_shrinker(struct zs_pool *pool)
/** /**
* zs_create_pool - Creates an allocation pool to work from. * zs_create_pool - Creates an allocation pool to work from.
* @flags: allocation flags used to allocate pool metadata * @name: pool name to be created
* *
* This function must be called before anything when using * This function must be called before anything when using
* the zsmalloc allocator. * the zsmalloc allocator.
@ -2438,6 +2429,7 @@ struct zs_pool *zs_create_pool(const char *name)
for (i = zs_size_classes - 1; i >= 0; i--) { for (i = zs_size_classes - 1; i >= 0; i--) {
int size; int size;
int pages_per_zspage; int pages_per_zspage;
int objs_per_zspage;
struct size_class *class; struct size_class *class;
int fullness = 0; int fullness = 0;
@ -2445,6 +2437,7 @@ struct zs_pool *zs_create_pool(const char *name)
if (size > ZS_MAX_ALLOC_SIZE) if (size > ZS_MAX_ALLOC_SIZE)
size = ZS_MAX_ALLOC_SIZE; size = ZS_MAX_ALLOC_SIZE;
pages_per_zspage = get_pages_per_zspage(size); pages_per_zspage = get_pages_per_zspage(size);
objs_per_zspage = pages_per_zspage * PAGE_SIZE / size;
/* /*
* size_class is used for normal zsmalloc operation such * size_class is used for normal zsmalloc operation such
@ -2456,7 +2449,7 @@ struct zs_pool *zs_create_pool(const char *name)
* previous size_class if possible. * previous size_class if possible.
*/ */
if (prev_class) { if (prev_class) {
if (can_merge(prev_class, size, pages_per_zspage)) { if (can_merge(prev_class, pages_per_zspage, objs_per_zspage)) {
pool->size_class[i] = prev_class; pool->size_class[i] = prev_class;
continue; continue;
} }
@ -2469,8 +2462,7 @@ struct zs_pool *zs_create_pool(const char *name)
class->size = size; class->size = size;
class->index = i; class->index = i;
class->pages_per_zspage = pages_per_zspage; class->pages_per_zspage = pages_per_zspage;
class->objs_per_zspage = class->pages_per_zspage * class->objs_per_zspage = objs_per_zspage;
PAGE_SIZE / class->size;
spin_lock_init(&class->lock); spin_lock_init(&class->lock);
pool->size_class[i] = class; pool->size_class[i] = class;
for (fullness = ZS_EMPTY; fullness < NR_ZS_FULLNESS; for (fullness = ZS_EMPTY; fullness < NR_ZS_FULLNESS;

View file

@ -608,6 +608,7 @@ static const struct {
const char *compact; const char *compact;
} gfp_compact_table[] = { } gfp_compact_table[] = {
{ "GFP_TRANSHUGE", "THP" }, { "GFP_TRANSHUGE", "THP" },
{ "GFP_TRANSHUGE_LIGHT", "THL" },
{ "GFP_HIGHUSER_MOVABLE", "HUM" }, { "GFP_HIGHUSER_MOVABLE", "HUM" },
{ "GFP_HIGHUSER", "HU" }, { "GFP_HIGHUSER", "HU" },
{ "GFP_USER", "U" }, { "GFP_USER", "U" },