diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 77d1ba57d495..bdf597c4f0be 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -85,6 +85,7 @@ enum ttu_flags { TTU_UNMAP = 1, /* unmap mode */ TTU_MIGRATION = 2, /* migration mode */ TTU_MUNLOCK = 4, /* munlock mode */ + TTU_LZFREE = 8, /* lazy free mode */ TTU_IGNORE_MLOCK = (1 << 8), /* ignore mlock */ TTU_IGNORE_ACCESS = (1 << 9), /* don't age */ @@ -311,5 +312,6 @@ static inline int page_mkclean(struct page *page) #define SWAP_AGAIN 1 #define SWAP_FAIL 2 #define SWAP_MLOCK 3 +#define SWAP_LZFREE 4 #endif /* _LINUX_RMAP_H */ diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index e1f8c993e73b..67c1dbd19c6d 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -25,6 +25,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, FOR_ALL_ZONES(PGALLOC), PGFREE, PGACTIVATE, PGDEACTIVATE, PGFAULT, PGMAJFAULT, + PGLAZYFREED, FOR_ALL_ZONES(PGREFILL), FOR_ALL_ZONES(PGSTEAL_KSWAPD), FOR_ALL_ZONES(PGSTEAL_DIRECT), diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index a74dd84bbb6d..0e821e3c3d45 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -39,6 +39,7 @@ #define MADV_SEQUENTIAL 2 /* expect sequential page references */ #define MADV_WILLNEED 3 /* will need these pages */ #define MADV_DONTNEED 4 /* don't need these pages */ +#define MADV_FREE 5 /* free pages only if memory pressure */ /* common parameters: try to keep these consistent across architectures */ #define MADV_REMOVE 9 /* remove these pages & resources */ diff --git a/mm/madvise.c b/mm/madvise.c index c889fcbb530e..ed137fde4459 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -20,6 +20,9 @@ #include #include #include +#include + +#include /* * Any behaviour which results in changes to the vma->vm_flags needs to @@ -32,6 +35,7 @@ static int madvise_need_mmap_write(int behavior) case MADV_REMOVE: case MADV_WILLNEED: case MADV_DONTNEED: + case MADV_FREE: return 0; default: /* be safe, default to 1. list exceptions explicitly */ @@ -256,6 +260,163 @@ static long madvise_willneed(struct vm_area_struct *vma, return 0; } +static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, + unsigned long end, struct mm_walk *walk) + +{ + struct mmu_gather *tlb = walk->private; + struct mm_struct *mm = tlb->mm; + struct vm_area_struct *vma = walk->vma; + spinlock_t *ptl; + pte_t *orig_pte, *pte, ptent; + struct page *page; + + split_huge_pmd(vma, pmd, addr); + if (pmd_trans_unstable(pmd)) + return 0; + + orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + arch_enter_lazy_mmu_mode(); + for (; addr != end; pte++, addr += PAGE_SIZE) { + ptent = *pte; + + if (!pte_present(ptent)) + continue; + + page = vm_normal_page(vma, addr, ptent); + if (!page) + continue; + + /* + * If pmd isn't transhuge but the page is THP and + * is owned by only this process, split it and + * deactivate all pages. + */ + if (PageTransCompound(page)) { + if (page_mapcount(page) != 1) + goto out; + get_page(page); + if (!trylock_page(page)) { + put_page(page); + goto out; + } + pte_unmap_unlock(orig_pte, ptl); + if (split_huge_page(page)) { + unlock_page(page); + put_page(page); + pte_offset_map_lock(mm, pmd, addr, &ptl); + goto out; + } + put_page(page); + unlock_page(page); + pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + pte--; + addr -= PAGE_SIZE; + continue; + } + + VM_BUG_ON_PAGE(PageTransCompound(page), page); + + if (PageSwapCache(page) || PageDirty(page)) { + if (!trylock_page(page)) + continue; + /* + * If page is shared with others, we couldn't clear + * PG_dirty of the page. + */ + if (page_mapcount(page) != 1) { + unlock_page(page); + continue; + } + + if (PageSwapCache(page) && !try_to_free_swap(page)) { + unlock_page(page); + continue; + } + + ClearPageDirty(page); + unlock_page(page); + } + + if (pte_young(ptent) || pte_dirty(ptent)) { + /* + * Some of architecture(ex, PPC) don't update TLB + * with set_pte_at and tlb_remove_tlb_entry so for + * the portability, remap the pte with old|clean + * after pte clearing. + */ + ptent = ptep_get_and_clear_full(mm, addr, pte, + tlb->fullmm); + + ptent = pte_mkold(ptent); + ptent = pte_mkclean(ptent); + set_pte_at(mm, addr, pte, ptent); + tlb_remove_tlb_entry(tlb, pte, addr); + } + } +out: + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(orig_pte, ptl); + cond_resched(); + return 0; +} + +static void madvise_free_page_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + struct mm_walk free_walk = { + .pmd_entry = madvise_free_pte_range, + .mm = vma->vm_mm, + .private = tlb, + }; + + tlb_start_vma(tlb, vma); + walk_page_range(addr, end, &free_walk); + tlb_end_vma(tlb, vma); +} + +static int madvise_free_single_vma(struct vm_area_struct *vma, + unsigned long start_addr, unsigned long end_addr) +{ + unsigned long start, end; + struct mm_struct *mm = vma->vm_mm; + struct mmu_gather tlb; + + if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)) + return -EINVAL; + + /* MADV_FREE works for only anon vma at the moment */ + if (!vma_is_anonymous(vma)) + return -EINVAL; + + start = max(vma->vm_start, start_addr); + if (start >= vma->vm_end) + return -EINVAL; + end = min(vma->vm_end, end_addr); + if (end <= vma->vm_start) + return -EINVAL; + + lru_add_drain(); + tlb_gather_mmu(&tlb, mm, start, end); + update_hiwater_rss(mm); + + mmu_notifier_invalidate_range_start(mm, start, end); + madvise_free_page_range(&tlb, vma, start, end); + mmu_notifier_invalidate_range_end(mm, start, end); + tlb_finish_mmu(&tlb, start, end); + + return 0; +} + +static long madvise_free(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end) +{ + *prev = vma; + return madvise_free_single_vma(vma, start, end); +} + /* * Application no longer needs these pages. If the pages are dirty, * it's OK to just throw them away. The app will be more careful about @@ -379,6 +540,14 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, return madvise_remove(vma, prev, start, end); case MADV_WILLNEED: return madvise_willneed(vma, prev, start, end); + case MADV_FREE: + /* + * XXX: In this implementation, MADV_FREE works like + * MADV_DONTNEED on swapless system or full swap. + */ + if (get_nr_swap_pages() > 0) + return madvise_free(vma, prev, start, end); + /* passthrough */ case MADV_DONTNEED: return madvise_dontneed(vma, prev, start, end); default: @@ -398,6 +567,7 @@ madvise_behavior_valid(int behavior) case MADV_REMOVE: case MADV_WILLNEED: case MADV_DONTNEED: + case MADV_FREE: #ifdef CONFIG_KSM case MADV_MERGEABLE: case MADV_UNMERGEABLE: diff --git a/mm/rmap.c b/mm/rmap.c index cdc2a885a4cd..68af2e32f7ed 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1411,6 +1411,11 @@ void page_remove_rmap(struct page *page, bool compound) */ } +struct rmap_private { + enum ttu_flags flags; + int lazyfreed; +}; + /* * @arg: enum ttu_flags will be passed to this argument */ @@ -1422,7 +1427,8 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, pte_t pteval; spinlock_t *ptl; int ret = SWAP_AGAIN; - enum ttu_flags flags = (enum ttu_flags)arg; + struct rmap_private *rp = arg; + enum ttu_flags flags = rp->flags; /* munlock has nothing to gain from examining un-locked vmas */ if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED)) @@ -1514,6 +1520,14 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * See handle_pte_fault() ... */ VM_BUG_ON_PAGE(!PageSwapCache(page), page); + + if (!PageDirty(page) && (flags & TTU_LZFREE)) { + /* It's a freeable page by MADV_FREE */ + dec_mm_counter(mm, MM_ANONPAGES); + rp->lazyfreed++; + goto discard; + } + if (swap_duplicate(entry) < 0) { set_pte_at(mm, address, pte, pteval); ret = SWAP_FAIL; @@ -1534,6 +1548,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, } else dec_mm_counter(mm, mm_counter_file(page)); +discard: page_remove_rmap(page, PageHuge(page)); page_cache_release(page); @@ -1586,9 +1601,14 @@ static int page_not_mapped(struct page *page) int try_to_unmap(struct page *page, enum ttu_flags flags) { int ret; + struct rmap_private rp = { + .flags = flags, + .lazyfreed = 0, + }; + struct rmap_walk_control rwc = { .rmap_one = try_to_unmap_one, - .arg = (void *)flags, + .arg = &rp, .done = page_not_mapped, .anon_lock = page_lock_anon_vma_read, }; @@ -1608,8 +1628,11 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) ret = rmap_walk(page, &rwc); - if (ret != SWAP_MLOCK && !page_mapped(page)) + if (ret != SWAP_MLOCK && !page_mapped(page)) { ret = SWAP_SUCCESS; + if (rp.lazyfreed && !PageDirty(page)) + ret = SWAP_LZFREE; + } return ret; } @@ -1631,9 +1654,14 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) int try_to_munlock(struct page *page) { int ret; + struct rmap_private rp = { + .flags = TTU_MUNLOCK, + .lazyfreed = 0, + }; + struct rmap_walk_control rwc = { .rmap_one = try_to_unmap_one, - .arg = (void *)TTU_MUNLOCK, + .arg = &rp, .done = page_not_mapped, .anon_lock = page_lock_anon_vma_read, diff --git a/mm/swap_state.c b/mm/swap_state.c index d783872d746c..676ff2991380 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -185,13 +185,12 @@ int add_to_swap(struct page *page, struct list_head *list) * deadlock in the swap out path. */ /* - * Add it to the swap cache and mark it dirty + * Add it to the swap cache. */ err = add_to_swap_cache(page, entry, __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN); - if (!err) { /* Success */ - SetPageDirty(page); + if (!err) { return 1; } else { /* -ENOMEM radix-tree allocation failure */ /* diff --git a/mm/vmscan.c b/mm/vmscan.c index 983e407afc09..5ac86956ff9d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -906,6 +906,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, int may_enter_fs; enum page_references references = PAGEREF_RECLAIM_CLEAN; bool dirty, writeback; + bool lazyfree = false; + int ret = SWAP_SUCCESS; cond_resched(); @@ -1049,6 +1051,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, goto keep_locked; if (!add_to_swap(page, page_list)) goto activate_locked; + lazyfree = true; may_enter_fs = 1; /* Adding to swap updated mapping */ @@ -1060,14 +1063,17 @@ static unsigned long shrink_page_list(struct list_head *page_list, * processes. Try to unmap it here. */ if (page_mapped(page) && mapping) { - switch (try_to_unmap(page, - ttu_flags|TTU_BATCH_FLUSH)) { + switch (ret = try_to_unmap(page, lazyfree ? + (ttu_flags | TTU_BATCH_FLUSH | TTU_LZFREE) : + (ttu_flags | TTU_BATCH_FLUSH))) { case SWAP_FAIL: goto activate_locked; case SWAP_AGAIN: goto keep_locked; case SWAP_MLOCK: goto cull_mlocked; + case SWAP_LZFREE: + goto lazyfree; case SWAP_SUCCESS: ; /* try to free the page below */ } @@ -1174,6 +1180,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, } } +lazyfree: if (!mapping || !__remove_mapping(mapping, page, true)) goto keep_locked; @@ -1186,6 +1193,9 @@ static unsigned long shrink_page_list(struct list_head *page_list, */ __ClearPageLocked(page); free_it: + if (ret == SWAP_LZFREE) + count_vm_event(PGLAZYFREED); + nr_reclaimed++; /* diff --git a/mm/vmstat.c b/mm/vmstat.c index 6489086f0753..64bd0aa13f75 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -783,6 +783,7 @@ const char * const vmstat_text[] = { "pgfault", "pgmajfault", + "pglazyfreed", TEXTS_FOR_ZONES("pgrefill") TEXTS_FOR_ZONES("pgsteal_kswapd")