diff --git a/include/linux/mm.h b/include/linux/mm.h index 864d7221de84..8f468e0d2534 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -500,11 +500,20 @@ static inline int page_mapcount(struct page *page) #ifdef CONFIG_TRANSPARENT_HUGEPAGE int total_mapcount(struct page *page); +int page_trans_huge_mapcount(struct page *page, int *total_mapcount); #else static inline int total_mapcount(struct page *page) { return page_mapcount(page); } +static inline int page_trans_huge_mapcount(struct page *page, + int *total_mapcount) +{ + int mapcount = page_mapcount(page); + if (total_mapcount) + *total_mapcount = mapcount; + return mapcount; +} #endif static inline struct page *virt_to_head_page(const void *x) diff --git a/include/linux/swap.h b/include/linux/swap.h index 0a4cd4703f40..ad220359f1b0 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -418,7 +418,7 @@ extern sector_t swapdev_block(int, pgoff_t); extern int page_swapcount(struct page *); extern int swp_swapcount(swp_entry_t entry); extern struct swap_info_struct *page_swap_info(struct page *); -extern int reuse_swap_page(struct page *); +extern bool reuse_swap_page(struct page *, int *); extern int try_to_free_swap(struct page *); struct backing_dev_info; @@ -513,8 +513,8 @@ static inline int swp_swapcount(swp_entry_t entry) return 0; } -#define reuse_swap_page(page) \ - (!PageTransCompound(page) && page_mapcount(page) == 1) +#define reuse_swap_page(page, total_mapcount) \ + (page_trans_huge_mapcount(page, total_mapcount) == 1) static inline int try_to_free_swap(struct page *page) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f7daa7de8f48..b49ee126d4d1 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1298,15 +1298,9 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page); /* * We can only reuse the page if nobody else maps the huge page or it's - * part. We can do it by checking page_mapcount() on each sub-page, but - * it's expensive. - * The cheaper way is to check page_count() to be equal 1: every - * mapcount takes page reference reference, so this way we can - * guarantee, that the PMD is the only mapping. - * This can give false negative if somebody pinned the page, but that's - * fine. + * part. */ - if (page_mapcount(page) == 1 && page_count(page) == 1) { + if (page_trans_huge_mapcount(page, NULL) == 1) { pmd_t entry; entry = pmd_mkyoung(orig_pmd); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); @@ -2079,7 +2073,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, if (pte_write(pteval)) { writable = true; } else { - if (PageSwapCache(page) && !reuse_swap_page(page)) { + if (PageSwapCache(page) && + !reuse_swap_page(page, NULL)) { unlock_page(page); result = SCAN_SWAP_CACHE_PAGE; goto out; @@ -3222,6 +3217,64 @@ int total_mapcount(struct page *page) return ret; } +/* + * This calculates accurately how many mappings a transparent hugepage + * has (unlike page_mapcount() which isn't fully accurate). This full + * accuracy is primarily needed to know if copy-on-write faults can + * reuse the page and change the mapping to read-write instead of + * copying them. At the same time this returns the total_mapcount too. + * + * The function returns the highest mapcount any one of the subpages + * has. If the return value is one, even if different processes are + * mapping different subpages of the transparent hugepage, they can + * all reuse it, because each process is reusing a different subpage. + * + * The total_mapcount is instead counting all virtual mappings of the + * subpages. If the total_mapcount is equal to "one", it tells the + * caller all mappings belong to the same "mm" and in turn the + * anon_vma of the transparent hugepage can become the vma->anon_vma + * local one as no other process may be mapping any of the subpages. + * + * It would be more accurate to replace page_mapcount() with + * page_trans_huge_mapcount(), however we only use + * page_trans_huge_mapcount() in the copy-on-write faults where we + * need full accuracy to avoid breaking page pinning, because + * page_trans_huge_mapcount() is slower than page_mapcount(). + */ +int page_trans_huge_mapcount(struct page *page, int *total_mapcount) +{ + int i, ret, _total_mapcount, mapcount; + + /* hugetlbfs shouldn't call it */ + VM_BUG_ON_PAGE(PageHuge(page), page); + + if (likely(!PageTransCompound(page))) { + mapcount = atomic_read(&page->_mapcount) + 1; + if (total_mapcount) + *total_mapcount = mapcount; + return mapcount; + } + + page = compound_head(page); + + _total_mapcount = ret = 0; + for (i = 0; i < HPAGE_PMD_NR; i++) { + mapcount = atomic_read(&page[i]._mapcount) + 1; + ret = max(ret, mapcount); + _total_mapcount += mapcount; + } + if (PageDoubleMap(page)) { + ret -= 1; + _total_mapcount -= HPAGE_PMD_NR; + } + mapcount = compound_mapcount(page); + ret += mapcount; + _total_mapcount += mapcount; + if (total_mapcount) + *total_mapcount = _total_mapcount; + return ret; +} + /* * This function splits huge page into normal pages. @page can point to any * subpage of huge page to split. Split doesn't change the position of @page. diff --git a/mm/memory.c b/mm/memory.c index 52c218e2b724..07493e34ab7e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2373,6 +2373,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, * not dirty accountable. */ if (PageAnon(old_page) && !PageKsm(old_page)) { + int total_mapcount; if (!trylock_page(old_page)) { get_page(old_page); pte_unmap_unlock(page_table, ptl); @@ -2387,13 +2388,18 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, } put_page(old_page); } - if (reuse_swap_page(old_page)) { - /* - * The page is all ours. Move it to our anon_vma so - * the rmap code will not search our parent or siblings. - * Protected against the rmap code by the page lock. - */ - page_move_anon_rmap(old_page, vma, address); + if (reuse_swap_page(old_page, &total_mapcount)) { + if (total_mapcount == 1) { + /* + * The page is all ours. Move it to + * our anon_vma so the rmap code will + * not search our parent or siblings. + * Protected against the rmap code by + * the page lock. + */ + page_move_anon_rmap(compound_head(old_page), + vma, address); + } unlock_page(old_page); return wp_page_reuse(mm, vma, address, page_table, ptl, orig_pte, old_page, 0, 0); @@ -2617,7 +2623,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, inc_mm_counter_fast(mm, MM_ANONPAGES); dec_mm_counter_fast(mm, MM_SWAPENTS); pte = mk_pte(page, vma->vm_page_prot); - if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { + if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); flags &= ~FAULT_FLAG_WRITE; ret |= VM_FAULT_WRITE; diff --git a/mm/swapfile.c b/mm/swapfile.c index 83874eced5bf..031713ab40ce 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -922,18 +922,19 @@ out: * to it. And as a side-effect, free up its swap: because the old content * on disk will never be read, and seeking back there to write new content * later would only waste time away from clustering. + * + * NOTE: total_mapcount should not be relied upon by the caller if + * reuse_swap_page() returns false, but it may be always overwritten + * (see the other implementation for CONFIG_SWAP=n). */ -int reuse_swap_page(struct page *page) +bool reuse_swap_page(struct page *page, int *total_mapcount) { int count; VM_BUG_ON_PAGE(!PageLocked(page), page); if (unlikely(PageKsm(page))) - return 0; - /* The page is part of THP and cannot be reused */ - if (PageTransCompound(page)) - return 0; - count = page_mapcount(page); + return false; + count = page_trans_huge_mapcount(page, total_mapcount); if (count <= 1 && PageSwapCache(page)) { count += page_swapcount(page); if (count == 1 && !PageWriteback(page)) {