diff --git a/include/linux/ksm.h b/include/linux/ksm.h index ef55ce14a2ce..157d83dbaef8 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -9,10 +9,12 @@ #include #include +#include +#include #include -#include struct stable_node; +struct mem_cgroup; #ifdef CONFIG_KSM int ksm_madvise(struct vm_area_struct *vma, unsigned long start, @@ -57,11 +59,36 @@ static inline void set_page_stable_node(struct page *page, (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); } -static inline void page_add_ksm_rmap(struct page *page) +/* + * When do_swap_page() first faults in from swap what used to be a KSM page, + * no problem, it will be assigned to this vma's anon_vma; but thereafter, + * it might be faulted into a different anon_vma (or perhaps to a different + * offset in the same anon_vma). do_swap_page() cannot do all the locking + * needed to reconstitute a cross-anon_vma KSM page: for now it has to make + * a copy, and leave remerging the pages to a later pass of ksmd. + * + * We'd like to make this conditional on vma->vm_flags & VM_MERGEABLE, + * but what if the vma was unmerged while the page was swapped out? + */ +struct page *ksm_does_need_to_copy(struct page *page, + struct vm_area_struct *vma, unsigned long address); +static inline struct page *ksm_might_need_to_copy(struct page *page, + struct vm_area_struct *vma, unsigned long address) { - if (atomic_inc_and_test(&page->_mapcount)) - __inc_zone_page_state(page, NR_ANON_PAGES); + struct anon_vma *anon_vma = page_anon_vma(page); + + if (!anon_vma || + (anon_vma == vma->anon_vma && + page->index == linear_page_index(vma, address))) + return page; + + return ksm_does_need_to_copy(page, vma, address); } + +int page_referenced_ksm(struct page *page, + struct mem_cgroup *memcg, unsigned long *vm_flags); +int try_to_unmap_ksm(struct page *page, enum ttu_flags flags); + #else /* !CONFIG_KSM */ static inline int ksm_madvise(struct vm_area_struct *vma, unsigned long start, @@ -84,7 +111,22 @@ static inline int PageKsm(struct page *page) return 0; } -/* No stub required for page_add_ksm_rmap(page) */ +static inline struct page *ksm_might_need_to_copy(struct page *page, + struct vm_area_struct *vma, unsigned long address) +{ + return page; +} + +static inline int page_referenced_ksm(struct page *page, + struct mem_cgroup *memcg, unsigned long *vm_flags) +{ + return 0; +} + +static inline int try_to_unmap_ksm(struct page *page, enum ttu_flags flags) +{ + return 0; +} #endif /* !CONFIG_KSM */ -#endif +#endif /* __LINUX_KSM_H */ diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 1f65af44c6d2..0b4913a4a344 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -89,6 +89,9 @@ static inline void page_dup_rmap(struct page *page) */ int page_referenced(struct page *, int is_locked, struct mem_cgroup *cnt, unsigned long *vm_flags); +int page_referenced_one(struct page *, struct vm_area_struct *, + unsigned long address, unsigned int *mapcount, unsigned long *vm_flags); + enum ttu_flags { TTU_UNMAP = 0, /* unmap mode */ TTU_MIGRATION = 1, /* migration mode */ @@ -102,6 +105,8 @@ enum ttu_flags { #define TTU_ACTION(x) ((x) & TTU_ACTION_MASK) int try_to_unmap(struct page *, enum ttu_flags flags); +int try_to_unmap_one(struct page *, struct vm_area_struct *, + unsigned long address, enum ttu_flags flags); /* * Called from mm/filemap_xip.c to unmap empty zero page diff --git a/mm/ksm.c b/mm/ksm.c index af5f571185d5..2f58ceebfe8f 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -196,6 +196,13 @@ static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait); static DEFINE_MUTEX(ksm_thread_mutex); static DEFINE_SPINLOCK(ksm_mmlist_lock); +/* + * Temporary hack for page_referenced_ksm() and try_to_unmap_ksm(), + * later we rework things a little to get the right vma to them. + */ +static DEFINE_SPINLOCK(ksm_fallback_vma_lock); +static struct vm_area_struct ksm_fallback_vma; + #define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\ sizeof(struct __struct), __alignof__(struct __struct),\ (__flags), NULL) @@ -445,14 +452,20 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) { if (rmap_item->address & STABLE_FLAG) { struct stable_node *stable_node; + struct page *page; stable_node = rmap_item->head; + page = stable_node->page; + lock_page(page); + hlist_del(&rmap_item->hlist); - if (stable_node->hlist.first) + if (stable_node->hlist.first) { + unlock_page(page); ksm_pages_sharing--; - else { - set_page_stable_node(stable_node->page, NULL); - put_page(stable_node->page); + } else { + set_page_stable_node(page, NULL); + unlock_page(page); + put_page(page); rb_erase(&stable_node->node, &root_stable_tree); free_stable_node(stable_node); @@ -710,7 +723,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, } get_page(kpage); - page_add_ksm_rmap(kpage); + page_add_anon_rmap(kpage, vma, addr); flush_cache_page(vma, addr, pte_pfn(*ptep)); ptep_clear_flush(vma, addr, ptep); @@ -763,8 +776,16 @@ static int try_to_merge_one_page(struct vm_area_struct *vma, pages_identical(page, kpage)) err = replace_page(vma, page, kpage, orig_pte); - if ((vma->vm_flags & VM_LOCKED) && !err) + if ((vma->vm_flags & VM_LOCKED) && !err) { munlock_vma_page(page); + if (!PageMlocked(kpage)) { + unlock_page(page); + lru_add_drain(); + lock_page(kpage); + mlock_vma_page(kpage); + page = kpage; /* for final unlock */ + } + } unlock_page(page); out: @@ -841,7 +862,11 @@ static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item, copy_user_highpage(kpage, page, rmap_item->address, vma); + SetPageDirty(kpage); + __SetPageUptodate(kpage); + SetPageSwapBacked(kpage); set_page_stable_node(kpage, NULL); /* mark it PageKsm */ + lru_cache_add_lru(kpage, LRU_ACTIVE_ANON); err = try_to_merge_one_page(vma, page, kpage); up: @@ -1071,7 +1096,9 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) * The page was successfully merged: * add its rmap_item to the stable tree. */ + lock_page(kpage); stable_tree_append(rmap_item, stable_node); + unlock_page(kpage); } put_page(kpage); return; @@ -1112,11 +1139,13 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) if (kpage) { remove_rmap_item_from_tree(tree_rmap_item); + lock_page(kpage); stable_node = stable_tree_insert(kpage); if (stable_node) { stable_tree_append(tree_rmap_item, stable_node); stable_tree_append(rmap_item, stable_node); } + unlock_page(kpage); put_page(kpage); /* @@ -1285,14 +1314,6 @@ static void ksm_do_scan(unsigned int scan_npages) return; if (!PageKsm(page) || !in_stable_tree(rmap_item)) cmp_and_merge_page(page, rmap_item); - else if (page_mapcount(page) == 1) { - /* - * Replace now-unshared ksm page by ordinary page. - */ - break_cow(rmap_item); - remove_rmap_item_from_tree(rmap_item); - rmap_item->oldchecksum = calc_checksum(page); - } put_page(page); } } @@ -1337,7 +1358,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start, if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | VM_PFNMAP | VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | - VM_MIXEDMAP | VM_SAO)) + VM_NONLINEAR | VM_MIXEDMAP | VM_SAO)) return 0; /* just ignore the advice */ if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { @@ -1435,6 +1456,127 @@ void __ksm_exit(struct mm_struct *mm) } } +struct page *ksm_does_need_to_copy(struct page *page, + struct vm_area_struct *vma, unsigned long address) +{ + struct page *new_page; + + unlock_page(page); /* any racers will COW it, not modify it */ + + new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); + if (new_page) { + copy_user_highpage(new_page, page, address, vma); + + SetPageDirty(new_page); + __SetPageUptodate(new_page); + SetPageSwapBacked(new_page); + __set_page_locked(new_page); + + if (page_evictable(new_page, vma)) + lru_cache_add_lru(new_page, LRU_ACTIVE_ANON); + else + add_page_to_unevictable_list(new_page); + } + + page_cache_release(page); + return new_page; +} + +int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg, + unsigned long *vm_flags) +{ + struct stable_node *stable_node; + struct rmap_item *rmap_item; + struct hlist_node *hlist; + unsigned int mapcount = page_mapcount(page); + int referenced = 0; + struct vm_area_struct *vma; + + VM_BUG_ON(!PageKsm(page)); + VM_BUG_ON(!PageLocked(page)); + + stable_node = page_stable_node(page); + if (!stable_node) + return 0; + + /* + * Temporary hack: really we need anon_vma in rmap_item, to + * provide the correct vma, and to find recently forked instances. + * Use zalloc to avoid weirdness if any other fields are involved. + */ + vma = kmem_cache_zalloc(vm_area_cachep, GFP_ATOMIC); + if (!vma) { + spin_lock(&ksm_fallback_vma_lock); + vma = &ksm_fallback_vma; + } + + hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { + if (memcg && !mm_match_cgroup(rmap_item->mm, memcg)) + continue; + + vma->vm_mm = rmap_item->mm; + vma->vm_start = rmap_item->address; + vma->vm_end = vma->vm_start + PAGE_SIZE; + + referenced += page_referenced_one(page, vma, + rmap_item->address, &mapcount, vm_flags); + if (!mapcount) + goto out; + } +out: + if (vma == &ksm_fallback_vma) + spin_unlock(&ksm_fallback_vma_lock); + else + kmem_cache_free(vm_area_cachep, vma); + return referenced; +} + +int try_to_unmap_ksm(struct page *page, enum ttu_flags flags) +{ + struct stable_node *stable_node; + struct hlist_node *hlist; + struct rmap_item *rmap_item; + int ret = SWAP_AGAIN; + struct vm_area_struct *vma; + + VM_BUG_ON(!PageKsm(page)); + VM_BUG_ON(!PageLocked(page)); + + stable_node = page_stable_node(page); + if (!stable_node) + return SWAP_FAIL; + + /* + * Temporary hack: really we need anon_vma in rmap_item, to + * provide the correct vma, and to find recently forked instances. + * Use zalloc to avoid weirdness if any other fields are involved. + */ + if (TTU_ACTION(flags) != TTU_UNMAP) + return SWAP_FAIL; + + vma = kmem_cache_zalloc(vm_area_cachep, GFP_ATOMIC); + if (!vma) { + spin_lock(&ksm_fallback_vma_lock); + vma = &ksm_fallback_vma; + } + + hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { + vma->vm_mm = rmap_item->mm; + vma->vm_start = rmap_item->address; + vma->vm_end = vma->vm_start + PAGE_SIZE; + + ret = try_to_unmap_one(page, vma, rmap_item->address, flags); + if (ret != SWAP_AGAIN || !page_mapped(page)) + goto out; + } +out: + if (vma == &ksm_fallback_vma) + spin_unlock(&ksm_fallback_vma_lock); + else + kmem_cache_free(vm_area_cachep, vma); + return ret; +} + #ifdef CONFIG_SYSFS /* * This all compiles without CONFIG_SYSFS, but is a waste of space. diff --git a/mm/memory.c b/mm/memory.c index 1c9dc46da3db..a54b2c498444 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2561,6 +2561,12 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, lock_page(page); delayacct_clear_flag(DELAYACCT_PF_SWAPIN); + page = ksm_might_need_to_copy(page, vma, address); + if (!page) { + ret = VM_FAULT_OOM; + goto out; + } + if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) { ret = VM_FAULT_OOM; goto out_page; diff --git a/mm/rmap.c b/mm/rmap.c index ebee81688736..869aaa3206a2 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #include #include @@ -336,9 +337,9 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) * Subfunctions of page_referenced: page_referenced_one called * repeatedly from either page_referenced_anon or page_referenced_file. */ -static int page_referenced_one(struct page *page, struct vm_area_struct *vma, - unsigned long address, unsigned int *mapcount, - unsigned long *vm_flags) +int page_referenced_one(struct page *page, struct vm_area_struct *vma, + unsigned long address, unsigned int *mapcount, + unsigned long *vm_flags) { struct mm_struct *mm = vma->vm_mm; pte_t *pte; @@ -507,28 +508,33 @@ int page_referenced(struct page *page, unsigned long *vm_flags) { int referenced = 0; + int we_locked = 0; if (TestClearPageReferenced(page)) referenced++; *vm_flags = 0; if (page_mapped(page) && page_rmapping(page)) { - if (PageAnon(page)) + if (!is_locked && (!PageAnon(page) || PageKsm(page))) { + we_locked = trylock_page(page); + if (!we_locked) { + referenced++; + goto out; + } + } + if (unlikely(PageKsm(page))) + referenced += page_referenced_ksm(page, mem_cont, + vm_flags); + else if (PageAnon(page)) referenced += page_referenced_anon(page, mem_cont, vm_flags); - else if (is_locked) + else if (page->mapping) referenced += page_referenced_file(page, mem_cont, vm_flags); - else if (!trylock_page(page)) - referenced++; - else { - if (page->mapping) - referenced += page_referenced_file(page, - mem_cont, vm_flags); + if (we_locked) unlock_page(page); - } } - +out: if (page_test_and_clear_young(page)) referenced++; @@ -620,14 +626,7 @@ static void __page_set_anon_rmap(struct page *page, BUG_ON(!anon_vma); anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; page->mapping = (struct address_space *) anon_vma; - page->index = linear_page_index(vma, address); - - /* - * nr_mapped state can be updated without turning off - * interrupts because it is not modified via interrupt. - */ - __inc_zone_page_state(page, NR_ANON_PAGES); } /** @@ -665,14 +664,21 @@ static void __page_check_anon_rmap(struct page *page, * @vma: the vm area in which the mapping is added * @address: the user virtual address mapped * - * The caller needs to hold the pte lock and the page must be locked. + * The caller needs to hold the pte lock, and the page must be locked in + * the anon_vma case: to serialize mapping,index checking after setting. */ void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) { + int first = atomic_inc_and_test(&page->_mapcount); + if (first) + __inc_zone_page_state(page, NR_ANON_PAGES); + if (unlikely(PageKsm(page))) + return; + VM_BUG_ON(!PageLocked(page)); VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); - if (atomic_inc_and_test(&page->_mapcount)) + if (first) __page_set_anon_rmap(page, vma, address); else __page_check_anon_rmap(page, vma, address); @@ -694,6 +700,7 @@ void page_add_new_anon_rmap(struct page *page, VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); SetPageSwapBacked(page); atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ + __inc_zone_page_state(page, NR_ANON_PAGES); __page_set_anon_rmap(page, vma, address); if (page_evictable(page, vma)) lru_cache_add_lru(page, LRU_ACTIVE_ANON); @@ -760,8 +767,8 @@ void page_remove_rmap(struct page *page) * Subfunctions of try_to_unmap: try_to_unmap_one called * repeatedly from either try_to_unmap_anon or try_to_unmap_file. */ -static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, - unsigned long address, enum ttu_flags flags) +int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, + unsigned long address, enum ttu_flags flags) { struct mm_struct *mm = vma->vm_mm; pte_t *pte; @@ -1156,7 +1163,9 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) BUG_ON(!PageLocked(page)); - if (PageAnon(page)) + if (unlikely(PageKsm(page))) + ret = try_to_unmap_ksm(page, flags); + else if (PageAnon(page)) ret = try_to_unmap_anon(page, flags); else ret = try_to_unmap_file(page, flags); @@ -1177,15 +1186,17 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) * * SWAP_AGAIN - no vma is holding page mlocked, or, * SWAP_AGAIN - page mapped in mlocked vma -- couldn't acquire mmap sem + * SWAP_FAIL - page cannot be located at present * SWAP_MLOCK - page is now mlocked. */ int try_to_munlock(struct page *page) { VM_BUG_ON(!PageLocked(page) || PageLRU(page)); - if (PageAnon(page)) + if (unlikely(PageKsm(page))) + return try_to_unmap_ksm(page, TTU_MUNLOCK); + else if (PageAnon(page)) return try_to_unmap_anon(page, TTU_MUNLOCK); else return try_to_unmap_file(page, TTU_MUNLOCK); } - diff --git a/mm/swapfile.c b/mm/swapfile.c index e74112e8e5f4..6c0585b16418 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -650,6 +651,8 @@ int reuse_swap_page(struct page *page) int count; VM_BUG_ON(!PageLocked(page)); + if (unlikely(PageKsm(page))) + return 0; count = page_mapcount(page); if (count <= 1 && PageSwapCache(page)) { count += page_swapcount(page); @@ -658,7 +661,7 @@ int reuse_swap_page(struct page *page) SetPageDirty(page); } } - return count == 1; + return count <= 1; } /* @@ -1185,6 +1188,12 @@ static int try_to_unuse(unsigned int type) * read from disk into another page. Splitting into two * pages would be incorrect if swap supported "shared * private" pages, but they are handled by tmpfs files. + * + * Given how unuse_vma() targets one particular offset + * in an anon_vma, once the anon_vma has been determined, + * this splitting happens to be just what is needed to + * handle where KSM pages have been swapped out: re-reading + * is unnecessarily slow, but we can fix that later on. */ if (swap_count(*swap_map) && PageDirty(page) && PageSwapCache(page)) {