diff --git a/include/linux/swap.h b/include/linux/swap.h index cd28ad206dae..7cee73ef4f15 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -28,7 +28,14 @@ static inline int current_is_kswapd(void) * the type/offset into the pte as 5/27 as well. */ #define MAX_SWAPFILES_SHIFT 5 +#ifndef CONFIG_MIGRATION #define MAX_SWAPFILES (1 << MAX_SWAPFILES_SHIFT) +#else +/* Use last two entries for page migration swap entries */ +#define MAX_SWAPFILES ((1 << MAX_SWAPFILES_SHIFT)-2) +#define SWP_MIGRATION_READ MAX_SWAPFILES +#define SWP_MIGRATION_WRITE (MAX_SWAPFILES + 1) +#endif /* * Magic header for a swap area. The first part of the union is diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 87b9d14c710d..ec639aa3a1d3 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -67,3 +67,56 @@ static inline pte_t swp_entry_to_pte(swp_entry_t entry) BUG_ON(pte_file(__swp_entry_to_pte(arch_entry))); return __swp_entry_to_pte(arch_entry); } + +#ifdef CONFIG_MIGRATION +static inline swp_entry_t make_migration_entry(struct page *page, int write) +{ + BUG_ON(!PageLocked(page)); + return swp_entry(write ? SWP_MIGRATION_WRITE : SWP_MIGRATION_READ, + page_to_pfn(page)); +} + +static inline int is_migration_entry(swp_entry_t entry) +{ + return unlikely(swp_type(entry) == SWP_MIGRATION_READ || + swp_type(entry) == SWP_MIGRATION_WRITE); +} + +static inline int is_write_migration_entry(swp_entry_t entry) +{ + return unlikely(swp_type(entry) == SWP_MIGRATION_WRITE); +} + +static inline struct page *migration_entry_to_page(swp_entry_t entry) +{ + struct page *p = pfn_to_page(swp_offset(entry)); + /* + * Any use of migration entries may only occur while the + * corresponding page is locked + */ + BUG_ON(!PageLocked(p)); + return p; +} + +static inline void make_migration_entry_read(swp_entry_t *entry) +{ + *entry = swp_entry(SWP_MIGRATION_READ, swp_offset(*entry)); +} + +extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, + unsigned long address); +#else + +#define make_migration_entry(page, write) swp_entry(0, 0) +#define is_migration_entry(swp) 0 +#define migration_entry_to_page(swp) NULL +static inline void make_migration_entry_read(swp_entry_t *entryp) { } +static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, + unsigned long address) { } +static inline int is_write_migration_entry(swp_entry_t entry) +{ + return 0; +} + +#endif + diff --git a/mm/memory.c b/mm/memory.c index 7e3683fd4f3c..11673c5d2c20 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -434,7 +434,9 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, /* pte contains position in swap or file, so copy. */ if (unlikely(!pte_present(pte))) { if (!pte_file(pte)) { - swap_duplicate(pte_to_swp_entry(pte)); + swp_entry_t entry = pte_to_swp_entry(pte); + + swap_duplicate(entry); /* make sure dst_mm is on swapoff's mmlist. */ if (unlikely(list_empty(&dst_mm->mmlist))) { spin_lock(&mmlist_lock); @@ -443,6 +445,16 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, &src_mm->mmlist); spin_unlock(&mmlist_lock); } + if (is_write_migration_entry(entry) && + is_cow_mapping(vm_flags)) { + /* + * COW mappings require pages in both parent + * and child to be set to read. + */ + make_migration_entry_read(&entry); + pte = swp_entry_to_pte(entry); + set_pte_at(src_mm, addr, src_pte, pte); + } } goto out_set_pte; } @@ -1879,6 +1891,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, goto out; entry = pte_to_swp_entry(orig_pte); + if (is_migration_entry(entry)) { + migration_entry_wait(mm, pmd, address); + goto out; + } page = lookup_swap_cache(entry); if (!page) { swapin_readahead(entry, address, vma); diff --git a/mm/migrate.c b/mm/migrate.c index 5a340f4ca212..0a011e421bb4 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -23,7 +24,6 @@ #include #include #include -#include #include "internal.h" @@ -119,6 +119,132 @@ int putback_lru_pages(struct list_head *l) return count; } +static inline int is_swap_pte(pte_t pte) +{ + return !pte_none(pte) && !pte_present(pte) && !pte_file(pte); +} + +/* + * Restore a potential migration pte to a working pte entry + */ +static void remove_migration_pte(struct vm_area_struct *vma, unsigned long addr, + struct page *old, struct page *new) +{ + struct mm_struct *mm = vma->vm_mm; + swp_entry_t entry; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *ptep, pte; + spinlock_t *ptl; + + pgd = pgd_offset(mm, addr); + if (!pgd_present(*pgd)) + return; + + pud = pud_offset(pgd, addr); + if (!pud_present(*pud)) + return; + + pmd = pmd_offset(pud, addr); + if (!pmd_present(*pmd)) + return; + + ptep = pte_offset_map(pmd, addr); + + if (!is_swap_pte(*ptep)) { + pte_unmap(ptep); + return; + } + + ptl = pte_lockptr(mm, pmd); + spin_lock(ptl); + pte = *ptep; + if (!is_swap_pte(pte)) + goto out; + + entry = pte_to_swp_entry(pte); + + if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old) + goto out; + + inc_mm_counter(mm, anon_rss); + get_page(new); + pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); + if (is_write_migration_entry(entry)) + pte = pte_mkwrite(pte); + set_pte_at(mm, addr, ptep, pte); + page_add_anon_rmap(new, vma, addr); +out: + pte_unmap_unlock(ptep, ptl); +} + +/* + * Get rid of all migration entries and replace them by + * references to the indicated page. + * + * Must hold mmap_sem lock on at least one of the vmas containing + * the page so that the anon_vma cannot vanish. + */ +static void remove_migration_ptes(struct page *old, struct page *new) +{ + struct anon_vma *anon_vma; + struct vm_area_struct *vma; + unsigned long mapping; + + mapping = (unsigned long)new->mapping; + + if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0) + return; + + /* + * We hold the mmap_sem lock. So no need to call page_lock_anon_vma. + */ + anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON); + spin_lock(&anon_vma->lock); + + list_for_each_entry(vma, &anon_vma->head, anon_vma_node) + remove_migration_pte(vma, page_address_in_vma(new, vma), + old, new); + + spin_unlock(&anon_vma->lock); +} + +/* + * Something used the pte of a page under migration. We need to + * get to the page and wait until migration is finished. + * When we return from this function the fault will be retried. + * + * This function is called from do_swap_page(). + */ +void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, + unsigned long address) +{ + pte_t *ptep, pte; + spinlock_t *ptl; + swp_entry_t entry; + struct page *page; + + ptep = pte_offset_map_lock(mm, pmd, address, &ptl); + pte = *ptep; + if (!is_swap_pte(pte)) + goto out; + + entry = pte_to_swp_entry(pte); + if (!is_migration_entry(entry)) + goto out; + + page = migration_entry_to_page(entry); + + get_page(page); + pte_unmap_unlock(ptep, ptl); + wait_on_page_locked(page); + put_page(page); + return; +out: + pte_unmap_unlock(ptep, ptl); +} + /* * swapout a single page * page is locked upon entry, unlocked on exit diff --git a/mm/mprotect.c b/mm/mprotect.c index 5faf01ad3ef8..14f93e62270f 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -19,7 +19,8 @@ #include #include #include - +#include +#include #include #include #include @@ -28,12 +29,13 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t newprot) { - pte_t *pte; + pte_t *pte, oldpte; spinlock_t *ptl; pte = pte_offset_map_lock(mm, pmd, addr, &ptl); do { - if (pte_present(*pte)) { + oldpte = *pte; + if (pte_present(oldpte)) { pte_t ptent; /* Avoid an SMP race with hardware updated dirty/clean @@ -43,7 +45,22 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, ptent = pte_modify(ptep_get_and_clear(mm, addr, pte), newprot); set_pte_at(mm, addr, pte, ptent); lazy_mmu_prot_update(ptent); +#ifdef CONFIG_MIGRATION + } else if (!pte_file(oldpte)) { + swp_entry_t entry = pte_to_swp_entry(oldpte); + + if (is_write_migration_entry(entry)) { + /* + * A protection check is difficult so + * just be safe and disable write + */ + make_migration_entry_read(&entry); + set_pte_at(mm, addr, pte, + swp_entry_to_pte(entry)); + } +#endif } + } while (pte++, addr += PAGE_SIZE, addr != end); pte_unmap_unlock(pte - 1, ptl); } diff --git a/mm/rmap.c b/mm/rmap.c index 10806b7af40c..3b8ce86daa3a 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -103,7 +103,7 @@ int anon_vma_prepare(struct vm_area_struct *vma) spin_lock(&mm->page_table_lock); if (likely(!vma->anon_vma)) { vma->anon_vma = anon_vma; - list_add(&vma->anon_vma_node, &anon_vma->head); + list_add_tail(&vma->anon_vma_node, &anon_vma->head); allocated = NULL; } spin_unlock(&mm->page_table_lock); @@ -127,7 +127,7 @@ void __anon_vma_link(struct vm_area_struct *vma) struct anon_vma *anon_vma = vma->anon_vma; if (anon_vma) { - list_add(&vma->anon_vma_node, &anon_vma->head); + list_add_tail(&vma->anon_vma_node, &anon_vma->head); validate_anon_vma(vma); } } @@ -138,7 +138,7 @@ void anon_vma_link(struct vm_area_struct *vma) if (anon_vma) { spin_lock(&anon_vma->lock); - list_add(&vma->anon_vma_node, &anon_vma->head); + list_add_tail(&vma->anon_vma_node, &anon_vma->head); validate_anon_vma(vma); spin_unlock(&anon_vma->lock); } @@ -620,17 +620,27 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, if (PageAnon(page)) { swp_entry_t entry = { .val = page_private(page) }; - /* - * Store the swap location in the pte. - * See handle_pte_fault() ... - */ - BUG_ON(!PageSwapCache(page)); - swap_duplicate(entry); - if (list_empty(&mm->mmlist)) { - spin_lock(&mmlist_lock); - if (list_empty(&mm->mmlist)) - list_add(&mm->mmlist, &init_mm.mmlist); - spin_unlock(&mmlist_lock); + + if (PageSwapCache(page)) { + /* + * Store the swap location in the pte. + * See handle_pte_fault() ... + */ + swap_duplicate(entry); + if (list_empty(&mm->mmlist)) { + spin_lock(&mmlist_lock); + if (list_empty(&mm->mmlist)) + list_add(&mm->mmlist, &init_mm.mmlist); + spin_unlock(&mmlist_lock); + } + } else { + /* + * Store the pfn of the page in a special migration + * pte. do_swap_page() will wait until the migration + * pte is removed and then restart fault handling. + */ + BUG_ON(!migration); + entry = make_migration_entry(page, pte_write(pteval)); } set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); BUG_ON(pte_file(*pte)); diff --git a/mm/swapfile.c b/mm/swapfile.c index 47a6812f5f8c..e3b1362372c2 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -395,6 +395,9 @@ void free_swap_and_cache(swp_entry_t entry) struct swap_info_struct * p; struct page *page = NULL; + if (is_migration_entry(entry)) + return; + p = swap_info_get(entry); if (p) { if (swap_entry_free(p, swp_offset(entry)) == 1) { @@ -1400,19 +1403,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) if (!(p->flags & SWP_USED)) break; error = -EPERM; - /* - * Test if adding another swap device is possible. There are - * two limiting factors: 1) the number of bits for the swap - * type swp_entry_t definition and 2) the number of bits for - * the swap type in the swap ptes as defined by the different - * architectures. To honor both limitations a swap entry - * with swap offset 0 and swap type ~0UL is created, encoded - * to a swap pte, decoded to a swp_entry_t again and finally - * the swap type part is extracted. This will mask all bits - * from the initial ~0UL that can't be encoded in either the - * swp_entry_t or the architecture definition of a swap pte. - */ - if (type > swp_type(pte_to_swp_entry(swp_entry_to_pte(swp_entry(~0UL,0))))) { + if (type >= MAX_SWAPFILES) { spin_unlock(&swap_lock); goto out; } @@ -1702,6 +1693,9 @@ int swap_duplicate(swp_entry_t entry) unsigned long offset, type; int result = 0; + if (is_migration_entry(entry)) + return 1; + type = swp_type(entry); if (type >= nr_swapfiles) goto bad_file;