diff --git a/Documentation/vm/00-INDEX b/Documentation/vm/00-INDEX index 081c49777abb..6a5e2a102a45 100644 --- a/Documentation/vm/00-INDEX +++ b/Documentation/vm/00-INDEX @@ -14,6 +14,8 @@ hugetlbpage.txt - a brief summary of hugetlbpage support in the Linux kernel. hwpoison.txt - explains what hwpoison is +idle_page_tracking.txt + - description of the idle page tracking feature. ksm.txt - how to use the Kernel Samepage Merging feature. numa diff --git a/Documentation/vm/idle_page_tracking.txt b/Documentation/vm/idle_page_tracking.txt new file mode 100644 index 000000000000..85dcc3bb85dc --- /dev/null +++ b/Documentation/vm/idle_page_tracking.txt @@ -0,0 +1,98 @@ +MOTIVATION + +The idle page tracking feature allows to track which memory pages are being +accessed by a workload and which are idle. This information can be useful for +estimating the workload's working set size, which, in turn, can be taken into +account when configuring the workload parameters, setting memory cgroup limits, +or deciding where to place the workload within a compute cluster. + +It is enabled by CONFIG_IDLE_PAGE_TRACKING=y. + +USER API + +The idle page tracking API is located at /sys/kernel/mm/page_idle. Currently, +it consists of the only read-write file, /sys/kernel/mm/page_idle/bitmap. + +The file implements a bitmap where each bit corresponds to a memory page. The +bitmap is represented by an array of 8-byte integers, and the page at PFN #i is +mapped to bit #i%64 of array element #i/64, byte order is native. When a bit is +set, the corresponding page is idle. + +A page is considered idle if it has not been accessed since it was marked idle +(for more details on what "accessed" actually means see the IMPLEMENTATION +DETAILS section). To mark a page idle one has to set the bit corresponding to +the page by writing to the file. A value written to the file is OR-ed with the +current bitmap value. + +Only accesses to user memory pages are tracked. These are pages mapped to a +process address space, page cache and buffer pages, swap cache pages. For other +page types (e.g. SLAB pages) an attempt to mark a page idle is silently ignored, +and hence such pages are never reported idle. + +For huge pages the idle flag is set only on the head page, so one has to read +/proc/kpageflags in order to correctly count idle huge pages. + +Reading from or writing to /sys/kernel/mm/page_idle/bitmap will return +-EINVAL if you are not starting the read/write on an 8-byte boundary, or +if the size of the read/write is not a multiple of 8 bytes. Writing to +this file beyond max PFN will return -ENXIO. + +That said, in order to estimate the amount of pages that are not used by a +workload one should: + + 1. Mark all the workload's pages as idle by setting corresponding bits in + /sys/kernel/mm/page_idle/bitmap. The pages can be found by reading + /proc/pid/pagemap if the workload is represented by a process, or by + filtering out alien pages using /proc/kpagecgroup in case the workload is + placed in a memory cgroup. + + 2. Wait until the workload accesses its working set. + + 3. Read /sys/kernel/mm/page_idle/bitmap and count the number of bits set. If + one wants to ignore certain types of pages, e.g. mlocked pages since they + are not reclaimable, he or she can filter them out using /proc/kpageflags. + +See Documentation/vm/pagemap.txt for more information about /proc/pid/pagemap, +/proc/kpageflags, and /proc/kpagecgroup. + +IMPLEMENTATION DETAILS + +The kernel internally keeps track of accesses to user memory pages in order to +reclaim unreferenced pages first on memory shortage conditions. A page is +considered referenced if it has been recently accessed via a process address +space, in which case one or more PTEs it is mapped to will have the Accessed bit +set, or marked accessed explicitly by the kernel (see mark_page_accessed()). The +latter happens when: + + - a userspace process reads or writes a page using a system call (e.g. read(2) + or write(2)) + + - a page that is used for storing filesystem buffers is read or written, + because a process needs filesystem metadata stored in it (e.g. lists a + directory tree) + + - a page is accessed by a device driver using get_user_pages() + +When a dirty page is written to swap or disk as a result of memory reclaim or +exceeding the dirty memory limit, it is not marked referenced. + +The idle memory tracking feature adds a new page flag, the Idle flag. This flag +is set manually, by writing to /sys/kernel/mm/page_idle/bitmap (see the USER API +section), and cleared automatically whenever a page is referenced as defined +above. + +When a page is marked idle, the Accessed bit must be cleared in all PTEs it is +mapped to, otherwise we will not be able to detect accesses to the page coming +from a process address space. To avoid interference with the reclaimer, which, +as noted above, uses the Accessed bit to promote actively referenced pages, one +more page flag is introduced, the Young flag. When the PTE Accessed bit is +cleared as a result of setting or updating a page's Idle flag, the Young flag +is set on the page. The reclaimer treats the Young flag as an extra PTE +Accessed bit and therefore will consider such a page as referenced. + +Since the idle memory tracking feature is based on the memory reclaimer logic, +it only works with pages that are on an LRU list, other pages are silently +ignored. That means it will ignore a user memory page if it is isolated, but +since there are usually not many of them, it should not affect the overall +result noticeably. In order not to stall scanning of the idle page bitmap, +locked pages may be skipped too. diff --git a/fs/proc/page.c b/fs/proc/page.c index 70d23245dd43..c2d29edcaa6b 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -10,12 +10,15 @@ #include #include #include +#include +#include #include #include #include "internal.h" #define KPMSIZE sizeof(u64) #define KPMMASK (KPMSIZE - 1) +#define KPMBITS (KPMSIZE * BITS_PER_BYTE) /* /proc/kpagecount - an array exposing page counts * diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 41f1a50c10c9..e2d46adb54b4 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -459,7 +460,7 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page, mss->resident += size; /* Accumulate the size in pages that have been accessed. */ - if (young || PageReferenced(page)) + if (young || page_is_young(page) || PageReferenced(page)) mss->referenced += size; mapcount = page_mapcount(page); if (mapcount >= 2) { @@ -807,6 +808,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, /* Clear accessed and referenced bits. */ pmdp_test_and_clear_young(vma, addr, pmd); + test_and_clear_page_young(page); ClearPageReferenced(page); out: spin_unlock(ptl); @@ -834,6 +836,7 @@ out: /* Clear accessed and referenced bits. */ ptep_test_and_clear_young(vma, addr, pte); + test_and_clear_page_young(page); ClearPageReferenced(page); } pte_unmap_unlock(pte - 1, ptl); diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index a5b17137c683..a1a210d59961 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -471,6 +471,8 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) #define ptep_clear_flush_young_notify ptep_clear_flush_young #define pmdp_clear_flush_young_notify pmdp_clear_flush_young +#define ptep_clear_young_notify ptep_test_and_clear_young +#define pmdp_clear_young_notify pmdp_test_and_clear_young #define ptep_clear_flush_notify ptep_clear_flush #define pmdp_huge_clear_flush_notify pmdp_huge_clear_flush #define pmdp_huge_get_and_clear_notify pmdp_huge_get_and_clear diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 41c93844fb1d..416509e26d6d 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -108,6 +108,10 @@ enum pageflags { #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE PG_compound_lock, +#endif +#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT) + PG_young, + PG_idle, #endif __NR_PAGEFLAGS, @@ -289,6 +293,13 @@ PAGEFLAG_FALSE(HWPoison) #define __PG_HWPOISON 0 #endif +#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT) +TESTPAGEFLAG(Young, young) +SETPAGEFLAG(Young, young) +TESTCLEARFLAG(Young, young) +PAGEFLAG(Idle, idle) +#endif + /* * On an anonymous page mapped into a user virtual memory area, * page->mapping points to its anon_vma, not to a struct address_space; diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h index c42981cd99aa..17f118a82854 100644 --- a/include/linux/page_ext.h +++ b/include/linux/page_ext.h @@ -26,6 +26,10 @@ enum page_ext_flags { PAGE_EXT_DEBUG_POISON, /* Page is poisoned */ PAGE_EXT_DEBUG_GUARD, PAGE_EXT_OWNER, +#if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT) + PAGE_EXT_YOUNG, + PAGE_EXT_IDLE, +#endif }; /* diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h new file mode 100644 index 000000000000..bf268fa92c5b --- /dev/null +++ b/include/linux/page_idle.h @@ -0,0 +1,110 @@ +#ifndef _LINUX_MM_PAGE_IDLE_H +#define _LINUX_MM_PAGE_IDLE_H + +#include +#include +#include + +#ifdef CONFIG_IDLE_PAGE_TRACKING + +#ifdef CONFIG_64BIT +static inline bool page_is_young(struct page *page) +{ + return PageYoung(page); +} + +static inline void set_page_young(struct page *page) +{ + SetPageYoung(page); +} + +static inline bool test_and_clear_page_young(struct page *page) +{ + return TestClearPageYoung(page); +} + +static inline bool page_is_idle(struct page *page) +{ + return PageIdle(page); +} + +static inline void set_page_idle(struct page *page) +{ + SetPageIdle(page); +} + +static inline void clear_page_idle(struct page *page) +{ + ClearPageIdle(page); +} +#else /* !CONFIG_64BIT */ +/* + * If there is not enough space to store Idle and Young bits in page flags, use + * page ext flags instead. + */ +extern struct page_ext_operations page_idle_ops; + +static inline bool page_is_young(struct page *page) +{ + return test_bit(PAGE_EXT_YOUNG, &lookup_page_ext(page)->flags); +} + +static inline void set_page_young(struct page *page) +{ + set_bit(PAGE_EXT_YOUNG, &lookup_page_ext(page)->flags); +} + +static inline bool test_and_clear_page_young(struct page *page) +{ + return test_and_clear_bit(PAGE_EXT_YOUNG, + &lookup_page_ext(page)->flags); +} + +static inline bool page_is_idle(struct page *page) +{ + return test_bit(PAGE_EXT_IDLE, &lookup_page_ext(page)->flags); +} + +static inline void set_page_idle(struct page *page) +{ + set_bit(PAGE_EXT_IDLE, &lookup_page_ext(page)->flags); +} + +static inline void clear_page_idle(struct page *page) +{ + clear_bit(PAGE_EXT_IDLE, &lookup_page_ext(page)->flags); +} +#endif /* CONFIG_64BIT */ + +#else /* !CONFIG_IDLE_PAGE_TRACKING */ + +static inline bool page_is_young(struct page *page) +{ + return false; +} + +static inline void set_page_young(struct page *page) +{ +} + +static inline bool test_and_clear_page_young(struct page *page) +{ + return false; +} + +static inline bool page_is_idle(struct page *page) +{ + return false; +} + +static inline void set_page_idle(struct page *page) +{ +} + +static inline void clear_page_idle(struct page *page) +{ +} + +#endif /* CONFIG_IDLE_PAGE_TRACKING */ + +#endif /* _LINUX_MM_PAGE_IDLE_H */ diff --git a/mm/Kconfig b/mm/Kconfig index 3a4070f5ab79..6413d027c0b2 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -649,6 +649,18 @@ config DEFERRED_STRUCT_PAGE_INIT processes running early in the lifetime of the systemm until kswapd finishes the initialisation. +config IDLE_PAGE_TRACKING + bool "Enable idle page tracking" + depends on SYSFS && MMU + select PAGE_EXTENSION if !64BIT + help + This feature allows to estimate the amount of user pages that have + not been touched during a given period of time. This information can + be useful to tune memory cgroup limits and/or for job placement + within a compute cluster. + + See Documentation/vm/idle_page_tracking.txt for more details. + config ZONE_DEVICE bool "Device memory (pmem, etc...) hotplug support" if EXPERT default !ZONE_DMA diff --git a/mm/Makefile b/mm/Makefile index b424d5e5b6ff..56f8eed73f1a 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -79,3 +79,4 @@ obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o obj-$(CONFIG_USERFAULTFD) += userfaultfd.o +obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o diff --git a/mm/debug.c b/mm/debug.c index 76089ddf99ea..6c1b3ea61bfd 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -48,6 +48,10 @@ static const struct trace_print_flags pageflag_names[] = { #ifdef CONFIG_TRANSPARENT_HUGEPAGE {1UL << PG_compound_lock, "compound_lock" }, #endif +#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT) + {1UL << PG_young, "young" }, + {1UL << PG_idle, "idle" }, +#endif }; static void dump_flags(unsigned long flags, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b16279cbd91d..4b06b8db9df2 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -1757,6 +1758,11 @@ static void __split_huge_page_refcount(struct page *page, /* clear PageTail before overwriting first_page */ smp_wmb(); + if (page_is_young(page)) + set_page_young(page_tail); + if (page_is_idle(page)) + set_page_idle(page_tail); + /* * __split_huge_page_splitting() already set the * splitting bit in all pmd that could map this @@ -2262,7 +2268,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, VM_BUG_ON_PAGE(PageLRU(page), page); /* If there is no mapped pte young don't collapse the page */ - if (pte_young(pteval) || PageReferenced(page) || + if (pte_young(pteval) || + page_is_young(page) || PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm, address)) referenced = true; } @@ -2693,7 +2700,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, */ if (page_count(page) != 1 + !!PageSwapCache(page)) goto out_unmap; - if (pte_young(pteval) || PageReferenced(page) || + if (pte_young(pteval) || + page_is_young(page) || PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm, address)) referenced = true; } diff --git a/mm/migrate.c b/mm/migrate.c index 02ce25df16c2..c3cb566af3e2 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -37,6 +37,7 @@ #include #include #include +#include #include @@ -524,6 +525,11 @@ void migrate_page_copy(struct page *newpage, struct page *page) __set_page_dirty_nobuffers(newpage); } + if (page_is_young(page)) + set_page_young(newpage); + if (page_is_idle(page)) + set_page_idle(newpage); + /* * Copy NUMA information to the new page, to prevent over-eager * future migrations of this same page. diff --git a/mm/page_ext.c b/mm/page_ext.c index d86fd2f5353f..292ca7b8debd 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -6,6 +6,7 @@ #include #include #include +#include /* * struct page extension @@ -59,6 +60,9 @@ static struct page_ext_operations *page_ext_ops[] = { #ifdef CONFIG_PAGE_OWNER &page_owner_ops, #endif +#if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT) + &page_idle_ops, +#endif }; static unsigned long total_usage; diff --git a/mm/page_idle.c b/mm/page_idle.c new file mode 100644 index 000000000000..d5dd79041484 --- /dev/null +++ b/mm/page_idle.c @@ -0,0 +1,232 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define BITMAP_CHUNK_SIZE sizeof(u64) +#define BITMAP_CHUNK_BITS (BITMAP_CHUNK_SIZE * BITS_PER_BYTE) + +/* + * Idle page tracking only considers user memory pages, for other types of + * pages the idle flag is always unset and an attempt to set it is silently + * ignored. + * + * We treat a page as a user memory page if it is on an LRU list, because it is + * always safe to pass such a page to rmap_walk(), which is essential for idle + * page tracking. With such an indicator of user pages we can skip isolated + * pages, but since there are not usually many of them, it will hardly affect + * the overall result. + * + * This function tries to get a user memory page by pfn as described above. + */ +static struct page *page_idle_get_page(unsigned long pfn) +{ + struct page *page; + struct zone *zone; + + if (!pfn_valid(pfn)) + return NULL; + + page = pfn_to_page(pfn); + if (!page || !PageLRU(page) || + !get_page_unless_zero(page)) + return NULL; + + zone = page_zone(page); + spin_lock_irq(&zone->lru_lock); + if (unlikely(!PageLRU(page))) { + put_page(page); + page = NULL; + } + spin_unlock_irq(&zone->lru_lock); + return page; +} + +static int page_idle_clear_pte_refs_one(struct page *page, + struct vm_area_struct *vma, + unsigned long addr, void *arg) +{ + struct mm_struct *mm = vma->vm_mm; + spinlock_t *ptl; + pmd_t *pmd; + pte_t *pte; + bool referenced = false; + + if (unlikely(PageTransHuge(page))) { + pmd = page_check_address_pmd(page, mm, addr, + PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl); + if (pmd) { + referenced = pmdp_clear_young_notify(vma, addr, pmd); + spin_unlock(ptl); + } + } else { + pte = page_check_address(page, mm, addr, &ptl, 0); + if (pte) { + referenced = ptep_clear_young_notify(vma, addr, pte); + pte_unmap_unlock(pte, ptl); + } + } + if (referenced) { + clear_page_idle(page); + /* + * We cleared the referenced bit in a mapping to this page. To + * avoid interference with page reclaim, mark it young so that + * page_referenced() will return > 0. + */ + set_page_young(page); + } + return SWAP_AGAIN; +} + +static void page_idle_clear_pte_refs(struct page *page) +{ + /* + * Since rwc.arg is unused, rwc is effectively immutable, so we + * can make it static const to save some cycles and stack. + */ + static const struct rmap_walk_control rwc = { + .rmap_one = page_idle_clear_pte_refs_one, + .anon_lock = page_lock_anon_vma_read, + }; + bool need_lock; + + if (!page_mapped(page) || + !page_rmapping(page)) + return; + + need_lock = !PageAnon(page) || PageKsm(page); + if (need_lock && !trylock_page(page)) + return; + + rmap_walk(page, (struct rmap_walk_control *)&rwc); + + if (need_lock) + unlock_page(page); +} + +static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj, + struct bin_attribute *attr, char *buf, + loff_t pos, size_t count) +{ + u64 *out = (u64 *)buf; + struct page *page; + unsigned long pfn, end_pfn; + int bit; + + if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE) + return -EINVAL; + + pfn = pos * BITS_PER_BYTE; + if (pfn >= max_pfn) + return 0; + + end_pfn = pfn + count * BITS_PER_BYTE; + if (end_pfn > max_pfn) + end_pfn = ALIGN(max_pfn, BITMAP_CHUNK_BITS); + + for (; pfn < end_pfn; pfn++) { + bit = pfn % BITMAP_CHUNK_BITS; + if (!bit) + *out = 0ULL; + page = page_idle_get_page(pfn); + if (page) { + if (page_is_idle(page)) { + /* + * The page might have been referenced via a + * pte, in which case it is not idle. Clear + * refs and recheck. + */ + page_idle_clear_pte_refs(page); + if (page_is_idle(page)) + *out |= 1ULL << bit; + } + put_page(page); + } + if (bit == BITMAP_CHUNK_BITS - 1) + out++; + cond_resched(); + } + return (char *)out - buf; +} + +static ssize_t page_idle_bitmap_write(struct file *file, struct kobject *kobj, + struct bin_attribute *attr, char *buf, + loff_t pos, size_t count) +{ + const u64 *in = (u64 *)buf; + struct page *page; + unsigned long pfn, end_pfn; + int bit; + + if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE) + return -EINVAL; + + pfn = pos * BITS_PER_BYTE; + if (pfn >= max_pfn) + return -ENXIO; + + end_pfn = pfn + count * BITS_PER_BYTE; + if (end_pfn > max_pfn) + end_pfn = ALIGN(max_pfn, BITMAP_CHUNK_BITS); + + for (; pfn < end_pfn; pfn++) { + bit = pfn % BITMAP_CHUNK_BITS; + if ((*in >> bit) & 1) { + page = page_idle_get_page(pfn); + if (page) { + page_idle_clear_pte_refs(page); + set_page_idle(page); + put_page(page); + } + } + if (bit == BITMAP_CHUNK_BITS - 1) + in++; + cond_resched(); + } + return (char *)in - buf; +} + +static struct bin_attribute page_idle_bitmap_attr = + __BIN_ATTR(bitmap, S_IRUSR | S_IWUSR, + page_idle_bitmap_read, page_idle_bitmap_write, 0); + +static struct bin_attribute *page_idle_bin_attrs[] = { + &page_idle_bitmap_attr, + NULL, +}; + +static struct attribute_group page_idle_attr_group = { + .bin_attrs = page_idle_bin_attrs, + .name = "page_idle", +}; + +#ifndef CONFIG_64BIT +static bool need_page_idle(void) +{ + return true; +} +struct page_ext_operations page_idle_ops = { + .need = need_page_idle, +}; +#endif + +static int __init page_idle_init(void) +{ + int err; + + err = sysfs_create_group(mm_kobj, &page_idle_attr_group); + if (err) { + pr_err("page_idle: register sysfs failed\n"); + return err; + } + return 0; +} +subsys_initcall(page_idle_init); diff --git a/mm/rmap.c b/mm/rmap.c index 0db38e7d0a72..f5b5c1f3dcd7 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -59,6 +59,7 @@ #include #include #include +#include #include @@ -886,6 +887,11 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma, pte_unmap_unlock(pte, ptl); } + if (referenced) + clear_page_idle(page); + if (test_and_clear_page_young(page)) + referenced++; + if (referenced) { pra->referenced++; pra->vm_flags |= vma->vm_flags; diff --git a/mm/swap.c b/mm/swap.c index a3a0a2f1f7c3..983f692a47fd 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -32,6 +32,7 @@ #include #include #include +#include #include "internal.h" @@ -622,6 +623,8 @@ void mark_page_accessed(struct page *page) } else if (!PageReferenced(page)) { SetPageReferenced(page); } + if (page_is_idle(page)) + clear_page_idle(page); } EXPORT_SYMBOL(mark_page_accessed);