From c49f50d1983d53871ecc77b60c1fa69a2a5ca6d9 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 25 Feb 2021 17:15:25 -0800 Subject: [PATCH 001/118] mm: make pagecache tagged lookups return only head pages Patch series "Overhaul multi-page lookups for THP", v4. This THP prep patchset changes several page cache iteration APIs to only return head pages. - It's only possible to tag head pages in the page cache, so only return head pages, not all their subpages. - Factor a lot of common code out of the various batch lookup routines - Add mapping_seek_hole_data() - Unify find_get_entries() and pagevec_lookup_entries() - Make find_get_entries only return head pages, like find_get_entry(). These are only loosely connected, but they seem to make sense together as a series. This patch (of 14): Pagecache tags are used for dirty page writeback. Since dirtiness is tracked on a per-THP basis, we only want to return the head page rather than each subpage of a tagged page. All the filesystems which use huge pages today are in-memory, so there are no tagged huge pages today. Link: https://lkml.kernel.org/r/20201112212641.27837-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jan Kara Reviewed-by: William Kucharski Reviewed-by: Christoph Hellwig Cc: Hugh Dickins Cc: Johannes Weiner Cc: Yang Shi Cc: Dave Chinner Cc: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 46a8b9e82434..57eae5163bce 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2062,7 +2062,7 @@ retry: EXPORT_SYMBOL(find_get_pages_contig); /** - * find_get_pages_range_tag - find and return pages in given range matching @tag + * find_get_pages_range_tag - Find and return head pages matching @tag. * @mapping: the address_space to search * @index: the starting page index * @end: The final page index (inclusive) @@ -2070,8 +2070,9 @@ EXPORT_SYMBOL(find_get_pages_contig); * @nr_pages: the maximum number of pages * @pages: where the resulting pages are placed * - * Like find_get_pages, except we only return pages which are tagged with - * @tag. We update @index to index the next page for the traversal. + * Like find_get_pages(), except we only return head pages which are tagged + * with @tag. @index is updated to the index immediately after the last + * page we return, ready for the next iteration. * * Return: the number of pages which were found. */ @@ -2105,9 +2106,9 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, if (unlikely(page != xas_reload(&xas))) goto put_page; - pages[ret] = find_subpage(page, xas.xa_index); + pages[ret] = page; if (++ret == nr_pages) { - *index = xas.xa_index + 1; + *index = page->index + thp_nr_pages(page); goto out; } continue; From 96888e0ab0e652eb3036eff0cb0664a96cb7e9a9 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 25 Feb 2021 17:15:29 -0800 Subject: [PATCH 002/118] mm/shmem: use pagevec_lookup in shmem_unlock_mapping The comment shows that the reason for using find_get_entries() is now stale; find_get_pages() will not return 0 if it hits a consecutive run of swap entries, and I don't believe it has since 2011. pagevec_lookup() is a simpler function to use than find_get_pages(), so use it instead. Link: https://lkml.kernel.org/r/20201112212641.27837-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jan Kara Reviewed-by: William Kucharski Reviewed-by: Christoph Hellwig Cc: Dave Chinner Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index ff741d229701..5ea1fa53db3f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -842,7 +842,6 @@ unsigned long shmem_swap_usage(struct vm_area_struct *vma) void shmem_unlock_mapping(struct address_space *mapping) { struct pagevec pvec; - pgoff_t indices[PAGEVEC_SIZE]; pgoff_t index = 0; pagevec_init(&pvec); @@ -850,16 +849,8 @@ void shmem_unlock_mapping(struct address_space *mapping) * Minor point, but we might as well stop if someone else SHM_LOCKs it. */ while (!mapping_unevictable(mapping)) { - /* - * Avoid pagevec_lookup(): find_get_pages() returns 0 as if it - * has finished, if it hits a row of PAGEVEC_SIZE swap entries. - */ - pvec.nr = find_get_entries(mapping, index, - PAGEVEC_SIZE, pvec.pages, indices); - if (!pvec.nr) + if (!pagevec_lookup(&pvec, mapping, &index)) break; - index = indices[pvec.nr - 1] + 1; - pagevec_remove_exceptionals(&pvec); check_move_unevictable_pages(&pvec); pagevec_release(&pvec); cond_resched(); From 8c647dd1e39573f23a4ca25c09f82716b70e702c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 25 Feb 2021 17:15:33 -0800 Subject: [PATCH 003/118] mm/swap: optimise get_shadow_from_swap_cache There's no need to get a reference to the page, just load the entry and see if it's a shadow entry. Link: https://lkml.kernel.org/r/20201112212641.27837-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Cc: Dave Chinner Cc: Hugh Dickins Cc: Jan Kara Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: William Kucharski Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap_state.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/swap_state.c b/mm/swap_state.c index c1a648d9092b..f270c30d4681 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -87,11 +87,9 @@ void *get_shadow_from_swap_cache(swp_entry_t entry) pgoff_t idx = swp_offset(entry); struct page *page; - page = find_get_entry(address_space, idx); + page = xa_load(&address_space->i_pages, idx); if (xa_is_value(page)) return page; - if (page) - put_page(page); return NULL; } From 44835d20b2a0c9b4c0c3fb96e90f4e2fd4a4e41d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 25 Feb 2021 17:15:36 -0800 Subject: [PATCH 004/118] mm: add FGP_ENTRY The functionality of find_lock_entry() and find_get_entry() can be provided by pagecache_get_page(), which lets us delete find_lock_entry() and make find_get_entry() static. Link: https://lkml.kernel.org/r/20201112212641.27837-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Cc: Dave Chinner Cc: Hugh Dickins Cc: Jan Kara Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: William Kucharski Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 1 + mm/filemap.c | 44 ++++++++--------------------------------- mm/internal.h | 3 --- mm/shmem.c | 3 ++- mm/swap_state.c | 3 ++- 5 files changed, 13 insertions(+), 41 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index bd629d676a27..b379b2388202 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -315,6 +315,7 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping, #define FGP_NOWAIT 0x00000020 #define FGP_FOR_MMAP 0x00000040 #define FGP_HEAD 0x00000080 +#define FGP_ENTRY 0x00000100 struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset, int fgp_flags, gfp_t cache_gfp_mask); diff --git a/mm/filemap.c b/mm/filemap.c index 57eae5163bce..84b7813badf1 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1658,7 +1658,7 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping, } EXPORT_SYMBOL(page_cache_prev_miss); -/** +/* * find_get_entry - find and get a page cache entry * @mapping: the address_space to search * @index: The page cache index. @@ -1671,7 +1671,7 @@ EXPORT_SYMBOL(page_cache_prev_miss); * * Return: The head page or shadow entry, %NULL if nothing is found. */ -struct page *find_get_entry(struct address_space *mapping, pgoff_t index) +static struct page *find_get_entry(struct address_space *mapping, pgoff_t index) { XA_STATE(xas, &mapping->i_pages, index); struct page *page; @@ -1707,39 +1707,6 @@ out: return page; } -/** - * find_lock_entry - Locate and lock a page cache entry. - * @mapping: The address_space to search. - * @index: The page cache index. - * - * Looks up the page at @mapping & @index. If there is a page in the - * cache, the head page is returned locked and with an increased refcount. - * - * If the slot holds a shadow entry of a previously evicted page, or a - * swap entry from shmem/tmpfs, it is returned. - * - * Context: May sleep. - * Return: The head page or shadow entry, %NULL if nothing is found. - */ -struct page *find_lock_entry(struct address_space *mapping, pgoff_t index) -{ - struct page *page; - -repeat: - page = find_get_entry(mapping, index); - if (page && !xa_is_value(page)) { - lock_page(page); - /* Has the page been truncated? */ - if (unlikely(page->mapping != mapping)) { - unlock_page(page); - put_page(page); - goto repeat; - } - VM_BUG_ON_PAGE(!thp_contains(page, index), page); - } - return page; -} - /** * pagecache_get_page - Find and get a reference to a page. * @mapping: The address_space to search. @@ -1755,6 +1722,8 @@ repeat: * * %FGP_LOCK - The page is returned locked. * * %FGP_HEAD - If the page is present and a THP, return the head page * rather than the exact page specified by the index. + * * %FGP_ENTRY - If there is a shadow / swap / DAX entry, return it + * instead of allocating a new page to replace it. * * %FGP_CREAT - If no page is present then a new page is allocated using * @gfp_mask and added to the page cache and the VM's LRU list. * The page is returned locked and with an increased refcount. @@ -1779,8 +1748,11 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index, repeat: page = find_get_entry(mapping, index); - if (xa_is_value(page)) + if (xa_is_value(page)) { + if (fgp_flags & FGP_ENTRY) + return page; page = NULL; + } if (!page) goto no_page; diff --git a/mm/internal.h b/mm/internal.h index 25d2b2439f19..eed74f1e6147 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -60,9 +60,6 @@ static inline void force_page_cache_readahead(struct address_space *mapping, force_page_cache_ra(&ractl, &file->f_ra, nr_to_read); } -struct page *find_get_entry(struct address_space *mapping, pgoff_t index); -struct page *find_lock_entry(struct address_space *mapping, pgoff_t index); - /** * page_evictable - test whether a page is evictable * @page: the page to test diff --git a/mm/shmem.c b/mm/shmem.c index 5ea1fa53db3f..bd5bb78128af 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1812,7 +1812,8 @@ repeat: sbinfo = SHMEM_SB(inode->i_sb); charge_mm = vma ? vma->vm_mm : current->mm; - page = find_lock_entry(mapping, index); + page = pagecache_get_page(mapping, index, + FGP_ENTRY | FGP_HEAD | FGP_LOCK, 0); if (xa_is_value(page)) { error = shmem_swapin_page(inode, index, &page, sgp, gfp, vma, fault_type); diff --git a/mm/swap_state.c b/mm/swap_state.c index f270c30d4681..3cdee7b11da9 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -403,7 +403,8 @@ struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index) { swp_entry_t swp; struct swap_info_struct *si; - struct page *page = find_get_entry(mapping, index); + struct page *page = pagecache_get_page(mapping, index, + FGP_ENTRY | FGP_HEAD, 0); if (!page) return page; From bc5a301120f35caf0cd6cfdff7efa0fa779749c3 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 25 Feb 2021 17:15:40 -0800 Subject: [PATCH 005/118] mm/filemap: rename find_get_entry to mapping_get_entry find_get_entry doesn't "find" anything. It returns the entry at a particular index. Link: https://lkml.kernel.org/r/20201112212641.27837-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Cc: Dave Chinner Cc: Hugh Dickins Cc: Jan Kara Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: William Kucharski Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 84b7813badf1..087308cf17ba 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1659,7 +1659,7 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping, EXPORT_SYMBOL(page_cache_prev_miss); /* - * find_get_entry - find and get a page cache entry + * mapping_get_entry - Get a page cache entry. * @mapping: the address_space to search * @index: The page cache index. * @@ -1671,7 +1671,8 @@ EXPORT_SYMBOL(page_cache_prev_miss); * * Return: The head page or shadow entry, %NULL if nothing is found. */ -static struct page *find_get_entry(struct address_space *mapping, pgoff_t index) +static struct page *mapping_get_entry(struct address_space *mapping, + pgoff_t index) { XA_STATE(xas, &mapping->i_pages, index); struct page *page; @@ -1747,7 +1748,7 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index, struct page *page; repeat: - page = find_get_entry(mapping, index); + page = mapping_get_entry(mapping, index); if (xa_is_value(page)) { if (fgp_flags & FGP_ENTRY) return page; From c7bad633e6b749b2d64e2421cc9d4ee0d1540a8a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 25 Feb 2021 17:15:44 -0800 Subject: [PATCH 006/118] mm/filemap: add helper for finding pages There is a lot of common code in find_get_entries(), find_get_pages_range() and find_get_pages_range_tag(). Factor out find_get_entry() which simplifies all three functions. [willy@infradead.org: remove VM_BUG_ON_PAGE()] Link: https://lkml.kernel.org/r/20201124041507.28996-2-willy@infradead.orgLink: https://lkml.kernel.org/r/20201112212641.27837-7-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jan Kara Reviewed-by: William Kucharski Reviewed-by: Christoph Hellwig Cc: Dave Chinner Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 97 +++++++++++++++++++++++----------------------------- 1 file changed, 42 insertions(+), 55 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 087308cf17ba..21443850aeae 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1825,6 +1825,42 @@ no_page: } EXPORT_SYMBOL(pagecache_get_page); +static inline struct page *find_get_entry(struct xa_state *xas, pgoff_t max, + xa_mark_t mark) +{ + struct page *page; + +retry: + if (mark == XA_PRESENT) + page = xas_find(xas, max); + else + page = xas_find_marked(xas, max, mark); + + if (xas_retry(xas, page)) + goto retry; + /* + * A shadow entry of a recently evicted page, a swap + * entry from shmem/tmpfs or a DAX entry. Return it + * without attempting to raise page count. + */ + if (!page || xa_is_value(page)) + return page; + + if (!page_cache_get_speculative(page)) + goto reset; + + /* Has the page moved or been split? */ + if (unlikely(page != xas_reload(xas))) { + put_page(page); + goto reset; + } + + return page; +reset: + xas_reset(xas); + goto retry; +} + /** * find_get_entries - gang pagecache lookup * @mapping: The address_space to search @@ -1864,42 +1900,21 @@ unsigned find_get_entries(struct address_space *mapping, return 0; rcu_read_lock(); - xas_for_each(&xas, page, ULONG_MAX) { - if (xas_retry(&xas, page)) - continue; - /* - * A shadow entry of a recently evicted page, a swap - * entry from shmem/tmpfs or a DAX entry. Return it - * without attempting to raise page count. - */ - if (xa_is_value(page)) - goto export; - - if (!page_cache_get_speculative(page)) - goto retry; - - /* Has the page moved or been split? */ - if (unlikely(page != xas_reload(&xas))) - goto put_page; - + while ((page = find_get_entry(&xas, ULONG_MAX, XA_PRESENT))) { /* * Terminate early on finding a THP, to allow the caller to * handle it all at once; but continue if this is hugetlbfs. */ - if (PageTransHuge(page) && !PageHuge(page)) { + if (!xa_is_value(page) && PageTransHuge(page) && + !PageHuge(page)) { page = find_subpage(page, xas.xa_index); nr_entries = ret + 1; } -export: + indices[ret] = xas.xa_index; entries[ret] = page; if (++ret == nr_entries) break; - continue; -put_page: - put_page(page); -retry: - xas_reset(&xas); } rcu_read_unlock(); return ret; @@ -1938,30 +1953,16 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, return 0; rcu_read_lock(); - xas_for_each(&xas, page, end) { - if (xas_retry(&xas, page)) - continue; + while ((page = find_get_entry(&xas, end, XA_PRESENT))) { /* Skip over shadow, swap and DAX entries */ if (xa_is_value(page)) continue; - if (!page_cache_get_speculative(page)) - goto retry; - - /* Has the page moved or been split? */ - if (unlikely(page != xas_reload(&xas))) - goto put_page; - pages[ret] = find_subpage(page, xas.xa_index); if (++ret == nr_pages) { *start = xas.xa_index + 1; goto out; } - continue; -put_page: - put_page(page); -retry: - xas_reset(&xas); } /* @@ -2061,9 +2062,7 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, return 0; rcu_read_lock(); - xas_for_each_marked(&xas, page, end, tag) { - if (xas_retry(&xas, page)) - continue; + while ((page = find_get_entry(&xas, end, tag))) { /* * Shadow entries should never be tagged, but this iteration * is lockless so there is a window for page reclaim to evict @@ -2072,23 +2071,11 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, if (xa_is_value(page)) continue; - if (!page_cache_get_speculative(page)) - goto retry; - - /* Has the page moved or been split? */ - if (unlikely(page != xas_reload(&xas))) - goto put_page; - pages[ret] = page; if (++ret == nr_pages) { *index = page->index + thp_nr_pages(page); goto out; } - continue; -put_page: - put_page(page); -retry: - xas_reset(&xas); } /* From 41139aa4c3a31ee7e072fc63353c74035aade2ff Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 25 Feb 2021 17:15:48 -0800 Subject: [PATCH 007/118] mm/filemap: add mapping_seek_hole_data Rewrite shmem_seek_hole_data() and move it to filemap.c. [willy@infradead.org: don't put an xa_is_value() page] Link: https://lkml.kernel.org/r/20201124041507.28996-4-willy@infradead.org Link: https://lkml.kernel.org/r/20201112212641.27837-8-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: William Kucharski Reviewed-by: Christoph Hellwig Cc: Dave Chinner Cc: Hugh Dickins Cc: Jan Kara Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 2 ++ mm/filemap.c | 76 +++++++++++++++++++++++++++++++++++++++++ mm/shmem.c | 74 +++------------------------------------ 3 files changed, 82 insertions(+), 70 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index b379b2388202..3608993428d9 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -760,6 +760,8 @@ extern void __delete_from_page_cache(struct page *page, void *shadow); void replace_page_cache_page(struct page *old, struct page *new); void delete_from_page_cache_batch(struct address_space *mapping, struct pagevec *pvec); +loff_t mapping_seek_hole_data(struct address_space *, loff_t start, loff_t end, + int whence); /* * Like add_to_page_cache_locked, but used to add newly allocated pages: diff --git a/mm/filemap.c b/mm/filemap.c index 21443850aeae..eff3006be12a 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2553,6 +2553,82 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) } EXPORT_SYMBOL(generic_file_read_iter); +static inline bool page_seek_match(struct page *page, bool seek_data) +{ + if (xa_is_value(page) || PageUptodate(page)) + return seek_data; + return !seek_data; +} + +static inline +unsigned int seek_page_size(struct xa_state *xas, struct page *page) +{ + if (xa_is_value(page)) + return PAGE_SIZE << xa_get_order(xas->xa, xas->xa_index); + return thp_size(page); +} + +/** + * mapping_seek_hole_data - Seek for SEEK_DATA / SEEK_HOLE in the page cache. + * @mapping: Address space to search. + * @start: First byte to consider. + * @end: Limit of search (exclusive). + * @whence: Either SEEK_HOLE or SEEK_DATA. + * + * If the page cache knows which blocks contain holes and which blocks + * contain data, your filesystem can use this function to implement + * SEEK_HOLE and SEEK_DATA. This is useful for filesystems which are + * entirely memory-based such as tmpfs, and filesystems which support + * unwritten extents. + * + * Return: The requested offset on successs, or -ENXIO if @whence specifies + * SEEK_DATA and there is no data after @start. There is an implicit hole + * after @end - 1, so SEEK_HOLE returns @end if all the bytes between @start + * and @end contain data. + */ +loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start, + loff_t end, int whence) +{ + XA_STATE(xas, &mapping->i_pages, start >> PAGE_SHIFT); + pgoff_t max = (end - 1) / PAGE_SIZE; + bool seek_data = (whence == SEEK_DATA); + struct page *page; + + if (end <= start) + return -ENXIO; + + rcu_read_lock(); + while ((page = find_get_entry(&xas, max, XA_PRESENT))) { + loff_t pos = xas.xa_index * PAGE_SIZE; + + if (start < pos) { + if (!seek_data) + goto unlock; + start = pos; + } + + if (page_seek_match(page, seek_data)) + goto unlock; + start = pos + seek_page_size(&xas, page); + if (!xa_is_value(page)) + put_page(page); + } + rcu_read_unlock(); + + if (seek_data) + return -ENXIO; + goto out; + +unlock: + rcu_read_unlock(); + if (!xa_is_value(page)) + put_page(page); +out: + if (start > end) + return end; + return start; +} + #ifdef CONFIG_MMU #define MMAP_LOTSAMISS (100) /* diff --git a/mm/shmem.c b/mm/shmem.c index bd5bb78128af..deb22e128435 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2668,86 +2668,20 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) return retval ? retval : error; } -/* - * llseek SEEK_DATA or SEEK_HOLE through the page cache. - */ -static pgoff_t shmem_seek_hole_data(struct address_space *mapping, - pgoff_t index, pgoff_t end, int whence) -{ - struct page *page; - struct pagevec pvec; - pgoff_t indices[PAGEVEC_SIZE]; - bool done = false; - int i; - - pagevec_init(&pvec); - pvec.nr = 1; /* start small: we may be there already */ - while (!done) { - pvec.nr = find_get_entries(mapping, index, - pvec.nr, pvec.pages, indices); - if (!pvec.nr) { - if (whence == SEEK_DATA) - index = end; - break; - } - for (i = 0; i < pvec.nr; i++, index++) { - if (index < indices[i]) { - if (whence == SEEK_HOLE) { - done = true; - break; - } - index = indices[i]; - } - page = pvec.pages[i]; - if (page && !xa_is_value(page)) { - if (!PageUptodate(page)) - page = NULL; - } - if (index >= end || - (page && whence == SEEK_DATA) || - (!page && whence == SEEK_HOLE)) { - done = true; - break; - } - } - pagevec_remove_exceptionals(&pvec); - pagevec_release(&pvec); - pvec.nr = PAGEVEC_SIZE; - cond_resched(); - } - return index; -} - static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence) { struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; - pgoff_t start, end; - loff_t new_offset; if (whence != SEEK_DATA && whence != SEEK_HOLE) return generic_file_llseek_size(file, offset, whence, MAX_LFS_FILESIZE, i_size_read(inode)); + if (offset < 0) + return -ENXIO; + inode_lock(inode); /* We're holding i_mutex so we can access i_size directly */ - - if (offset < 0 || offset >= inode->i_size) - offset = -ENXIO; - else { - start = offset >> PAGE_SHIFT; - end = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT; - new_offset = shmem_seek_hole_data(mapping, start, end, whence); - new_offset <<= PAGE_SHIFT; - if (new_offset > offset) { - if (new_offset < inode->i_size) - offset = new_offset; - else if (whence == SEEK_DATA) - offset = -ENXIO; - else - offset = inode->i_size; - } - } - + offset = mapping_seek_hole_data(mapping, offset, inode->i_size, whence); if (offset >= 0) offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE); inode_unlock(inode); From 54fa39ac2e00b1b8c2a7fe72e648773ffa48f76d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 25 Feb 2021 17:15:52 -0800 Subject: [PATCH 008/118] iomap: use mapping_seek_hole_data Enhance mapping_seek_hole_data() to handle partially uptodate pages and convert the iomap seek code to call it. Link: https://lkml.kernel.org/r/20201112212641.27837-9-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Christoph Hellwig Cc: Dave Chinner Cc: Hugh Dickins Cc: Jan Kara Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: William Kucharski Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/iomap/seek.c | 125 +++++------------------------------------------- mm/filemap.c | 37 ++++++++++++-- 2 files changed, 43 insertions(+), 119 deletions(-) diff --git a/fs/iomap/seek.c b/fs/iomap/seek.c index 107ee80c3568..dab1b02eba5b 100644 --- a/fs/iomap/seek.c +++ b/fs/iomap/seek.c @@ -10,122 +10,17 @@ #include #include -/* - * Seek for SEEK_DATA / SEEK_HOLE within @page, starting at @lastoff. - * Returns true if found and updates @lastoff to the offset in file. - */ -static bool -page_seek_hole_data(struct inode *inode, struct page *page, loff_t *lastoff, - int whence) -{ - const struct address_space_operations *ops = inode->i_mapping->a_ops; - unsigned int bsize = i_blocksize(inode), off; - bool seek_data = whence == SEEK_DATA; - loff_t poff = page_offset(page); - - if (WARN_ON_ONCE(*lastoff >= poff + PAGE_SIZE)) - return false; - - if (*lastoff < poff) { - /* - * Last offset smaller than the start of the page means we found - * a hole: - */ - if (whence == SEEK_HOLE) - return true; - *lastoff = poff; - } - - /* - * Just check the page unless we can and should check block ranges: - */ - if (bsize == PAGE_SIZE || !ops->is_partially_uptodate) - return PageUptodate(page) == seek_data; - - lock_page(page); - if (unlikely(page->mapping != inode->i_mapping)) - goto out_unlock_not_found; - - for (off = 0; off < PAGE_SIZE; off += bsize) { - if (offset_in_page(*lastoff) >= off + bsize) - continue; - if (ops->is_partially_uptodate(page, off, bsize) == seek_data) { - unlock_page(page); - return true; - } - *lastoff = poff + off + bsize; - } - -out_unlock_not_found: - unlock_page(page); - return false; -} - -/* - * Seek for SEEK_DATA / SEEK_HOLE in the page cache. - * - * Within unwritten extents, the page cache determines which parts are holes - * and which are data: uptodate buffer heads count as data; everything else - * counts as a hole. - * - * Returns the resulting offset on successs, and -ENOENT otherwise. - */ static loff_t -page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length, - int whence) -{ - pgoff_t index = offset >> PAGE_SHIFT; - pgoff_t end = DIV_ROUND_UP(offset + length, PAGE_SIZE); - loff_t lastoff = offset; - struct pagevec pvec; - - if (length <= 0) - return -ENOENT; - - pagevec_init(&pvec); - - do { - unsigned nr_pages, i; - - nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, &index, - end - 1); - if (nr_pages == 0) - break; - - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; - - if (page_seek_hole_data(inode, page, &lastoff, whence)) - goto check_range; - lastoff = page_offset(page) + PAGE_SIZE; - } - pagevec_release(&pvec); - } while (index < end); - - /* When no page at lastoff and we are not done, we found a hole. */ - if (whence != SEEK_HOLE) - goto not_found; - -check_range: - if (lastoff < offset + length) - goto out; -not_found: - lastoff = -ENOENT; -out: - pagevec_release(&pvec); - return lastoff; -} - - -static loff_t -iomap_seek_hole_actor(struct inode *inode, loff_t offset, loff_t length, +iomap_seek_hole_actor(struct inode *inode, loff_t start, loff_t length, void *data, struct iomap *iomap, struct iomap *srcmap) { + loff_t offset = start; + switch (iomap->type) { case IOMAP_UNWRITTEN: - offset = page_cache_seek_hole_data(inode, offset, length, - SEEK_HOLE); - if (offset < 0) + offset = mapping_seek_hole_data(inode->i_mapping, start, + start + length, SEEK_HOLE); + if (offset == start + length) return length; fallthrough; case IOMAP_HOLE: @@ -164,15 +59,17 @@ iomap_seek_hole(struct inode *inode, loff_t offset, const struct iomap_ops *ops) EXPORT_SYMBOL_GPL(iomap_seek_hole); static loff_t -iomap_seek_data_actor(struct inode *inode, loff_t offset, loff_t length, +iomap_seek_data_actor(struct inode *inode, loff_t start, loff_t length, void *data, struct iomap *iomap, struct iomap *srcmap) { + loff_t offset = start; + switch (iomap->type) { case IOMAP_HOLE: return length; case IOMAP_UNWRITTEN: - offset = page_cache_seek_hole_data(inode, offset, length, - SEEK_DATA); + offset = mapping_seek_hole_data(inode->i_mapping, start, + start + length, SEEK_DATA); if (offset < 0) return length; fallthrough; diff --git a/mm/filemap.c b/mm/filemap.c index eff3006be12a..6a34f94adf3b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2553,11 +2553,36 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) } EXPORT_SYMBOL(generic_file_read_iter); -static inline bool page_seek_match(struct page *page, bool seek_data) +static inline loff_t page_seek_hole_data(struct xa_state *xas, + struct address_space *mapping, struct page *page, + loff_t start, loff_t end, bool seek_data) { + const struct address_space_operations *ops = mapping->a_ops; + size_t offset, bsz = i_blocksize(mapping->host); + if (xa_is_value(page) || PageUptodate(page)) - return seek_data; - return !seek_data; + return seek_data ? start : end; + if (!ops->is_partially_uptodate) + return seek_data ? end : start; + + xas_pause(xas); + rcu_read_unlock(); + lock_page(page); + if (unlikely(page->mapping != mapping)) + goto unlock; + + offset = offset_in_thp(page, start) & ~(bsz - 1); + + do { + if (ops->is_partially_uptodate(page, offset, bsz) == seek_data) + break; + start = (start + bsz) & ~(bsz - 1); + offset += bsz; + } while (offset < thp_size(page)); +unlock: + unlock_page(page); + rcu_read_lock(); + return start; } static inline @@ -2607,9 +2632,11 @@ loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start, start = pos; } - if (page_seek_match(page, seek_data)) + pos += seek_page_size(&xas, page); + start = page_seek_hole_data(&xas, mapping, page, start, pos, + seek_data); + if (start < pos) goto unlock; - start = pos + seek_page_size(&xas, page); if (!xa_is_value(page)) put_page(page); } From 5c211ba29deb84e647b3a87207c8714efd9c11d5 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 25 Feb 2021 17:15:56 -0800 Subject: [PATCH 009/118] mm: add and use find_lock_entries We have three functions (shmem_undo_range(), truncate_inode_pages_range() and invalidate_mapping_pages()) which want exactly this function, so add it to filemap.c. Before this patch, shmem_undo_range() would split any compound page which overlaps either end of the range being punched in both the first and second loops through the address space. After this patch, that functionality is left for the second loop, which is arguably more appropriate since the first loop is supposed to run through all the pages quickly, and splitting a page can sleep. [willy@infradead.org: add assertion] Link: https://lkml.kernel.org/r/20201124041507.28996-3-willy@infradead.org Link: https://lkml.kernel.org/r/20201112212641.27837-10-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jan Kara Reviewed-by: William Kucharski Reviewed-by: Christoph Hellwig Cc: Dave Chinner Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 59 +++++++++++++++++++++++++++++++++ mm/internal.h | 3 ++ mm/shmem.c | 22 +++---------- mm/truncate.c | 91 +++++++-------------------------------------------- 4 files changed, 78 insertions(+), 97 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 6a34f94adf3b..61fdcdc75275 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1920,6 +1920,65 @@ unsigned find_get_entries(struct address_space *mapping, return ret; } +/** + * find_lock_entries - Find a batch of pagecache entries. + * @mapping: The address_space to search. + * @start: The starting page cache index. + * @end: The final page index (inclusive). + * @pvec: Where the resulting entries are placed. + * @indices: The cache indices of the entries in @pvec. + * + * find_lock_entries() will return a batch of entries from @mapping. + * Swap, shadow and DAX entries are included. Pages are returned + * locked and with an incremented refcount. Pages which are locked by + * somebody else or under writeback are skipped. Only the head page of + * a THP is returned. Pages which are partially outside the range are + * not returned. + * + * The entries have ascending indexes. The indices may not be consecutive + * due to not-present entries, THP pages, pages which could not be locked + * or pages under writeback. + * + * Return: The number of entries which were found. + */ +unsigned find_lock_entries(struct address_space *mapping, pgoff_t start, + pgoff_t end, struct pagevec *pvec, pgoff_t *indices) +{ + XA_STATE(xas, &mapping->i_pages, start); + struct page *page; + + rcu_read_lock(); + while ((page = find_get_entry(&xas, end, XA_PRESENT))) { + if (!xa_is_value(page)) { + if (page->index < start) + goto put; + VM_BUG_ON_PAGE(page->index != xas.xa_index, page); + if (page->index + thp_nr_pages(page) - 1 > end) + goto put; + if (!trylock_page(page)) + goto put; + if (page->mapping != mapping || PageWriteback(page)) + goto unlock; + VM_BUG_ON_PAGE(!thp_contains(page, xas.xa_index), + page); + } + indices[pvec->nr] = xas.xa_index; + if (!pagevec_add(pvec, page)) + break; + goto next; +unlock: + unlock_page(page); +put: + put_page(page); +next: + if (!xa_is_value(page) && PageTransHuge(page)) + xas_set(&xas, page->index + thp_nr_pages(page)); + } + rcu_read_unlock(); + + return pagevec_count(pvec); +} + /** * find_get_pages_range - gang pagecache lookup * @mapping: The address_space to search diff --git a/mm/internal.h b/mm/internal.h index eed74f1e6147..9902648f2206 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -60,6 +60,9 @@ static inline void force_page_cache_readahead(struct address_space *mapping, force_page_cache_ra(&ractl, &file->f_ra, nr_to_read); } +unsigned find_lock_entries(struct address_space *mapping, pgoff_t start, + pgoff_t end, struct pagevec *pvec, pgoff_t *indices); + /** * page_evictable - test whether a page is evictable * @page: the page to test diff --git a/mm/shmem.c b/mm/shmem.c index deb22e128435..86b1f5bc502c 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -907,12 +907,8 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, pagevec_init(&pvec); index = start; - while (index < end) { - pvec.nr = find_get_entries(mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE), - pvec.pages, indices); - if (!pvec.nr) - break; + while (index < end && find_lock_entries(mapping, index, end - 1, + &pvec, indices)) { for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; @@ -927,18 +923,10 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, index, page); continue; } + index += thp_nr_pages(page) - 1; - VM_BUG_ON_PAGE(page_to_pgoff(page) != index, page); - - if (!trylock_page(page)) - continue; - - if ((!unfalloc || !PageUptodate(page)) && - page_mapping(page) == mapping) { - VM_BUG_ON_PAGE(PageWriteback(page), page); - if (shmem_punch_compound(page, start, end)) - truncate_inode_page(mapping, page); - } + if (!unfalloc || !PageUptodate(page)) + truncate_inode_page(mapping, page); unlock_page(page); } pagevec_remove_exceptionals(&pvec); diff --git a/mm/truncate.c b/mm/truncate.c index 8aa4907e06e0..de7f4f47f780 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -326,51 +326,19 @@ void truncate_inode_pages_range(struct address_space *mapping, pagevec_init(&pvec); index = start; - while (index < end && pagevec_lookup_entries(&pvec, mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE), - indices)) { - /* - * Pagevec array has exceptional entries and we may also fail - * to lock some pages. So we store pages that can be deleted - * in a new pagevec. - */ - struct pagevec locked_pvec; - - pagevec_init(&locked_pvec); - for (i = 0; i < pagevec_count(&pvec); i++) { - struct page *page = pvec.pages[i]; - - /* We rely upon deletion not changing page->index */ - index = indices[i]; - if (index >= end) - break; - - if (xa_is_value(page)) - continue; - - if (!trylock_page(page)) - continue; - WARN_ON(page_to_index(page) != index); - if (PageWriteback(page)) { - unlock_page(page); - continue; - } - if (page->mapping != mapping) { - unlock_page(page); - continue; - } - pagevec_add(&locked_pvec, page); - } - for (i = 0; i < pagevec_count(&locked_pvec); i++) - truncate_cleanup_page(mapping, locked_pvec.pages[i]); - delete_from_page_cache_batch(mapping, &locked_pvec); - for (i = 0; i < pagevec_count(&locked_pvec); i++) - unlock_page(locked_pvec.pages[i]); + while (index < end && find_lock_entries(mapping, index, end - 1, + &pvec, indices)) { + index = indices[pagevec_count(&pvec) - 1] + 1; truncate_exceptional_pvec_entries(mapping, &pvec, indices, end); + for (i = 0; i < pagevec_count(&pvec); i++) + truncate_cleanup_page(mapping, pvec.pages[i]); + delete_from_page_cache_batch(mapping, &pvec); + for (i = 0; i < pagevec_count(&pvec); i++) + unlock_page(pvec.pages[i]); pagevec_release(&pvec); cond_resched(); - index++; } + if (partial_start) { struct page *page = find_lock_page(mapping, start - 1); if (page) { @@ -539,9 +507,7 @@ static unsigned long __invalidate_mapping_pages(struct address_space *mapping, int i; pagevec_init(&pvec); - while (index <= end && pagevec_lookup_entries(&pvec, mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, - indices)) { + while (find_lock_entries(mapping, index, end, &pvec, indices)) { for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; @@ -555,39 +521,7 @@ static unsigned long __invalidate_mapping_pages(struct address_space *mapping, page); continue; } - - if (!trylock_page(page)) - continue; - - WARN_ON(page_to_index(page) != index); - - /* Middle of THP: skip */ - if (PageTransTail(page)) { - unlock_page(page); - continue; - } else if (PageTransHuge(page)) { - index += HPAGE_PMD_NR - 1; - i += HPAGE_PMD_NR - 1; - /* - * 'end' is in the middle of THP. Don't - * invalidate the page as the part outside of - * 'end' could be still useful. - */ - if (index > end) { - unlock_page(page); - continue; - } - - /* Take a pin outside pagevec */ - get_page(page); - - /* - * Drop extra pins before trying to invalidate - * the huge page. - */ - pagevec_remove_exceptionals(&pvec); - pagevec_release(&pvec); - } + index += thp_nr_pages(page) - 1; ret = invalidate_inode_page(page); unlock_page(page); @@ -601,9 +535,6 @@ static unsigned long __invalidate_mapping_pages(struct address_space *mapping, if (nr_pagevec) (*nr_pagevec)++; } - - if (PageTransHuge(page)) - put_page(page); count += ret; } pagevec_remove_exceptionals(&pvec); From ca122fe40eb463c8c11c3bfc1914f0048ca5c268 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 25 Feb 2021 17:16:00 -0800 Subject: [PATCH 010/118] mm: add an 'end' parameter to find_get_entries This simplifies the callers and leads to a more efficient implementation since the XArray has this functionality already. Link: https://lkml.kernel.org/r/20201112212641.27837-11-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jan Kara Reviewed-by: William Kucharski Reviewed-by: Christoph Hellwig Cc: Dave Chinner Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 4 ++-- mm/filemap.c | 9 +++++---- mm/shmem.c | 10 ++-------- mm/swap.c | 2 +- 4 files changed, 10 insertions(+), 15 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 3608993428d9..fdb2c4e44851 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -451,8 +451,8 @@ static inline struct page *find_subpage(struct page *head, pgoff_t index) } unsigned find_get_entries(struct address_space *mapping, pgoff_t start, - unsigned int nr_entries, struct page **entries, - pgoff_t *indices); + pgoff_t end, unsigned int nr_entries, struct page **entries, + pgoff_t *indices); unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, pgoff_t end, unsigned int nr_pages, struct page **pages); diff --git a/mm/filemap.c b/mm/filemap.c index 61fdcdc75275..65cfdff17ac6 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1865,6 +1865,7 @@ reset: * find_get_entries - gang pagecache lookup * @mapping: The address_space to search * @start: The starting page cache index + * @end: The final page index (inclusive). * @nr_entries: The maximum number of entries * @entries: Where the resulting entries are placed * @indices: The cache indices corresponding to the entries in @entries @@ -1888,9 +1889,9 @@ reset: * * Return: the number of pages and shadow entries which were found. */ -unsigned find_get_entries(struct address_space *mapping, - pgoff_t start, unsigned int nr_entries, - struct page **entries, pgoff_t *indices) +unsigned find_get_entries(struct address_space *mapping, pgoff_t start, + pgoff_t end, unsigned int nr_entries, struct page **entries, + pgoff_t *indices) { XA_STATE(xas, &mapping->i_pages, start); struct page *page; @@ -1900,7 +1901,7 @@ unsigned find_get_entries(struct address_space *mapping, return 0; rcu_read_lock(); - while ((page = find_get_entry(&xas, ULONG_MAX, XA_PRESENT))) { + while ((page = find_get_entry(&xas, end, XA_PRESENT))) { /* * Terminate early on finding a THP, to allow the caller to * handle it all at once; but continue if this is hugetlbfs. diff --git a/mm/shmem.c b/mm/shmem.c index 86b1f5bc502c..4aac760aa2d4 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -913,8 +913,6 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, struct page *page = pvec.pages[i]; index = indices[i]; - if (index >= end) - break; if (xa_is_value(page)) { if (unfalloc) @@ -967,9 +965,8 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, while (index < end) { cond_resched(); - pvec.nr = find_get_entries(mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE), - pvec.pages, indices); + pvec.nr = find_get_entries(mapping, index, end - 1, + PAGEVEC_SIZE, pvec.pages, indices); if (!pvec.nr) { /* If all gone or hole-punch or unfalloc, we're done */ if (index == start || end != -1) @@ -982,9 +979,6 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, struct page *page = pvec.pages[i]; index = indices[i]; - if (index >= end) - break; - if (xa_is_value(page)) { if (unfalloc) continue; diff --git a/mm/swap.c b/mm/swap.c index ab3258afcbeb..c5773a84feab 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -1046,7 +1046,7 @@ unsigned pagevec_lookup_entries(struct pagevec *pvec, pgoff_t start, unsigned nr_entries, pgoff_t *indices) { - pvec->nr = find_get_entries(mapping, start, nr_entries, + pvec->nr = find_get_entries(mapping, start, ULONG_MAX, nr_entries, pvec->pages, indices); return pagevec_count(pvec); } From 31d270fd98d196578223e5b568a0bd3bc6028b09 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 25 Feb 2021 17:16:03 -0800 Subject: [PATCH 011/118] mm: add an 'end' parameter to pagevec_lookup_entries Simplifies the callers and uses the existing functionality in find_get_entries(). We can also drop the final argument of truncate_exceptional_pvec_entries() and simplify the logic in that function. Link: https://lkml.kernel.org/r/20201112212641.27837-12-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jan Kara Reviewed-by: William Kucharski Reviewed-by: Christoph Hellwig Cc: Dave Chinner Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagevec.h | 5 ++--- mm/swap.c | 8 ++++---- mm/truncate.c | 41 ++++++++++------------------------------- 3 files changed, 16 insertions(+), 38 deletions(-) diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index ad4ddc17d403..f70a9dc81504 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -26,9 +26,8 @@ struct pagevec { void __pagevec_release(struct pagevec *pvec); void __pagevec_lru_add(struct pagevec *pvec); unsigned pagevec_lookup_entries(struct pagevec *pvec, - struct address_space *mapping, - pgoff_t start, unsigned nr_entries, - pgoff_t *indices); + struct address_space *mapping, pgoff_t start, pgoff_t end, + unsigned nr_entries, pgoff_t *indices); void pagevec_remove_exceptionals(struct pagevec *pvec); unsigned pagevec_lookup_range(struct pagevec *pvec, struct address_space *mapping, diff --git a/mm/swap.c b/mm/swap.c index c5773a84feab..db8c354264a5 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -1022,6 +1022,7 @@ void __pagevec_lru_add(struct pagevec *pvec) * @pvec: Where the resulting entries are placed * @mapping: The address_space to search * @start: The starting entry index + * @end: The highest index to return (inclusive). * @nr_entries: The maximum number of pages * @indices: The cache indices corresponding to the entries in @pvec * @@ -1042,11 +1043,10 @@ void __pagevec_lru_add(struct pagevec *pvec) * found. */ unsigned pagevec_lookup_entries(struct pagevec *pvec, - struct address_space *mapping, - pgoff_t start, unsigned nr_entries, - pgoff_t *indices) + struct address_space *mapping, pgoff_t start, pgoff_t end, + unsigned nr_entries, pgoff_t *indices) { - pvec->nr = find_get_entries(mapping, start, ULONG_MAX, nr_entries, + pvec->nr = find_get_entries(mapping, start, end, nr_entries, pvec->pages, indices); return pagevec_count(pvec); } diff --git a/mm/truncate.c b/mm/truncate.c index de7f4f47f780..60df23890c2d 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -57,11 +57,10 @@ static void clear_shadow_entry(struct address_space *mapping, pgoff_t index, * exceptional entries similar to what pagevec_remove_exceptionals does. */ static void truncate_exceptional_pvec_entries(struct address_space *mapping, - struct pagevec *pvec, pgoff_t *indices, - pgoff_t end) + struct pagevec *pvec, pgoff_t *indices) { int i, j; - bool dax, lock; + bool dax; /* Handled by shmem itself */ if (shmem_mapping(mapping)) @@ -75,8 +74,7 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping, return; dax = dax_mapping(mapping); - lock = !dax && indices[j] < end; - if (lock) + if (!dax) xa_lock_irq(&mapping->i_pages); for (i = j; i < pagevec_count(pvec); i++) { @@ -88,9 +86,6 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping, continue; } - if (index >= end) - continue; - if (unlikely(dax)) { dax_delete_mapping_entry(mapping, index); continue; @@ -99,7 +94,7 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping, __clear_shadow_entry(mapping, index, page); } - if (lock) + if (!dax) xa_unlock_irq(&mapping->i_pages); pvec->nr = j; } @@ -329,7 +324,7 @@ void truncate_inode_pages_range(struct address_space *mapping, while (index < end && find_lock_entries(mapping, index, end - 1, &pvec, indices)) { index = indices[pagevec_count(&pvec) - 1] + 1; - truncate_exceptional_pvec_entries(mapping, &pvec, indices, end); + truncate_exceptional_pvec_entries(mapping, &pvec, indices); for (i = 0; i < pagevec_count(&pvec); i++) truncate_cleanup_page(mapping, pvec.pages[i]); delete_from_page_cache_batch(mapping, &pvec); @@ -381,8 +376,8 @@ void truncate_inode_pages_range(struct address_space *mapping, index = start; for ( ; ; ) { cond_resched(); - if (!pagevec_lookup_entries(&pvec, mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE), indices)) { + if (!pagevec_lookup_entries(&pvec, mapping, index, end - 1, + PAGEVEC_SIZE, indices)) { /* If all gone from start onwards, we're done */ if (index == start) break; @@ -390,23 +385,12 @@ void truncate_inode_pages_range(struct address_space *mapping, index = start; continue; } - if (index == start && indices[0] >= end) { - /* All gone out of hole to be punched, we're done */ - pagevec_remove_exceptionals(&pvec); - pagevec_release(&pvec); - break; - } for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; /* We rely upon deletion not changing page->index */ index = indices[i]; - if (index >= end) { - /* Restart punch to make sure all gone */ - index = start - 1; - break; - } if (xa_is_value(page)) continue; @@ -417,7 +401,7 @@ void truncate_inode_pages_range(struct address_space *mapping, truncate_inode_page(mapping, page); unlock_page(page); } - truncate_exceptional_pvec_entries(mapping, &pvec, indices, end); + truncate_exceptional_pvec_entries(mapping, &pvec, indices); pagevec_release(&pvec); index++; } @@ -513,8 +497,6 @@ static unsigned long __invalidate_mapping_pages(struct address_space *mapping, /* We rely upon deletion not changing page->index */ index = indices[i]; - if (index > end) - break; if (xa_is_value(page)) { invalidate_exceptional_entry(mapping, index, @@ -656,16 +638,13 @@ int invalidate_inode_pages2_range(struct address_space *mapping, pagevec_init(&pvec); index = start; - while (index <= end && pagevec_lookup_entries(&pvec, mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, - indices)) { + while (pagevec_lookup_entries(&pvec, mapping, index, end, + PAGEVEC_SIZE, indices)) { for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; /* We rely upon deletion not changing page->index */ index = indices[i]; - if (index > end) - break; if (xa_is_value(page)) { if (!invalidate_exceptional_entry2(mapping, From 38cefeb33749992ceaad6ea40e12f92aa8f8e28f Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 25 Feb 2021 17:16:07 -0800 Subject: [PATCH 012/118] mm: remove nr_entries parameter from pagevec_lookup_entries All callers want to fetch the full size of the pvec. Link: https://lkml.kernel.org/r/20201112212641.27837-13-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jan Kara Reviewed-by: William Kucharski Reviewed-by: Christoph Hellwig Cc: Dave Chinner Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagevec.h | 2 +- mm/swap.c | 4 ++-- mm/truncate.c | 5 ++--- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index f70a9dc81504..72c5ea2e708d 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -27,7 +27,7 @@ void __pagevec_release(struct pagevec *pvec); void __pagevec_lru_add(struct pagevec *pvec); unsigned pagevec_lookup_entries(struct pagevec *pvec, struct address_space *mapping, pgoff_t start, pgoff_t end, - unsigned nr_entries, pgoff_t *indices); + pgoff_t *indices); void pagevec_remove_exceptionals(struct pagevec *pvec); unsigned pagevec_lookup_range(struct pagevec *pvec, struct address_space *mapping, diff --git a/mm/swap.c b/mm/swap.c index db8c354264a5..cd9e1ed7e78f 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -1044,9 +1044,9 @@ void __pagevec_lru_add(struct pagevec *pvec) */ unsigned pagevec_lookup_entries(struct pagevec *pvec, struct address_space *mapping, pgoff_t start, pgoff_t end, - unsigned nr_entries, pgoff_t *indices) + pgoff_t *indices) { - pvec->nr = find_get_entries(mapping, start, end, nr_entries, + pvec->nr = find_get_entries(mapping, start, end, PAGEVEC_SIZE, pvec->pages, indices); return pagevec_count(pvec); } diff --git a/mm/truncate.c b/mm/truncate.c index 60df23890c2d..41e7377ad58d 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -377,7 +377,7 @@ void truncate_inode_pages_range(struct address_space *mapping, for ( ; ; ) { cond_resched(); if (!pagevec_lookup_entries(&pvec, mapping, index, end - 1, - PAGEVEC_SIZE, indices)) { + indices)) { /* If all gone from start onwards, we're done */ if (index == start) break; @@ -638,8 +638,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, pagevec_init(&pvec); index = start; - while (pagevec_lookup_entries(&pvec, mapping, index, end, - PAGEVEC_SIZE, indices)) { + while (pagevec_lookup_entries(&pvec, mapping, index, end, indices)) { for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; From cf2039af1a2eee58fdbfa68bc0c9123e77477645 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 25 Feb 2021 17:16:11 -0800 Subject: [PATCH 013/118] mm: pass pvec directly to find_get_entries All callers of find_get_entries() use a pvec, so pass it directly instead of manipulating it in the caller. Link: https://lkml.kernel.org/r/20201112212641.27837-14-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jan Kara Reviewed-by: William Kucharski Cc: Christoph Hellwig Cc: Dave Chinner Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 3 +-- mm/filemap.c | 21 +++++++++------------ mm/shmem.c | 5 ++--- mm/swap.c | 4 +--- 4 files changed, 13 insertions(+), 20 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index fdb2c4e44851..20225b067583 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -451,8 +451,7 @@ static inline struct page *find_subpage(struct page *head, pgoff_t index) } unsigned find_get_entries(struct address_space *mapping, pgoff_t start, - pgoff_t end, unsigned int nr_entries, struct page **entries, - pgoff_t *indices); + pgoff_t end, struct pagevec *pvec, pgoff_t *indices); unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, pgoff_t end, unsigned int nr_pages, struct page **pages); diff --git a/mm/filemap.c b/mm/filemap.c index 65cfdff17ac6..43700480d897 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1866,14 +1866,12 @@ reset: * @mapping: The address_space to search * @start: The starting page cache index * @end: The final page index (inclusive). - * @nr_entries: The maximum number of entries - * @entries: Where the resulting entries are placed + * @pvec: Where the resulting entries are placed. * @indices: The cache indices corresponding to the entries in @entries * - * find_get_entries() will search for and return a group of up to - * @nr_entries entries in the mapping. The entries are placed at - * @entries. find_get_entries() takes a reference against any actual - * pages it returns. + * find_get_entries() will search for and return a batch of entries in + * the mapping. The entries are placed in @pvec. find_get_entries() + * takes a reference on any actual pages it returns. * * The search returns a group of mapping-contiguous page cache entries * with ascending indexes. There may be holes in the indices due to @@ -1890,15 +1888,12 @@ reset: * Return: the number of pages and shadow entries which were found. */ unsigned find_get_entries(struct address_space *mapping, pgoff_t start, - pgoff_t end, unsigned int nr_entries, struct page **entries, - pgoff_t *indices) + pgoff_t end, struct pagevec *pvec, pgoff_t *indices) { XA_STATE(xas, &mapping->i_pages, start); struct page *page; unsigned int ret = 0; - - if (!nr_entries) - return 0; + unsigned nr_entries = PAGEVEC_SIZE; rcu_read_lock(); while ((page = find_get_entry(&xas, end, XA_PRESENT))) { @@ -1913,11 +1908,13 @@ unsigned find_get_entries(struct address_space *mapping, pgoff_t start, } indices[ret] = xas.xa_index; - entries[ret] = page; + pvec->pages[ret] = page; if (++ret == nr_entries) break; } rcu_read_unlock(); + + pvec->nr = ret; return ret; } diff --git a/mm/shmem.c b/mm/shmem.c index 4aac760aa2d4..ee8f21832f98 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -965,9 +965,8 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, while (index < end) { cond_resched(); - pvec.nr = find_get_entries(mapping, index, end - 1, - PAGEVEC_SIZE, pvec.pages, indices); - if (!pvec.nr) { + if (!find_get_entries(mapping, index, end - 1, &pvec, + indices)) { /* If all gone or hole-punch or unfalloc, we're done */ if (index == start || end != -1) break; diff --git a/mm/swap.c b/mm/swap.c index cd9e1ed7e78f..d20a746a831e 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -1046,9 +1046,7 @@ unsigned pagevec_lookup_entries(struct pagevec *pvec, struct address_space *mapping, pgoff_t start, pgoff_t end, pgoff_t *indices) { - pvec->nr = find_get_entries(mapping, start, end, PAGEVEC_SIZE, - pvec->pages, indices); - return pagevec_count(pvec); + return find_get_entries(mapping, start, end, pvec, indices); } /** From a656a20241f08be532539c7d5bd82df741c2d487 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 25 Feb 2021 17:16:14 -0800 Subject: [PATCH 014/118] mm: remove pagevec_lookup_entries pagevec_lookup_entries() is now just a wrapper around find_get_entries() so remove it and convert all its callers. Link: https://lkml.kernel.org/r/20201112212641.27837-15-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jan Kara Reviewed-by: William Kucharski Reviewed-by: Christoph Hellwig Cc: Dave Chinner Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagevec.h | 3 --- mm/swap.c | 36 ++---------------------------------- mm/truncate.c | 4 ++-- 3 files changed, 4 insertions(+), 39 deletions(-) diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index 72c5ea2e708d..7f3f19065a9f 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -25,9 +25,6 @@ struct pagevec { void __pagevec_release(struct pagevec *pvec); void __pagevec_lru_add(struct pagevec *pvec); -unsigned pagevec_lookup_entries(struct pagevec *pvec, - struct address_space *mapping, pgoff_t start, pgoff_t end, - pgoff_t *indices); void pagevec_remove_exceptionals(struct pagevec *pvec); unsigned pagevec_lookup_range(struct pagevec *pvec, struct address_space *mapping, diff --git a/mm/swap.c b/mm/swap.c index d20a746a831e..31b844d4ed94 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -1017,44 +1017,12 @@ void __pagevec_lru_add(struct pagevec *pvec) pagevec_reinit(pvec); } -/** - * pagevec_lookup_entries - gang pagecache lookup - * @pvec: Where the resulting entries are placed - * @mapping: The address_space to search - * @start: The starting entry index - * @end: The highest index to return (inclusive). - * @nr_entries: The maximum number of pages - * @indices: The cache indices corresponding to the entries in @pvec - * - * pagevec_lookup_entries() will search for and return a group of up - * to @nr_pages pages and shadow entries in the mapping. All - * entries are placed in @pvec. pagevec_lookup_entries() takes a - * reference against actual pages in @pvec. - * - * The search returns a group of mapping-contiguous entries with - * ascending indexes. There may be holes in the indices due to - * not-present entries. - * - * Only one subpage of a Transparent Huge Page is returned in one call: - * allowing truncate_inode_pages_range() to evict the whole THP without - * cycling through a pagevec of extra references. - * - * pagevec_lookup_entries() returns the number of entries which were - * found. - */ -unsigned pagevec_lookup_entries(struct pagevec *pvec, - struct address_space *mapping, pgoff_t start, pgoff_t end, - pgoff_t *indices) -{ - return find_get_entries(mapping, start, end, pvec, indices); -} - /** * pagevec_remove_exceptionals - pagevec exceptionals pruning * @pvec: The pagevec to prune * - * pagevec_lookup_entries() fills both pages and exceptional radix - * tree entries into the pagevec. This function prunes all + * find_get_entries() fills both pages and XArray value entries (aka + * exceptional entries) into the pagevec. This function prunes all * exceptionals from @pvec without leaving holes, so that it can be * passed on to page-only pagevec operations. */ diff --git a/mm/truncate.c b/mm/truncate.c index 41e7377ad58d..455944264663 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -376,7 +376,7 @@ void truncate_inode_pages_range(struct address_space *mapping, index = start; for ( ; ; ) { cond_resched(); - if (!pagevec_lookup_entries(&pvec, mapping, index, end - 1, + if (!find_get_entries(mapping, index, end - 1, &pvec, indices)) { /* If all gone from start onwards, we're done */ if (index == start) @@ -638,7 +638,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, pagevec_init(&pvec); index = start; - while (pagevec_lookup_entries(&pvec, mapping, index, end, indices)) { + while (find_get_entries(mapping, index, end, &pvec, indices)) { for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; From 164cc4fef4456727466f8e35bb654c3994748070 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Thu, 25 Feb 2021 17:16:18 -0800 Subject: [PATCH 015/118] mm,thp,shmem: limit shmem THP alloc gfp_mask Patch series "mm,thp,shm: limit shmem THP alloc gfp_mask", v6. The allocation flags of anonymous transparent huge pages can be controlled through the files in /sys/kernel/mm/transparent_hugepage/defrag, which can help the system from getting bogged down in the page reclaim and compaction code when many THPs are getting allocated simultaneously. However, the gfp_mask for shmem THP allocations were not limited by those configuration settings, and some workloads ended up with all CPUs stuck on the LRU lock in the page reclaim code, trying to allocate dozens of THPs simultaneously. This patch applies the same configurated limitation of THPs to shmem hugepage allocations, to prevent that from happening. This way a THP defrag setting of "never" or "defer+madvise" will result in quick allocation failures without direct reclaim when no 2MB free pages are available. With this patch applied, THP allocations for tmpfs will be a little more aggressive than today for files mmapped with MADV_HUGEPAGE, and a little less aggressive for files that are not mmapped or mapped without that flag. This patch (of 4): The allocation flags of anonymous transparent huge pages can be controlled through the files in /sys/kernel/mm/transparent_hugepage/defrag, which can help the system from getting bogged down in the page reclaim and compaction code when many THPs are getting allocated simultaneously. However, the gfp_mask for shmem THP allocations were not limited by those configuration settings, and some workloads ended up with all CPUs stuck on the LRU lock in the page reclaim code, trying to allocate dozens of THPs simultaneously. This patch applies the same configurated limitation of THPs to shmem hugepage allocations, to prevent that from happening. Controlling the gfp_mask of THP allocations through the knobs in sysfs allows users to determine the balance between how aggressively the system tries to allocate THPs at fault time, and how much the application may end up stalling attempting those allocations. This way a THP defrag setting of "never" or "defer+madvise" will result in quick allocation failures without direct reclaim when no 2MB free pages are available. With this patch applied, THP allocations for tmpfs will be a little more aggressive than today for files mmapped with MADV_HUGEPAGE, and a little less aggressive for files that are not mmapped or mapped without that flag. Link: https://lkml.kernel.org/r/20201124194925.623931-1-riel@surriel.com Link: https://lkml.kernel.org/r/20201124194925.623931-2-riel@surriel.com Signed-off-by: Rik van Riel Acked-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Xu Yu Cc: Mel Gorman Cc: Andrea Arcangeli Cc: Matthew Wilcox (Oracle) Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 2 ++ mm/huge_memory.c | 6 +++--- mm/shmem.c | 8 +++++--- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 220cd553a9e7..8572a1474e16 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -634,6 +634,8 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask); extern void pm_restrict_gfp_mask(void); extern void pm_restore_gfp_mask(void); +extern gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma); + #ifdef CONFIG_PM_SLEEP extern bool pm_suspended_storage(void); #else diff --git a/mm/huge_memory.c b/mm/huge_memory.c index d77605c30f2e..395c75111d33 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -668,9 +668,9 @@ release: * available * never: never stall for any thp allocation */ -static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma) +gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma) { - const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE); + const bool vma_madvised = vma && (vma->vm_flags & VM_HUGEPAGE); /* Always do synchronous compaction */ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) @@ -762,7 +762,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) } return ret; } - gfp = alloc_hugepage_direct_gfpmask(vma); + gfp = vma_thp_gfp_mask(vma); page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); if (unlikely(!page)) { count_vm_event(THP_FAULT_FALLBACK); diff --git a/mm/shmem.c b/mm/shmem.c index ee8f21832f98..596009a44431 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1519,8 +1519,8 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp, return NULL; shmem_pseudo_vma_init(&pvma, info, hindex); - page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN, - HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), true); + page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), + true); shmem_pseudo_vma_destroy(&pvma); if (page) prep_transhuge_page(page); @@ -1776,6 +1776,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, struct page *page; enum sgp_type sgp_huge = sgp; pgoff_t hindex = index; + gfp_t huge_gfp; int error; int once = 0; int alloced = 0; @@ -1862,7 +1863,8 @@ repeat: } alloc_huge: - page = shmem_alloc_and_acct_page(gfp, inode, index, true); + huge_gfp = vma_thp_gfp_mask(vma); + page = shmem_alloc_and_acct_page(huge_gfp, inode, index, true); if (IS_ERR(page)) { alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, inode, From 78cc8cdc54008f54b79711fc027afc3564588a04 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Thu, 25 Feb 2021 17:16:22 -0800 Subject: [PATCH 016/118] mm,thp,shm: limit gfp mask to no more than specified Matthew Wilcox pointed out that the i915 driver opportunistically allocates tmpfs memory, but will happily reclaim some of its pool if no memory is available. Make sure the gfp mask used to opportunistically allocate a THP is always at least as restrictive as the original gfp mask. Link: https://lkml.kernel.org/r/20201124194925.623931-3-riel@surriel.com Signed-off-by: Rik van Riel Suggested-by: Matthew Wilcox Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Mel Gorman Cc: Michal Hocko Cc: Vlastimil Babka Cc: Xu Yu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/mm/shmem.c b/mm/shmem.c index 596009a44431..06c771d23127 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1505,6 +1505,26 @@ static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, return page; } +/* + * Make sure huge_gfp is always more limited than limit_gfp. + * Some of the flags set permissions, while others set limitations. + */ +static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp) +{ + gfp_t allowflags = __GFP_IO | __GFP_FS | __GFP_RECLAIM; + gfp_t denyflags = __GFP_NOWARN | __GFP_NORETRY; + gfp_t result = huge_gfp & ~allowflags; + + /* + * Minimize the result gfp by taking the union with the deny flags, + * and the intersection of the allow flags. + */ + result |= (limit_gfp & denyflags); + result |= (huge_gfp & limit_gfp) & allowflags; + + return result; +} + static struct page *shmem_alloc_hugepage(gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) { @@ -1864,6 +1884,7 @@ repeat: alloc_huge: huge_gfp = vma_thp_gfp_mask(vma); + huge_gfp = limit_gfp_mask(huge_gfp, gfp); page = shmem_alloc_and_acct_page(huge_gfp, inode, index, true); if (IS_ERR(page)) { alloc_nohuge: From cd89fb06509903f942a0ffe97ffa63034671ed0c Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Thu, 25 Feb 2021 17:16:25 -0800 Subject: [PATCH 017/118] mm,thp,shmem: make khugepaged obey tmpfs mount flags Currently if thp enabled=[madvise], mounting a tmpfs filesystem with huge=always and mmapping files from that tmpfs does not result in khugepaged collapsing those mappings, despite the mount flag indicating that it should. Fix that by breaking up the blocks of tests in hugepage_vma_check a little bit, and testing things in the correct order. Link: https://lkml.kernel.org/r/20201124194925.623931-4-riel@surriel.com Fixes: c2231020ea7b ("mm: thp: register mm for khugepaged when merging vma for shmem") Signed-off-by: Rik van Riel Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Vlastimil Babka Cc: Xu Yu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/khugepaged.h | 2 ++ mm/khugepaged.c | 22 ++++++++++++++++------ 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index c941b7377321..2fcc01891b47 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h @@ -3,6 +3,7 @@ #define _LINUX_KHUGEPAGED_H #include /* MMF_VM_HUGEPAGE */ +#include #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -57,6 +58,7 @@ static inline int khugepaged_enter(struct vm_area_struct *vma, { if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags)) if ((khugepaged_always() || + (shmem_file(vma->vm_file) && shmem_huge_enabled(vma)) || (khugepaged_req_madv() && (vm_flags & VM_HUGEPAGE))) && !(vm_flags & VM_NOHUGEPAGE) && !test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 75e246f680f4..a7d6cb912b05 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -442,18 +442,28 @@ static inline int khugepaged_test_exit(struct mm_struct *mm) static bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags) { - if ((!(vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || - (vm_flags & VM_NOHUGEPAGE) || + /* Explicitly disabled through madvise. */ + if ((vm_flags & VM_NOHUGEPAGE) || test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) return false; - if (shmem_file(vma->vm_file) || - (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && - vma->vm_file && - (vm_flags & VM_DENYWRITE))) { + /* Enabled via shmem mount options or sysfs settings. */ + if (shmem_file(vma->vm_file) && shmem_huge_enabled(vma)) { return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff, HPAGE_PMD_NR); } + + /* THP settings require madvise. */ + if (!(vm_flags & VM_HUGEPAGE) && !khugepaged_always()) + return false; + + /* Read-only file mappings need to be aligned for THP to work. */ + if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && vma->vm_file && + (vm_flags & VM_DENYWRITE)) { + return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff, + HPAGE_PMD_NR); + } + if (!vma->anon_vma || vma->vm_ops) return false; if (vma_is_temporary_stack(vma)) From 187df5dde943ae28f260db7377467ffb3b51a6de Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Thu, 25 Feb 2021 17:16:29 -0800 Subject: [PATCH 018/118] mm,shmem,thp: limit shmem THP allocations to requested zones Hugh pointed out that the gma500 driver uses shmem pages, but needs to limit them to the DMA32 zone. Ensure the allocations resulting from the gfp_mask returned by limit_gfp_mask use the zone flags that were originally passed to shmem_getpage_gfp. Link: https://lkml.kernel.org/r/20210224121016.1314ed6d@imladris.surriel.com Signed-off-by: Rik van Riel Suggested-by: Hugh Dickins Cc: Michal Hocko Cc: Vlastimil Babka Cc: Xu Yu Cc: Mel Gorman Cc: Andrea Arcangeli Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/mm/shmem.c b/mm/shmem.c index 06c771d23127..b2db4ed0fbc7 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1513,7 +1513,11 @@ static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp) { gfp_t allowflags = __GFP_IO | __GFP_FS | __GFP_RECLAIM; gfp_t denyflags = __GFP_NOWARN | __GFP_NORETRY; - gfp_t result = huge_gfp & ~allowflags; + gfp_t zoneflags = limit_gfp & GFP_ZONEMASK; + gfp_t result = huge_gfp & ~(allowflags | GFP_ZONEMASK); + + /* Allow allocations only from the originally specified zones. */ + result |= zoneflags; /* * Minimize the result gfp by taking the union with the deny flags, From df2ff39e78da74dc23e7187dd58a784d91a876e0 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 25 Feb 2021 17:16:33 -0800 Subject: [PATCH 019/118] mm: cma: allocate cma areas bottom-up Currently cma areas without a fixed base are allocated close to the end of the node. This placement is sub-optimal because of compaction: it brings pages into the cma area. In particular, it can bring in hot executable pages, even if there is a plenty of free memory on the machine. This results in cma allocation failures. Instead let's place cma areas close to the beginning of a node. In this case the compaction will help to free cma areas, resulting in better cma allocation success rates. If there is enough memory let's try to allocate bottom-up starting with 4GB to exclude any possible interference with DMA32. On smaller machines or in a case of a failure, stick with the old behavior. 16GB vm, 2GB cma area: With this patch: [ 0.000000] Command line: root=/dev/vda3 rootflags=subvol=/root systemd.unified_cgroup_hierarchy=1 enforcing=0 console=ttyS0,115200 hugetlb_cma=2G [ 0.002928] hugetlb_cma: reserve 2048 MiB, up to 2048 MiB per node [ 0.002930] cma: Reserved 2048 MiB at 0x0000000100000000 [ 0.002931] hugetlb_cma: reserved 2048 MiB on node 0 Without this patch: [ 0.000000] Command line: root=/dev/vda3 rootflags=subvol=/root systemd.unified_cgroup_hierarchy=1 enforcing=0 console=ttyS0,115200 hugetlb_cma=2G [ 0.002930] hugetlb_cma: reserve 2048 MiB, up to 2048 MiB per node [ 0.002933] cma: Reserved 2048 MiB at 0x00000003c0000000 [ 0.002934] hugetlb_cma: reserved 2048 MiB on node 0 v2: - switched to memblock_set_bottom_up(true), by Mike - start with 4GB, by Mike [guro@fb.com: whitespace fix, per Mike] Link: https://lkml.kernel.org/r/20201221170551.GB3428478@carbon.DHCP.thefacebook.com [guro@fb.com: fix 32-bit warnings] Link: https://lkml.kernel.org/r/20201223163537.GA4011967@carbon.DHCP.thefacebook.com [guro@fb.com: fix 32-bit systems] [akpm@linux-foundation.org: build fix] Link: https://lkml.kernel.org/r/20201217201214.3414100-1-guro@fb.com Signed-off-by: Roman Gushchin Reviewed-by: Mike Rapoport Cc: Wonhyuk Yang Cc: Joonsoo Kim Cc: Rik van Riel Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/cma.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/mm/cma.c b/mm/cma.c index 20c4f6f40037..0ba69cd16aeb 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -336,6 +336,23 @@ int __init cma_declare_contiguous_nid(phys_addr_t base, limit = highmem_start; } + /* + * If there is enough memory, try a bottom-up allocation first. + * It will place the new cma area close to the start of the node + * and guarantee that the compaction is moving pages out of the + * cma area and not into it. + * Avoid using first 4GB to not interfere with constrained zones + * like DMA/DMA32. + */ +#ifdef CONFIG_PHYS_ADDR_T_64BIT + if (!memblock_bottom_up() && memblock_end >= SZ_4G + size) { + memblock_set_bottom_up(true); + addr = memblock_alloc_range_nid(size, alignment, SZ_4G, + limit, nid, true); + memblock_set_bottom_up(false); + } +#endif + if (!addr) { addr = memblock_alloc_range_nid(size, alignment, base, limit, nid, true); From 072355c1cf2d4f37993bcfc5894e17d0b11bb290 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 25 Feb 2021 17:16:37 -0800 Subject: [PATCH 020/118] mm/cma: expose all pages to the buddy if activation of an area fails Right now, if activation fails, we might already have exposed some pages to the buddy for CMA use (although they will never get actually used by CMA), and some pages won't be exposed to the buddy at all. Let's check for "single zone" early and on error, don't expose any pages for CMA use - instead, expose them to the buddy available for any use. Simply call free_reserved_page() on every single page - easier than going via free_reserved_area(), converting back and forth between pfns and virt addresses. In addition, make sure to fixup totalcma_pages properly. Example: 6 GiB QEMU VM with "... hugetlb_cma=2G movablecore=20% ...": [ 0.006891] hugetlb_cma: reserve 2048 MiB, up to 2048 MiB per node [ 0.006893] cma: Reserved 2048 MiB at 0x0000000100000000 [ 0.006893] hugetlb_cma: reserved 2048 MiB on node 0 ... [ 0.175433] cma: CMA area hugetlb0 could not be activated Before this patch: # cat /proc/meminfo MemTotal: 5867348 kB MemFree: 5692808 kB MemAvailable: 5542516 kB ... CmaTotal: 2097152 kB CmaFree: 1884160 kB After this patch: # cat /proc/meminfo MemTotal: 6077308 kB MemFree: 5904208 kB MemAvailable: 5747968 kB ... CmaTotal: 0 kB CmaFree: 0 kB Note: cma_init_reserved_mem() makes sure that we always cover full pageblocks / MAX_ORDER - 1 pages. Link: https://lkml.kernel.org/r/20210127101813.6370-2-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Zi Yan Reviewed-by: Oscar Salvador Cc: Thomas Gleixner Cc: "Peter Zijlstra (Intel)" Cc: Mike Rapoport Cc: Michal Hocko Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/cma.c | 41 ++++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/mm/cma.c b/mm/cma.c index 0ba69cd16aeb..23d4a97c834a 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -94,34 +94,29 @@ static void cma_clear_bitmap(struct cma *cma, unsigned long pfn, static void __init cma_activate_area(struct cma *cma) { - unsigned long base_pfn = cma->base_pfn, pfn = base_pfn; - unsigned i = cma->count >> pageblock_order; + unsigned long base_pfn = cma->base_pfn, pfn; struct zone *zone; cma->bitmap = bitmap_zalloc(cma_bitmap_maxno(cma), GFP_KERNEL); if (!cma->bitmap) goto out_error; - WARN_ON_ONCE(!pfn_valid(pfn)); - zone = page_zone(pfn_to_page(pfn)); + /* + * alloc_contig_range() requires the pfn range specified to be in the + * same zone. Simplify by forcing the entire CMA resv range to be in the + * same zone. + */ + WARN_ON_ONCE(!pfn_valid(base_pfn)); + zone = page_zone(pfn_to_page(base_pfn)); + for (pfn = base_pfn + 1; pfn < base_pfn + cma->count; pfn++) { + WARN_ON_ONCE(!pfn_valid(pfn)); + if (page_zone(pfn_to_page(pfn)) != zone) + goto not_in_zone; + } - do { - unsigned j; - - base_pfn = pfn; - for (j = pageblock_nr_pages; j; --j, pfn++) { - WARN_ON_ONCE(!pfn_valid(pfn)); - /* - * alloc_contig_range requires the pfn range - * specified to be in the same zone. Make this - * simple by forcing the entire CMA resv range - * to be in the same zone. - */ - if (page_zone(pfn_to_page(pfn)) != zone) - goto not_in_zone; - } - init_cma_reserved_pageblock(pfn_to_page(base_pfn)); - } while (--i); + for (pfn = base_pfn; pfn < base_pfn + cma->count; + pfn += pageblock_nr_pages) + init_cma_reserved_pageblock(pfn_to_page(pfn)); mutex_init(&cma->lock); @@ -135,6 +130,10 @@ static void __init cma_activate_area(struct cma *cma) not_in_zone: bitmap_free(cma->bitmap); out_error: + /* Expose all pages to the buddy, they are useless for CMA. */ + for (pfn = base_pfn; pfn < base_pfn + cma->count; pfn++) + free_reserved_page(pfn_to_page(pfn)); + totalcma_pages -= cma->count; cma->count = 0; pr_err("CMA area %s could not be activated\n", cma->name); return; From 3c381db1fac80373f2cc0d8c1d0bcfbf8bd4fb57 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 25 Feb 2021 17:16:40 -0800 Subject: [PATCH 021/118] mm/page_alloc: count CMA pages per zone and print them in /proc/zoneinfo Let's count the number of CMA pages per zone and print them in /proc/zoneinfo. Having access to the total number of CMA pages per zone is helpful for debugging purposes to know where exactly the CMA pages ended up, and to figure out how many pages of a zone might behave differently, even after some of these pages might already have been allocated. As one example, CMA pages part of a kernel zone cannot be used for ordinary kernel allocations but instead behave more like ZONE_MOVABLE. For now, we are only able to get the global nr+free cma pages from /proc/meminfo and the free cma pages per zone from /proc/zoneinfo. Example after this patch when booting a 6 GiB QEMU VM with "hugetlb_cma=2G": # cat /proc/zoneinfo | grep cma cma 0 nr_free_cma 0 cma 0 nr_free_cma 0 cma 524288 nr_free_cma 493016 cma 0 cma 0 # cat /proc/meminfo | grep Cma CmaTotal: 2097152 kB CmaFree: 1972064 kB Note: We print even without CONFIG_CMA, just like "nr_free_cma"; this way, one can be sure when spotting "cma 0", that there are definetly no CMA pages located in a zone. [david@redhat.com: v2] Link: https://lkml.kernel.org/r/20210128164533.18566-1-david@redhat.com [david@redhat.com: v3] Link: https://lkml.kernel.org/r/20210129113451.22085-1-david@redhat.com Link: https://lkml.kernel.org/r/20210127101813.6370-3-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Oscar Salvador Acked-by: David Rientjes Cc: Thomas Gleixner Cc: "Peter Zijlstra (Intel)" Cc: Mike Rapoport Cc: Michal Hocko Cc: Wei Yang Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 15 +++++++++++++++ mm/page_alloc.c | 1 + mm/vmstat.c | 6 ++++-- 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 9198b7ade85f..5f9c4dad73ed 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -503,6 +503,9 @@ struct zone { * bootmem allocator): * managed_pages = present_pages - reserved_pages; * + * cma pages is present pages that are assigned for CMA use + * (MIGRATE_CMA). + * * So present_pages may be used by memory hotplug or memory power * management logic to figure out unmanaged pages by checking * (present_pages - managed_pages). And managed_pages should be used @@ -527,6 +530,9 @@ struct zone { atomic_long_t managed_pages; unsigned long spanned_pages; unsigned long present_pages; +#ifdef CONFIG_CMA + unsigned long cma_pages; +#endif const char *name; @@ -624,6 +630,15 @@ static inline unsigned long zone_managed_pages(struct zone *zone) return (unsigned long)atomic_long_read(&zone->managed_pages); } +static inline unsigned long zone_cma_pages(struct zone *zone) +{ +#ifdef CONFIG_CMA + return zone->cma_pages; +#else + return 0; +#endif +} + static inline unsigned long zone_end_pfn(const struct zone *zone) { return zone->zone_start_pfn + zone->spanned_pages; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ddccc59f2f72..3e4b29ee2b1e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2168,6 +2168,7 @@ void __init init_cma_reserved_pageblock(struct page *page) } adjust_managed_page_count(page, pageblock_nr_pages); + page_zone(page)->cma_pages += pageblock_nr_pages; } #endif diff --git a/mm/vmstat.c b/mm/vmstat.c index a0e949542204..6cdf789ced5e 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1637,14 +1637,16 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, "\n high %lu" "\n spanned %lu" "\n present %lu" - "\n managed %lu", + "\n managed %lu" + "\n cma %lu", zone_page_state(zone, NR_FREE_PAGES), min_wmark_pages(zone), low_wmark_pages(zone), high_wmark_pages(zone), zone->spanned_pages, zone->present_pages, - zone_managed_pages(zone)); + zone_managed_pages(zone), + zone_cma_pages(zone)); seq_printf(m, "\n protection: (%ld", From a052d4d13d88c2073d1339d9dce02cba7b4dc609 Mon Sep 17 00:00:00 2001 From: Patrick Daly Date: Thu, 25 Feb 2021 17:16:44 -0800 Subject: [PATCH 022/118] mm: cma: print region name on failure Print the name of the CMA region for convenience. This is useful information to have when cma_alloc() fails. [pdaly@codeaurora.org: print the "count" variable] Link: https://lkml.kernel.org/r/20210209142414.12768-1-georgi.djakov@linaro.org Link: https://lkml.kernel.org/r/20210208115200.20286-1-georgi.djakov@linaro.org Signed-off-by: Patrick Daly Signed-off-by: Georgi Djakov Acked-by: Minchan Kim Reviewed-by: David Hildenbrand Reviewed-by: Randy Dunlap Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/cma.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/cma.c b/mm/cma.c index 23d4a97c834a..54eee2119822 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -500,8 +500,8 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, } if (ret && !no_warn) { - pr_err("%s: alloc failed, req-size: %zu pages, ret: %d\n", - __func__, count, ret); + pr_err("%s: %s: alloc failed, req-size: %zu pages, ret: %d\n", + __func__, cma->name, count, ret); cma_debug_show_areas(cma); } From 2bbd00aef0671bfe3c2ca5ba67097246257de125 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 25 Feb 2021 17:16:47 -0800 Subject: [PATCH 023/118] mm: vmstat: fix NOHZ wakeups for node stat changes On NOHZ, the periodic vmstat flushers on each CPU can go to sleep and won't wake up until stat changes are detected in the per-cpu deltas of the zone vmstat counters. In commit 75ef71840539 ("mm, vmstat: add infrastructure for per-node vmstats") per-node counters were introduced, and subsequently most stats were moved from the zone to the node level. However, the node counters weren't added to the NOHZ wakeup detection. In theory this can cause per-cpu errors to remain in the user-reported stats indefinitely. In practice this only affects a handful of sub counters (file_mapped, dirty and writeback e.g.) because other page state changes at the node level likely involve a change at the zone level as well (alloc and free, lru ops). Also, nobody has complained. Fix it up for completeness: wake up vmstat refreshing on node changes. Also remove the BUILD_BUG_ONs that assert counter size; we haven't relied on it since we added sizeof() to the range calculation in commit 13c9aaf7fa01 ("mm/vmstat.c: fix NUMA statistics updates"). Link: https://lkml.kernel.org/r/20210202184342.118513-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/mm/vmstat.c b/mm/vmstat.c index 6cdf789ced5e..0b0fc3b77789 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1894,16 +1894,12 @@ static void vmstat_update(struct work_struct *w) */ static bool need_update(int cpu) { + pg_data_t *last_pgdat = NULL; struct zone *zone; for_each_populated_zone(zone) { struct per_cpu_pageset *p = per_cpu_ptr(zone->pageset, cpu); - - BUILD_BUG_ON(sizeof(p->vm_stat_diff[0]) != 1); -#ifdef CONFIG_NUMA - BUILD_BUG_ON(sizeof(p->vm_numa_stat_diff[0]) != 2); -#endif - + struct per_cpu_nodestat *n; /* * The fast way of checking if there are any vmstat diffs. */ @@ -1915,6 +1911,13 @@ static bool need_update(int cpu) sizeof(p->vm_numa_stat_diff[0]))) return true; #endif + if (last_pgdat == zone->zone_pgdat) + continue; + last_pgdat = zone->zone_pgdat; + n = per_cpu_ptr(zone->zone_pgdat->per_cpu_nodestats, cpu); + if (memchr_inv(n->vm_node_stat_diff, 0, NR_VM_NODE_STAT_ITEMS * + sizeof(n->vm_node_stat_diff[0]))) + return true; } return false; } From 629484ae73754243917e06d8d5e5f37c26e99399 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 25 Feb 2021 17:16:51 -0800 Subject: [PATCH 024/118] mm: vmstat: add some comments on internal storage of byte items Byte-accounted items are used for slab object accounting at the cgroup level, because the objects in a slab page can belong to different cgroups. At the global level these items always change in multiples of whole slab pages. The vmstat code exploits this and stores these items as pages internally, which allows for more compact per-cpu data. This optimization isn't self-evident from the asserts and the division in the stat update functions. Provide the reader with some context. Link: https://lkml.kernel.org/r/20210202184411.118614-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmstat.h | 6 ++++++ mm/vmstat.c | 12 ++++++++++++ 2 files changed, 18 insertions(+) diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 773135fc6e19..506d625163a1 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -313,6 +313,12 @@ static inline void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, int delta) { if (vmstat_item_in_bytes(item)) { + /* + * Only cgroups use subpage accounting right now; at + * the global level, these items still change in + * multiples of whole pages. Store them as pages + * internally to keep the per-cpu counters compact. + */ VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1)); delta >>= PAGE_SHIFT; } diff --git a/mm/vmstat.c b/mm/vmstat.c index 0b0fc3b77789..e60b36f5f0a9 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -342,6 +342,12 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, long t; if (vmstat_item_in_bytes(item)) { + /* + * Only cgroups use subpage accounting right now; at + * the global level, these items still change in + * multiples of whole pages. Store them as pages + * internally to keep the per-cpu counters compact. + */ VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1)); delta >>= PAGE_SHIFT; } @@ -551,6 +557,12 @@ static inline void mod_node_state(struct pglist_data *pgdat, long o, n, t, z; if (vmstat_item_in_bytes(item)) { + /* + * Only cgroups use subpage accounting right now; at + * the global level, these items still change in + * multiples of whole pages. Store them as pages + * internally to keep the per-cpu counters compact. + */ VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1)); delta >>= PAGE_SHIFT; } From fbcc8183a4f815910697237386681153a05d9573 Mon Sep 17 00:00:00 2001 From: Jiang Biao Date: Thu, 25 Feb 2021 17:16:54 -0800 Subject: [PATCH 025/118] mm/vmstat.c: erase latency in vmstat_shepherd Many 100us+ latencies have been deteceted in vmstat_shepherd() on CPX platform which has 208 logic cpus. And vmstat_shepherd is queued every second, which could make the case worse. Add schedule point in vmstat_shepherd() to erase the latency. Link: https://lkml.kernel.org/r/20210111035526.1511-1-benbjiang@tencent.com Signed-off-by: Jiang Biao Reported-by: Bin Lai Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/vmstat.c b/mm/vmstat.c index e60b36f5f0a9..74b2c374b86c 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1980,6 +1980,8 @@ static void vmstat_shepherd(struct work_struct *w) if (!delayed_work_pending(dw) && need_update(cpu)) queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0); + + cond_resched(); } put_online_cpus(); From 9f605f260594f99b950062fd62244251e85dbd2b Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 25 Feb 2021 17:16:57 -0800 Subject: [PATCH 026/118] mm: move pfn_to_online_page() out of line Patch series "mm: Fix pfn_to_online_page() with respect to ZONE_DEVICE", v4. A pfn-walker that uses pfn_to_online_page() may inadvertently translate a pfn as online and in the page allocator, when it is offline managed by a ZONE_DEVICE mapping (details in Patch 3: ("mm: Teach pfn_to_online_page() about ZONE_DEVICE section collisions")). The 2 proposals under consideration are teach pfn_to_online_page() to be precise in the presence of mixed-zone sections, or teach the memory-add code to drop the System RAM associated with ZONE_DEVICE collisions. In order to not regress memory capacity by a few 10s to 100s of MiB the approach taken in this set is to add precision to pfn_to_online_page(). In the course of validating pfn_to_online_page() a couple other fixes fell out: 1/ soft_offline_page() fails to drop the reference taken in the madvise(..., MADV_SOFT_OFFLINE) case. 2/ memory_failure() uses get_dev_pagemap() to lookup ZONE_DEVICE pages, however that mapping may contain data pages and metadata raw pfns. Introduce pgmap_pfn_valid() to delineate the 2 types and fail the handling of raw metadata pfns. This patch (of 4); pfn_to_online_page() is already too large to be a macro or an inline function. In anticipation of further logic changes / growth, move it out of line. No functional change, just code movement. Link: https://lkml.kernel.org/r/161058499000.1840162.702316708443239771.stgit@dwillia2-desk3.amr.corp.intel.com Link: https://lkml.kernel.org/r/161058499608.1840162.10165648147615238793.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams Reported-by: Michal Hocko Acked-by: Michal Hocko Reviewed-by: David Hildenbrand Reviewed-by: Oscar Salvador Cc: Naoya Horiguchi Cc: Qian Cai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 17 +---------------- mm/memory_hotplug.c | 16 ++++++++++++++++ 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 15acce5ab106..3d99de0db2dd 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -16,22 +16,7 @@ struct resource; struct vmem_altmap; #ifdef CONFIG_MEMORY_HOTPLUG -/* - * Return page for the valid pfn only if the page is online. All pfn - * walkers which rely on the fully initialized page->flags and others - * should use this rather than pfn_valid && pfn_to_page - */ -#define pfn_to_online_page(pfn) \ -({ \ - struct page *___page = NULL; \ - unsigned long ___pfn = pfn; \ - unsigned long ___nr = pfn_to_section_nr(___pfn); \ - \ - if (___nr < NR_MEM_SECTIONS && online_section_nr(___nr) && \ - pfn_valid_within(___pfn)) \ - ___page = pfn_to_page(___pfn); \ - ___page; \ -}) +struct page *pfn_to_online_page(unsigned long pfn); /* * Types for free bootmem stored in page->lru.next. These have to be in diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index abe43c1ae920..fc6cdd99941b 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -300,6 +300,22 @@ static int check_hotplug_memory_addressable(unsigned long pfn, return 0; } +/* + * Return page for the valid pfn only if the page is online. All pfn + * walkers which rely on the fully initialized page->flags and others + * should use this rather than pfn_valid && pfn_to_page + */ +struct page *pfn_to_online_page(unsigned long pfn) +{ + unsigned long nr = pfn_to_section_nr(pfn); + + if (nr < NR_MEM_SECTIONS && online_section_nr(nr) && + pfn_valid_within(pfn)) + return pfn_to_page(pfn); + return NULL; +} +EXPORT_SYMBOL_GPL(pfn_to_online_page); + /* * Reasonably generic function for adding memory. It is * expected that archs that support memory hotplug will From 9f9b02e5b3468e665a576a86ceb72f753001710b Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 25 Feb 2021 17:17:01 -0800 Subject: [PATCH 027/118] mm: teach pfn_to_online_page() to consider subsection validity pfn_to_online_page is primarily used to filter out offline or fully uninitialized pages. pfn_valid resp. online_section_nr have a coarse per memory section granularity. If a section shared with a partially offline memory (e.g. part of ZONE_DEVICE) then pfn_to_online_page would lead to a false positive on some pfns. Fix this by adding pfn_section_valid check which is subsection aware. [mhocko@kernel.org: changelog rewrite] Link: https://lkml.kernel.org/r/161058500148.1840162.4365921007820501696.stgit@dwillia2-desk3.amr.corp.intel.com Fixes: b13bc35193d9 ("mm/hotplug: invalid PFNs from pfn_to_online_page()") Signed-off-by: Dan Williams Reported-by: David Hildenbrand Reviewed-by: David Hildenbrand Reviewed-by: Oscar Salvador Acked-by: Michal Hocko Cc: Qian Cai Cc: Oscar Salvador Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index fc6cdd99941b..02378f11e2d6 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -308,11 +308,26 @@ static int check_hotplug_memory_addressable(unsigned long pfn, struct page *pfn_to_online_page(unsigned long pfn) { unsigned long nr = pfn_to_section_nr(pfn); + struct mem_section *ms; - if (nr < NR_MEM_SECTIONS && online_section_nr(nr) && - pfn_valid_within(pfn)) - return pfn_to_page(pfn); - return NULL; + if (nr >= NR_MEM_SECTIONS) + return NULL; + + ms = __nr_to_section(nr); + if (!online_section(ms)) + return NULL; + + /* + * Save some code text when online_section() + + * pfn_section_valid() are sufficient. + */ + if (IS_ENABLED(CONFIG_HAVE_ARCH_PFN_VALID) && !pfn_valid(pfn)) + return NULL; + + if (!pfn_section_valid(ms, pfn)) + return NULL; + + return pfn_to_page(pfn); } EXPORT_SYMBOL_GPL(pfn_to_online_page); From 1f90a3477df3ff1a91e064af554cdc887c8f9e5e Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 25 Feb 2021 17:17:05 -0800 Subject: [PATCH 028/118] mm: teach pfn_to_online_page() about ZONE_DEVICE section collisions While pfn_to_online_page() is able to determine pfn_valid() at subsection granularity it is not able to reliably determine if a given pfn is also online if the section is mixes ZONE_{NORMAL,MOVABLE} with ZONE_DEVICE. This means that pfn_to_online_page() may return invalid @page objects. For example with a memory map like: 100000000-1fbffffff : System RAM 142000000-143002e16 : Kernel code 143200000-143713fff : Kernel rodata 143800000-143b15b7f : Kernel data 144227000-144ffffff : Kernel bss 1fc000000-2fbffffff : Persistent Memory (legacy) 1fc000000-2fbffffff : namespace0.0 This command: echo 0x1fc000000 > /sys/devices/system/memory/soft_offline_page ...succeeds when it should fail. When it succeeds it touches an uninitialized page and may crash or cause other damage (see dissolve_free_huge_page()). While the memory map above is contrived via the memmap=ss!nn kernel command line option, the collision happens in practice on shipping platforms. The memory controller resources that decode spans of physical address space are a limited resource. One technique platform-firmware uses to conserve those resources is to share a decoder across 2 devices to keep the address range contiguous. Unfortunately the unit of operation of a decoder is 64MiB while the Linux section size is 128MiB. This results in situations where, without subsection hotplug memory mappings with different lifetimes collide into one object that can only express one lifetime. Update move_pfn_range_to_zone() to flag (SECTION_TAINT_ZONE_DEVICE) a section that mixes ZONE_DEVICE pfns with other online pfns. With SECTION_TAINT_ZONE_DEVICE to delineate, pfn_to_online_page() can fall back to a slow-path check for ZONE_DEVICE pfns in an online section. In the fast path online_section() for a full ZONE_DEVICE section returns false. Because the collision case is rare, and for simplicity, the SECTION_TAINT_ZONE_DEVICE flag is never cleared once set. [dan.j.williams@intel.com: fix CONFIG_ZONE_DEVICE=n build] Link: https://lkml.kernel.org/r/CAPcyv4iX+7LAgAeSqx7Zw-Zd=ZV9gBv8Bo7oTbwCOOqJoZ3+Yg@mail.gmail.com Link: https://lkml.kernel.org/r/161058500675.1840162.7887862152161279354.stgit@dwillia2-desk3.amr.corp.intel.com Fixes: ba72b4c8cf60 ("mm/sparsemem: support sub-section hotplug") Signed-off-by: Dan Williams Reported-by: Michal Hocko Acked-by: Michal Hocko Reported-by: David Hildenbrand Reviewed-by: David Hildenbrand Reviewed-by: Oscar Salvador Cc: Naoya Horiguchi Cc: Qian Cai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 34 +++++++++++++++++++++++++++------- mm/memory_hotplug.c | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+), 7 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 5f9c4dad73ed..47946cec7584 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -918,6 +918,18 @@ static inline int local_memory_node(int node_id) { return node_id; }; */ #define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones) +#ifdef CONFIG_ZONE_DEVICE +static inline bool zone_is_zone_device(struct zone *zone) +{ + return zone_idx(zone) == ZONE_DEVICE; +} +#else +static inline bool zone_is_zone_device(struct zone *zone) +{ + return false; +} +#endif + /* * Returns true if a zone has pages managed by the buddy allocator. * All the reclaim decisions have to use this function rather than @@ -1306,13 +1318,14 @@ extern size_t mem_section_usage_size(void); * which results in PFN_SECTION_SHIFT equal 6. * To sum it up, at least 6 bits are available. */ -#define SECTION_MARKED_PRESENT (1UL<<0) -#define SECTION_HAS_MEM_MAP (1UL<<1) -#define SECTION_IS_ONLINE (1UL<<2) -#define SECTION_IS_EARLY (1UL<<3) -#define SECTION_MAP_LAST_BIT (1UL<<4) -#define SECTION_MAP_MASK (~(SECTION_MAP_LAST_BIT-1)) -#define SECTION_NID_SHIFT 3 +#define SECTION_MARKED_PRESENT (1UL<<0) +#define SECTION_HAS_MEM_MAP (1UL<<1) +#define SECTION_IS_ONLINE (1UL<<2) +#define SECTION_IS_EARLY (1UL<<3) +#define SECTION_TAINT_ZONE_DEVICE (1UL<<4) +#define SECTION_MAP_LAST_BIT (1UL<<5) +#define SECTION_MAP_MASK (~(SECTION_MAP_LAST_BIT-1)) +#define SECTION_NID_SHIFT 3 static inline struct page *__section_mem_map_addr(struct mem_section *section) { @@ -1351,6 +1364,13 @@ static inline int online_section(struct mem_section *section) return (section && (section->section_mem_map & SECTION_IS_ONLINE)); } +static inline int online_device_section(struct mem_section *section) +{ + unsigned long flags = SECTION_IS_ONLINE | SECTION_TAINT_ZONE_DEVICE; + + return section && ((section->section_mem_map & flags) == flags); +} + static inline int online_section_nr(unsigned long nr) { return online_section(__nr_to_section(nr)); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 02378f11e2d6..3af4d3851d1a 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -308,6 +308,7 @@ static int check_hotplug_memory_addressable(unsigned long pfn, struct page *pfn_to_online_page(unsigned long pfn) { unsigned long nr = pfn_to_section_nr(pfn); + struct dev_pagemap *pgmap; struct mem_section *ms; if (nr >= NR_MEM_SECTIONS) @@ -327,6 +328,22 @@ struct page *pfn_to_online_page(unsigned long pfn) if (!pfn_section_valid(ms, pfn)) return NULL; + if (!online_device_section(ms)) + return pfn_to_page(pfn); + + /* + * Slowpath: when ZONE_DEVICE collides with + * ZONE_{NORMAL,MOVABLE} within the same section some pfns in + * the section may be 'offline' but 'valid'. Only + * get_dev_pagemap() can determine sub-section online status. + */ + pgmap = get_dev_pagemap(pfn, NULL); + put_dev_pagemap(pgmap); + + /* The presence of a pgmap indicates ZONE_DEVICE offline pfn */ + if (pgmap) + return NULL; + return pfn_to_page(pfn); } EXPORT_SYMBOL_GPL(pfn_to_online_page); @@ -709,6 +726,14 @@ static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned lon pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn; } + +static void section_taint_zone_device(unsigned long pfn) +{ + struct mem_section *ms = __pfn_to_section(pfn); + + ms->section_mem_map |= SECTION_TAINT_ZONE_DEVICE; +} + /* * Associate the pfn range with the given zone, initializing the memmaps * and resizing the pgdat/zone data to span the added pages. After this @@ -738,6 +763,19 @@ void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, resize_pgdat_range(pgdat, start_pfn, nr_pages); pgdat_resize_unlock(pgdat, &flags); + /* + * Subsection population requires care in pfn_to_online_page(). + * Set the taint to enable the slow path detection of + * ZONE_DEVICE pages in an otherwise ZONE_{NORMAL,MOVABLE} + * section. + */ + if (zone_is_zone_device(zone)) { + if (!IS_ALIGNED(start_pfn, PAGES_PER_SECTION)) + section_taint_zone_device(start_pfn); + if (!IS_ALIGNED(start_pfn + nr_pages, PAGES_PER_SECTION)) + section_taint_zone_device(start_pfn + nr_pages); + } + /* * TODO now we have a visible range of pages which are not associated * with their zone properly. Not nice but set_pfnblock_flags_mask From 34dc45be4563f344d59ba0428416d0d265aa4f4d Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 25 Feb 2021 17:17:08 -0800 Subject: [PATCH 029/118] mm: fix memory_failure() handling of dax-namespace metadata Given 'struct dev_pagemap' spans both data pages and metadata pages be careful to consult the altmap if present to delineate metadata. In fact the pfn_first() helper already identifies the first valid data pfn, so export that helper for other code paths via pgmap_pfn_valid(). Other usage of get_dev_pagemap() are not a concern because those are operating on known data pfns having been looked up by get_user_pages(). I.e. metadata pfns are never user mapped. Link: https://lkml.kernel.org/r/161058501758.1840162.4239831989762604527.stgit@dwillia2-desk3.amr.corp.intel.com Fixes: 6100e34b2526 ("mm, memory_failure: Teach memory_failure() about dev_pagemap pages") Signed-off-by: Dan Williams Reported-by: David Hildenbrand Reviewed-by: David Hildenbrand Reviewed-by: Naoya Horiguchi Cc: Michal Hocko Cc: Oscar Salvador Cc: Qian Cai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memremap.h | 6 ++++++ mm/memory-failure.c | 6 ++++++ mm/memremap.c | 15 +++++++++++++++ 3 files changed, 27 insertions(+) diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 79c49e7f5c30..f5b464daeeca 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -137,6 +137,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap); void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap); struct dev_pagemap *get_dev_pagemap(unsigned long pfn, struct dev_pagemap *pgmap); +bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn); unsigned long vmem_altmap_offset(struct vmem_altmap *altmap); void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns); @@ -165,6 +166,11 @@ static inline struct dev_pagemap *get_dev_pagemap(unsigned long pfn, return NULL; } +static inline bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn) +{ + return false; +} + static inline unsigned long vmem_altmap_offset(struct vmem_altmap *altmap) { return 0; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 55c671904aac..24210c9bd843 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1312,6 +1312,12 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags, */ put_page(page); + /* device metadata space is not recoverable */ + if (!pgmap_pfn_valid(pgmap, pfn)) { + rc = -ENXIO; + goto out; + } + /* * Prevent the inode from being freed while we are interrogating * the address_space, typically this would be handled by diff --git a/mm/memremap.c b/mm/memremap.c index 16b2fb482da1..2455bac89506 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -80,6 +80,21 @@ static unsigned long pfn_first(struct dev_pagemap *pgmap, int range_id) return pfn + vmem_altmap_offset(pgmap_altmap(pgmap)); } +bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn) +{ + int i; + + for (i = 0; i < pgmap->nr_range; i++) { + struct range *range = &pgmap->ranges[i]; + + if (pfn >= PHYS_PFN(range->start) && + pfn <= PHYS_PFN(range->end)) + return pfn >= pfn_first(pgmap, i); + } + + return false; +} + static unsigned long pfn_end(struct dev_pagemap *pgmap, int range_id) { const struct range *range = &pgmap->ranges[range_id]; From 1adf8b468ff6bc64ba01ce3848da4bcf409215b4 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Thu, 25 Feb 2021 17:17:13 -0800 Subject: [PATCH 030/118] mm/memory_hotplug: rename all existing 'memhp' into 'mhp' This renames all 'memhp' instances to 'mhp' except for memhp_default_state for being a kernel command line option. This is just a clean up and should not cause a functional change. Let's make it consistent rater than mixing the two prefixes. In preparation for more users of the 'mhp' terminology. Link: https://lkml.kernel.org/r/1611554093-27316-1-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Suggested-by: David Hildenbrand Reviewed-by: David Hildenbrand Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 10 +++++----- include/linux/memory_hotplug.h | 4 ++-- mm/memory_hotplug.c | 12 ++++++------ 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index eef4ffb6122c..901e379676be 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -35,7 +35,7 @@ static const char *const online_type_to_str[] = { [MMOP_ONLINE_MOVABLE] = "online_movable", }; -int memhp_online_type_from_str(const char *str) +int mhp_online_type_from_str(const char *str) { int i; @@ -253,7 +253,7 @@ static int memory_subsys_offline(struct device *dev) static ssize_t state_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { - const int online_type = memhp_online_type_from_str(buf); + const int online_type = mhp_online_type_from_str(buf); struct memory_block *mem = to_memory_block(dev); int ret; @@ -387,19 +387,19 @@ static ssize_t auto_online_blocks_show(struct device *dev, struct device_attribute *attr, char *buf) { return sysfs_emit(buf, "%s\n", - online_type_to_str[memhp_default_online_type]); + online_type_to_str[mhp_default_online_type]); } static ssize_t auto_online_blocks_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { - const int online_type = memhp_online_type_from_str(buf); + const int online_type = mhp_online_type_from_str(buf); if (online_type < 0) return -EINVAL; - memhp_default_online_type = online_type; + mhp_default_online_type = online_type; return count; } diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 3d99de0db2dd..ca5e8d137726 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -116,10 +116,10 @@ extern int arch_add_memory(int nid, u64 start, u64 size, struct mhp_params *params); extern u64 max_mem_size; -extern int memhp_online_type_from_str(const char *str); +extern int mhp_online_type_from_str(const char *str); /* Default online_type (MMOP_*) when new memory blocks are added. */ -extern int memhp_default_online_type; +extern int mhp_default_online_type; /* If movable_node boot option specified */ extern bool movable_node_enabled; static inline bool movable_node_is_enabled(void) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 3af4d3851d1a..ac1c686a5989 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -67,17 +67,17 @@ void put_online_mems(void) bool movable_node_enabled = false; #ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE -int memhp_default_online_type = MMOP_OFFLINE; +int mhp_default_online_type = MMOP_OFFLINE; #else -int memhp_default_online_type = MMOP_ONLINE; +int mhp_default_online_type = MMOP_ONLINE; #endif static int __init setup_memhp_default_state(char *str) { - const int online_type = memhp_online_type_from_str(str); + const int online_type = mhp_online_type_from_str(str); if (online_type >= 0) - memhp_default_online_type = online_type; + mhp_default_online_type = online_type; return 1; } @@ -1076,7 +1076,7 @@ static int check_hotplug_memory_range(u64 start, u64 size) static int online_memory_block(struct memory_block *mem, void *arg) { - mem->online_type = memhp_default_online_type; + mem->online_type = mhp_default_online_type; return device_online(&mem->dev); } @@ -1157,7 +1157,7 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) merge_system_ram_resource(res); /* online pages if requested */ - if (memhp_default_online_type != MMOP_OFFLINE) + if (mhp_default_online_type != MMOP_OFFLINE) walk_memory_blocks(start, size, NULL, online_memory_block); return ret; From 26011267e1a7ddaab50b5f81b402ca3e7fc2887c Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 25 Feb 2021 17:17:17 -0800 Subject: [PATCH 031/118] mm/memory_hotplug: MEMHP_MERGE_RESOURCE -> MHP_MERGE_RESOURCE Let's make "MEMHP_MERGE_RESOURCE" consistent with "MHP_NONE", "mhp_t" and "mhp_flags". As discussed recently [1], "mhp" is our internal acronym for memory hotplug now. [1] https://lore.kernel.org/linux-mm/c37de2d0-28a1-4f7d-f944-cfd7d81c334d@redhat.com/ Link: https://lkml.kernel.org/r/20210126115829.10909-1-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Miaohe Lin Acked-by: Michael S. Tsirkin Reviewed-by: Oscar Salvador Acked-by: Wei Liu Reviewed-by: Pankaj Gupta Cc: "K. Y. Srinivasan" Cc: Haiyang Zhang Cc: Stephen Hemminger Cc: Jason Wang Cc: Boris Ostrovsky Cc: Juergen Gross Cc: Stefano Stabellini Cc: Michal Hocko Cc: Anshuman Khandual Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/hv/hv_balloon.c | 2 +- drivers/virtio/virtio_mem.c | 2 +- drivers/xen/balloon.c | 2 +- include/linux/memory_hotplug.h | 2 +- mm/memory_hotplug.c | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c index 8c471823a5af..2f776d78e3c1 100644 --- a/drivers/hv/hv_balloon.c +++ b/drivers/hv/hv_balloon.c @@ -726,7 +726,7 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size, nid = memory_add_physaddr_to_nid(PFN_PHYS(start_pfn)); ret = add_memory(nid, PFN_PHYS((start_pfn)), - (HA_CHUNK << PAGE_SHIFT), MEMHP_MERGE_RESOURCE); + (HA_CHUNK << PAGE_SHIFT), MHP_MERGE_RESOURCE); if (ret) { pr_err("hot_add memory failed error is %d\n", ret); diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index 9fc9ec4a25f5..d44e43869f17 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -623,7 +623,7 @@ static int virtio_mem_add_memory(struct virtio_mem *vm, uint64_t addr, /* Memory might get onlined immediately. */ atomic64_add(size, &vm->offline_size); rc = add_memory_driver_managed(vm->nid, addr, size, vm->resource_name, - MEMHP_MERGE_RESOURCE); + MHP_MERGE_RESOURCE); if (rc) { atomic64_sub(size, &vm->offline_size); dev_warn(&vm->vdev->dev, "adding memory failed: %d\n", rc); diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index b57b2067ecbf..671c71245a7b 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -331,7 +331,7 @@ static enum bp_state reserve_additional_memory(void) mutex_unlock(&balloon_mutex); /* add_memory_resource() requires the device_hotplug lock */ lock_device_hotplug(); - rc = add_memory_resource(nid, resource, MEMHP_MERGE_RESOURCE); + rc = add_memory_resource(nid, resource, MHP_MERGE_RESOURCE); unlock_device_hotplug(); mutex_lock(&balloon_mutex); diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index ca5e8d137726..08eeef679ab7 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -53,7 +53,7 @@ typedef int __bitwise mhp_t; * with this flag set, the resource pointer must no longer be used as it * might be stale, or the resource might have changed. */ -#define MEMHP_MERGE_RESOURCE ((__force mhp_t)BIT(0)) +#define MHP_MERGE_RESOURCE ((__force mhp_t)BIT(0)) /* * Extended parameters for memory hotplug: diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index ac1c686a5989..6a02c3f42717 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1153,7 +1153,7 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) * In case we're allowed to merge the resource, flag it and trigger * merging now that adding succeeded. */ - if (mhp_flags & MEMHP_MERGE_RESOURCE) + if (mhp_flags & MHP_MERGE_RESOURCE) merge_system_ram_resource(res); /* online pages if requested */ From 6c922cf75115c8b389c091a073209ca45f1af530 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 25 Feb 2021 17:17:21 -0800 Subject: [PATCH 032/118] mm/memory_hotplug: use helper function zone_end_pfn() to get end_pfn Commit 108bcc96ef70 ("mm: add & use zone_end_pfn() and zone_spans_pfn()") introduced the helper zone_end_pfn() to calculate the zone end pfn. But update_pgdat_span() forgot to use it. Use this helper and rename local variable zone_end_pfn to end_pfn to avoid a naming conflict with the existing zone_end_pfn(). Link: https://lkml.kernel.org/r/20210127093211.37714-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 6a02c3f42717..a969463bdda4 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -493,20 +493,19 @@ static void update_pgdat_span(struct pglist_data *pgdat) for (zone = pgdat->node_zones; zone < pgdat->node_zones + MAX_NR_ZONES; zone++) { - unsigned long zone_end_pfn = zone->zone_start_pfn + - zone->spanned_pages; + unsigned long end_pfn = zone_end_pfn(zone); /* No need to lock the zones, they can't change. */ if (!zone->spanned_pages) continue; if (!node_end_pfn) { node_start_pfn = zone->zone_start_pfn; - node_end_pfn = zone_end_pfn; + node_end_pfn = end_pfn; continue; } - if (zone_end_pfn > node_end_pfn) - node_end_pfn = zone_end_pfn; + if (end_pfn > node_end_pfn) + node_end_pfn = end_pfn; if (zone->zone_start_pfn < node_start_pfn) node_start_pfn = zone->zone_start_pfn; } From e9a2e48e8704c9d20a625c6f2357147d03ea7b97 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 25 Feb 2021 17:17:24 -0800 Subject: [PATCH 033/118] drivers/base/memory: don't store phys_device in memory blocks No need to store the value for each and every memory block, as we can easily query the value at runtime. Reshuffle the members to optimize the memory layout. Also, let's clarify what the interface once was used for and why it's legacy nowadays. "phys_device" was used on s390x in older versions of lsmem[2]/chmem[3], back when they were still part of s390x-tools. They were later replaced by the variants in linux-utils. For example, RHEL6 and RHEL7 contain lsmem/chmem from s390-utils. RHEL8 switched to versions from util-linux on s390x [4]. "phys_device" was added with sysfs support for memory hotplug in commit 3947be1969a9 ("[PATCH] memory hotplug: sysfs and add/remove functions") in 2005. It always returned 0. s390x started returning something != 0 on some setups (if sclp.rzm is set by HW) in 2010 via commit 57b552ba0b2f ("memory hotplug/s390: set phys_device"). For s390x, it allowed for identifying which memory block devices belong to the same storage increment (RZM). Only if all memory block devices comprising a single storage increment were offline, the memory could actually be removed in the hypervisor. Since commit e5d709bb5fb7 ("s390/memory hotplug: provide memory_block_size_bytes() function") in 2013 a memory block device spans at least one storage increment - which is why the interface isn't really helpful/used anymore (except by old lsmem/chmem tools). There were once RFC patches to make use of "phys_device" in ACPI context; however, the underlying problem could be solved using different interfaces [1]. [1] https://patchwork.kernel.org/patch/2163871/ [2] https://github.com/ibm-s390-tools/s390-tools/blob/v2.1.0/zconf/lsmem [3] https://github.com/ibm-s390-tools/s390-tools/blob/v2.1.0/zconf/chmem [4] https://bugzilla.redhat.com/show_bug.cgi?id=1504134 Link: https://lkml.kernel.org/r/20210201181347.13262-2-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Michal Hocko Reviewed-by: Oscar Salvador Cc: Dave Hansen Cc: Greg Kroah-Hartman Cc: Gerald Schaefer Cc: Jonathan Corbet Cc: "Rafael J. Wysocki" Cc: Mauro Carvalho Chehab Cc: Ilya Dryomov Cc: Vaibhav Jain Cc: Tom Rix Cc: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- .../ABI/testing/sysfs-devices-memory | 5 ++-- .../admin-guide/mm/memory-hotplug.rst | 4 +-- drivers/base/memory.c | 25 +++++++------------ include/linux/memory.h | 3 +-- 4 files changed, 15 insertions(+), 22 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-devices-memory b/Documentation/ABI/testing/sysfs-devices-memory index 246a45b96d22..58dbc592bc57 100644 --- a/Documentation/ABI/testing/sysfs-devices-memory +++ b/Documentation/ABI/testing/sysfs-devices-memory @@ -26,8 +26,9 @@ Date: September 2008 Contact: Badari Pulavarty Description: The file /sys/devices/system/memory/memoryX/phys_device - is read-only and is designed to show the name of physical - memory device. Implementation is currently incomplete. + is read-only; it is a legacy interface only ever used on s390x + to expose the covered storage increment. +Users: Legacy s390-tools lsmem/chmem What: /sys/devices/system/memory/memoryX/phys_index Date: September 2008 diff --git a/Documentation/admin-guide/mm/memory-hotplug.rst b/Documentation/admin-guide/mm/memory-hotplug.rst index 5c4432c96c4b..245739f55ac7 100644 --- a/Documentation/admin-guide/mm/memory-hotplug.rst +++ b/Documentation/admin-guide/mm/memory-hotplug.rst @@ -160,8 +160,8 @@ Under each memory block, you can see 5 files: "online_movable", "online", "offline" command which will be performed on all sections in the block. -``phys_device`` read-only: designed to show the name of physical memory - device. This is not well implemented now. +``phys_device`` read-only: legacy interface only ever used on s390x to + expose the covered storage increment. ``removable`` read-only: contains an integer value indicating whether the memory block is removable or not removable. A value of 1 indicates that the memory diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 901e379676be..f35298425575 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -290,20 +290,20 @@ static ssize_t state_store(struct device *dev, struct device_attribute *attr, } /* - * phys_device is a bad name for this. What I really want - * is a way to differentiate between memory ranges that - * are part of physical devices that constitute - * a complete removable unit or fru. - * i.e. do these ranges belong to the same physical device, - * s.t. if I offline all of these sections I can then - * remove the physical device? + * Legacy interface that we cannot remove: s390x exposes the storage increment + * covered by a memory block, allowing for identifying which memory blocks + * comprise a storage increment. Since a memory block spans complete + * storage increments nowadays, this interface is basically unused. Other + * archs never exposed != 0. */ static ssize_t phys_device_show(struct device *dev, struct device_attribute *attr, char *buf) { struct memory_block *mem = to_memory_block(dev); + unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); - return sysfs_emit(buf, "%d\n", mem->phys_device); + return sysfs_emit(buf, "%d\n", + arch_get_memory_phys_device(start_pfn)); } #ifdef CONFIG_MEMORY_HOTREMOVE @@ -488,11 +488,7 @@ static DEVICE_ATTR_WO(soft_offline_page); static DEVICE_ATTR_WO(hard_offline_page); #endif -/* - * Note that phys_device is optional. It is here to allow for - * differentiation between which *physical* devices each - * section belongs to... - */ +/* See phys_device_show(). */ int __weak arch_get_memory_phys_device(unsigned long start_pfn) { return 0; @@ -574,7 +570,6 @@ int register_memory(struct memory_block *memory) static int init_memory_block(unsigned long block_id, unsigned long state) { struct memory_block *mem; - unsigned long start_pfn; int ret = 0; mem = find_memory_block_by_id(block_id); @@ -588,8 +583,6 @@ static int init_memory_block(unsigned long block_id, unsigned long state) mem->start_section_nr = block_id * sections_per_block; mem->state = state; - start_pfn = section_nr_to_pfn(mem->start_section_nr); - mem->phys_device = arch_get_memory_phys_device(start_pfn); mem->nid = NUMA_NO_NODE; ret = register_memory(mem); diff --git a/include/linux/memory.h b/include/linux/memory.h index 439a89e758d8..4da95e684e20 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -27,9 +27,8 @@ struct memory_block { unsigned long start_section_nr; unsigned long state; /* serialized by the dev->lock */ int online_type; /* for passing data to online routine */ - int phys_device; /* to which fru does this belong? */ - struct device dev; int nid; /* NID for this memory block */ + struct device dev; }; int arch_get_memory_phys_device(unsigned long start_pfn); From a89107c0478137115c6647aa28caef75513b9f40 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 25 Feb 2021 17:17:28 -0800 Subject: [PATCH 034/118] Documentation: sysfs/memory: clarify some memory block device properties In commit 53cdc1cb29e8 ("drivers/base/memory.c: indicate all memory blocks as removable") we changed the output of the "removable" property of memory devices to return "1" if and only if the kernel supports memory offlining. Let's update documentation, stating that the interface is legacy. Also update documentation of the "state" property and "valid_zones" properties. Link: https://lkml.kernel.org/r/20210201181347.13262-3-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Michal Hocko Reviewed-by: Oscar Salvador Cc: Dave Hansen Cc: Jonathan Corbet Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: Jonathan Cameron Cc: Ilya Dryomov Cc: Mauro Carvalho Chehab Cc: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- .../ABI/testing/sysfs-devices-memory | 51 ++++++++++++------- .../admin-guide/mm/memory-hotplug.rst | 16 +++--- 2 files changed, 40 insertions(+), 27 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-devices-memory b/Documentation/ABI/testing/sysfs-devices-memory index 58dbc592bc57..d8b0f80b9e33 100644 --- a/Documentation/ABI/testing/sysfs-devices-memory +++ b/Documentation/ABI/testing/sysfs-devices-memory @@ -13,13 +13,13 @@ What: /sys/devices/system/memory/memoryX/removable Date: June 2008 Contact: Badari Pulavarty Description: - The file /sys/devices/system/memory/memoryX/removable - indicates whether this memory block is removable or not. - This is useful for a user-level agent to determine - identify removable sections of the memory before attempting - potentially expensive hot-remove memory operation + The file /sys/devices/system/memory/memoryX/removable is a + legacy interface used to indicated whether a memory block is + likely to be offlineable or not. Newer kernel versions return + "1" if and only if the kernel supports memory offlining. Users: hotplug memory remove tools http://www.ibm.com/developerworks/wikis/display/LinuxP/powerpc-utils + lsmem/chmem part of util-linux What: /sys/devices/system/memory/memoryX/phys_device Date: September 2008 @@ -44,23 +44,25 @@ Date: September 2008 Contact: Badari Pulavarty Description: The file /sys/devices/system/memory/memoryX/state - is read-write. When read, its contents show the - online/offline state of the memory section. When written, - root can toggle the the online/offline state of a removable - memory section (see removable file description above) - using the following commands:: + is read-write. When read, it returns the online/offline + state of the memory block. When written, root can toggle + the online/offline state of a memory block using the following + commands:: # echo online > /sys/devices/system/memory/memoryX/state # echo offline > /sys/devices/system/memory/memoryX/state - For example, if /sys/devices/system/memory/memory22/removable - contains a value of 1 and - /sys/devices/system/memory/memory22/state contains the - string "online" the following command can be executed by - by root to offline that section:: - - # echo offline > /sys/devices/system/memory/memory22/state + On newer kernel versions, advanced states can be specified + when onlining to select a target zone: "online_movable" + selects the movable zone. "online_kernel" selects the + applicable kernel zone (DMA, DMA32, or Normal). However, + after successfully setting one of the advanced states, + reading the file will return "online"; the zone information + can be obtained via "valid_zones" instead. + While onlining is unlikely to fail, there are no guarantees + that offlining will succeed. Offlining is more likely to + succeed if "valid_zones" indicates "Movable". Users: hotplug memory remove tools http://www.ibm.com/developerworks/wikis/display/LinuxP/powerpc-utils @@ -70,8 +72,19 @@ Date: July 2014 Contact: Zhang Zhen Description: The file /sys/devices/system/memory/memoryX/valid_zones is - read-only and is designed to show which zone this memory - block can be onlined to. + read-only. + + For online memory blocks, it returns in which zone memory + provided by a memory block is managed. If multiple zones + apply (not applicable for hotplugged memory), "None" is returned + and the memory block cannot be offlined. + + For offline memory blocks, it returns by which zone memory + provided by a memory block can be managed when onlining. + The first returned zone ("default") will be used when setting + the state of an offline memory block to "online". Only one of + the kernel zones (DMA, DMA32, Normal) is applicable for a single + memory block. What: /sys/devices/system/memoryX/nodeY Date: October 2009 diff --git a/Documentation/admin-guide/mm/memory-hotplug.rst b/Documentation/admin-guide/mm/memory-hotplug.rst index 245739f55ac7..5307f90738aa 100644 --- a/Documentation/admin-guide/mm/memory-hotplug.rst +++ b/Documentation/admin-guide/mm/memory-hotplug.rst @@ -162,14 +162,14 @@ Under each memory block, you can see 5 files: which will be performed on all sections in the block. ``phys_device`` read-only: legacy interface only ever used on s390x to expose the covered storage increment. -``removable`` read-only: contains an integer value indicating - whether the memory block is removable or not - removable. A value of 1 indicates that the memory - block is removable and a value of 0 indicates that - it is not removable. A memory block is removable only if - every section in the block is removable. -``valid_zones`` read-only: designed to show which zones this memory block - can be onlined to. +``removable`` read-only: legacy interface that indicated whether a memory + block was likely to be offlineable or not. Newer kernel + versions return "1" if and only if the kernel supports + memory offlining. +``valid_zones`` read-only: designed to show by which zone memory provided by + a memory block is managed, and to show by which zone memory + provided by an offline memory block could be managed when + onlining. The first column shows it`s default zone. From bca3feaa0764ab5a4cbe6817871601f1d00c059d Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Thu, 25 Feb 2021 17:17:33 -0800 Subject: [PATCH 035/118] mm/memory_hotplug: prevalidate the address range being added with platform Patch series "mm/memory_hotplug: Pre-validate the address range with platform", v5. This series adds a mechanism allowing platforms to weigh in and prevalidate incoming address range before proceeding further with the memory hotplug. This helps prevent potential platform errors for the given address range, down the hotplug call chain, which inevitably fails the hotplug itself. This mechanism was suggested by David Hildenbrand during another discussion with respect to a memory hotplug fix on arm64 platform. https://lore.kernel.org/linux-arm-kernel/1600332402-30123-1-git-send-email-anshuman.khandual@arm.com/ This mechanism focuses on the addressibility aspect and not [sub] section alignment aspect. Hence check_hotplug_memory_range() and check_pfn_span() have been left unchanged. This patch (of 4): This introduces mhp_range_allowed() which can be called in various memory hotplug paths to prevalidate the address range which is being added, with the platform. Then mhp_range_allowed() calls mhp_get_pluggable_range() which provides applicable address range depending on whether linear mapping is required or not. For ranges that require linear mapping, it calls a new arch callback arch_get_mappable_range() which the platform can override. So the new callback, in turn provides the platform an opportunity to configure acceptable memory hotplug address ranges in case there are constraints. This mechanism will help prevent platform specific errors deep down during hotplug calls. This drops now redundant check_hotplug_memory_addressable() check in __add_pages() but instead adds a VM_BUG_ON() check which would ensure that the range has been validated with mhp_range_allowed() earlier in the call chain. Besides mhp_get_pluggable_range() also can be used by potential memory hotplug callers to avail the allowed physical range which would go through on a given platform. This does not really add any new range check in generic memory hotplug but instead compensates for lost checks in arch_add_memory() where applicable and check_hotplug_memory_addressable(), with unified mhp_range_allowed(). [akpm@linux-foundation.org: make pagemap_range() return -EINVAL when mhp_range_allowed() fails] Link: https://lkml.kernel.org/r/1612149902-7867-1-git-send-email-anshuman.khandual@arm.com Link: https://lkml.kernel.org/r/1612149902-7867-2-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Suggested-by: David Hildenbrand Reviewed-by: David Hildenbrand Reviewed-by: Oscar Salvador Cc: Heiko Carstens Cc: Catalin Marinas Cc: Vasily Gorbik # s390 Cc: Will Deacon Cc: Ard Biesheuvel Cc: Mark Rutland Cc: Jason Wang Cc: Jonathan Cameron Cc: "Michael S. Tsirkin" Cc: Michal Hocko Cc: Pankaj Gupta Cc: Pankaj Gupta Cc: teawater Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 10 +++++ mm/memory_hotplug.c | 78 +++++++++++++++++++++++++--------- mm/memremap.c | 8 +++- 3 files changed, 76 insertions(+), 20 deletions(-) diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 08eeef679ab7..7288aa5ef73b 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -66,6 +66,9 @@ struct mhp_params { pgprot_t pgprot; }; +bool mhp_range_allowed(u64 start, u64 size, bool need_mapping); +struct range mhp_get_pluggable_range(bool need_mapping); + /* * Zone resizing functions * @@ -266,6 +269,13 @@ static inline bool movable_node_is_enabled(void) } #endif /* ! CONFIG_MEMORY_HOTPLUG */ +/* + * Keep this declaration outside CONFIG_MEMORY_HOTPLUG as some + * platforms might override and use arch_get_mappable_range() + * for internal non memory hotplug purposes. + */ +struct range arch_get_mappable_range(void); + #if defined(CONFIG_MEMORY_HOTPLUG) || defined(CONFIG_DEFERRED_STRUCT_PAGE_INIT) /* * pgdat resizing functions diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index a969463bdda4..5ba51a8bdaeb 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -107,6 +107,9 @@ static struct resource *register_memory_resource(u64 start, u64 size, if (strcmp(resource_name, "System RAM")) flags |= IORESOURCE_SYSRAM_DRIVER_MANAGED; + if (!mhp_range_allowed(start, size, true)) + return ERR_PTR(-E2BIG); + /* * Make sure value parsed from 'mem=' only restricts memory adding * while booting, so that memory hotplug won't be impacted. Please @@ -284,22 +287,6 @@ static int check_pfn_span(unsigned long pfn, unsigned long nr_pages, return 0; } -static int check_hotplug_memory_addressable(unsigned long pfn, - unsigned long nr_pages) -{ - const u64 max_addr = PFN_PHYS(pfn + nr_pages) - 1; - - if (max_addr >> MAX_PHYSMEM_BITS) { - const u64 max_allowed = (1ull << (MAX_PHYSMEM_BITS + 1)) - 1; - WARN(1, - "Hotplugged memory exceeds maximum addressable address, range=%#llx-%#llx, maximum=%#llx\n", - (u64)PFN_PHYS(pfn), max_addr, max_allowed); - return -E2BIG; - } - - return 0; -} - /* * Return page for the valid pfn only if the page is online. All pfn * walkers which rely on the fully initialized page->flags and others @@ -365,9 +352,7 @@ int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages, if (WARN_ON_ONCE(!params->pgprot.pgprot)) return -EINVAL; - err = check_hotplug_memory_addressable(pfn, nr_pages); - if (err) - return err; + VM_BUG_ON(!mhp_range_allowed(PFN_PHYS(pfn), nr_pages * PAGE_SIZE, false)); if (altmap) { /* @@ -1248,6 +1233,61 @@ out_unlock: } EXPORT_SYMBOL_GPL(add_memory_driver_managed); +/* + * Platforms should define arch_get_mappable_range() that provides + * maximum possible addressable physical memory range for which the + * linear mapping could be created. The platform returned address + * range must adhere to these following semantics. + * + * - range.start <= range.end + * - Range includes both end points [range.start..range.end] + * + * There is also a fallback definition provided here, allowing the + * entire possible physical address range in case any platform does + * not define arch_get_mappable_range(). + */ +struct range __weak arch_get_mappable_range(void) +{ + struct range mhp_range = { + .start = 0UL, + .end = -1ULL, + }; + return mhp_range; +} + +struct range mhp_get_pluggable_range(bool need_mapping) +{ + const u64 max_phys = (1ULL << MAX_PHYSMEM_BITS) - 1; + struct range mhp_range; + + if (need_mapping) { + mhp_range = arch_get_mappable_range(); + if (mhp_range.start > max_phys) { + mhp_range.start = 0; + mhp_range.end = 0; + } + mhp_range.end = min_t(u64, mhp_range.end, max_phys); + } else { + mhp_range.start = 0; + mhp_range.end = max_phys; + } + return mhp_range; +} +EXPORT_SYMBOL_GPL(mhp_get_pluggable_range); + +bool mhp_range_allowed(u64 start, u64 size, bool need_mapping) +{ + struct range mhp_range = mhp_get_pluggable_range(need_mapping); + u64 end = start + size; + + if (start < end && start >= mhp_range.start && (end - 1) <= mhp_range.end) + return true; + + pr_warn("Hotplug memory [%#llx-%#llx] exceeds maximum addressable range [%#llx-%#llx]\n", + start, end, mhp_range.start, mhp_range.end); + return false; +} + #ifdef CONFIG_MEMORY_HOTREMOVE /* * Confirm all pages in a range [start, end) belong to the same zone (skipping diff --git a/mm/memremap.c b/mm/memremap.c index 2455bac89506..7aa7d6e80ee5 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -200,6 +200,7 @@ static void dev_pagemap_percpu_release(struct percpu_ref *ref) static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params, int range_id, int nid) { + const bool is_private = pgmap->type == MEMORY_DEVICE_PRIVATE; struct range *range = &pgmap->ranges[range_id]; struct dev_pagemap *conflict_pgmap; int error, is_ram; @@ -245,6 +246,11 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params, if (error) goto err_pfn_remap; + if (!mhp_range_allowed(range->start, range_len(range), !is_private)) { + error = -EINVAL; + goto err_pfn_remap; + } + mem_hotplug_begin(); /* @@ -258,7 +264,7 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params, * the CPU, we do want the linear mapping and thus use * arch_add_memory(). */ - if (pgmap->type == MEMORY_DEVICE_PRIVATE) { + if (is_private) { error = add_pages(nid, PHYS_PFN(range->start), PHYS_PFN(range_len(range)), params); } else { From 03aaf83fba6e5af08b5dd174c72edee9b7d9ed9b Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Thu, 25 Feb 2021 17:17:37 -0800 Subject: [PATCH 036/118] arm64/mm: define arch_get_mappable_range() This overrides arch_get_mappable_range() on arm64 platform which will be used with recently added generic framework. It drops inside_linear_region() and subsequent check in arch_add_memory() which are no longer required. It also adds a VM_BUG_ON() check that would ensure that mhp_range_allowed() has already been called. Link: https://lkml.kernel.org/r/1612149902-7867-3-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Reviewed-by: David Hildenbrand Reviewed-by: Catalin Marinas Cc: Will Deacon Cc: Ard Biesheuvel Cc: Mark Rutland Cc: Heiko Carstens Cc: Jason Wang Cc: Jonathan Cameron Cc: "Michael S. Tsirkin" Cc: Michal Hocko Cc: Oscar Salvador Cc: Pankaj Gupta Cc: Pankaj Gupta Cc: teawater Cc: Vasily Gorbik Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/mm/mmu.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 25af183e4bed..d0758d24a42d 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -1444,16 +1444,19 @@ static void __remove_pgd_mapping(pgd_t *pgdir, unsigned long start, u64 size) free_empty_tables(start, end, PAGE_OFFSET, PAGE_END); } -static bool inside_linear_region(u64 start, u64 size) +struct range arch_get_mappable_range(void) { + struct range mhp_range; + /* * Linear mapping region is the range [PAGE_OFFSET..(PAGE_END - 1)] * accommodating both its ends but excluding PAGE_END. Max physical * range which can be mapped inside this linear mapping range, must * also be derived from its end points. */ - return start >= __pa(_PAGE_OFFSET(vabits_actual)) && - (start + size - 1) <= __pa(PAGE_END - 1); + mhp_range.start = __pa(_PAGE_OFFSET(vabits_actual)); + mhp_range.end = __pa(PAGE_END - 1); + return mhp_range; } int arch_add_memory(int nid, u64 start, u64 size, @@ -1461,11 +1464,7 @@ int arch_add_memory(int nid, u64 start, u64 size, { int ret, flags = 0; - if (!inside_linear_region(start, size)) { - pr_err("[%llx %llx] is outside linear mapping region\n", start, start + size); - return -EINVAL; - } - + VM_BUG_ON(!mhp_range_allowed(start, size, true)); if (rodata_full || debug_pagealloc_enabled()) flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS; From 7707248a4727c4e8ee8d84ed578a9807d8994a40 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Thu, 25 Feb 2021 17:17:41 -0800 Subject: [PATCH 037/118] s390/mm: define arch_get_mappable_range() This overrides arch_get_mappabble_range() on s390 platform which will be used with recently added generic framework. It modifies the existing range check in vmem_add_mapping() using arch_get_mappable_range(). It also adds a VM_BUG_ON() check that would ensure that mhp_range_allowed() has already been called on the hotplug path. Link: https://lkml.kernel.org/r/1612149902-7867-4-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Acked-by: Heiko Carstens Reviewed-by: David Hildenbrand Cc: Vasily Gorbik Cc: Ard Biesheuvel Cc: Catalin Marinas Cc: Jason Wang Cc: Jonathan Cameron Cc: Mark Rutland Cc: "Michael S. Tsirkin" Cc: Michal Hocko Cc: Oscar Salvador Cc: Pankaj Gupta Cc: Pankaj Gupta Cc: teawater Cc: Wei Yang Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/mm/init.c | 1 + arch/s390/mm/vmem.c | 14 +++++++++++++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 73a163065b95..0e76b2127dc6 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -297,6 +297,7 @@ int arch_add_memory(int nid, u64 start, u64 size, if (WARN_ON_ONCE(params->pgprot.pgprot != PAGE_KERNEL.pgprot)) return -EINVAL; + VM_BUG_ON(!mhp_range_allowed(start, size, true)); rc = vmem_add_mapping(start, size); if (rc) return rc; diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index 01f3a5f58e64..82dbf9450105 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -4,6 +4,7 @@ * Author(s): Heiko Carstens */ +#include #include #include #include @@ -532,11 +533,22 @@ void vmem_remove_mapping(unsigned long start, unsigned long size) mutex_unlock(&vmem_mutex); } +struct range arch_get_mappable_range(void) +{ + struct range mhp_range; + + mhp_range.start = 0; + mhp_range.end = VMEM_MAX_PHYS - 1; + return mhp_range; +} + int vmem_add_mapping(unsigned long start, unsigned long size) { + struct range range = arch_get_mappable_range(); int ret; - if (start + size > VMEM_MAX_PHYS || + if (start < range.start || + start + size > range.end + 1 || start + size < start) return -ERANGE; From 94c8945376d44b37aa3ab5b58669a2a86326968e Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 25 Feb 2021 17:17:45 -0800 Subject: [PATCH 038/118] virtio-mem: check against mhp_get_pluggable_range() which memory we can hotplug Right now, we only check against MAX_PHYSMEM_BITS - but turns out there are more restrictions of which memory we can actually hotplug, especially om arm64 or s390x once we support them: we might receive something like -E2BIG or -ERANGE from add_memory_driver_managed(), stopping device operation. So, check right when initializing the device which memory we can add, warning the user. Try only adding actually pluggable ranges: in the worst case, no memory provided by our device is pluggable. In the usual case, we expect all device memory to be pluggable, and in corner cases only some memory at the end of the device-managed memory region to not be pluggable. Link: https://lkml.kernel.org/r/1612149902-7867-5-git-send-email-anshuman.khandual@arm.com Signed-off-by: David Hildenbrand Signed-off-by: Anshuman Khandual Cc: "Michael S. Tsirkin" Cc: Jason Wang Cc: Pankaj Gupta Cc: Oscar Salvador Cc: Wei Yang Cc: teawater Cc: Anshuman Khandual Cc: Pankaj Gupta Cc: Jonathan Cameron Cc: Vasily Gorbik Cc: Will Deacon Cc: Ard Biesheuvel Cc: Mark Rutland Cc: Heiko Carstens Cc: Michal Hocko Cc: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/virtio/virtio_mem.c | 41 ++++++++++++++++++++++++------------- 1 file changed, 27 insertions(+), 14 deletions(-) diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index d44e43869f17..1119e0c6f6c1 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -2222,7 +2222,7 @@ static int virtio_mem_unplug_pending_mb(struct virtio_mem *vm) */ static void virtio_mem_refresh_config(struct virtio_mem *vm) { - const uint64_t phys_limit = 1UL << MAX_PHYSMEM_BITS; + const struct range pluggable_range = mhp_get_pluggable_range(true); uint64_t new_plugged_size, usable_region_size, end_addr; /* the plugged_size is just a reflection of what _we_ did previously */ @@ -2234,15 +2234,25 @@ static void virtio_mem_refresh_config(struct virtio_mem *vm) /* calculate the last usable memory block id */ virtio_cread_le(vm->vdev, struct virtio_mem_config, usable_region_size, &usable_region_size); - end_addr = vm->addr + usable_region_size; - end_addr = min(end_addr, phys_limit); + end_addr = min(vm->addr + usable_region_size - 1, + pluggable_range.end); - if (vm->in_sbm) - vm->sbm.last_usable_mb_id = - virtio_mem_phys_to_mb_id(end_addr) - 1; - else - vm->bbm.last_usable_bb_id = - virtio_mem_phys_to_bb_id(vm, end_addr) - 1; + if (vm->in_sbm) { + vm->sbm.last_usable_mb_id = virtio_mem_phys_to_mb_id(end_addr); + if (!IS_ALIGNED(end_addr + 1, memory_block_size_bytes())) + vm->sbm.last_usable_mb_id--; + } else { + vm->bbm.last_usable_bb_id = virtio_mem_phys_to_bb_id(vm, + end_addr); + if (!IS_ALIGNED(end_addr + 1, vm->bbm.bb_size)) + vm->bbm.last_usable_bb_id--; + } + /* + * If we cannot plug any of our device memory (e.g., nothing in the + * usable region is addressable), the last usable memory block id will + * be smaller than the first usable memory block id. We'll stop + * attempting to add memory with -ENOSPC from our main loop. + */ /* see if there is a request to change the size */ virtio_cread_le(vm->vdev, struct virtio_mem_config, requested_size, @@ -2364,7 +2374,7 @@ static int virtio_mem_init_vq(struct virtio_mem *vm) static int virtio_mem_init(struct virtio_mem *vm) { - const uint64_t phys_limit = 1UL << MAX_PHYSMEM_BITS; + const struct range pluggable_range = mhp_get_pluggable_range(true); uint64_t sb_size, addr; uint16_t node_id; @@ -2405,9 +2415,10 @@ static int virtio_mem_init(struct virtio_mem *vm) if (!IS_ALIGNED(vm->addr + vm->region_size, memory_block_size_bytes())) dev_warn(&vm->vdev->dev, "The alignment of the physical end address can make some memory unusable.\n"); - if (vm->addr + vm->region_size > phys_limit) + if (vm->addr < pluggable_range.start || + vm->addr + vm->region_size - 1 > pluggable_range.end) dev_warn(&vm->vdev->dev, - "Some memory is not addressable. This can make some memory unusable.\n"); + "Some device memory is not addressable/pluggable. This can make some memory unusable.\n"); /* * We want subblocks to span at least MAX_ORDER_NR_PAGES and @@ -2429,7 +2440,8 @@ static int virtio_mem_init(struct virtio_mem *vm) vm->sbm.sb_size; /* Round up to the next full memory block */ - addr = vm->addr + memory_block_size_bytes() - 1; + addr = max_t(uint64_t, vm->addr, pluggable_range.start) + + memory_block_size_bytes() - 1; vm->sbm.first_mb_id = virtio_mem_phys_to_mb_id(addr); vm->sbm.next_mb_id = vm->sbm.first_mb_id; } else { @@ -2450,7 +2462,8 @@ static int virtio_mem_init(struct virtio_mem *vm) } /* Round up to the next aligned big block */ - addr = vm->addr + vm->bbm.bb_size - 1; + addr = max_t(uint64_t, vm->addr, pluggable_range.start) + + vm->bbm.bb_size - 1; vm->bbm.first_bb_id = virtio_mem_phys_to_bb_id(vm, addr); vm->bbm.next_bb_id = vm->bbm.first_bb_id; } From 48b03eea321c85185d173cb0d112698b79b1c98e Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 25 Feb 2021 17:17:49 -0800 Subject: [PATCH 039/118] mm/mlock: stop counting mlocked pages when none vma is found There will be no vma satisfies addr < vm_end when find_vma() returns NULL. Thus it's meaningless to traverse the vma list below because we can't find any vma to count mlocked pages. Stop counting mlocked pages in this case to save some vma list traversal cycles. Link: https://lkml.kernel.org/r/20210204110705.17586-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mlock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mlock.c b/mm/mlock.c index 73960bb3464d..f8f8cc32d03d 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -622,7 +622,7 @@ static unsigned long count_mm_mlocked_page_nr(struct mm_struct *mm, vma = find_vma(mm, start); if (vma == NULL) - vma = mm->mmap; + return 0; for (; vma ; vma = vma->vm_next) { if (start >= vma->vm_end) From aaf1f990aee40bc74b425ef8f51201ae21b85ed7 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 25 Feb 2021 17:17:53 -0800 Subject: [PATCH 040/118] mm/rmap: correct some obsolete comments of anon_vma commit 2b575eb64f7a ("mm: convert anon_vma->lock to a mutex") changed spinlock used to serialize access to vma list to mutex. And further, the commit 5a505085f043 ("mm/rmap: Convert the struct anon_vma::mutex to an rwsem") converted the mutex to an rwsem for solving scalability problem. So replace spinlock with rwsem to make comment uptodate. Link: https://lkml.kernel.org/r/20210123072459.25903-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index e26ae119a131..f6f43620cd97 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -168,7 +168,7 @@ static void anon_vma_chain_link(struct vm_area_struct *vma, * * Anon-vma allocations are very subtle, because we may have * optimistically looked up an anon_vma in page_lock_anon_vma_read() - * and that may actually touch the spinlock even in the newly + * and that may actually touch the rwsem even in the newly * allocated vma (it depends on RCU to make sure that the * anon_vma isn't actually destroyed). * @@ -359,7 +359,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) goto out_error_free_anon_vma; /* - * The root anon_vma's spinlock is the lock actually used when we + * The root anon_vma's rwsem is the lock actually used when we * lock any of the anon_vmas in this anon_vma tree. */ anon_vma->root = pvma->anon_vma->root; From e0af87ff7afcde2660be44302836d2d5618185af Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 25 Feb 2021 17:17:56 -0800 Subject: [PATCH 041/118] mm/rmap: remove unneeded semicolon in page_not_mapped() Remove extra semicolon without any functional change intended. Link: https://lkml.kernel.org/r/20210127093425.39640-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/rmap.c b/mm/rmap.c index f6f43620cd97..46fdbf541b8e 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1784,7 +1784,7 @@ bool try_to_unmap(struct page *page, enum ttu_flags flags) static int page_not_mapped(struct page *page) { return !page_mapped(page); -}; +} /** * try_to_munlock - try to munlock a page From 90aaca852ca13a6c962b25964fb6678120f266b1 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 25 Feb 2021 17:17:59 -0800 Subject: [PATCH 042/118] mm/rmap: fix obsolete comment in __page_check_anon_rmap() Commit 21333b2b66b8 ("ksm: no debug in page_dup_rmap()") has reverted page_dup_rmap() to an inline atomic_inc of mapcount. So page_dup_rmap() does not call __page_check_anon_rmap() anymore. Link: https://lkml.kernel.org/r/20210128110209.50857-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index 46fdbf541b8e..c3f6e060d73f 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1086,8 +1086,7 @@ static void __page_check_anon_rmap(struct page *page, * be set up correctly at this point. * * We have exclusion against page_add_anon_rmap because the caller - * always holds the page locked, except if called from page_dup_rmap, - * in which case the page is already known to be setup. + * always holds the page locked. * * We have exclusion against page_add_new_anon_rmap because those pages * are initially only visible via the pagetables, and the pte is locked From b7e188ec98b1644ff70a6d3624ea16aadc39f5e0 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 25 Feb 2021 17:18:03 -0800 Subject: [PATCH 043/118] mm/rmap: use page_not_mapped in try_to_unmap() page_mapcount_is_zero() calculates accurately how many mappings a hugepage has in order to check against 0 only. This is a waste of cpu time. We can do this via page_not_mapped() to save some possible atomic_read cycles. Remove the function page_mapcount_is_zero() as it's not used anymore and move page_not_mapped() above try_to_unmap() to avoid identifier undeclared compilation error. Link: https://lkml.kernel.org/r/20210130084904.35307-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index c3f6e060d73f..b49e85605f8f 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1736,9 +1736,9 @@ static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg) return vma_is_temporary_stack(vma); } -static int page_mapcount_is_zero(struct page *page) +static int page_not_mapped(struct page *page) { - return !total_mapcount(page); + return !page_mapped(page); } /** @@ -1756,7 +1756,7 @@ bool try_to_unmap(struct page *page, enum ttu_flags flags) struct rmap_walk_control rwc = { .rmap_one = try_to_unmap_one, .arg = (void *)flags, - .done = page_mapcount_is_zero, + .done = page_not_mapped, .anon_lock = page_lock_anon_vma_read, }; @@ -1780,11 +1780,6 @@ bool try_to_unmap(struct page *page, enum ttu_flags flags) return !page_mapcount(page) ? true : false; } -static int page_not_mapped(struct page *page) -{ - return !page_mapped(page); -} - /** * try_to_munlock - try to munlock a page * @page: the page to be munlocked From ad8a20cf6d19a9506b4a554030bafc1ac204ef31 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 25 Feb 2021 17:18:06 -0800 Subject: [PATCH 044/118] mm/rmap: correct obsolete comment of page_get_anon_vma() Since commit 746b18d421da ("mm: use refcounts for page_lock_anon_vma()"), page_lock_anon_vma() is renamed to page_get_anon_vma() and converted to return a refcount increased anon_vma. But it forgot to change the relevant comment. Link: https://lkml.kernel.org/r/20210203093215.31990-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index b49e85605f8f..b0fc27e77d6d 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -462,8 +462,8 @@ void __init anon_vma_init(void) * Getting a lock on a stable anon_vma from a page off the LRU is tricky! * * Since there is no serialization what so ever against page_remove_rmap() - * the best this function can do is return a locked anon_vma that might - * have been relevant to this page. + * the best this function can do is return a refcount increased anon_vma + * that might have been relevant to this page. * * The page might have been remapped to a different anon_vma or the anon_vma * returned may already be freed (and even reused). From 5d5d19eda6b0ee790af89c45e3f678345be6f50f Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 25 Feb 2021 17:18:09 -0800 Subject: [PATCH 045/118] mm/rmap: fix potential pte_unmap on an not mapped pte For PMD-mapped page (usually THP), pvmw->pte is NULL. For PTE-mapped THP, pvmw->pte is mapped. But for HugeTLB pages, pvmw->pte is not mapped and set to the relevant page table entry. So in page_vma_mapped_walk_done(), we may do pte_unmap() for HugeTLB pte which is not mapped. Fix this by checking pvmw->page against PageHuge before trying to do pte_unmap(). Link: https://lkml.kernel.org/r/20210127093349.39081-1-linmiaohe@huawei.com Fixes: ace71a19cec5 ("mm: introduce page_vma_mapped_walk()") Signed-off-by: Hongxiang Lou Signed-off-by: Miaohe Lin Tested-by: Sedat Dilek Cc: Kees Cook Cc: Nathan Chancellor Cc: Mike Kravetz Cc: Shakeel Butt Cc: Johannes Weiner Cc: Vlastimil Babka Cc: Michel Lespinasse Cc: Nick Desaulniers Cc: "Kirill A. Shutemov" Cc: Wei Yang Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: Brian Geffon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rmap.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 70085ca1a3fc..def5c62c93b3 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -213,7 +213,8 @@ struct page_vma_mapped_walk { static inline void page_vma_mapped_walk_done(struct page_vma_mapped_walk *pvmw) { - if (pvmw->pte) + /* HugeTLB pte is set to the relevant page table entry without pte_mapped. */ + if (pvmw->pte && !PageHuge(pvmw->page)) pte_unmap(pvmw->pte); if (pvmw->ptl) spin_unlock(pvmw->ptl); From c0c641d77b9ab0da798ca86d34d2327d6f427f4c Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 25 Feb 2021 17:18:13 -0800 Subject: [PATCH 046/118] mm: zswap: clean up confusing comment Correct wording and change one duplicated word (it) to "it is". Link: https://lkml.kernel.org/r/20201221042848.13980-1-rdunlap@infradead.org Fixes: 0ab0abcf5115 ("mm/zswap: refactor the get/put routines") Signed-off-by: Randy Dunlap Cc: Weijie Yang Cc: Seth Jennings Cc: Seth Jennings Cc: Dan Streetman Cc: Vitaly Wool Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zswap.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 182f6ad5aa69..1e41c2857068 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1022,10 +1022,10 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) /* * if we get here due to ZSWAP_SWAPCACHE_EXIST - * a load may happening concurrently - * it is safe and okay to not free the entry + * a load may be happening concurrently. + * it is safe and okay to not free the entry. * if we free the entry in the following put - * it it either okay to return !0 + * it is also okay to return !0 */ fail: spin_lock(&tree->lock); From fc6697a89f56d9773b2fbff718d4cf2a6d63379d Mon Sep 17 00:00:00 2001 From: Tian Tao Date: Thu, 25 Feb 2021 17:18:17 -0800 Subject: [PATCH 047/118] mm/zswap: add the flag can_sleep_mapped Patch series "Fix the compatibility of zsmalloc and zswap". Patch #1 adds a flag to zpool, then zswap used to determine if zpool drivers such as zbud/z3fold/zsmalloc will enter an atomic context after mapping. The difference between zbud/z3fold and zsmalloc is that zsmalloc requires an atomic context that since its map function holds a preempt-disabled, but zbud/z3fold don't require an atomic context. So patch #2 sets flag sleep_mapped to true indicating that zbud/z3fold can sleep after mapping. zsmalloc didn't support sleep after mapping, so don't set that flag to true. This patch (of 2): Add a flag to zpool, named is "can_sleep_mapped", and have it set true for zbud/z3fold, not set this flag for zsmalloc, so its default value is false. Then zswap could go the current path if the flag is true; and if it's false, copy data from src to a temporary buffer, then unmap the handle, take the mutex, process the buffer instead of src to avoid sleeping function called from atomic context. [natechancellor@gmail.com: add return value in zswap_frontswap_load] Link: https://lkml.kernel.org/r/20210121214804.926843-1-natechancellor@gmail.com [tiantao6@hisilicon.com: fix potential memory leak] Link: https://lkml.kernel.org/r/1611538365-51811-1-git-send-email-tiantao6@hisilicon.com [colin.king@canonical.com: fix potential uninitialized pointer read on tmp] Link: https://lkml.kernel.org/r/20210128141728.639030-1-colin.king@canonical.com [tiantao6@hisilicon.com: fix variable 'entry' is uninitialized when used] Link: https://lkml.kernel.org/r/1611223030-58346-1-git-send-email-tiantao6@hisilicon.comLink: https://lkml.kernel.org/r/1611035683-12732-1-git-send-email-tiantao6@hisilicon.com Link: https://lkml.kernel.org/r/1611035683-12732-2-git-send-email-tiantao6@hisilicon.com Signed-off-by: Tian Tao Signed-off-by: Nathan Chancellor Signed-off-by: Colin Ian King Reviewed-by: Vitaly Wool Acked-by: Sebastian Andrzej Siewior Reported-by: Mike Galbraith Cc: Barry Song Cc: Dan Streetman Cc: Seth Jennings Cc: Dan Carpenter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/zpool.h | 3 +++ mm/zpool.c | 13 +++++++++++ mm/zswap.c | 51 ++++++++++++++++++++++++++++++++++++++----- 3 files changed, 62 insertions(+), 5 deletions(-) diff --git a/include/linux/zpool.h b/include/linux/zpool.h index 51bf43076165..e8997010612a 100644 --- a/include/linux/zpool.h +++ b/include/linux/zpool.h @@ -73,6 +73,7 @@ u64 zpool_get_total_size(struct zpool *pool); * @malloc: allocate mem from a pool. * @free: free mem from a pool. * @shrink: shrink the pool. + * @sleep_mapped: whether zpool driver can sleep during map. * @map: map a handle. * @unmap: unmap a handle. * @total_size: get total size of a pool. @@ -100,6 +101,7 @@ struct zpool_driver { int (*shrink)(void *pool, unsigned int pages, unsigned int *reclaimed); + bool sleep_mapped; void *(*map)(void *pool, unsigned long handle, enum zpool_mapmode mm); void (*unmap)(void *pool, unsigned long handle); @@ -112,5 +114,6 @@ void zpool_register_driver(struct zpool_driver *driver); int zpool_unregister_driver(struct zpool_driver *driver); bool zpool_evictable(struct zpool *pool); +bool zpool_can_sleep_mapped(struct zpool *pool); #endif diff --git a/mm/zpool.c b/mm/zpool.c index 3744a2d1a624..5ed71207ced7 100644 --- a/mm/zpool.c +++ b/mm/zpool.c @@ -23,6 +23,7 @@ struct zpool { void *pool; const struct zpool_ops *ops; bool evictable; + bool can_sleep_mapped; struct list_head list; }; @@ -183,6 +184,7 @@ struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp, zpool->pool = driver->create(name, gfp, ops, zpool); zpool->ops = ops; zpool->evictable = driver->shrink && ops && ops->evict; + zpool->can_sleep_mapped = driver->sleep_mapped; if (!zpool->pool) { pr_err("couldn't create %s pool\n", type); @@ -393,6 +395,17 @@ bool zpool_evictable(struct zpool *zpool) return zpool->evictable; } +/** + * zpool_can_sleep_mapped - Test if zpool can sleep when do mapped. + * @zpool: The zpool to test + * + * Returns: true if zpool can sleep; false otherwise. + */ +bool zpool_can_sleep_mapped(struct zpool *zpool) +{ + return zpool->can_sleep_mapped; +} + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Dan Streetman "); MODULE_DESCRIPTION("Common API for compressed memory storage"); diff --git a/mm/zswap.c b/mm/zswap.c index 1e41c2857068..578d9f256920 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -935,13 +935,19 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) struct scatterlist input, output; struct crypto_acomp_ctx *acomp_ctx; - u8 *src; + u8 *src, *tmp = NULL; unsigned int dlen; int ret; struct writeback_control wbc = { .sync_mode = WB_SYNC_NONE, }; + if (!zpool_can_sleep_mapped(pool)) { + tmp = kmalloc(PAGE_SIZE, GFP_ATOMIC); + if (!tmp) + return -ENOMEM; + } + /* extract swpentry from data */ zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO); swpentry = zhdr->swpentry; /* here */ @@ -955,6 +961,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) /* entry was invalidated */ spin_unlock(&tree->lock); zpool_unmap_handle(pool, handle); + kfree(tmp); return 0; } spin_unlock(&tree->lock); @@ -979,6 +986,14 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) dlen = PAGE_SIZE; src = (u8 *)zhdr + sizeof(struct zswap_header); + if (!zpool_can_sleep_mapped(pool)) { + + memcpy(tmp, src, entry->length); + src = tmp; + + zpool_unmap_handle(pool, handle); + } + mutex_lock(acomp_ctx->mutex); sg_init_one(&input, src, entry->length); sg_init_table(&output, 1); @@ -1033,7 +1048,11 @@ fail: spin_unlock(&tree->lock); end: - zpool_unmap_handle(pool, handle); + if (zpool_can_sleep_mapped(pool)) + zpool_unmap_handle(pool, handle); + else + kfree(tmp); + return ret; } @@ -1235,7 +1254,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, struct zswap_entry *entry; struct scatterlist input, output; struct crypto_acomp_ctx *acomp_ctx; - u8 *src, *dst; + u8 *src, *dst, *tmp; unsigned int dlen; int ret; @@ -1253,15 +1272,33 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, dst = kmap_atomic(page); zswap_fill_page(dst, entry->value); kunmap_atomic(dst); + ret = 0; goto freeentry; } + if (!zpool_can_sleep_mapped(entry->pool->zpool)) { + + tmp = kmalloc(entry->length, GFP_ATOMIC); + if (!tmp) { + ret = -ENOMEM; + goto freeentry; + } + } + /* decompress */ dlen = PAGE_SIZE; src = zpool_map_handle(entry->pool->zpool, entry->handle, ZPOOL_MM_RO); if (zpool_evictable(entry->pool->zpool)) src += sizeof(struct zswap_header); + if (!zpool_can_sleep_mapped(entry->pool->zpool)) { + + memcpy(tmp, src, entry->length); + src = tmp; + + zpool_unmap_handle(entry->pool->zpool, entry->handle); + } + acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); mutex_lock(acomp_ctx->mutex); sg_init_one(&input, src, entry->length); @@ -1271,7 +1308,11 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait); mutex_unlock(acomp_ctx->mutex); - zpool_unmap_handle(entry->pool->zpool, entry->handle); + if (zpool_can_sleep_mapped(entry->pool->zpool)) + zpool_unmap_handle(entry->pool->zpool, entry->handle); + else + kfree(tmp); + BUG_ON(ret); freeentry: @@ -1279,7 +1320,7 @@ freeentry: zswap_entry_put(tree, entry); spin_unlock(&tree->lock); - return 0; + return ret; } /* frees an entry in zswap */ From e818e820c6a0e819d239264fc863531bbcd72c30 Mon Sep 17 00:00:00 2001 From: Tian Tao Date: Thu, 25 Feb 2021 17:18:22 -0800 Subject: [PATCH 048/118] mm: set the sleep_mapped to true for zbud and z3fold zpool driver adds a flag to indicate whether the zpool driver can enter an atomic context after mapping. This patch sets it true for z3fold and zbud. Link: https://lkml.kernel.org/r/1611035683-12732-3-git-send-email-tiantao6@hisilicon.com Signed-off-by: Tian Tao Reviewed-by: Vitaly Wool Acked-by: Sebastian Andrzej Siewior Reported-by: Mike Galbraith Cc: Seth Jennings Cc: Dan Streetman Cc: Barry Song Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/z3fold.c | 1 + mm/zbud.c | 1 + 2 files changed, 2 insertions(+) diff --git a/mm/z3fold.c b/mm/z3fold.c index c1ccf6bb0ffb..b5dafa7e44e4 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -1771,6 +1771,7 @@ static u64 z3fold_zpool_total_size(void *pool) static struct zpool_driver z3fold_zpool_driver = { .type = "z3fold", + .sleep_mapped = true, .owner = THIS_MODULE, .create = z3fold_zpool_create, .destroy = z3fold_zpool_destroy, diff --git a/mm/zbud.c b/mm/zbud.c index c49966ece674..7ec5f27a68b0 100644 --- a/mm/zbud.c +++ b/mm/zbud.c @@ -203,6 +203,7 @@ static u64 zbud_zpool_total_size(void *pool) static struct zpool_driver zbud_zpool_driver = { .type = "zbud", + .sleep_mapped = true, .owner = THIS_MODULE, .create = zbud_zpool_create, .destroy = zbud_zpool_destroy, From f0231305acd53375c6cf736971bf5711105dd6bb Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 25 Feb 2021 17:18:27 -0800 Subject: [PATCH 049/118] mm/zsmalloc.c: convert to use kmem_cache_zalloc in cache_alloc_zspage() We always memset the zspage allocated via cache_alloc_zspage. So it's more convenient to use kmem_cache_zalloc in cache_alloc_zspage than caller do it manually. Link: https://lkml.kernel.org/r/20210114120032.25885-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Sergey Senozhatsky Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zsmalloc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 7289f502ffac..cf0ed0e4e911 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -357,7 +357,7 @@ static void cache_free_handle(struct zs_pool *pool, unsigned long handle) static struct zspage *cache_alloc_zspage(struct zs_pool *pool, gfp_t flags) { - return kmem_cache_alloc(pool->zspage_cachep, + return kmem_cache_zalloc(pool->zspage_cachep, flags & ~(__GFP_HIGHMEM|__GFP_MOVABLE)); } @@ -1064,7 +1064,6 @@ static struct zspage *alloc_zspage(struct zs_pool *pool, if (!zspage) return NULL; - memset(zspage, 0, sizeof(struct zspage)); zspage->magic = ZSPAGE_MAGIC; migrate_lock_init(zspage); From 2395928158059b8f9858365fce7713ce7fef62e4 Mon Sep 17 00:00:00 2001 From: Rokudo Yan Date: Thu, 25 Feb 2021 17:18:31 -0800 Subject: [PATCH 050/118] zsmalloc: account the number of compacted pages correctly There exists multiple path may do zram compaction concurrently. 1. auto-compaction triggered during memory reclaim 2. userspace utils write zram/compaction node So, multiple threads may call zs_shrinker_scan/zs_compact concurrently. But pages_compacted is a per zsmalloc pool variable and modification of the variable is not serialized(through under class->lock). There are two issues here: 1. the pages_compacted may not equal to total number of pages freed(due to concurrently add). 2. zs_shrinker_scan may not return the correct number of pages freed(issued by current shrinker). The fix is simple: 1. account the number of pages freed in zs_compact locally. 2. use actomic variable pages_compacted to accumulate total number. Link: https://lkml.kernel.org/r/20210202122235.26885-1-wu-yan@tcl.com Fixes: 860c707dca155a56 ("zsmalloc: account the number of compacted pages") Signed-off-by: Rokudo Yan Cc: Minchan Kim Cc: Sergey Senozhatsky Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.c | 2 +- include/linux/zsmalloc.h | 2 +- mm/zsmalloc.c | 17 +++++++++++------ 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index d7018543842e..a711a2e2a794 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1081,7 +1081,7 @@ static ssize_t mm_stat_show(struct device *dev, zram->limit_pages << PAGE_SHIFT, max_used << PAGE_SHIFT, (u64)atomic64_read(&zram->stats.same_pages), - pool_stats.pages_compacted, + atomic_long_read(&pool_stats.pages_compacted), (u64)atomic64_read(&zram->stats.huge_pages), (u64)atomic64_read(&zram->stats.huge_pages_since)); up_read(&zram->init_lock); diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h index 4807ca4d52e0..2a430e713ce5 100644 --- a/include/linux/zsmalloc.h +++ b/include/linux/zsmalloc.h @@ -35,7 +35,7 @@ enum zs_mapmode { struct zs_pool_stats { /* How many pages were migrated (freed) */ - unsigned long pages_compacted; + atomic_long_t pages_compacted; }; struct zs_pool; diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index cf0ed0e4e911..1518732f95c3 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -2212,11 +2212,13 @@ static unsigned long zs_can_compact(struct size_class *class) return obj_wasted * class->pages_per_zspage; } -static void __zs_compact(struct zs_pool *pool, struct size_class *class) +static unsigned long __zs_compact(struct zs_pool *pool, + struct size_class *class) { struct zs_compact_control cc; struct zspage *src_zspage; struct zspage *dst_zspage = NULL; + unsigned long pages_freed = 0; spin_lock(&class->lock); while ((src_zspage = isolate_zspage(class, true))) { @@ -2246,7 +2248,7 @@ static void __zs_compact(struct zs_pool *pool, struct size_class *class) putback_zspage(class, dst_zspage); if (putback_zspage(class, src_zspage) == ZS_EMPTY) { free_zspage(pool, class, src_zspage); - pool->stats.pages_compacted += class->pages_per_zspage; + pages_freed += class->pages_per_zspage; } spin_unlock(&class->lock); cond_resched(); @@ -2257,12 +2259,15 @@ static void __zs_compact(struct zs_pool *pool, struct size_class *class) putback_zspage(class, src_zspage); spin_unlock(&class->lock); + + return pages_freed; } unsigned long zs_compact(struct zs_pool *pool) { int i; struct size_class *class; + unsigned long pages_freed = 0; for (i = ZS_SIZE_CLASSES - 1; i >= 0; i--) { class = pool->size_class[i]; @@ -2270,10 +2275,11 @@ unsigned long zs_compact(struct zs_pool *pool) continue; if (class->index != i) continue; - __zs_compact(pool, class); + pages_freed += __zs_compact(pool, class); } + atomic_long_add(pages_freed, &pool->stats.pages_compacted); - return pool->stats.pages_compacted; + return pages_freed; } EXPORT_SYMBOL_GPL(zs_compact); @@ -2290,13 +2296,12 @@ static unsigned long zs_shrinker_scan(struct shrinker *shrinker, struct zs_pool *pool = container_of(shrinker, struct zs_pool, shrinker); - pages_freed = pool->stats.pages_compacted; /* * Compact classes and calculate compaction delta. * Can run concurrently with a manually triggered * (by user) compaction. */ - pages_freed = zs_compact(pool) - pages_freed; + pages_freed = zs_compact(pool); return pages_freed ? pages_freed : SHRINK_STOP; } From a6c5e0f75b3f7b8ace146f4eaa6398774d39a640 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 25 Feb 2021 17:18:34 -0800 Subject: [PATCH 051/118] mm/zsmalloc.c: use page_private() to access page->private It's recommended to use helper macro page_private() to access the private field of page. Use such helper to eliminate direct access. Link: https://lkml.kernel.org/r/20210203091857.20017-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Minchan Kim Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zsmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 1518732f95c3..30c358b72025 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -816,7 +816,7 @@ static int get_pages_per_zspage(int class_size) static struct zspage *get_zspage(struct page *page) { - struct zspage *zspage = (struct zspage *)page->private; + struct zspage *zspage = (struct zspage *)page_private(page); BUG_ON(zspage->magic != ZSPAGE_MAGIC); return zspage; From 4be408cec257d1156d35647db57726f5ef977630 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Thu, 25 Feb 2021 17:18:38 -0800 Subject: [PATCH 052/118] mm: page-flags.h: Typo fix (It -> If) The "If" was wrongly spelled as "It". Link: https://lkml.kernel.org/r/1608959036-91409-1-git-send-email-guoren@kernel.org Signed-off-by: Guo Ren Cc: Oscar Salvador Cc: Alexander Duyck Cc: David Hildenbrand Cc: Steven Price Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index db914477057b..04a34c08e0a6 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -810,7 +810,7 @@ static inline void ClearPageSlabPfmemalloc(struct page *page) /* * Flags checked when a page is freed. Pages being freed should not have - * these flags set. It they are, there is a problem. + * these flags set. If they are, there is a problem. */ #define PAGE_FLAGS_CHECK_AT_FREE \ (1UL << PG_lru | 1UL << PG_locked | \ @@ -821,7 +821,7 @@ static inline void ClearPageSlabPfmemalloc(struct page *page) /* * Flags checked when a page is prepped for return by the page allocator. - * Pages being prepped should not have these flags set. It they are set, + * Pages being prepped should not have these flags set. If they are set, * there has been a kernel bug or struct page corruption. * * __PG_HWPOISON is exceptional because it needs to be kept beyond page's From 0f2f89b6de32de49373040eb4ee9d6bc1930ae5a Mon Sep 17 00:00:00 2001 From: Daniel Vetter Date: Thu, 25 Feb 2021 17:18:41 -0800 Subject: [PATCH 053/118] mm/dmapool: use might_alloc() Now that my little helper has landed, use it more. On top of the existing check this also uses lockdep through the fs_reclaim annotations. Link: https://lkml.kernel.org/r/20210113135009.3606813-1-daniel.vetter@ffwll.ch Signed-off-by: Daniel Vetter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/dmapool.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/dmapool.c b/mm/dmapool.c index a97c97232337..f3791532fef2 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -319,7 +320,7 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, size_t offset; void *retval; - might_sleep_if(gfpflags_allow_blocking(mem_flags)); + might_alloc(mem_flags); spin_lock_irqsave(&pool->lock, flags); list_for_each_entry(page, &pool->page_list, page_list) { From c1ca59a1f21e360b26e26c187a4e42f22bb768d3 Mon Sep 17 00:00:00 2001 From: Daniel Vetter Date: Thu, 25 Feb 2021 17:18:45 -0800 Subject: [PATCH 054/118] mm/backing-dev.c: use might_alloc() Now that my little helper has landed, use it more. On top of the existing check this also uses lockdep through the fs_reclaim annotations. [akpm@linux-foundation.org: include linux/sched/mm.h] Link: https://lkml.kernel.org/r/20210113135009.3606813-2-daniel.vetter@ffwll.ch Signed-off-by: Daniel Vetter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/backing-dev.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index eca555f658d9..576220acd686 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -578,7 +579,7 @@ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, { struct bdi_writeback *wb; - might_sleep_if(gfpflags_allow_blocking(gfp)); + might_alloc(gfp); if (!memcg_css->parent) return &bdi->wb; From 87005394e14aa2f886581fb51e5e2022dc77ea05 Mon Sep 17 00:00:00 2001 From: Stephen Zhang Date: Thu, 25 Feb 2021 17:18:48 -0800 Subject: [PATCH 055/118] mm/early_ioremap.c: use __func__ instead of function name It is better to use __func__ instead of function name. Link: https://lkml.kernel.org/r/1611385587-4209-1-git-send-email-stephenzhangzsd@gmail.com Signed-off-by: Stephen Zhang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/early_ioremap.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c index a0018ad1a1f6..164607c7cdf1 100644 --- a/mm/early_ioremap.c +++ b/mm/early_ioremap.c @@ -181,17 +181,17 @@ void __init early_iounmap(void __iomem *addr, unsigned long size) } } - if (WARN(slot < 0, "early_iounmap(%p, %08lx) not found slot\n", - addr, size)) + if (WARN(slot < 0, "%s(%p, %08lx) not found slot\n", + __func__, addr, size)) return; if (WARN(prev_size[slot] != size, - "early_iounmap(%p, %08lx) [%d] size not consistent %08lx\n", - addr, size, slot, prev_size[slot])) + "%s(%p, %08lx) [%d] size not consistent %08lx\n", + __func__, addr, size, slot, prev_size[slot])) return; - WARN(early_ioremap_debug, "early_iounmap(%p, %08lx) [%d]\n", - addr, size, slot); + WARN(early_ioremap_debug, "%s(%p, %08lx) [%d]\n", + __func__, addr, size, slot); virt_addr = (unsigned long)addr; if (WARN_ON(virt_addr < fix_to_virt(FIX_BTMAP_BEGIN))) From 0ce20dd840897b12ae70869c69f1ba34d6d16965 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 25 Feb 2021 17:18:53 -0800 Subject: [PATCH 056/118] mm: add Kernel Electric-Fence infrastructure Patch series "KFENCE: A low-overhead sampling-based memory safety error detector", v7. This adds the Kernel Electric-Fence (KFENCE) infrastructure. KFENCE is a low-overhead sampling-based memory safety error detector of heap use-after-free, invalid-free, and out-of-bounds access errors. This series enables KFENCE for the x86 and arm64 architectures, and adds KFENCE hooks to the SLAB and SLUB allocators. KFENCE is designed to be enabled in production kernels, and has near zero performance overhead. Compared to KASAN, KFENCE trades performance for precision. The main motivation behind KFENCE's design, is that with enough total uptime KFENCE will detect bugs in code paths not typically exercised by non-production test workloads. One way to quickly achieve a large enough total uptime is when the tool is deployed across a large fleet of machines. KFENCE objects each reside on a dedicated page, at either the left or right page boundaries. The pages to the left and right of the object page are "guard pages", whose attributes are changed to a protected state, and cause page faults on any attempted access to them. Such page faults are then intercepted by KFENCE, which handles the fault gracefully by reporting a memory access error. Guarded allocations are set up based on a sample interval (can be set via kfence.sample_interval). After expiration of the sample interval, the next allocation through the main allocator (SLAB or SLUB) returns a guarded allocation from the KFENCE object pool. At this point, the timer is reset, and the next allocation is set up after the expiration of the interval. To enable/disable a KFENCE allocation through the main allocator's fast-path without overhead, KFENCE relies on static branches via the static keys infrastructure. The static branch is toggled to redirect the allocation to KFENCE. The KFENCE memory pool is of fixed size, and if the pool is exhausted no further KFENCE allocations occur. The default config is conservative with only 255 objects, resulting in a pool size of 2 MiB (with 4 KiB pages). We have verified by running synthetic benchmarks (sysbench I/O, hackbench) and production server-workload benchmarks that a kernel with KFENCE (using sample intervals 100-500ms) is performance-neutral compared to a non-KFENCE baseline kernel. KFENCE is inspired by GWP-ASan [1], a userspace tool with similar properties. The name "KFENCE" is a homage to the Electric Fence Malloc Debugger [2]. For more details, see Documentation/dev-tools/kfence.rst added in the series -- also viewable here: https://raw.githubusercontent.com/google/kasan/kfence/Documentation/dev-tools/kfence.rst [1] http://llvm.org/docs/GwpAsan.html [2] https://linux.die.net/man/3/efence This patch (of 9): This adds the Kernel Electric-Fence (KFENCE) infrastructure. KFENCE is a low-overhead sampling-based memory safety error detector of heap use-after-free, invalid-free, and out-of-bounds access errors. KFENCE is designed to be enabled in production kernels, and has near zero performance overhead. Compared to KASAN, KFENCE trades performance for precision. The main motivation behind KFENCE's design, is that with enough total uptime KFENCE will detect bugs in code paths not typically exercised by non-production test workloads. One way to quickly achieve a large enough total uptime is when the tool is deployed across a large fleet of machines. KFENCE objects each reside on a dedicated page, at either the left or right page boundaries. The pages to the left and right of the object page are "guard pages", whose attributes are changed to a protected state, and cause page faults on any attempted access to them. Such page faults are then intercepted by KFENCE, which handles the fault gracefully by reporting a memory access error. To detect out-of-bounds writes to memory within the object's page itself, KFENCE also uses pattern-based redzones. The following figure illustrates the page layout: ---+-----------+-----------+-----------+-----------+-----------+--- | xxxxxxxxx | O : | xxxxxxxxx | : O | xxxxxxxxx | | xxxxxxxxx | B : | xxxxxxxxx | : B | xxxxxxxxx | | x GUARD x | J : RED- | x GUARD x | RED- : J | x GUARD x | | xxxxxxxxx | E : ZONE | xxxxxxxxx | ZONE : E | xxxxxxxxx | | xxxxxxxxx | C : | xxxxxxxxx | : C | xxxxxxxxx | | xxxxxxxxx | T : | xxxxxxxxx | : T | xxxxxxxxx | ---+-----------+-----------+-----------+-----------+-----------+--- Guarded allocations are set up based on a sample interval (can be set via kfence.sample_interval). After expiration of the sample interval, a guarded allocation from the KFENCE object pool is returned to the main allocator (SLAB or SLUB). At this point, the timer is reset, and the next allocation is set up after the expiration of the interval. To enable/disable a KFENCE allocation through the main allocator's fast-path without overhead, KFENCE relies on static branches via the static keys infrastructure. The static branch is toggled to redirect the allocation to KFENCE. To date, we have verified by running synthetic benchmarks (sysbench I/O, hackbench) that a kernel compiled with KFENCE is performance-neutral compared to the non-KFENCE baseline. For more details, see Documentation/dev-tools/kfence.rst (added later in the series). [elver@google.com: fix parameter description for kfence_object_start()] Link: https://lkml.kernel.org/r/20201106092149.GA2851373@elver.google.com [elver@google.com: avoid stalling work queue task without allocations] Link: https://lkml.kernel.org/r/CADYN=9J0DQhizAGB0-jz4HOBBh+05kMBXb4c0cXMS7Qi5NAJiw@mail.gmail.com Link: https://lkml.kernel.org/r/20201110135320.3309507-1-elver@google.com [elver@google.com: fix potential deadlock due to wake_up()] Link: https://lkml.kernel.org/r/000000000000c0645805b7f982e4@google.com Link: https://lkml.kernel.org/r/20210104130749.1768991-1-elver@google.com [elver@google.com: add option to use KFENCE without static keys] Link: https://lkml.kernel.org/r/20210111091544.3287013-1-elver@google.com [elver@google.com: add missing copyright and description headers] Link: https://lkml.kernel.org/r/20210118092159.145934-1-elver@google.com Link: https://lkml.kernel.org/r/20201103175841.3495947-2-elver@google.com Signed-off-by: Marco Elver Signed-off-by: Alexander Potapenko Reviewed-by: Dmitry Vyukov Reviewed-by: SeongJae Park Co-developed-by: Marco Elver Reviewed-by: Jann Horn Cc: "H. Peter Anvin" Cc: Paul E. McKenney Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christopher Lameter Cc: Dave Hansen Cc: David Rientjes Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Hillf Danton Cc: Ingo Molnar Cc: Jonathan Corbet Cc: Joonsoo Kim Cc: Joern Engel Cc: Kees Cook Cc: Mark Rutland Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kfence.h | 216 +++++++++++ init/main.c | 3 + lib/Kconfig.debug | 1 + lib/Kconfig.kfence | 67 ++++ mm/Makefile | 1 + mm/kfence/Makefile | 3 + mm/kfence/core.c | 840 +++++++++++++++++++++++++++++++++++++++++ mm/kfence/kfence.h | 113 ++++++ mm/kfence/report.c | 240 ++++++++++++ 9 files changed, 1484 insertions(+) create mode 100644 include/linux/kfence.h create mode 100644 lib/Kconfig.kfence create mode 100644 mm/kfence/Makefile create mode 100644 mm/kfence/core.c create mode 100644 mm/kfence/kfence.h create mode 100644 mm/kfence/report.c diff --git a/include/linux/kfence.h b/include/linux/kfence.h new file mode 100644 index 000000000000..81f3911cb298 --- /dev/null +++ b/include/linux/kfence.h @@ -0,0 +1,216 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Kernel Electric-Fence (KFENCE). Public interface for allocator and fault + * handler integration. For more info see Documentation/dev-tools/kfence.rst. + * + * Copyright (C) 2020, Google LLC. + */ + +#ifndef _LINUX_KFENCE_H +#define _LINUX_KFENCE_H + +#include +#include + +#ifdef CONFIG_KFENCE + +/* + * We allocate an even number of pages, as it simplifies calculations to map + * address to metadata indices; effectively, the very first page serves as an + * extended guard page, but otherwise has no special purpose. + */ +#define KFENCE_POOL_SIZE ((CONFIG_KFENCE_NUM_OBJECTS + 1) * 2 * PAGE_SIZE) +extern char *__kfence_pool; + +#ifdef CONFIG_KFENCE_STATIC_KEYS +#include +DECLARE_STATIC_KEY_FALSE(kfence_allocation_key); +#else +#include +extern atomic_t kfence_allocation_gate; +#endif + +/** + * is_kfence_address() - check if an address belongs to KFENCE pool + * @addr: address to check + * + * Return: true or false depending on whether the address is within the KFENCE + * object range. + * + * KFENCE objects live in a separate page range and are not to be intermixed + * with regular heap objects (e.g. KFENCE objects must never be added to the + * allocator freelists). Failing to do so may and will result in heap + * corruptions, therefore is_kfence_address() must be used to check whether + * an object requires specific handling. + * + * Note: This function may be used in fast-paths, and is performance critical. + * Future changes should take this into account; for instance, we want to avoid + * introducing another load and therefore need to keep KFENCE_POOL_SIZE a + * constant (until immediate patching support is added to the kernel). + */ +static __always_inline bool is_kfence_address(const void *addr) +{ + /* + * The non-NULL check is required in case the __kfence_pool pointer was + * never initialized; keep it in the slow-path after the range-check. + */ + return unlikely((unsigned long)((char *)addr - __kfence_pool) < KFENCE_POOL_SIZE && addr); +} + +/** + * kfence_alloc_pool() - allocate the KFENCE pool via memblock + */ +void __init kfence_alloc_pool(void); + +/** + * kfence_init() - perform KFENCE initialization at boot time + * + * Requires that kfence_alloc_pool() was called before. This sets up the + * allocation gate timer, and requires that workqueues are available. + */ +void __init kfence_init(void); + +/** + * kfence_shutdown_cache() - handle shutdown_cache() for KFENCE objects + * @s: cache being shut down + * + * Before shutting down a cache, one must ensure there are no remaining objects + * allocated from it. Because KFENCE objects are not referenced from the cache + * directly, we need to check them here. + * + * Note that shutdown_cache() is internal to SL*B, and kmem_cache_destroy() does + * not return if allocated objects still exist: it prints an error message and + * simply aborts destruction of a cache, leaking memory. + * + * If the only such objects are KFENCE objects, we will not leak the entire + * cache, but instead try to provide more useful debug info by making allocated + * objects "zombie allocations". Objects may then still be used or freed (which + * is handled gracefully), but usage will result in showing KFENCE error reports + * which include stack traces to the user of the object, the original allocation + * site, and caller to shutdown_cache(). + */ +void kfence_shutdown_cache(struct kmem_cache *s); + +/* + * Allocate a KFENCE object. Allocators must not call this function directly, + * use kfence_alloc() instead. + */ +void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags); + +/** + * kfence_alloc() - allocate a KFENCE object with a low probability + * @s: struct kmem_cache with object requirements + * @size: exact size of the object to allocate (can be less than @s->size + * e.g. for kmalloc caches) + * @flags: GFP flags + * + * Return: + * * NULL - must proceed with allocating as usual, + * * non-NULL - pointer to a KFENCE object. + * + * kfence_alloc() should be inserted into the heap allocation fast path, + * allowing it to transparently return KFENCE-allocated objects with a low + * probability using a static branch (the probability is controlled by the + * kfence.sample_interval boot parameter). + */ +static __always_inline void *kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags) +{ +#ifdef CONFIG_KFENCE_STATIC_KEYS + if (static_branch_unlikely(&kfence_allocation_key)) +#else + if (unlikely(!atomic_read(&kfence_allocation_gate))) +#endif + return __kfence_alloc(s, size, flags); + return NULL; +} + +/** + * kfence_ksize() - get actual amount of memory allocated for a KFENCE object + * @addr: pointer to a heap object + * + * Return: + * * 0 - not a KFENCE object, must call __ksize() instead, + * * non-0 - this many bytes can be accessed without causing a memory error. + * + * kfence_ksize() returns the number of bytes requested for a KFENCE object at + * allocation time. This number may be less than the object size of the + * corresponding struct kmem_cache. + */ +size_t kfence_ksize(const void *addr); + +/** + * kfence_object_start() - find the beginning of a KFENCE object + * @addr: address within a KFENCE-allocated object + * + * Return: address of the beginning of the object. + * + * SL[AU]B-allocated objects are laid out within a page one by one, so it is + * easy to calculate the beginning of an object given a pointer inside it and + * the object size. The same is not true for KFENCE, which places a single + * object at either end of the page. This helper function is used to find the + * beginning of a KFENCE-allocated object. + */ +void *kfence_object_start(const void *addr); + +/** + * __kfence_free() - release a KFENCE heap object to KFENCE pool + * @addr: object to be freed + * + * Requires: is_kfence_address(addr) + * + * Release a KFENCE object and mark it as freed. + */ +void __kfence_free(void *addr); + +/** + * kfence_free() - try to release an arbitrary heap object to KFENCE pool + * @addr: object to be freed + * + * Return: + * * false - object doesn't belong to KFENCE pool and was ignored, + * * true - object was released to KFENCE pool. + * + * Release a KFENCE object and mark it as freed. May be called on any object, + * even non-KFENCE objects, to simplify integration of the hooks into the + * allocator's free codepath. The allocator must check the return value to + * determine if it was a KFENCE object or not. + */ +static __always_inline __must_check bool kfence_free(void *addr) +{ + if (!is_kfence_address(addr)) + return false; + __kfence_free(addr); + return true; +} + +/** + * kfence_handle_page_fault() - perform page fault handling for KFENCE pages + * @addr: faulting address + * + * Return: + * * false - address outside KFENCE pool, + * * true - page fault handled by KFENCE, no additional handling required. + * + * A page fault inside KFENCE pool indicates a memory error, such as an + * out-of-bounds access, a use-after-free or an invalid memory access. In these + * cases KFENCE prints an error message and marks the offending page as + * present, so that the kernel can proceed. + */ +bool __must_check kfence_handle_page_fault(unsigned long addr); + +#else /* CONFIG_KFENCE */ + +static inline bool is_kfence_address(const void *addr) { return false; } +static inline void kfence_alloc_pool(void) { } +static inline void kfence_init(void) { } +static inline void kfence_shutdown_cache(struct kmem_cache *s) { } +static inline void *kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags) { return NULL; } +static inline size_t kfence_ksize(const void *addr) { return 0; } +static inline void *kfence_object_start(const void *addr) { return NULL; } +static inline void __kfence_free(void *addr) { } +static inline bool __must_check kfence_free(void *addr) { return false; } +static inline bool __must_check kfence_handle_page_fault(unsigned long addr) { return false; } + +#endif + +#endif /* _LINUX_KFENCE_H */ diff --git a/init/main.c b/init/main.c index e9933cbf60d4..261051070e3c 100644 --- a/init/main.c +++ b/init/main.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include #include @@ -824,6 +825,7 @@ static void __init mm_init(void) */ page_ext_init_flatmem(); init_mem_debugging_and_hardening(); + kfence_alloc_pool(); report_meminit(); mem_init(); /* page_owner must be initialized after buddy is ready */ @@ -955,6 +957,7 @@ asmlinkage __visible void __init __no_sanitize_address start_kernel(void) hrtimers_init(); softirq_init(); timekeeping_init(); + kfence_init(); /* * For best initial stack canary entropy, prepare it after: diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index f9febffffc21..2779c29d9981 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -938,6 +938,7 @@ config DEBUG_STACKOVERFLOW If in doubt, say "N". source "lib/Kconfig.kasan" +source "lib/Kconfig.kfence" endmenu # "Memory Debugging" diff --git a/lib/Kconfig.kfence b/lib/Kconfig.kfence new file mode 100644 index 000000000000..b88ac9d6b2e6 --- /dev/null +++ b/lib/Kconfig.kfence @@ -0,0 +1,67 @@ +# SPDX-License-Identifier: GPL-2.0-only + +config HAVE_ARCH_KFENCE + bool + +menuconfig KFENCE + bool "KFENCE: low-overhead sampling-based memory safety error detector" + depends on HAVE_ARCH_KFENCE && !KASAN && (SLAB || SLUB) + select STACKTRACE + help + KFENCE is a low-overhead sampling-based detector of heap out-of-bounds + access, use-after-free, and invalid-free errors. KFENCE is designed + to have negligible cost to permit enabling it in production + environments. + + Note that, KFENCE is not a substitute for explicit testing with tools + such as KASAN. KFENCE can detect a subset of bugs that KASAN can + detect, albeit at very different performance profiles. If you can + afford to use KASAN, continue using KASAN, for example in test + environments. If your kernel targets production use, and cannot + enable KASAN due to its cost, consider using KFENCE. + +if KFENCE + +config KFENCE_STATIC_KEYS + bool "Use static keys to set up allocations" + default y + depends on JUMP_LABEL # To ensure performance, require jump labels + help + Use static keys (static branches) to set up KFENCE allocations. Using + static keys is normally recommended, because it avoids a dynamic + branch in the allocator's fast path. However, with very low sample + intervals, or on systems that do not support jump labels, a dynamic + branch may still be an acceptable performance trade-off. + +config KFENCE_SAMPLE_INTERVAL + int "Default sample interval in milliseconds" + default 100 + help + The KFENCE sample interval determines the frequency with which heap + allocations will be guarded by KFENCE. May be overridden via boot + parameter "kfence.sample_interval". + + Set this to 0 to disable KFENCE by default, in which case only + setting "kfence.sample_interval" to a non-zero value enables KFENCE. + +config KFENCE_NUM_OBJECTS + int "Number of guarded objects available" + range 1 65535 + default 255 + help + The number of guarded objects available. For each KFENCE object, 2 + pages are required; with one containing the object and two adjacent + ones used as guard pages. + +config KFENCE_STRESS_TEST_FAULTS + int "Stress testing of fault handling and error reporting" if EXPERT + default 0 + help + The inverse probability with which to randomly protect KFENCE object + pages, resulting in spurious use-after-frees. The main purpose of + this option is to stress test KFENCE with concurrent error reports + and allocations/frees. A value of 0 disables stress testing logic. + + Only for KFENCE testing; set to 0 if you are not a KFENCE developer. + +endif # KFENCE diff --git a/mm/Makefile b/mm/Makefile index 135bbb65511a..72227b24a616 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -81,6 +81,7 @@ obj-$(CONFIG_PAGE_POISONING) += page_poison.o obj-$(CONFIG_SLAB) += slab.o obj-$(CONFIG_SLUB) += slub.o obj-$(CONFIG_KASAN) += kasan/ +obj-$(CONFIG_KFENCE) += kfence/ obj-$(CONFIG_FAILSLAB) += failslab.o obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o obj-$(CONFIG_MEMTEST) += memtest.o diff --git a/mm/kfence/Makefile b/mm/kfence/Makefile new file mode 100644 index 000000000000..d991e9a349f0 --- /dev/null +++ b/mm/kfence/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-$(CONFIG_KFENCE) := core.o report.o diff --git a/mm/kfence/core.c b/mm/kfence/core.c new file mode 100644 index 000000000000..d6a32c13336b --- /dev/null +++ b/mm/kfence/core.c @@ -0,0 +1,840 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KFENCE guarded object allocator and fault handling. + * + * Copyright (C) 2020, Google LLC. + */ + +#define pr_fmt(fmt) "kfence: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "kfence.h" + +/* Disables KFENCE on the first warning assuming an irrecoverable error. */ +#define KFENCE_WARN_ON(cond) \ + ({ \ + const bool __cond = WARN_ON(cond); \ + if (unlikely(__cond)) \ + WRITE_ONCE(kfence_enabled, false); \ + __cond; \ + }) + +/* === Data ================================================================= */ + +static bool kfence_enabled __read_mostly; + +static unsigned long kfence_sample_interval __read_mostly = CONFIG_KFENCE_SAMPLE_INTERVAL; + +#ifdef MODULE_PARAM_PREFIX +#undef MODULE_PARAM_PREFIX +#endif +#define MODULE_PARAM_PREFIX "kfence." + +static int param_set_sample_interval(const char *val, const struct kernel_param *kp) +{ + unsigned long num; + int ret = kstrtoul(val, 0, &num); + + if (ret < 0) + return ret; + + if (!num) /* Using 0 to indicate KFENCE is disabled. */ + WRITE_ONCE(kfence_enabled, false); + else if (!READ_ONCE(kfence_enabled) && system_state != SYSTEM_BOOTING) + return -EINVAL; /* Cannot (re-)enable KFENCE on-the-fly. */ + + *((unsigned long *)kp->arg) = num; + return 0; +} + +static int param_get_sample_interval(char *buffer, const struct kernel_param *kp) +{ + if (!READ_ONCE(kfence_enabled)) + return sprintf(buffer, "0\n"); + + return param_get_ulong(buffer, kp); +} + +static const struct kernel_param_ops sample_interval_param_ops = { + .set = param_set_sample_interval, + .get = param_get_sample_interval, +}; +module_param_cb(sample_interval, &sample_interval_param_ops, &kfence_sample_interval, 0600); + +/* The pool of pages used for guard pages and objects. */ +char *__kfence_pool __ro_after_init; +EXPORT_SYMBOL(__kfence_pool); /* Export for test modules. */ + +/* + * Per-object metadata, with one-to-one mapping of object metadata to + * backing pages (in __kfence_pool). + */ +static_assert(CONFIG_KFENCE_NUM_OBJECTS > 0); +struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS]; + +/* Freelist with available objects. */ +static struct list_head kfence_freelist = LIST_HEAD_INIT(kfence_freelist); +static DEFINE_RAW_SPINLOCK(kfence_freelist_lock); /* Lock protecting freelist. */ + +#ifdef CONFIG_KFENCE_STATIC_KEYS +/* The static key to set up a KFENCE allocation. */ +DEFINE_STATIC_KEY_FALSE(kfence_allocation_key); +#endif + +/* Gates the allocation, ensuring only one succeeds in a given period. */ +atomic_t kfence_allocation_gate = ATOMIC_INIT(1); + +/* Statistics counters for debugfs. */ +enum kfence_counter_id { + KFENCE_COUNTER_ALLOCATED, + KFENCE_COUNTER_ALLOCS, + KFENCE_COUNTER_FREES, + KFENCE_COUNTER_ZOMBIES, + KFENCE_COUNTER_BUGS, + KFENCE_COUNTER_COUNT, +}; +static atomic_long_t counters[KFENCE_COUNTER_COUNT]; +static const char *const counter_names[] = { + [KFENCE_COUNTER_ALLOCATED] = "currently allocated", + [KFENCE_COUNTER_ALLOCS] = "total allocations", + [KFENCE_COUNTER_FREES] = "total frees", + [KFENCE_COUNTER_ZOMBIES] = "zombie allocations", + [KFENCE_COUNTER_BUGS] = "total bugs", +}; +static_assert(ARRAY_SIZE(counter_names) == KFENCE_COUNTER_COUNT); + +/* === Internals ============================================================ */ + +static bool kfence_protect(unsigned long addr) +{ + return !KFENCE_WARN_ON(!kfence_protect_page(ALIGN_DOWN(addr, PAGE_SIZE), true)); +} + +static bool kfence_unprotect(unsigned long addr) +{ + return !KFENCE_WARN_ON(!kfence_protect_page(ALIGN_DOWN(addr, PAGE_SIZE), false)); +} + +static inline struct kfence_metadata *addr_to_metadata(unsigned long addr) +{ + long index; + + /* The checks do not affect performance; only called from slow-paths. */ + + if (!is_kfence_address((void *)addr)) + return NULL; + + /* + * May be an invalid index if called with an address at the edge of + * __kfence_pool, in which case we would report an "invalid access" + * error. + */ + index = (addr - (unsigned long)__kfence_pool) / (PAGE_SIZE * 2) - 1; + if (index < 0 || index >= CONFIG_KFENCE_NUM_OBJECTS) + return NULL; + + return &kfence_metadata[index]; +} + +static inline unsigned long metadata_to_pageaddr(const struct kfence_metadata *meta) +{ + unsigned long offset = (meta - kfence_metadata + 1) * PAGE_SIZE * 2; + unsigned long pageaddr = (unsigned long)&__kfence_pool[offset]; + + /* The checks do not affect performance; only called from slow-paths. */ + + /* Only call with a pointer into kfence_metadata. */ + if (KFENCE_WARN_ON(meta < kfence_metadata || + meta >= kfence_metadata + CONFIG_KFENCE_NUM_OBJECTS)) + return 0; + + /* + * This metadata object only ever maps to 1 page; verify that the stored + * address is in the expected range. + */ + if (KFENCE_WARN_ON(ALIGN_DOWN(meta->addr, PAGE_SIZE) != pageaddr)) + return 0; + + return pageaddr; +} + +/* + * Update the object's metadata state, including updating the alloc/free stacks + * depending on the state transition. + */ +static noinline void metadata_update_state(struct kfence_metadata *meta, + enum kfence_object_state next) +{ + struct kfence_track *track = + next == KFENCE_OBJECT_FREED ? &meta->free_track : &meta->alloc_track; + + lockdep_assert_held(&meta->lock); + + /* + * Skip over 1 (this) functions; noinline ensures we do not accidentally + * skip over the caller by never inlining. + */ + track->num_stack_entries = stack_trace_save(track->stack_entries, KFENCE_STACK_DEPTH, 1); + track->pid = task_pid_nr(current); + + /* + * Pairs with READ_ONCE() in + * kfence_shutdown_cache(), + * kfence_handle_page_fault(). + */ + WRITE_ONCE(meta->state, next); +} + +/* Write canary byte to @addr. */ +static inline bool set_canary_byte(u8 *addr) +{ + *addr = KFENCE_CANARY_PATTERN(addr); + return true; +} + +/* Check canary byte at @addr. */ +static inline bool check_canary_byte(u8 *addr) +{ + if (likely(*addr == KFENCE_CANARY_PATTERN(addr))) + return true; + + atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]); + kfence_report_error((unsigned long)addr, addr_to_metadata((unsigned long)addr), + KFENCE_ERROR_CORRUPTION); + return false; +} + +/* __always_inline this to ensure we won't do an indirect call to fn. */ +static __always_inline void for_each_canary(const struct kfence_metadata *meta, bool (*fn)(u8 *)) +{ + const unsigned long pageaddr = ALIGN_DOWN(meta->addr, PAGE_SIZE); + unsigned long addr; + + lockdep_assert_held(&meta->lock); + + /* + * We'll iterate over each canary byte per-side until fn() returns + * false. However, we'll still iterate over the canary bytes to the + * right of the object even if there was an error in the canary bytes to + * the left of the object. Specifically, if check_canary_byte() + * generates an error, showing both sides might give more clues as to + * what the error is about when displaying which bytes were corrupted. + */ + + /* Apply to left of object. */ + for (addr = pageaddr; addr < meta->addr; addr++) { + if (!fn((u8 *)addr)) + break; + } + + /* Apply to right of object. */ + for (addr = meta->addr + meta->size; addr < pageaddr + PAGE_SIZE; addr++) { + if (!fn((u8 *)addr)) + break; + } +} + +static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t gfp) +{ + struct kfence_metadata *meta = NULL; + unsigned long flags; + struct page *page; + void *addr; + + /* Try to obtain a free object. */ + raw_spin_lock_irqsave(&kfence_freelist_lock, flags); + if (!list_empty(&kfence_freelist)) { + meta = list_entry(kfence_freelist.next, struct kfence_metadata, list); + list_del_init(&meta->list); + } + raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags); + if (!meta) + return NULL; + + if (unlikely(!raw_spin_trylock_irqsave(&meta->lock, flags))) { + /* + * This is extremely unlikely -- we are reporting on a + * use-after-free, which locked meta->lock, and the reporting + * code via printk calls kmalloc() which ends up in + * kfence_alloc() and tries to grab the same object that we're + * reporting on. While it has never been observed, lockdep does + * report that there is a possibility of deadlock. Fix it by + * using trylock and bailing out gracefully. + */ + raw_spin_lock_irqsave(&kfence_freelist_lock, flags); + /* Put the object back on the freelist. */ + list_add_tail(&meta->list, &kfence_freelist); + raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags); + + return NULL; + } + + meta->addr = metadata_to_pageaddr(meta); + /* Unprotect if we're reusing this page. */ + if (meta->state == KFENCE_OBJECT_FREED) + kfence_unprotect(meta->addr); + + /* + * Note: for allocations made before RNG initialization, will always + * return zero. We still benefit from enabling KFENCE as early as + * possible, even when the RNG is not yet available, as this will allow + * KFENCE to detect bugs due to earlier allocations. The only downside + * is that the out-of-bounds accesses detected are deterministic for + * such allocations. + */ + if (prandom_u32_max(2)) { + /* Allocate on the "right" side, re-calculate address. */ + meta->addr += PAGE_SIZE - size; + meta->addr = ALIGN_DOWN(meta->addr, cache->align); + } + + addr = (void *)meta->addr; + + /* Update remaining metadata. */ + metadata_update_state(meta, KFENCE_OBJECT_ALLOCATED); + /* Pairs with READ_ONCE() in kfence_shutdown_cache(). */ + WRITE_ONCE(meta->cache, cache); + meta->size = size; + for_each_canary(meta, set_canary_byte); + + /* Set required struct page fields. */ + page = virt_to_page(meta->addr); + page->slab_cache = cache; + + raw_spin_unlock_irqrestore(&meta->lock, flags); + + /* Memory initialization. */ + + /* + * We check slab_want_init_on_alloc() ourselves, rather than letting + * SL*B do the initialization, as otherwise we might overwrite KFENCE's + * redzone. + */ + if (unlikely(slab_want_init_on_alloc(gfp, cache))) + memzero_explicit(addr, size); + if (cache->ctor) + cache->ctor(addr); + + if (CONFIG_KFENCE_STRESS_TEST_FAULTS && !prandom_u32_max(CONFIG_KFENCE_STRESS_TEST_FAULTS)) + kfence_protect(meta->addr); /* Random "faults" by protecting the object. */ + + atomic_long_inc(&counters[KFENCE_COUNTER_ALLOCATED]); + atomic_long_inc(&counters[KFENCE_COUNTER_ALLOCS]); + + return addr; +} + +static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool zombie) +{ + struct kcsan_scoped_access assert_page_exclusive; + unsigned long flags; + + raw_spin_lock_irqsave(&meta->lock, flags); + + if (meta->state != KFENCE_OBJECT_ALLOCATED || meta->addr != (unsigned long)addr) { + /* Invalid or double-free, bail out. */ + atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]); + kfence_report_error((unsigned long)addr, meta, KFENCE_ERROR_INVALID_FREE); + raw_spin_unlock_irqrestore(&meta->lock, flags); + return; + } + + /* Detect racy use-after-free, or incorrect reallocation of this page by KFENCE. */ + kcsan_begin_scoped_access((void *)ALIGN_DOWN((unsigned long)addr, PAGE_SIZE), PAGE_SIZE, + KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT, + &assert_page_exclusive); + + if (CONFIG_KFENCE_STRESS_TEST_FAULTS) + kfence_unprotect((unsigned long)addr); /* To check canary bytes. */ + + /* Restore page protection if there was an OOB access. */ + if (meta->unprotected_page) { + kfence_protect(meta->unprotected_page); + meta->unprotected_page = 0; + } + + /* Check canary bytes for memory corruption. */ + for_each_canary(meta, check_canary_byte); + + /* + * Clear memory if init-on-free is set. While we protect the page, the + * data is still there, and after a use-after-free is detected, we + * unprotect the page, so the data is still accessible. + */ + if (!zombie && unlikely(slab_want_init_on_free(meta->cache))) + memzero_explicit(addr, meta->size); + + /* Mark the object as freed. */ + metadata_update_state(meta, KFENCE_OBJECT_FREED); + + raw_spin_unlock_irqrestore(&meta->lock, flags); + + /* Protect to detect use-after-frees. */ + kfence_protect((unsigned long)addr); + + kcsan_end_scoped_access(&assert_page_exclusive); + if (!zombie) { + /* Add it to the tail of the freelist for reuse. */ + raw_spin_lock_irqsave(&kfence_freelist_lock, flags); + KFENCE_WARN_ON(!list_empty(&meta->list)); + list_add_tail(&meta->list, &kfence_freelist); + raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags); + + atomic_long_dec(&counters[KFENCE_COUNTER_ALLOCATED]); + atomic_long_inc(&counters[KFENCE_COUNTER_FREES]); + } else { + /* See kfence_shutdown_cache(). */ + atomic_long_inc(&counters[KFENCE_COUNTER_ZOMBIES]); + } +} + +static void rcu_guarded_free(struct rcu_head *h) +{ + struct kfence_metadata *meta = container_of(h, struct kfence_metadata, rcu_head); + + kfence_guarded_free((void *)meta->addr, meta, false); +} + +static bool __init kfence_init_pool(void) +{ + unsigned long addr = (unsigned long)__kfence_pool; + struct page *pages; + int i; + + if (!__kfence_pool) + return false; + + if (!arch_kfence_init_pool()) + goto err; + + pages = virt_to_page(addr); + + /* + * Set up object pages: they must have PG_slab set, to avoid freeing + * these as real pages. + * + * We also want to avoid inserting kfence_free() in the kfree() + * fast-path in SLUB, and therefore need to ensure kfree() correctly + * enters __slab_free() slow-path. + */ + for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) { + if (!i || (i % 2)) + continue; + + /* Verify we do not have a compound head page. */ + if (WARN_ON(compound_head(&pages[i]) != &pages[i])) + goto err; + + __SetPageSlab(&pages[i]); + } + + /* + * Protect the first 2 pages. The first page is mostly unnecessary, and + * merely serves as an extended guard page. However, adding one + * additional page in the beginning gives us an even number of pages, + * which simplifies the mapping of address to metadata index. + */ + for (i = 0; i < 2; i++) { + if (unlikely(!kfence_protect(addr))) + goto err; + + addr += PAGE_SIZE; + } + + for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) { + struct kfence_metadata *meta = &kfence_metadata[i]; + + /* Initialize metadata. */ + INIT_LIST_HEAD(&meta->list); + raw_spin_lock_init(&meta->lock); + meta->state = KFENCE_OBJECT_UNUSED; + meta->addr = addr; /* Initialize for validation in metadata_to_pageaddr(). */ + list_add_tail(&meta->list, &kfence_freelist); + + /* Protect the right redzone. */ + if (unlikely(!kfence_protect(addr + PAGE_SIZE))) + goto err; + + addr += 2 * PAGE_SIZE; + } + + return true; + +err: + /* + * Only release unprotected pages, and do not try to go back and change + * page attributes due to risk of failing to do so as well. If changing + * page attributes for some pages fails, it is very likely that it also + * fails for the first page, and therefore expect addr==__kfence_pool in + * most failure cases. + */ + memblock_free_late(__pa(addr), KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool)); + __kfence_pool = NULL; + return false; +} + +/* === DebugFS Interface ==================================================== */ + +static int stats_show(struct seq_file *seq, void *v) +{ + int i; + + seq_printf(seq, "enabled: %i\n", READ_ONCE(kfence_enabled)); + for (i = 0; i < KFENCE_COUNTER_COUNT; i++) + seq_printf(seq, "%s: %ld\n", counter_names[i], atomic_long_read(&counters[i])); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(stats); + +/* + * debugfs seq_file operations for /sys/kernel/debug/kfence/objects. + * start_object() and next_object() return the object index + 1, because NULL is used + * to stop iteration. + */ +static void *start_object(struct seq_file *seq, loff_t *pos) +{ + if (*pos < CONFIG_KFENCE_NUM_OBJECTS) + return (void *)((long)*pos + 1); + return NULL; +} + +static void stop_object(struct seq_file *seq, void *v) +{ +} + +static void *next_object(struct seq_file *seq, void *v, loff_t *pos) +{ + ++*pos; + if (*pos < CONFIG_KFENCE_NUM_OBJECTS) + return (void *)((long)*pos + 1); + return NULL; +} + +static int show_object(struct seq_file *seq, void *v) +{ + struct kfence_metadata *meta = &kfence_metadata[(long)v - 1]; + unsigned long flags; + + raw_spin_lock_irqsave(&meta->lock, flags); + kfence_print_object(seq, meta); + raw_spin_unlock_irqrestore(&meta->lock, flags); + seq_puts(seq, "---------------------------------\n"); + + return 0; +} + +static const struct seq_operations object_seqops = { + .start = start_object, + .next = next_object, + .stop = stop_object, + .show = show_object, +}; + +static int open_objects(struct inode *inode, struct file *file) +{ + return seq_open(file, &object_seqops); +} + +static const struct file_operations objects_fops = { + .open = open_objects, + .read = seq_read, + .llseek = seq_lseek, +}; + +static int __init kfence_debugfs_init(void) +{ + struct dentry *kfence_dir = debugfs_create_dir("kfence", NULL); + + debugfs_create_file("stats", 0444, kfence_dir, NULL, &stats_fops); + debugfs_create_file("objects", 0400, kfence_dir, NULL, &objects_fops); + return 0; +} + +late_initcall(kfence_debugfs_init); + +/* === Allocation Gate Timer ================================================ */ + +/* + * Set up delayed work, which will enable and disable the static key. We need to + * use a work queue (rather than a simple timer), since enabling and disabling a + * static key cannot be done from an interrupt. + * + * Note: Toggling a static branch currently causes IPIs, and here we'll end up + * with a total of 2 IPIs to all CPUs. If this ends up a problem in future (with + * more aggressive sampling intervals), we could get away with a variant that + * avoids IPIs, at the cost of not immediately capturing allocations if the + * instructions remain cached. + */ +static struct delayed_work kfence_timer; +static void toggle_allocation_gate(struct work_struct *work) +{ + if (!READ_ONCE(kfence_enabled)) + return; + + /* Enable static key, and await allocation to happen. */ + atomic_set(&kfence_allocation_gate, 0); +#ifdef CONFIG_KFENCE_STATIC_KEYS + static_branch_enable(&kfence_allocation_key); + /* + * Await an allocation. Timeout after 1 second, in case the kernel stops + * doing allocations, to avoid stalling this worker task for too long. + */ + { + unsigned long end_wait = jiffies + HZ; + + do { + set_current_state(TASK_UNINTERRUPTIBLE); + if (atomic_read(&kfence_allocation_gate) != 0) + break; + schedule_timeout(1); + } while (time_before(jiffies, end_wait)); + __set_current_state(TASK_RUNNING); + } + /* Disable static key and reset timer. */ + static_branch_disable(&kfence_allocation_key); +#endif + schedule_delayed_work(&kfence_timer, msecs_to_jiffies(kfence_sample_interval)); +} +static DECLARE_DELAYED_WORK(kfence_timer, toggle_allocation_gate); + +/* === Public interface ===================================================== */ + +void __init kfence_alloc_pool(void) +{ + if (!kfence_sample_interval) + return; + + __kfence_pool = memblock_alloc(KFENCE_POOL_SIZE, PAGE_SIZE); + + if (!__kfence_pool) + pr_err("failed to allocate pool\n"); +} + +void __init kfence_init(void) +{ + /* Setting kfence_sample_interval to 0 on boot disables KFENCE. */ + if (!kfence_sample_interval) + return; + + if (!kfence_init_pool()) { + pr_err("%s failed\n", __func__); + return; + } + + WRITE_ONCE(kfence_enabled, true); + schedule_delayed_work(&kfence_timer, 0); + pr_info("initialized - using %lu bytes for %d objects", KFENCE_POOL_SIZE, + CONFIG_KFENCE_NUM_OBJECTS); + if (IS_ENABLED(CONFIG_DEBUG_KERNEL)) + pr_cont(" at 0x%px-0x%px\n", (void *)__kfence_pool, + (void *)(__kfence_pool + KFENCE_POOL_SIZE)); + else + pr_cont("\n"); +} + +void kfence_shutdown_cache(struct kmem_cache *s) +{ + unsigned long flags; + struct kfence_metadata *meta; + int i; + + for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) { + bool in_use; + + meta = &kfence_metadata[i]; + + /* + * If we observe some inconsistent cache and state pair where we + * should have returned false here, cache destruction is racing + * with either kmem_cache_alloc() or kmem_cache_free(). Taking + * the lock will not help, as different critical section + * serialization will have the same outcome. + */ + if (READ_ONCE(meta->cache) != s || + READ_ONCE(meta->state) != KFENCE_OBJECT_ALLOCATED) + continue; + + raw_spin_lock_irqsave(&meta->lock, flags); + in_use = meta->cache == s && meta->state == KFENCE_OBJECT_ALLOCATED; + raw_spin_unlock_irqrestore(&meta->lock, flags); + + if (in_use) { + /* + * This cache still has allocations, and we should not + * release them back into the freelist so they can still + * safely be used and retain the kernel's default + * behaviour of keeping the allocations alive (leak the + * cache); however, they effectively become "zombie + * allocations" as the KFENCE objects are the only ones + * still in use and the owning cache is being destroyed. + * + * We mark them freed, so that any subsequent use shows + * more useful error messages that will include stack + * traces of the user of the object, the original + * allocation, and caller to shutdown_cache(). + */ + kfence_guarded_free((void *)meta->addr, meta, /*zombie=*/true); + } + } + + for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) { + meta = &kfence_metadata[i]; + + /* See above. */ + if (READ_ONCE(meta->cache) != s || READ_ONCE(meta->state) != KFENCE_OBJECT_FREED) + continue; + + raw_spin_lock_irqsave(&meta->lock, flags); + if (meta->cache == s && meta->state == KFENCE_OBJECT_FREED) + meta->cache = NULL; + raw_spin_unlock_irqrestore(&meta->lock, flags); + } +} + +void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags) +{ + /* + * allocation_gate only needs to become non-zero, so it doesn't make + * sense to continue writing to it and pay the associated contention + * cost, in case we have a large number of concurrent allocations. + */ + if (atomic_read(&kfence_allocation_gate) || atomic_inc_return(&kfence_allocation_gate) > 1) + return NULL; + + if (!READ_ONCE(kfence_enabled)) + return NULL; + + if (size > PAGE_SIZE) + return NULL; + + return kfence_guarded_alloc(s, size, flags); +} + +size_t kfence_ksize(const void *addr) +{ + const struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr); + + /* + * Read locklessly -- if there is a race with __kfence_alloc(), this is + * either a use-after-free or invalid access. + */ + return meta ? meta->size : 0; +} + +void *kfence_object_start(const void *addr) +{ + const struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr); + + /* + * Read locklessly -- if there is a race with __kfence_alloc(), this is + * either a use-after-free or invalid access. + */ + return meta ? (void *)meta->addr : NULL; +} + +void __kfence_free(void *addr) +{ + struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr); + + /* + * If the objects of the cache are SLAB_TYPESAFE_BY_RCU, defer freeing + * the object, as the object page may be recycled for other-typed + * objects once it has been freed. meta->cache may be NULL if the cache + * was destroyed. + */ + if (unlikely(meta->cache && (meta->cache->flags & SLAB_TYPESAFE_BY_RCU))) + call_rcu(&meta->rcu_head, rcu_guarded_free); + else + kfence_guarded_free(addr, meta, false); +} + +bool kfence_handle_page_fault(unsigned long addr) +{ + const int page_index = (addr - (unsigned long)__kfence_pool) / PAGE_SIZE; + struct kfence_metadata *to_report = NULL; + enum kfence_error_type error_type; + unsigned long flags; + + if (!is_kfence_address((void *)addr)) + return false; + + if (!READ_ONCE(kfence_enabled)) /* If disabled at runtime ... */ + return kfence_unprotect(addr); /* ... unprotect and proceed. */ + + atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]); + + if (page_index % 2) { + /* This is a redzone, report a buffer overflow. */ + struct kfence_metadata *meta; + int distance = 0; + + meta = addr_to_metadata(addr - PAGE_SIZE); + if (meta && READ_ONCE(meta->state) == KFENCE_OBJECT_ALLOCATED) { + to_report = meta; + /* Data race ok; distance calculation approximate. */ + distance = addr - data_race(meta->addr + meta->size); + } + + meta = addr_to_metadata(addr + PAGE_SIZE); + if (meta && READ_ONCE(meta->state) == KFENCE_OBJECT_ALLOCATED) { + /* Data race ok; distance calculation approximate. */ + if (!to_report || distance > data_race(meta->addr) - addr) + to_report = meta; + } + + if (!to_report) + goto out; + + raw_spin_lock_irqsave(&to_report->lock, flags); + to_report->unprotected_page = addr; + error_type = KFENCE_ERROR_OOB; + + /* + * If the object was freed before we took the look we can still + * report this as an OOB -- the report will simply show the + * stacktrace of the free as well. + */ + } else { + to_report = addr_to_metadata(addr); + if (!to_report) + goto out; + + raw_spin_lock_irqsave(&to_report->lock, flags); + error_type = KFENCE_ERROR_UAF; + /* + * We may race with __kfence_alloc(), and it is possible that a + * freed object may be reallocated. We simply report this as a + * use-after-free, with the stack trace showing the place where + * the object was re-allocated. + */ + } + +out: + if (to_report) { + kfence_report_error(addr, to_report, error_type); + raw_spin_unlock_irqrestore(&to_report->lock, flags); + } else { + /* This may be a UAF or OOB access, but we can't be sure. */ + kfence_report_error(addr, NULL, KFENCE_ERROR_INVALID); + } + + return kfence_unprotect(addr); /* Unprotect and let access proceed. */ +} diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h new file mode 100644 index 000000000000..1014060f9707 --- /dev/null +++ b/mm/kfence/kfence.h @@ -0,0 +1,113 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Kernel Electric-Fence (KFENCE). For more info please see + * Documentation/dev-tools/kfence.rst. + * + * Copyright (C) 2020, Google LLC. + */ + +#ifndef MM_KFENCE_KFENCE_H +#define MM_KFENCE_KFENCE_H + +#include +#include +#include +#include + +#include "../slab.h" /* for struct kmem_cache */ + +/* For non-debug builds, avoid leaking kernel pointers into dmesg. */ +#ifdef CONFIG_DEBUG_KERNEL +#define PTR_FMT "%px" +#else +#define PTR_FMT "%p" +#endif + +/* + * Get the canary byte pattern for @addr. Use a pattern that varies based on the + * lower 3 bits of the address, to detect memory corruptions with higher + * probability, where similar constants are used. + */ +#define KFENCE_CANARY_PATTERN(addr) ((u8)0xaa ^ (u8)((unsigned long)(addr) & 0x7)) + +/* Maximum stack depth for reports. */ +#define KFENCE_STACK_DEPTH 64 + +/* KFENCE object states. */ +enum kfence_object_state { + KFENCE_OBJECT_UNUSED, /* Object is unused. */ + KFENCE_OBJECT_ALLOCATED, /* Object is currently allocated. */ + KFENCE_OBJECT_FREED, /* Object was allocated, and then freed. */ +}; + +/* Alloc/free tracking information. */ +struct kfence_track { + pid_t pid; + int num_stack_entries; + unsigned long stack_entries[KFENCE_STACK_DEPTH]; +}; + +/* KFENCE metadata per guarded allocation. */ +struct kfence_metadata { + struct list_head list; /* Freelist node; access under kfence_freelist_lock. */ + struct rcu_head rcu_head; /* For delayed freeing. */ + + /* + * Lock protecting below data; to ensure consistency of the below data, + * since the following may execute concurrently: __kfence_alloc(), + * __kfence_free(), kfence_handle_page_fault(). However, note that we + * cannot grab the same metadata off the freelist twice, and multiple + * __kfence_alloc() cannot run concurrently on the same metadata. + */ + raw_spinlock_t lock; + + /* The current state of the object; see above. */ + enum kfence_object_state state; + + /* + * Allocated object address; cannot be calculated from size, because of + * alignment requirements. + * + * Invariant: ALIGN_DOWN(addr, PAGE_SIZE) is constant. + */ + unsigned long addr; + + /* + * The size of the original allocation. + */ + size_t size; + + /* + * The kmem_cache cache of the last allocation; NULL if never allocated + * or the cache has already been destroyed. + */ + struct kmem_cache *cache; + + /* + * In case of an invalid access, the page that was unprotected; we + * optimistically only store one address. + */ + unsigned long unprotected_page; + + /* Allocation and free stack information. */ + struct kfence_track alloc_track; + struct kfence_track free_track; +}; + +extern struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS]; + +/* KFENCE error types for report generation. */ +enum kfence_error_type { + KFENCE_ERROR_OOB, /* Detected a out-of-bounds access. */ + KFENCE_ERROR_UAF, /* Detected a use-after-free access. */ + KFENCE_ERROR_CORRUPTION, /* Detected a memory corruption on free. */ + KFENCE_ERROR_INVALID, /* Invalid access of unknown type. */ + KFENCE_ERROR_INVALID_FREE, /* Invalid free. */ +}; + +void kfence_report_error(unsigned long address, const struct kfence_metadata *meta, + enum kfence_error_type type); + +void kfence_print_object(struct seq_file *seq, const struct kfence_metadata *meta); + +#endif /* MM_KFENCE_KFENCE_H */ diff --git a/mm/kfence/report.c b/mm/kfence/report.c new file mode 100644 index 000000000000..64f27c8d46a3 --- /dev/null +++ b/mm/kfence/report.c @@ -0,0 +1,240 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KFENCE reporting. + * + * Copyright (C) 2020, Google LLC. + */ + +#include + +#include +#include +#include +#include +#include +#include + +#include + +#include "kfence.h" + +/* Helper function to either print to a seq_file or to console. */ +__printf(2, 3) +static void seq_con_printf(struct seq_file *seq, const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + if (seq) + seq_vprintf(seq, fmt, args); + else + vprintk(fmt, args); + va_end(args); +} + +/* + * Get the number of stack entries to skip to get out of MM internals. @type is + * optional, and if set to NULL, assumes an allocation or free stack. + */ +static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries, + const enum kfence_error_type *type) +{ + char buf[64]; + int skipnr, fallback = 0; + bool is_access_fault = false; + + if (type) { + /* Depending on error type, find different stack entries. */ + switch (*type) { + case KFENCE_ERROR_UAF: + case KFENCE_ERROR_OOB: + case KFENCE_ERROR_INVALID: + is_access_fault = true; + break; + case KFENCE_ERROR_CORRUPTION: + case KFENCE_ERROR_INVALID_FREE: + break; + } + } + + for (skipnr = 0; skipnr < num_entries; skipnr++) { + int len = scnprintf(buf, sizeof(buf), "%ps", (void *)stack_entries[skipnr]); + + if (is_access_fault) { + if (!strncmp(buf, KFENCE_SKIP_ARCH_FAULT_HANDLER, len)) + goto found; + } else { + if (str_has_prefix(buf, "kfence_") || str_has_prefix(buf, "__kfence_") || + !strncmp(buf, "__slab_free", len)) { + /* + * In case of tail calls from any of the below + * to any of the above. + */ + fallback = skipnr + 1; + } + + /* Also the *_bulk() variants by only checking prefixes. */ + if (str_has_prefix(buf, "kfree") || + str_has_prefix(buf, "kmem_cache_free") || + str_has_prefix(buf, "__kmalloc") || + str_has_prefix(buf, "kmem_cache_alloc")) + goto found; + } + } + if (fallback < num_entries) + return fallback; +found: + skipnr++; + return skipnr < num_entries ? skipnr : 0; +} + +static void kfence_print_stack(struct seq_file *seq, const struct kfence_metadata *meta, + bool show_alloc) +{ + const struct kfence_track *track = show_alloc ? &meta->alloc_track : &meta->free_track; + + if (track->num_stack_entries) { + /* Skip allocation/free internals stack. */ + int i = get_stack_skipnr(track->stack_entries, track->num_stack_entries, NULL); + + /* stack_trace_seq_print() does not exist; open code our own. */ + for (; i < track->num_stack_entries; i++) + seq_con_printf(seq, " %pS\n", (void *)track->stack_entries[i]); + } else { + seq_con_printf(seq, " no %s stack\n", show_alloc ? "allocation" : "deallocation"); + } +} + +void kfence_print_object(struct seq_file *seq, const struct kfence_metadata *meta) +{ + const int size = abs(meta->size); + const unsigned long start = meta->addr; + const struct kmem_cache *const cache = meta->cache; + + lockdep_assert_held(&meta->lock); + + if (meta->state == KFENCE_OBJECT_UNUSED) { + seq_con_printf(seq, "kfence-#%zd unused\n", meta - kfence_metadata); + return; + } + + seq_con_printf(seq, + "kfence-#%zd [0x" PTR_FMT "-0x" PTR_FMT + ", size=%d, cache=%s] allocated by task %d:\n", + meta - kfence_metadata, (void *)start, (void *)(start + size - 1), size, + (cache && cache->name) ? cache->name : "", meta->alloc_track.pid); + kfence_print_stack(seq, meta, true); + + if (meta->state == KFENCE_OBJECT_FREED) { + seq_con_printf(seq, "\nfreed by task %d:\n", meta->free_track.pid); + kfence_print_stack(seq, meta, false); + } +} + +/* + * Show bytes at @addr that are different from the expected canary values, up to + * @max_bytes. + */ +static void print_diff_canary(unsigned long address, size_t bytes_to_show, + const struct kfence_metadata *meta) +{ + const unsigned long show_until_addr = address + bytes_to_show; + const u8 *cur, *end; + + /* Do not show contents of object nor read into following guard page. */ + end = (const u8 *)(address < meta->addr ? min(show_until_addr, meta->addr) + : min(show_until_addr, PAGE_ALIGN(address))); + + pr_cont("["); + for (cur = (const u8 *)address; cur < end; cur++) { + if (*cur == KFENCE_CANARY_PATTERN(cur)) + pr_cont(" ."); + else if (IS_ENABLED(CONFIG_DEBUG_KERNEL)) + pr_cont(" 0x%02x", *cur); + else /* Do not leak kernel memory in non-debug builds. */ + pr_cont(" !"); + } + pr_cont(" ]"); +} + +void kfence_report_error(unsigned long address, const struct kfence_metadata *meta, + enum kfence_error_type type) +{ + unsigned long stack_entries[KFENCE_STACK_DEPTH] = { 0 }; + int num_stack_entries = stack_trace_save(stack_entries, KFENCE_STACK_DEPTH, 1); + int skipnr = get_stack_skipnr(stack_entries, num_stack_entries, &type); + const ptrdiff_t object_index = meta ? meta - kfence_metadata : -1; + + /* Require non-NULL meta, except if KFENCE_ERROR_INVALID. */ + if (WARN_ON(type != KFENCE_ERROR_INVALID && !meta)) + return; + + if (meta) + lockdep_assert_held(&meta->lock); + /* + * Because we may generate reports in printk-unfriendly parts of the + * kernel, such as scheduler code, the use of printk() could deadlock. + * Until such time that all printing code here is safe in all parts of + * the kernel, accept the risk, and just get our message out (given the + * system might already behave unpredictably due to the memory error). + * As such, also disable lockdep to hide warnings, and avoid disabling + * lockdep for the rest of the kernel. + */ + lockdep_off(); + + pr_err("==================================================================\n"); + /* Print report header. */ + switch (type) { + case KFENCE_ERROR_OOB: { + const bool left_of_object = address < meta->addr; + + pr_err("BUG: KFENCE: out-of-bounds in %pS\n\n", (void *)stack_entries[skipnr]); + pr_err("Out-of-bounds access at 0x" PTR_FMT " (%luB %s of kfence-#%zd):\n", + (void *)address, + left_of_object ? meta->addr - address : address - meta->addr, + left_of_object ? "left" : "right", object_index); + break; + } + case KFENCE_ERROR_UAF: + pr_err("BUG: KFENCE: use-after-free in %pS\n\n", (void *)stack_entries[skipnr]); + pr_err("Use-after-free access at 0x" PTR_FMT " (in kfence-#%zd):\n", + (void *)address, object_index); + break; + case KFENCE_ERROR_CORRUPTION: + pr_err("BUG: KFENCE: memory corruption in %pS\n\n", (void *)stack_entries[skipnr]); + pr_err("Corrupted memory at 0x" PTR_FMT " ", (void *)address); + print_diff_canary(address, 16, meta); + pr_cont(" (in kfence-#%zd):\n", object_index); + break; + case KFENCE_ERROR_INVALID: + pr_err("BUG: KFENCE: invalid access in %pS\n\n", (void *)stack_entries[skipnr]); + pr_err("Invalid access at 0x" PTR_FMT ":\n", (void *)address); + break; + case KFENCE_ERROR_INVALID_FREE: + pr_err("BUG: KFENCE: invalid free in %pS\n\n", (void *)stack_entries[skipnr]); + pr_err("Invalid free of 0x" PTR_FMT " (in kfence-#%zd):\n", (void *)address, + object_index); + break; + } + + /* Print stack trace and object info. */ + stack_trace_print(stack_entries + skipnr, num_stack_entries - skipnr, 0); + + if (meta) { + pr_err("\n"); + kfence_print_object(NULL, meta); + } + + /* Print report footer. */ + pr_err("\n"); + dump_stack_print_info(KERN_ERR); + pr_err("==================================================================\n"); + + lockdep_on(); + + if (panic_on_warn) + panic("panic_on_warn set ...\n"); + + /* We encountered a memory unsafety error, taint the kernel! */ + add_taint(TAINT_BAD_PAGE, LOCKDEP_STILL_OK); +} From 1dc0da6e9ec0f8d735756374697912cd50f402cf Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 25 Feb 2021 17:18:57 -0800 Subject: [PATCH 057/118] x86, kfence: enable KFENCE for x86 Add architecture specific implementation details for KFENCE and enable KFENCE for the x86 architecture. In particular, this implements the required interface in for setting up the pool and providing helper functions for protecting and unprotecting pages. For x86, we need to ensure that the pool uses 4K pages, which is done using the set_memory_4k() helper function. [elver@google.com: add missing copyright and description header] Link: https://lkml.kernel.org/r/20210118092159.145934-2-elver@google.com Link: https://lkml.kernel.org/r/20201103175841.3495947-3-elver@google.com Signed-off-by: Marco Elver Signed-off-by: Alexander Potapenko Reviewed-by: Dmitry Vyukov Co-developed-by: Marco Elver Reviewed-by: Jann Horn Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christopher Lameter Cc: Dave Hansen Cc: David Rientjes Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Hillf Danton Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Joern Engel Cc: Jonathan Corbet Cc: Joonsoo Kim Cc: Kees Cook Cc: Mark Rutland Cc: Paul E. McKenney Cc: Pekka Enberg Cc: Peter Zijlstra Cc: SeongJae Park Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 1 + arch/x86/include/asm/kfence.h | 70 +++++++++++++++++++++++++++++++++++ arch/x86/mm/fault.c | 5 +++ 3 files changed, 76 insertions(+) create mode 100644 arch/x86/include/asm/kfence.h diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index cd4b9b1204a8..2792879d398e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -151,6 +151,7 @@ config X86 select HAVE_ARCH_JUMP_LABEL_RELATIVE select HAVE_ARCH_KASAN if X86_64 select HAVE_ARCH_KASAN_VMALLOC if X86_64 + select HAVE_ARCH_KFENCE select HAVE_ARCH_KGDB select HAVE_ARCH_MMAP_RND_BITS if MMU select HAVE_ARCH_MMAP_RND_COMPAT_BITS if MMU && COMPAT diff --git a/arch/x86/include/asm/kfence.h b/arch/x86/include/asm/kfence.h new file mode 100644 index 000000000000..a0659dbd93ea --- /dev/null +++ b/arch/x86/include/asm/kfence.h @@ -0,0 +1,70 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * x86 KFENCE support. + * + * Copyright (C) 2020, Google LLC. + */ + +#ifndef _ASM_X86_KFENCE_H +#define _ASM_X86_KFENCE_H + +#include +#include + +#include +#include +#include +#include + +/* + * The page fault handler entry function, up to which the stack trace is + * truncated in reports. + */ +#define KFENCE_SKIP_ARCH_FAULT_HANDLER "asm_exc_page_fault" + +/* Force 4K pages for __kfence_pool. */ +static inline bool arch_kfence_init_pool(void) +{ + unsigned long addr; + + for (addr = (unsigned long)__kfence_pool; is_kfence_address((void *)addr); + addr += PAGE_SIZE) { + unsigned int level; + + if (!lookup_address(addr, &level)) + return false; + + if (level != PG_LEVEL_4K) + set_memory_4k(addr, 1); + } + + return true; +} + +/* Protect the given page and flush TLB. */ +static inline bool kfence_protect_page(unsigned long addr, bool protect) +{ + unsigned int level; + pte_t *pte = lookup_address(addr, &level); + + if (WARN_ON(!pte || level != PG_LEVEL_4K)) + return false; + + /* + * We need to avoid IPIs, as we may get KFENCE allocations or faults + * with interrupts disabled. Therefore, the below is best-effort, and + * does not flush TLBs on all CPUs. We can tolerate some inaccuracy; + * lazy fault handling takes care of faults after the page is PRESENT. + */ + + if (protect) + set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT)); + else + set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT)); + + /* Flush this CPU's TLB. */ + flush_tlb_one_kernel(addr); + return true; +} + +#endif /* _ASM_X86_KFENCE_H */ diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 525197381baa..99fe6d3e690d 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -9,6 +9,7 @@ #include /* oops_begin/end, ... */ #include /* search_exception_tables */ #include /* max_low_pfn */ +#include /* kfence_handle_page_fault */ #include /* NOKPROBE_SYMBOL, ... */ #include /* kmmio_handler, ... */ #include /* perf_sw_event */ @@ -680,6 +681,10 @@ page_fault_oops(struct pt_regs *regs, unsigned long error_code, if (IS_ENABLED(CONFIG_EFI)) efi_crash_gracefully_on_page_fault(address); + /* Only not-present faults should be handled by KFENCE. */ + if (!(error_code & X86_PF_PROT) && kfence_handle_page_fault(address)) + return; + oops: /* * Oops. The kernel tried to access some bad page. We'll have to From 840b239863449f27bf7522deb81e6746fbfbfeaf Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 25 Feb 2021 17:19:03 -0800 Subject: [PATCH 058/118] arm64, kfence: enable KFENCE for ARM64 Add architecture specific implementation details for KFENCE and enable KFENCE for the arm64 architecture. In particular, this implements the required interface in . KFENCE requires that attributes for pages from its memory pool can individually be set. Therefore, force the entire linear map to be mapped at page granularity. Doing so may result in extra memory allocated for page tables in case rodata=full is not set; however, currently CONFIG_RODATA_FULL_DEFAULT_ENABLED=y is the default, and the common case is therefore not affected by this change. [elver@google.com: add missing copyright and description header] Link: https://lkml.kernel.org/r/20210118092159.145934-3-elver@google.com Link: https://lkml.kernel.org/r/20201103175841.3495947-4-elver@google.com Signed-off-by: Alexander Potapenko Signed-off-by: Marco Elver Reviewed-by: Dmitry Vyukov Co-developed-by: Alexander Potapenko Reviewed-by: Jann Horn Reviewed-by: Mark Rutland Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christopher Lameter Cc: Dave Hansen Cc: David Rientjes Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Hillf Danton Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Joern Engel Cc: Jonathan Corbet Cc: Joonsoo Kim Cc: Kees Cook Cc: Paul E. McKenney Cc: Pekka Enberg Cc: Peter Zijlstra Cc: SeongJae Park Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/Kconfig | 1 + arch/arm64/include/asm/kfence.h | 24 ++++++++++++++++++++++++ arch/arm64/mm/fault.c | 4 ++++ arch/arm64/mm/mmu.c | 8 +++++++- 4 files changed, 36 insertions(+), 1 deletion(-) create mode 100644 arch/arm64/include/asm/kfence.h diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index fc0ce2a1f3bf..a254f4871683 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -140,6 +140,7 @@ config ARM64 select HAVE_ARCH_KASAN if !(ARM64_16K_PAGES && ARM64_VA_BITS_48) select HAVE_ARCH_KASAN_SW_TAGS if HAVE_ARCH_KASAN select HAVE_ARCH_KASAN_HW_TAGS if (HAVE_ARCH_KASAN && ARM64_MTE) + select HAVE_ARCH_KFENCE select HAVE_ARCH_KGDB select HAVE_ARCH_MMAP_RND_BITS select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT diff --git a/arch/arm64/include/asm/kfence.h b/arch/arm64/include/asm/kfence.h new file mode 100644 index 000000000000..42a06f83850a --- /dev/null +++ b/arch/arm64/include/asm/kfence.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * arm64 KFENCE support. + * + * Copyright (C) 2020, Google LLC. + */ + +#ifndef __ASM_KFENCE_H +#define __ASM_KFENCE_H + +#include + +#define KFENCE_SKIP_ARCH_FAULT_HANDLER "el1_sync" + +static inline bool arch_kfence_init_pool(void) { return true; } + +static inline bool kfence_protect_page(unsigned long addr, bool protect) +{ + set_memory_valid(addr, 1, !protect); + + return true; +} + +#endif /* __ASM_KFENCE_H */ diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index dc9f96442edc..42515900ab2e 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -389,6 +390,9 @@ static void __do_kernel_fault(unsigned long addr, unsigned int esr, } else if (addr < PAGE_SIZE) { msg = "NULL pointer dereference"; } else { + if (kfence_handle_page_fault(addr)) + return; + msg = "paging request"; } diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index d0758d24a42d..ef7698c4e2f0 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -1465,7 +1465,13 @@ int arch_add_memory(int nid, u64 start, u64 size, int ret, flags = 0; VM_BUG_ON(!mhp_range_allowed(start, size, true)); - if (rodata_full || debug_pagealloc_enabled()) + + /* + * KFENCE requires linear map to be mapped at page granularity, so that + * it is possible to protect/unprotect single pages in the KFENCE pool. + */ + if (rodata_full || debug_pagealloc_enabled() || + IS_ENABLED(CONFIG_KFENCE)) flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS; __create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start), From d438fabce7860df3cb9337776be6f90b59ced8ed Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 25 Feb 2021 17:19:08 -0800 Subject: [PATCH 059/118] kfence: use pt_regs to generate stack trace on faults Instead of removing the fault handling portion of the stack trace based on the fault handler's name, just use struct pt_regs directly. Change kfence_handle_page_fault() to take a struct pt_regs, and plumb it through to kfence_report_error() for out-of-bounds, use-after-free, or invalid access errors, where pt_regs is used to generate the stack trace. If the kernel is a DEBUG_KERNEL, also show registers for more information. Link: https://lkml.kernel.org/r/20201105092133.2075331-1-elver@google.com Signed-off-by: Marco Elver Suggested-by: Mark Rutland Acked-by: Mark Rutland Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Jann Horn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/include/asm/kfence.h | 2 -- arch/arm64/mm/fault.c | 2 +- arch/x86/include/asm/kfence.h | 6 ---- arch/x86/mm/fault.c | 2 +- include/linux/kfence.h | 5 +-- mm/kfence/core.c | 10 +++--- mm/kfence/kfence.h | 4 +-- mm/kfence/report.c | 63 +++++++++++++++++++-------------- 8 files changed, 48 insertions(+), 46 deletions(-) diff --git a/arch/arm64/include/asm/kfence.h b/arch/arm64/include/asm/kfence.h index 42a06f83850a..d061176d57ea 100644 --- a/arch/arm64/include/asm/kfence.h +++ b/arch/arm64/include/asm/kfence.h @@ -10,8 +10,6 @@ #include -#define KFENCE_SKIP_ARCH_FAULT_HANDLER "el1_sync" - static inline bool arch_kfence_init_pool(void) { return true; } static inline bool kfence_protect_page(unsigned long addr, bool protect) diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 42515900ab2e..56d9423ca59c 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -390,7 +390,7 @@ static void __do_kernel_fault(unsigned long addr, unsigned int esr, } else if (addr < PAGE_SIZE) { msg = "NULL pointer dereference"; } else { - if (kfence_handle_page_fault(addr)) + if (kfence_handle_page_fault(addr, regs)) return; msg = "paging request"; diff --git a/arch/x86/include/asm/kfence.h b/arch/x86/include/asm/kfence.h index a0659dbd93ea..97bbb4a9083a 100644 --- a/arch/x86/include/asm/kfence.h +++ b/arch/x86/include/asm/kfence.h @@ -16,12 +16,6 @@ #include #include -/* - * The page fault handler entry function, up to which the stack trace is - * truncated in reports. - */ -#define KFENCE_SKIP_ARCH_FAULT_HANDLER "asm_exc_page_fault" - /* Force 4K pages for __kfence_pool. */ static inline bool arch_kfence_init_pool(void) { diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 99fe6d3e690d..38868b4ce8b0 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -682,7 +682,7 @@ page_fault_oops(struct pt_regs *regs, unsigned long error_code, efi_crash_gracefully_on_page_fault(address); /* Only not-present faults should be handled by KFENCE. */ - if (!(error_code & X86_PF_PROT) && kfence_handle_page_fault(address)) + if (!(error_code & X86_PF_PROT) && kfence_handle_page_fault(address, regs)) return; oops: diff --git a/include/linux/kfence.h b/include/linux/kfence.h index 81f3911cb298..5a56bcf5606c 100644 --- a/include/linux/kfence.h +++ b/include/linux/kfence.h @@ -186,6 +186,7 @@ static __always_inline __must_check bool kfence_free(void *addr) /** * kfence_handle_page_fault() - perform page fault handling for KFENCE pages * @addr: faulting address + * @regs: current struct pt_regs (can be NULL, but shows full stack trace) * * Return: * * false - address outside KFENCE pool, @@ -196,7 +197,7 @@ static __always_inline __must_check bool kfence_free(void *addr) * cases KFENCE prints an error message and marks the offending page as * present, so that the kernel can proceed. */ -bool __must_check kfence_handle_page_fault(unsigned long addr); +bool __must_check kfence_handle_page_fault(unsigned long addr, struct pt_regs *regs); #else /* CONFIG_KFENCE */ @@ -209,7 +210,7 @@ static inline size_t kfence_ksize(const void *addr) { return 0; } static inline void *kfence_object_start(const void *addr) { return NULL; } static inline void __kfence_free(void *addr) { } static inline bool __must_check kfence_free(void *addr) { return false; } -static inline bool __must_check kfence_handle_page_fault(unsigned long addr) { return false; } +static inline bool __must_check kfence_handle_page_fault(unsigned long addr, struct pt_regs *regs) { return false; } #endif diff --git a/mm/kfence/core.c b/mm/kfence/core.c index d6a32c13336b..61c76670a7a9 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -216,7 +216,7 @@ static inline bool check_canary_byte(u8 *addr) return true; atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]); - kfence_report_error((unsigned long)addr, addr_to_metadata((unsigned long)addr), + kfence_report_error((unsigned long)addr, NULL, addr_to_metadata((unsigned long)addr), KFENCE_ERROR_CORRUPTION); return false; } @@ -351,7 +351,7 @@ static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool z if (meta->state != KFENCE_OBJECT_ALLOCATED || meta->addr != (unsigned long)addr) { /* Invalid or double-free, bail out. */ atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]); - kfence_report_error((unsigned long)addr, meta, KFENCE_ERROR_INVALID_FREE); + kfence_report_error((unsigned long)addr, NULL, meta, KFENCE_ERROR_INVALID_FREE); raw_spin_unlock_irqrestore(&meta->lock, flags); return; } @@ -766,7 +766,7 @@ void __kfence_free(void *addr) kfence_guarded_free(addr, meta, false); } -bool kfence_handle_page_fault(unsigned long addr) +bool kfence_handle_page_fault(unsigned long addr, struct pt_regs *regs) { const int page_index = (addr - (unsigned long)__kfence_pool) / PAGE_SIZE; struct kfence_metadata *to_report = NULL; @@ -829,11 +829,11 @@ bool kfence_handle_page_fault(unsigned long addr) out: if (to_report) { - kfence_report_error(addr, to_report, error_type); + kfence_report_error(addr, regs, to_report, error_type); raw_spin_unlock_irqrestore(&to_report->lock, flags); } else { /* This may be a UAF or OOB access, but we can't be sure. */ - kfence_report_error(addr, NULL, KFENCE_ERROR_INVALID); + kfence_report_error(addr, regs, NULL, KFENCE_ERROR_INVALID); } return kfence_unprotect(addr); /* Unprotect and let access proceed. */ diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h index 1014060f9707..0d83e628a97d 100644 --- a/mm/kfence/kfence.h +++ b/mm/kfence/kfence.h @@ -105,8 +105,8 @@ enum kfence_error_type { KFENCE_ERROR_INVALID_FREE, /* Invalid free. */ }; -void kfence_report_error(unsigned long address, const struct kfence_metadata *meta, - enum kfence_error_type type); +void kfence_report_error(unsigned long address, struct pt_regs *regs, + const struct kfence_metadata *meta, enum kfence_error_type type); void kfence_print_object(struct seq_file *seq, const struct kfence_metadata *meta); diff --git a/mm/kfence/report.c b/mm/kfence/report.c index 64f27c8d46a3..4dbfa9a382e4 100644 --- a/mm/kfence/report.c +++ b/mm/kfence/report.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -41,7 +42,6 @@ static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries { char buf[64]; int skipnr, fallback = 0; - bool is_access_fault = false; if (type) { /* Depending on error type, find different stack entries. */ @@ -49,8 +49,12 @@ static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries case KFENCE_ERROR_UAF: case KFENCE_ERROR_OOB: case KFENCE_ERROR_INVALID: - is_access_fault = true; - break; + /* + * kfence_handle_page_fault() may be called with pt_regs + * set to NULL; in that case we'll simply show the full + * stack trace. + */ + return 0; case KFENCE_ERROR_CORRUPTION: case KFENCE_ERROR_INVALID_FREE: break; @@ -60,26 +64,21 @@ static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries for (skipnr = 0; skipnr < num_entries; skipnr++) { int len = scnprintf(buf, sizeof(buf), "%ps", (void *)stack_entries[skipnr]); - if (is_access_fault) { - if (!strncmp(buf, KFENCE_SKIP_ARCH_FAULT_HANDLER, len)) - goto found; - } else { - if (str_has_prefix(buf, "kfence_") || str_has_prefix(buf, "__kfence_") || - !strncmp(buf, "__slab_free", len)) { - /* - * In case of tail calls from any of the below - * to any of the above. - */ - fallback = skipnr + 1; - } - - /* Also the *_bulk() variants by only checking prefixes. */ - if (str_has_prefix(buf, "kfree") || - str_has_prefix(buf, "kmem_cache_free") || - str_has_prefix(buf, "__kmalloc") || - str_has_prefix(buf, "kmem_cache_alloc")) - goto found; + if (str_has_prefix(buf, "kfence_") || str_has_prefix(buf, "__kfence_") || + !strncmp(buf, "__slab_free", len)) { + /* + * In case of tail calls from any of the below + * to any of the above. + */ + fallback = skipnr + 1; } + + /* Also the *_bulk() variants by only checking prefixes. */ + if (str_has_prefix(buf, "kfree") || + str_has_prefix(buf, "kmem_cache_free") || + str_has_prefix(buf, "__kmalloc") || + str_has_prefix(buf, "kmem_cache_alloc")) + goto found; } if (fallback < num_entries) return fallback; @@ -157,13 +156,20 @@ static void print_diff_canary(unsigned long address, size_t bytes_to_show, pr_cont(" ]"); } -void kfence_report_error(unsigned long address, const struct kfence_metadata *meta, - enum kfence_error_type type) +void kfence_report_error(unsigned long address, struct pt_regs *regs, + const struct kfence_metadata *meta, enum kfence_error_type type) { unsigned long stack_entries[KFENCE_STACK_DEPTH] = { 0 }; - int num_stack_entries = stack_trace_save(stack_entries, KFENCE_STACK_DEPTH, 1); - int skipnr = get_stack_skipnr(stack_entries, num_stack_entries, &type); const ptrdiff_t object_index = meta ? meta - kfence_metadata : -1; + int num_stack_entries; + int skipnr = 0; + + if (regs) { + num_stack_entries = stack_trace_save_regs(regs, stack_entries, KFENCE_STACK_DEPTH, 0); + } else { + num_stack_entries = stack_trace_save(stack_entries, KFENCE_STACK_DEPTH, 1); + skipnr = get_stack_skipnr(stack_entries, num_stack_entries, &type); + } /* Require non-NULL meta, except if KFENCE_ERROR_INVALID. */ if (WARN_ON(type != KFENCE_ERROR_INVALID && !meta)) @@ -227,7 +233,10 @@ void kfence_report_error(unsigned long address, const struct kfence_metadata *me /* Print report footer. */ pr_err("\n"); - dump_stack_print_info(KERN_ERR); + if (IS_ENABLED(CONFIG_DEBUG_KERNEL) && regs) + show_regs(regs); + else + dump_stack_print_info(KERN_ERR); pr_err("==================================================================\n"); lockdep_on(); From d3fb45f370d927224af35d22d34ea465884afec8 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 25 Feb 2021 17:19:11 -0800 Subject: [PATCH 060/118] mm, kfence: insert KFENCE hooks for SLAB Inserts KFENCE hooks into the SLAB allocator. To pass the originally requested size to KFENCE, add an argument 'orig_size' to slab_alloc*(). The additional argument is required to preserve the requested original size for kmalloc() allocations, which uses size classes (e.g. an allocation of 272 bytes will return an object of size 512). Therefore, kmem_cache::size does not represent the kmalloc-caller's requested size, and we must introduce the argument 'orig_size' to propagate the originally requested size to KFENCE. Without the originally requested size, we would not be able to detect out-of-bounds accesses for objects placed at the end of a KFENCE object page if that object is not equal to the kmalloc-size class it was bucketed into. When KFENCE is disabled, there is no additional overhead, since slab_alloc*() functions are __always_inline. Link: https://lkml.kernel.org/r/20201103175841.3495947-5-elver@google.com Signed-off-by: Marco Elver Signed-off-by: Alexander Potapenko Reviewed-by: Dmitry Vyukov Co-developed-by: Marco Elver Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Catalin Marinas Cc: Dave Hansen Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Hillf Danton Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jann Horn Cc: Joern Engel Cc: Jonathan Corbet Cc: Kees Cook Cc: Mark Rutland Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: SeongJae Park Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab_def.h | 3 +++ mm/kfence/core.c | 2 ++ mm/slab.c | 38 +++++++++++++++++++++++++++++--------- mm/slab_common.c | 5 ++++- 4 files changed, 38 insertions(+), 10 deletions(-) diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h index 9eb430c163c2..3aa5e1e73ab6 100644 --- a/include/linux/slab_def.h +++ b/include/linux/slab_def.h @@ -2,6 +2,7 @@ #ifndef _LINUX_SLAB_DEF_H #define _LINUX_SLAB_DEF_H +#include #include /* @@ -114,6 +115,8 @@ static inline unsigned int obj_to_index(const struct kmem_cache *cache, static inline int objs_per_slab_page(const struct kmem_cache *cache, const struct page *page) { + if (is_kfence_address(page_address(page))) + return 1; return cache->num; } diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 61c76670a7a9..05c18aa11851 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -317,6 +317,8 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g /* Set required struct page fields. */ page = virt_to_page(meta->addr); page->slab_cache = cache; + if (IS_ENABLED(CONFIG_SLAB)) + page->s_mem = addr; raw_spin_unlock_irqrestore(&meta->lock, flags); diff --git a/mm/slab.c b/mm/slab.c index 35c68d99d460..51fd424e0d6d 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -100,6 +100,7 @@ #include #include #include +#include #include #include #include @@ -3208,7 +3209,7 @@ must_grow: } static __always_inline void * -slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, +slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, size_t orig_size, unsigned long caller) { unsigned long save_flags; @@ -3221,6 +3222,10 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, if (unlikely(!cachep)) return NULL; + ptr = kfence_alloc(cachep, orig_size, flags); + if (unlikely(ptr)) + goto out_hooks; + cache_alloc_debugcheck_before(cachep, flags); local_irq_save(save_flags); @@ -3253,6 +3258,7 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, if (unlikely(slab_want_init_on_alloc(flags, cachep)) && ptr) memset(ptr, 0, cachep->object_size); +out_hooks: slab_post_alloc_hook(cachep, objcg, flags, 1, &ptr); return ptr; } @@ -3290,7 +3296,7 @@ __do_cache_alloc(struct kmem_cache *cachep, gfp_t flags) #endif /* CONFIG_NUMA */ static __always_inline void * -slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller) +slab_alloc(struct kmem_cache *cachep, gfp_t flags, size_t orig_size, unsigned long caller) { unsigned long save_flags; void *objp; @@ -3301,6 +3307,10 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller) if (unlikely(!cachep)) return NULL; + objp = kfence_alloc(cachep, orig_size, flags); + if (unlikely(objp)) + goto out; + cache_alloc_debugcheck_before(cachep, flags); local_irq_save(save_flags); objp = __do_cache_alloc(cachep, flags); @@ -3311,6 +3321,7 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller) if (unlikely(slab_want_init_on_alloc(flags, cachep)) && objp) memset(objp, 0, cachep->object_size); +out: slab_post_alloc_hook(cachep, objcg, flags, 1, &objp); return objp; } @@ -3416,6 +3427,12 @@ free_done: static __always_inline void __cache_free(struct kmem_cache *cachep, void *objp, unsigned long caller) { + if (is_kfence_address(objp)) { + kmemleak_free_recursive(objp, cachep->flags); + __kfence_free(objp); + return; + } + if (unlikely(slab_want_init_on_free(cachep))) memset(objp, 0, cachep->object_size); @@ -3482,7 +3499,7 @@ void ___cache_free(struct kmem_cache *cachep, void *objp, */ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) { - void *ret = slab_alloc(cachep, flags, _RET_IP_); + void *ret = slab_alloc(cachep, flags, cachep->object_size, _RET_IP_); trace_kmem_cache_alloc(_RET_IP_, ret, cachep->object_size, cachep->size, flags); @@ -3515,7 +3532,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, local_irq_disable(); for (i = 0; i < size; i++) { - void *objp = __do_cache_alloc(s, flags); + void *objp = kfence_alloc(s, s->object_size, flags) ?: __do_cache_alloc(s, flags); if (unlikely(!objp)) goto error; @@ -3548,7 +3565,7 @@ kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size) { void *ret; - ret = slab_alloc(cachep, flags, _RET_IP_); + ret = slab_alloc(cachep, flags, size, _RET_IP_); ret = kasan_kmalloc(cachep, ret, size, flags); trace_kmalloc(_RET_IP_, ret, @@ -3574,7 +3591,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_trace); */ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) { - void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); + void *ret = slab_alloc_node(cachep, flags, nodeid, cachep->object_size, _RET_IP_); trace_kmem_cache_alloc_node(_RET_IP_, ret, cachep->object_size, cachep->size, @@ -3592,7 +3609,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep, { void *ret; - ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); + ret = slab_alloc_node(cachep, flags, nodeid, size, _RET_IP_); ret = kasan_kmalloc(cachep, ret, size, flags); trace_kmalloc_node(_RET_IP_, ret, @@ -3673,7 +3690,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, cachep = kmalloc_slab(size, flags); if (unlikely(ZERO_OR_NULL_PTR(cachep))) return cachep; - ret = slab_alloc(cachep, flags, caller); + ret = slab_alloc(cachep, flags, size, caller); ret = kasan_kmalloc(cachep, ret, size, flags); trace_kmalloc(caller, ret, @@ -4172,7 +4189,10 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page, BUG_ON(objnr >= cachep->num); /* Find offset within object. */ - offset = ptr - index_to_obj(cachep, page, objnr) - obj_offset(cachep); + if (is_kfence_address(ptr)) + offset = ptr - kfence_object_start(ptr); + else + offset = ptr - index_to_obj(cachep, page, objnr) - obj_offset(cachep); /* Allow address range falling entirely within usercopy region. */ if (offset >= cachep->useroffset && diff --git a/mm/slab_common.c b/mm/slab_common.c index 7c8298c17145..284954ef1da5 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -430,6 +431,7 @@ static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work) rcu_barrier(); list_for_each_entry_safe(s, s2, &to_destroy, list) { + kfence_shutdown_cache(s); #ifdef SLAB_SUPPORTS_SYSFS sysfs_slab_release(s); #else @@ -455,6 +457,7 @@ static int shutdown_cache(struct kmem_cache *s) list_add_tail(&s->list, &slab_caches_to_rcu_destroy); schedule_work(&slab_caches_to_rcu_destroy_work); } else { + kfence_shutdown_cache(s); #ifdef SLAB_SUPPORTS_SYSFS sysfs_slab_unlink(s); sysfs_slab_release(s); @@ -1235,7 +1238,7 @@ size_t ksize(const void *objp) if (unlikely(ZERO_OR_NULL_PTR(objp)) || !kasan_check_byte(objp)) return 0; - size = __ksize(objp); + size = kfence_ksize(objp) ?: __ksize(objp); /* * We assume that ksize callers could use whole allocated area, * so we need to unpoison this area. From b89fb5ef0ce611b5db8eb9d3a5a7fcaab2cbe9e4 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 25 Feb 2021 17:19:16 -0800 Subject: [PATCH 061/118] mm, kfence: insert KFENCE hooks for SLUB Inserts KFENCE hooks into the SLUB allocator. To pass the originally requested size to KFENCE, add an argument 'orig_size' to slab_alloc*(). The additional argument is required to preserve the requested original size for kmalloc() allocations, which uses size classes (e.g. an allocation of 272 bytes will return an object of size 512). Therefore, kmem_cache::size does not represent the kmalloc-caller's requested size, and we must introduce the argument 'orig_size' to propagate the originally requested size to KFENCE. Without the originally requested size, we would not be able to detect out-of-bounds accesses for objects placed at the end of a KFENCE object page if that object is not equal to the kmalloc-size class it was bucketed into. When KFENCE is disabled, there is no additional overhead, since slab_alloc*() functions are __always_inline. Link: https://lkml.kernel.org/r/20201103175841.3495947-6-elver@google.com Signed-off-by: Marco Elver Signed-off-by: Alexander Potapenko Reviewed-by: Dmitry Vyukov Reviewed-by: Jann Horn Co-developed-by: Marco Elver Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christopher Lameter Cc: Dave Hansen Cc: David Rientjes Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Hillf Danton Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Joern Engel Cc: Jonathan Corbet Cc: Joonsoo Kim Cc: Kees Cook Cc: Mark Rutland Cc: Paul E. McKenney Cc: Pekka Enberg Cc: Peter Zijlstra Cc: SeongJae Park Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slub_def.h | 3 ++ mm/kfence/core.c | 2 ++ mm/slub.c | 60 ++++++++++++++++++++++++++++++---------- 3 files changed, 51 insertions(+), 14 deletions(-) diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 1be0ed5befa1..dcde82a4434c 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -7,6 +7,7 @@ * * (C) 2007 SGI, Christoph Lameter */ +#include #include #include @@ -185,6 +186,8 @@ static inline unsigned int __obj_to_index(const struct kmem_cache *cache, static inline unsigned int obj_to_index(const struct kmem_cache *cache, const struct page *page, void *obj) { + if (is_kfence_address(obj)) + return 0; return __obj_to_index(cache, page_address(page), obj); } diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 05c18aa11851..7692af715fdb 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -317,6 +317,8 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g /* Set required struct page fields. */ page = virt_to_page(meta->addr); page->slab_cache = cache; + if (IS_ENABLED(CONFIG_SLUB)) + page->objects = 1; if (IS_ENABLED(CONFIG_SLAB)) page->s_mem = addr; diff --git a/mm/slub.c b/mm/slub.c index b2833ce85c92..383616af28c4 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -1570,6 +1571,11 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s, void *old_tail = *tail ? *tail : *head; int rsize; + if (is_kfence_address(next)) { + slab_free_hook(s, next); + return true; + } + /* Head and tail of the reconstructed freelist */ *head = NULL; *tail = NULL; @@ -2809,7 +2815,7 @@ static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s, * Otherwise we can simply pick the next object from the lockless free list. */ static __always_inline void *slab_alloc_node(struct kmem_cache *s, - gfp_t gfpflags, int node, unsigned long addr) + gfp_t gfpflags, int node, unsigned long addr, size_t orig_size) { void *object; struct kmem_cache_cpu *c; @@ -2820,6 +2826,11 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s, s = slab_pre_alloc_hook(s, &objcg, 1, gfpflags); if (!s) return NULL; + + object = kfence_alloc(s, orig_size, gfpflags); + if (unlikely(object)) + goto out; + redo: /* * Must read kmem_cache cpu data via this cpu ptr. Preemption is @@ -2892,20 +2903,21 @@ redo: if (unlikely(slab_want_init_on_alloc(gfpflags, s)) && object) memset(kasan_reset_tag(object), 0, s->object_size); +out: slab_post_alloc_hook(s, objcg, gfpflags, 1, &object); return object; } static __always_inline void *slab_alloc(struct kmem_cache *s, - gfp_t gfpflags, unsigned long addr) + gfp_t gfpflags, unsigned long addr, size_t orig_size) { - return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr); + return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr, orig_size); } void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) { - void *ret = slab_alloc(s, gfpflags, _RET_IP_); + void *ret = slab_alloc(s, gfpflags, _RET_IP_, s->object_size); trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size, s->size, gfpflags); @@ -2917,7 +2929,7 @@ EXPORT_SYMBOL(kmem_cache_alloc); #ifdef CONFIG_TRACING void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) { - void *ret = slab_alloc(s, gfpflags, _RET_IP_); + void *ret = slab_alloc(s, gfpflags, _RET_IP_, size); trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags); ret = kasan_kmalloc(s, ret, size, gfpflags); return ret; @@ -2928,7 +2940,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_trace); #ifdef CONFIG_NUMA void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) { - void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_); + void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_, s->object_size); trace_kmem_cache_alloc_node(_RET_IP_, ret, s->object_size, s->size, gfpflags, node); @@ -2942,7 +2954,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, int node, size_t size) { - void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_); + void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_, size); trace_kmalloc_node(_RET_IP_, ret, size, s->size, gfpflags, node); @@ -2976,6 +2988,9 @@ static void __slab_free(struct kmem_cache *s, struct page *page, stat(s, FREE_SLOWPATH); + if (kfence_free(head)) + return; + if (kmem_cache_debug(s) && !free_debug_processing(s, page, head, tail, cnt, addr)) return; @@ -3220,6 +3235,13 @@ int build_detached_freelist(struct kmem_cache *s, size_t size, df->s = cache_from_obj(s, object); /* Support for memcg */ } + if (is_kfence_address(object)) { + slab_free_hook(df->s, object); + __kfence_free(object); + p[size] = NULL; /* mark object processed */ + return size; + } + /* Start new detached freelist */ df->page = page; set_freepointer(df->s, object, NULL); @@ -3295,8 +3317,14 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, c = this_cpu_ptr(s->cpu_slab); for (i = 0; i < size; i++) { - void *object = c->freelist; + void *object = kfence_alloc(s, s->object_size, flags); + if (unlikely(object)) { + p[i] = object; + continue; + } + + object = c->freelist; if (unlikely(!object)) { /* * We may have removed an object from c->freelist using @@ -4021,7 +4049,7 @@ void *__kmalloc(size_t size, gfp_t flags) if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - ret = slab_alloc(s, flags, _RET_IP_); + ret = slab_alloc(s, flags, _RET_IP_, size); trace_kmalloc(_RET_IP_, ret, size, s->size, flags); @@ -4069,7 +4097,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node) if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - ret = slab_alloc_node(s, flags, node, _RET_IP_); + ret = slab_alloc_node(s, flags, node, _RET_IP_, size); trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node); @@ -4095,6 +4123,7 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page, struct kmem_cache *s; unsigned int offset; size_t object_size; + bool is_kfence = is_kfence_address(ptr); ptr = kasan_reset_tag(ptr); @@ -4107,10 +4136,13 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page, to_user, 0, n); /* Find offset within object. */ - offset = (ptr - page_address(page)) % s->size; + if (is_kfence) + offset = ptr - kfence_object_start(ptr); + else + offset = (ptr - page_address(page)) % s->size; /* Adjust for redzone and reject if within the redzone. */ - if (kmem_cache_debug_flags(s, SLAB_RED_ZONE)) { + if (!is_kfence && kmem_cache_debug_flags(s, SLAB_RED_ZONE)) { if (offset < s->red_left_pad) usercopy_abort("SLUB object in left red zone", s->name, to_user, offset, n); @@ -4527,7 +4559,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - ret = slab_alloc(s, gfpflags, caller); + ret = slab_alloc(s, gfpflags, caller, size); /* Honor the call site pointer we received. */ trace_kmalloc(caller, ret, size, s->size, gfpflags); @@ -4558,7 +4590,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - ret = slab_alloc_node(s, gfpflags, node, caller); + ret = slab_alloc_node(s, gfpflags, node, caller, size); /* Honor the call site pointer we received. */ trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node); From 2b8305260fb37fc20e13f71e13073304d0a031c8 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 25 Feb 2021 17:19:21 -0800 Subject: [PATCH 062/118] kfence, kasan: make KFENCE compatible with KASAN Make KFENCE compatible with KASAN. Currently this helps test KFENCE itself, where KASAN can catch potential corruptions to KFENCE state, or other corruptions that may be a result of freepointer corruptions in the main allocators. [akpm@linux-foundation.org: merge fixup] [andreyknvl@google.com: untag addresses for KFENCE] Link: https://lkml.kernel.org/r/9dc196006921b191d25d10f6e611316db7da2efc.1611946152.git.andreyknvl@google.com Link: https://lkml.kernel.org/r/20201103175841.3495947-7-elver@google.com Signed-off-by: Marco Elver Signed-off-by: Alexander Potapenko Signed-off-by: Andrey Konovalov Reviewed-by: Dmitry Vyukov Reviewed-by: Jann Horn Co-developed-by: Marco Elver Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christopher Lameter Cc: Dave Hansen Cc: David Rientjes Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Hillf Danton Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Joern Engel Cc: Jonathan Corbet Cc: Joonsoo Kim Cc: Kees Cook Cc: Mark Rutland Cc: Paul E. McKenney Cc: Pekka Enberg Cc: Peter Zijlstra Cc: SeongJae Park Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig.kfence | 2 +- mm/kasan/common.c | 6 ++++++ mm/kasan/generic.c | 3 ++- mm/kasan/kasan.h | 21 ++++++++++++++++++--- mm/kasan/shadow.c | 13 +++++++++++++ 5 files changed, 40 insertions(+), 5 deletions(-) diff --git a/lib/Kconfig.kfence b/lib/Kconfig.kfence index b88ac9d6b2e6..edfecb5d6165 100644 --- a/lib/Kconfig.kfence +++ b/lib/Kconfig.kfence @@ -5,7 +5,7 @@ config HAVE_ARCH_KFENCE menuconfig KFENCE bool "KFENCE: low-overhead sampling-based memory safety error detector" - depends on HAVE_ARCH_KFENCE && !KASAN && (SLAB || SLUB) + depends on HAVE_ARCH_KFENCE && (SLAB || SLUB) select STACKTRACE help KFENCE is a low-overhead sampling-based detector of heap out-of-bounds diff --git a/mm/kasan/common.c b/mm/kasan/common.c index b18189ef3a92..af1768c4fee5 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -335,6 +335,9 @@ static bool ____kasan_slab_free(struct kmem_cache *cache, void *object, tagged_object = object; object = kasan_reset_tag(object); + if (is_kfence_address(object)) + return false; + if (unlikely(nearest_obj(cache, virt_to_head_page(object), object) != object)) { kasan_report_invalid_free(tagged_object, ip); @@ -413,6 +416,9 @@ static void *____kasan_kmalloc(struct kmem_cache *cache, const void *object, if (unlikely(object == NULL)) return NULL; + if (is_kfence_address(kasan_reset_tag(object))) + return (void *)object; + redzone_start = round_up((unsigned long)(object + size), KASAN_GRANULE_SIZE); redzone_end = round_up((unsigned long)object + cache->object_size, diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index 3f17a1218055..2e55e0f82f39 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -331,7 +332,7 @@ void kasan_record_aux_stack(void *addr) struct kasan_alloc_meta *alloc_meta; void *object; - if (!(page && PageSlab(page))) + if (is_kfence_address(addr) || !(page && PageSlab(page))) return; cache = page->slab_cache; diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index cc14b6e6c14c..fb883740fd27 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -3,6 +3,7 @@ #define __MM_KASAN_KASAN_H #include +#include #include #ifdef CONFIG_KASAN_HW_TAGS @@ -331,14 +332,28 @@ static inline u8 kasan_random_tag(void) { return 0; } static inline void kasan_poison(const void *address, size_t size, u8 value) { - hw_set_mem_tag_range(kasan_reset_tag(address), + address = kasan_reset_tag(address); + + /* Skip KFENCE memory if called explicitly outside of sl*b. */ + if (is_kfence_address(address)) + return; + + hw_set_mem_tag_range((void *)address, round_up(size, KASAN_GRANULE_SIZE), value); } static inline void kasan_unpoison(const void *address, size_t size) { - hw_set_mem_tag_range(kasan_reset_tag(address), - round_up(size, KASAN_GRANULE_SIZE), get_tag(address)); + u8 tag = get_tag(address); + + address = kasan_reset_tag(address); + + /* Skip KFENCE memory if called explicitly outside of sl*b. */ + if (is_kfence_address(address)) + return; + + hw_set_mem_tag_range((void *)address, + round_up(size, KASAN_GRANULE_SIZE), tag); } static inline bool kasan_byte_accessible(const void *addr) diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 80adc85d0393..1372a2fc0ca9 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -84,6 +85,10 @@ void kasan_poison(const void *address, size_t size, u8 value) address = kasan_reset_tag(address); size = round_up(size, KASAN_GRANULE_SIZE); + /* Skip KFENCE memory if called explicitly outside of sl*b. */ + if (is_kfence_address(address)) + return; + shadow_start = kasan_mem_to_shadow(address); shadow_end = kasan_mem_to_shadow(address + size); @@ -102,6 +107,14 @@ void kasan_unpoison(const void *address, size_t size) */ address = kasan_reset_tag(address); + /* + * Skip KFENCE memory if called explicitly outside of sl*b. Also note + * that calls to ksize(), where size is not a multiple of machine-word + * size, would otherwise poison the invalid portion of the word. + */ + if (is_kfence_address(address)) + return; + kasan_poison(address, size, tag); if (size & KASAN_GRANULE_MASK) { From 10efe55f883f2396a0024891ad1d7d5d040364b3 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 25 Feb 2021 17:19:26 -0800 Subject: [PATCH 063/118] kfence, Documentation: add KFENCE documentation Add KFENCE documentation in dev-tools/kfence.rst, and add to index. [elver@google.com: add missing copyright header to documentation] Link: https://lkml.kernel.org/r/20210118092159.145934-4-elver@google.com Link: https://lkml.kernel.org/r/20201103175841.3495947-8-elver@google.com Signed-off-by: Alexander Potapenko Signed-off-by: Marco Elver Reviewed-by: Dmitry Vyukov Co-developed-by: Alexander Potapenko Reviewed-by: Jann Horn Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christopher Lameter Cc: Dave Hansen Cc: David Rientjes Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Hillf Danton Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Joern Engel Cc: Jonathan Corbet Cc: Joonsoo Kim Cc: Kees Cook Cc: Mark Rutland Cc: Paul E. McKenney Cc: Pekka Enberg Cc: Peter Zijlstra Cc: SeongJae Park Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/dev-tools/index.rst | 1 + Documentation/dev-tools/kfence.rst | 298 +++++++++++++++++++++++++++++ lib/Kconfig.kfence | 2 + 3 files changed, 301 insertions(+) create mode 100644 Documentation/dev-tools/kfence.rst diff --git a/Documentation/dev-tools/index.rst b/Documentation/dev-tools/index.rst index f7809c7b1ba9..1b1cf4f5c9d9 100644 --- a/Documentation/dev-tools/index.rst +++ b/Documentation/dev-tools/index.rst @@ -22,6 +22,7 @@ whole; patches welcome! ubsan kmemleak kcsan + kfence gdb-kernel-debugging kgdb kselftest diff --git a/Documentation/dev-tools/kfence.rst b/Documentation/dev-tools/kfence.rst new file mode 100644 index 000000000000..0e2fb6ef3016 --- /dev/null +++ b/Documentation/dev-tools/kfence.rst @@ -0,0 +1,298 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. Copyright (C) 2020, Google LLC. + +Kernel Electric-Fence (KFENCE) +============================== + +Kernel Electric-Fence (KFENCE) is a low-overhead sampling-based memory safety +error detector. KFENCE detects heap out-of-bounds access, use-after-free, and +invalid-free errors. + +KFENCE is designed to be enabled in production kernels, and has near zero +performance overhead. Compared to KASAN, KFENCE trades performance for +precision. The main motivation behind KFENCE's design, is that with enough +total uptime KFENCE will detect bugs in code paths not typically exercised by +non-production test workloads. One way to quickly achieve a large enough total +uptime is when the tool is deployed across a large fleet of machines. + +Usage +----- + +To enable KFENCE, configure the kernel with:: + + CONFIG_KFENCE=y + +To build a kernel with KFENCE support, but disabled by default (to enable, set +``kfence.sample_interval`` to non-zero value), configure the kernel with:: + + CONFIG_KFENCE=y + CONFIG_KFENCE_SAMPLE_INTERVAL=0 + +KFENCE provides several other configuration options to customize behaviour (see +the respective help text in ``lib/Kconfig.kfence`` for more info). + +Tuning performance +~~~~~~~~~~~~~~~~~~ + +The most important parameter is KFENCE's sample interval, which can be set via +the kernel boot parameter ``kfence.sample_interval`` in milliseconds. The +sample interval determines the frequency with which heap allocations will be +guarded by KFENCE. The default is configurable via the Kconfig option +``CONFIG_KFENCE_SAMPLE_INTERVAL``. Setting ``kfence.sample_interval=0`` +disables KFENCE. + +The KFENCE memory pool is of fixed size, and if the pool is exhausted, no +further KFENCE allocations occur. With ``CONFIG_KFENCE_NUM_OBJECTS`` (default +255), the number of available guarded objects can be controlled. Each object +requires 2 pages, one for the object itself and the other one used as a guard +page; object pages are interleaved with guard pages, and every object page is +therefore surrounded by two guard pages. + +The total memory dedicated to the KFENCE memory pool can be computed as:: + + ( #objects + 1 ) * 2 * PAGE_SIZE + +Using the default config, and assuming a page size of 4 KiB, results in +dedicating 2 MiB to the KFENCE memory pool. + +Note: On architectures that support huge pages, KFENCE will ensure that the +pool is using pages of size ``PAGE_SIZE``. This will result in additional page +tables being allocated. + +Error reports +~~~~~~~~~~~~~ + +A typical out-of-bounds access looks like this:: + + ================================================================== + BUG: KFENCE: out-of-bounds in test_out_of_bounds_read+0xa3/0x22b + + Out-of-bounds access at 0xffffffffb672efff (1B left of kfence-#17): + test_out_of_bounds_read+0xa3/0x22b + kunit_try_run_case+0x51/0x85 + kunit_generic_run_threadfn_adapter+0x16/0x30 + kthread+0x137/0x160 + ret_from_fork+0x22/0x30 + + kfence-#17 [0xffffffffb672f000-0xffffffffb672f01f, size=32, cache=kmalloc-32] allocated by task 507: + test_alloc+0xf3/0x25b + test_out_of_bounds_read+0x98/0x22b + kunit_try_run_case+0x51/0x85 + kunit_generic_run_threadfn_adapter+0x16/0x30 + kthread+0x137/0x160 + ret_from_fork+0x22/0x30 + + CPU: 4 PID: 107 Comm: kunit_try_catch Not tainted 5.8.0-rc6+ #7 + Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1 04/01/2014 + ================================================================== + +The header of the report provides a short summary of the function involved in +the access. It is followed by more detailed information about the access and +its origin. Note that, real kernel addresses are only shown for +``CONFIG_DEBUG_KERNEL=y`` builds. + +Use-after-free accesses are reported as:: + + ================================================================== + BUG: KFENCE: use-after-free in test_use_after_free_read+0xb3/0x143 + + Use-after-free access at 0xffffffffb673dfe0 (in kfence-#24): + test_use_after_free_read+0xb3/0x143 + kunit_try_run_case+0x51/0x85 + kunit_generic_run_threadfn_adapter+0x16/0x30 + kthread+0x137/0x160 + ret_from_fork+0x22/0x30 + + kfence-#24 [0xffffffffb673dfe0-0xffffffffb673dfff, size=32, cache=kmalloc-32] allocated by task 507: + test_alloc+0xf3/0x25b + test_use_after_free_read+0x76/0x143 + kunit_try_run_case+0x51/0x85 + kunit_generic_run_threadfn_adapter+0x16/0x30 + kthread+0x137/0x160 + ret_from_fork+0x22/0x30 + + freed by task 507: + test_use_after_free_read+0xa8/0x143 + kunit_try_run_case+0x51/0x85 + kunit_generic_run_threadfn_adapter+0x16/0x30 + kthread+0x137/0x160 + ret_from_fork+0x22/0x30 + + CPU: 4 PID: 109 Comm: kunit_try_catch Tainted: G W 5.8.0-rc6+ #7 + Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1 04/01/2014 + ================================================================== + +KFENCE also reports on invalid frees, such as double-frees:: + + ================================================================== + BUG: KFENCE: invalid free in test_double_free+0xdc/0x171 + + Invalid free of 0xffffffffb6741000: + test_double_free+0xdc/0x171 + kunit_try_run_case+0x51/0x85 + kunit_generic_run_threadfn_adapter+0x16/0x30 + kthread+0x137/0x160 + ret_from_fork+0x22/0x30 + + kfence-#26 [0xffffffffb6741000-0xffffffffb674101f, size=32, cache=kmalloc-32] allocated by task 507: + test_alloc+0xf3/0x25b + test_double_free+0x76/0x171 + kunit_try_run_case+0x51/0x85 + kunit_generic_run_threadfn_adapter+0x16/0x30 + kthread+0x137/0x160 + ret_from_fork+0x22/0x30 + + freed by task 507: + test_double_free+0xa8/0x171 + kunit_try_run_case+0x51/0x85 + kunit_generic_run_threadfn_adapter+0x16/0x30 + kthread+0x137/0x160 + ret_from_fork+0x22/0x30 + + CPU: 4 PID: 111 Comm: kunit_try_catch Tainted: G W 5.8.0-rc6+ #7 + Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1 04/01/2014 + ================================================================== + +KFENCE also uses pattern-based redzones on the other side of an object's guard +page, to detect out-of-bounds writes on the unprotected side of the object. +These are reported on frees:: + + ================================================================== + BUG: KFENCE: memory corruption in test_kmalloc_aligned_oob_write+0xef/0x184 + + Corrupted memory at 0xffffffffb6797ff9 [ 0xac . . . . . . ] (in kfence-#69): + test_kmalloc_aligned_oob_write+0xef/0x184 + kunit_try_run_case+0x51/0x85 + kunit_generic_run_threadfn_adapter+0x16/0x30 + kthread+0x137/0x160 + ret_from_fork+0x22/0x30 + + kfence-#69 [0xffffffffb6797fb0-0xffffffffb6797ff8, size=73, cache=kmalloc-96] allocated by task 507: + test_alloc+0xf3/0x25b + test_kmalloc_aligned_oob_write+0x57/0x184 + kunit_try_run_case+0x51/0x85 + kunit_generic_run_threadfn_adapter+0x16/0x30 + kthread+0x137/0x160 + ret_from_fork+0x22/0x30 + + CPU: 4 PID: 120 Comm: kunit_try_catch Tainted: G W 5.8.0-rc6+ #7 + Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1 04/01/2014 + ================================================================== + +For such errors, the address where the corruption occurred as well as the +invalidly written bytes (offset from the address) are shown; in this +representation, '.' denote untouched bytes. In the example above ``0xac`` is +the value written to the invalid address at offset 0, and the remaining '.' +denote that no following bytes have been touched. Note that, real values are +only shown for ``CONFIG_DEBUG_KERNEL=y`` builds; to avoid information +disclosure for non-debug builds, '!' is used instead to denote invalidly +written bytes. + +And finally, KFENCE may also report on invalid accesses to any protected page +where it was not possible to determine an associated object, e.g. if adjacent +object pages had not yet been allocated:: + + ================================================================== + BUG: KFENCE: invalid access in test_invalid_access+0x26/0xe0 + + Invalid access at 0xffffffffb670b00a: + test_invalid_access+0x26/0xe0 + kunit_try_run_case+0x51/0x85 + kunit_generic_run_threadfn_adapter+0x16/0x30 + kthread+0x137/0x160 + ret_from_fork+0x22/0x30 + + CPU: 4 PID: 124 Comm: kunit_try_catch Tainted: G W 5.8.0-rc6+ #7 + Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1 04/01/2014 + ================================================================== + +DebugFS interface +~~~~~~~~~~~~~~~~~ + +Some debugging information is exposed via debugfs: + +* The file ``/sys/kernel/debug/kfence/stats`` provides runtime statistics. + +* The file ``/sys/kernel/debug/kfence/objects`` provides a list of objects + allocated via KFENCE, including those already freed but protected. + +Implementation Details +---------------------- + +Guarded allocations are set up based on the sample interval. After expiration +of the sample interval, the next allocation through the main allocator (SLAB or +SLUB) returns a guarded allocation from the KFENCE object pool (allocation +sizes up to PAGE_SIZE are supported). At this point, the timer is reset, and +the next allocation is set up after the expiration of the interval. To "gate" a +KFENCE allocation through the main allocator's fast-path without overhead, +KFENCE relies on static branches via the static keys infrastructure. The static +branch is toggled to redirect the allocation to KFENCE. + +KFENCE objects each reside on a dedicated page, at either the left or right +page boundaries selected at random. The pages to the left and right of the +object page are "guard pages", whose attributes are changed to a protected +state, and cause page faults on any attempted access. Such page faults are then +intercepted by KFENCE, which handles the fault gracefully by reporting an +out-of-bounds access, and marking the page as accessible so that the faulting +code can (wrongly) continue executing (set ``panic_on_warn`` to panic instead). + +To detect out-of-bounds writes to memory within the object's page itself, +KFENCE also uses pattern-based redzones. For each object page, a redzone is set +up for all non-object memory. For typical alignments, the redzone is only +required on the unguarded side of an object. Because KFENCE must honor the +cache's requested alignment, special alignments may result in unprotected gaps +on either side of an object, all of which are redzoned. + +The following figure illustrates the page layout:: + + ---+-----------+-----------+-----------+-----------+-----------+--- + | xxxxxxxxx | O : | xxxxxxxxx | : O | xxxxxxxxx | + | xxxxxxxxx | B : | xxxxxxxxx | : B | xxxxxxxxx | + | x GUARD x | J : RED- | x GUARD x | RED- : J | x GUARD x | + | xxxxxxxxx | E : ZONE | xxxxxxxxx | ZONE : E | xxxxxxxxx | + | xxxxxxxxx | C : | xxxxxxxxx | : C | xxxxxxxxx | + | xxxxxxxxx | T : | xxxxxxxxx | : T | xxxxxxxxx | + ---+-----------+-----------+-----------+-----------+-----------+--- + +Upon deallocation of a KFENCE object, the object's page is again protected and +the object is marked as freed. Any further access to the object causes a fault +and KFENCE reports a use-after-free access. Freed objects are inserted at the +tail of KFENCE's freelist, so that the least recently freed objects are reused +first, and the chances of detecting use-after-frees of recently freed objects +is increased. + +Interface +--------- + +The following describes the functions which are used by allocators as well as +page handling code to set up and deal with KFENCE allocations. + +.. kernel-doc:: include/linux/kfence.h + :functions: is_kfence_address + kfence_shutdown_cache + kfence_alloc kfence_free __kfence_free + kfence_ksize kfence_object_start + kfence_handle_page_fault + +Related Tools +------------- + +In userspace, a similar approach is taken by `GWP-ASan +`_. GWP-ASan also relies on guard pages and +a sampling strategy to detect memory unsafety bugs at scale. KFENCE's design is +directly influenced by GWP-ASan, and can be seen as its kernel sibling. Another +similar but non-sampling approach, that also inspired the name "KFENCE", can be +found in the userspace `Electric Fence Malloc Debugger +`_. + +In the kernel, several tools exist to debug memory access errors, and in +particular KASAN can detect all bug classes that KFENCE can detect. While KASAN +is more precise, relying on compiler instrumentation, this comes at a +performance cost. + +It is worth highlighting that KASAN and KFENCE are complementary, with +different target environments. For instance, KASAN is the better debugging-aid, +where test cases or reproducers exists: due to the lower chance to detect the +error, it would require more effort using KFENCE to debug. Deployments at scale +that cannot afford to enable KASAN, however, would benefit from using KFENCE to +discover bugs due to code paths not exercised by test cases or fuzzers. diff --git a/lib/Kconfig.kfence b/lib/Kconfig.kfence index edfecb5d6165..605125ac2ae0 100644 --- a/lib/Kconfig.kfence +++ b/lib/Kconfig.kfence @@ -13,6 +13,8 @@ menuconfig KFENCE to have negligible cost to permit enabling it in production environments. + See for more details. + Note that, KFENCE is not a substitute for explicit testing with tools such as KASAN. KFENCE can detect a subset of bugs that KASAN can detect, albeit at very different performance profiles. If you can From bc8fbc5f305aecf63423da91e5faf4c0ce40bf38 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 25 Feb 2021 17:19:31 -0800 Subject: [PATCH 064/118] kfence: add test suite Add KFENCE test suite, testing various error detection scenarios. Makes use of KUnit for test organization. Since KFENCE's interface to obtain error reports is via the console, the test verifies that KFENCE outputs expected reports to the console. [elver@google.com: fix typo in test] Link: https://lkml.kernel.org/r/X9lHQExmHGvETxY4@elver.google.com [elver@google.com: show access type in report] Link: https://lkml.kernel.org/r/20210111091544.3287013-2-elver@google.com Link: https://lkml.kernel.org/r/20201103175841.3495947-9-elver@google.com Signed-off-by: Alexander Potapenko Signed-off-by: Marco Elver Reviewed-by: Dmitry Vyukov Co-developed-by: Alexander Potapenko Reviewed-by: Jann Horn Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christopher Lameter Cc: Dave Hansen Cc: David Rientjes Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Hillf Danton Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Joern Engel Cc: Jonathan Corbet Cc: Joonsoo Kim Cc: Kees Cook Cc: Mark Rutland Cc: Paul E. McKenney Cc: Pekka Enberg Cc: Peter Zijlstra Cc: SeongJae Park Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/dev-tools/kfence.rst | 12 +- arch/arm64/mm/fault.c | 2 +- arch/x86/mm/fault.c | 3 +- include/linux/kfence.h | 9 +- lib/Kconfig.kfence | 13 + mm/kfence/Makefile | 3 + mm/kfence/core.c | 11 +- mm/kfence/kfence.h | 2 +- mm/kfence/kfence_test.c | 858 +++++++++++++++++++++++++++++ mm/kfence/report.c | 27 +- 10 files changed, 915 insertions(+), 25 deletions(-) create mode 100644 mm/kfence/kfence_test.c diff --git a/Documentation/dev-tools/kfence.rst b/Documentation/dev-tools/kfence.rst index 0e2fb6ef3016..58a0a5fa1ddc 100644 --- a/Documentation/dev-tools/kfence.rst +++ b/Documentation/dev-tools/kfence.rst @@ -65,9 +65,9 @@ Error reports A typical out-of-bounds access looks like this:: ================================================================== - BUG: KFENCE: out-of-bounds in test_out_of_bounds_read+0xa3/0x22b + BUG: KFENCE: out-of-bounds read in test_out_of_bounds_read+0xa3/0x22b - Out-of-bounds access at 0xffffffffb672efff (1B left of kfence-#17): + Out-of-bounds read at 0xffffffffb672efff (1B left of kfence-#17): test_out_of_bounds_read+0xa3/0x22b kunit_try_run_case+0x51/0x85 kunit_generic_run_threadfn_adapter+0x16/0x30 @@ -94,9 +94,9 @@ its origin. Note that, real kernel addresses are only shown for Use-after-free accesses are reported as:: ================================================================== - BUG: KFENCE: use-after-free in test_use_after_free_read+0xb3/0x143 + BUG: KFENCE: use-after-free read in test_use_after_free_read+0xb3/0x143 - Use-after-free access at 0xffffffffb673dfe0 (in kfence-#24): + Use-after-free read at 0xffffffffb673dfe0 (in kfence-#24): test_use_after_free_read+0xb3/0x143 kunit_try_run_case+0x51/0x85 kunit_generic_run_threadfn_adapter+0x16/0x30 @@ -193,9 +193,9 @@ where it was not possible to determine an associated object, e.g. if adjacent object pages had not yet been allocated:: ================================================================== - BUG: KFENCE: invalid access in test_invalid_access+0x26/0xe0 + BUG: KFENCE: invalid read in test_invalid_access+0x26/0xe0 - Invalid access at 0xffffffffb670b00a: + Invalid read at 0xffffffffb670b00a: test_invalid_access+0x26/0xe0 kunit_try_run_case+0x51/0x85 kunit_generic_run_threadfn_adapter+0x16/0x30 diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 56d9423ca59c..f37d4e3830b7 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -390,7 +390,7 @@ static void __do_kernel_fault(unsigned long addr, unsigned int esr, } else if (addr < PAGE_SIZE) { msg = "NULL pointer dereference"; } else { - if (kfence_handle_page_fault(addr, regs)) + if (kfence_handle_page_fault(addr, esr & ESR_ELx_WNR, regs)) return; msg = "paging request"; diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 38868b4ce8b0..a73347e2cdfc 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -682,7 +682,8 @@ page_fault_oops(struct pt_regs *regs, unsigned long error_code, efi_crash_gracefully_on_page_fault(address); /* Only not-present faults should be handled by KFENCE. */ - if (!(error_code & X86_PF_PROT) && kfence_handle_page_fault(address, regs)) + if (!(error_code & X86_PF_PROT) && + kfence_handle_page_fault(address, error_code & X86_PF_WRITE, regs)) return; oops: diff --git a/include/linux/kfence.h b/include/linux/kfence.h index 5a56bcf5606c..a70d1ea03532 100644 --- a/include/linux/kfence.h +++ b/include/linux/kfence.h @@ -186,6 +186,7 @@ static __always_inline __must_check bool kfence_free(void *addr) /** * kfence_handle_page_fault() - perform page fault handling for KFENCE pages * @addr: faulting address + * @is_write: is access a write * @regs: current struct pt_regs (can be NULL, but shows full stack trace) * * Return: @@ -197,7 +198,7 @@ static __always_inline __must_check bool kfence_free(void *addr) * cases KFENCE prints an error message and marks the offending page as * present, so that the kernel can proceed. */ -bool __must_check kfence_handle_page_fault(unsigned long addr, struct pt_regs *regs); +bool __must_check kfence_handle_page_fault(unsigned long addr, bool is_write, struct pt_regs *regs); #else /* CONFIG_KFENCE */ @@ -210,7 +211,11 @@ static inline size_t kfence_ksize(const void *addr) { return 0; } static inline void *kfence_object_start(const void *addr) { return NULL; } static inline void __kfence_free(void *addr) { } static inline bool __must_check kfence_free(void *addr) { return false; } -static inline bool __must_check kfence_handle_page_fault(unsigned long addr, struct pt_regs *regs) { return false; } +static inline bool __must_check kfence_handle_page_fault(unsigned long addr, bool is_write, + struct pt_regs *regs) +{ + return false; +} #endif diff --git a/lib/Kconfig.kfence b/lib/Kconfig.kfence index 605125ac2ae0..78f50ccb3b45 100644 --- a/lib/Kconfig.kfence +++ b/lib/Kconfig.kfence @@ -66,4 +66,17 @@ config KFENCE_STRESS_TEST_FAULTS Only for KFENCE testing; set to 0 if you are not a KFENCE developer. +config KFENCE_KUNIT_TEST + tristate "KFENCE integration test suite" if !KUNIT_ALL_TESTS + default KUNIT_ALL_TESTS + depends on TRACEPOINTS && KUNIT + help + Test suite for KFENCE, testing various error detection scenarios with + various allocation types, and checking that reports are correctly + output to console. + + Say Y here if you want the test to be built into the kernel and run + during boot; say M if you want the test to build as a module; say N + if you are unsure. + endif # KFENCE diff --git a/mm/kfence/Makefile b/mm/kfence/Makefile index d991e9a349f0..6872cd5e5390 100644 --- a/mm/kfence/Makefile +++ b/mm/kfence/Makefile @@ -1,3 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_KFENCE) := core.o report.o + +CFLAGS_kfence_test.o := -g -fno-omit-frame-pointer -fno-optimize-sibling-calls +obj-$(CONFIG_KFENCE_KUNIT_TEST) += kfence_test.o diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 7692af715fdb..cfe3d32ac5b7 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -216,7 +216,7 @@ static inline bool check_canary_byte(u8 *addr) return true; atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]); - kfence_report_error((unsigned long)addr, NULL, addr_to_metadata((unsigned long)addr), + kfence_report_error((unsigned long)addr, false, NULL, addr_to_metadata((unsigned long)addr), KFENCE_ERROR_CORRUPTION); return false; } @@ -355,7 +355,8 @@ static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool z if (meta->state != KFENCE_OBJECT_ALLOCATED || meta->addr != (unsigned long)addr) { /* Invalid or double-free, bail out. */ atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]); - kfence_report_error((unsigned long)addr, NULL, meta, KFENCE_ERROR_INVALID_FREE); + kfence_report_error((unsigned long)addr, false, NULL, meta, + KFENCE_ERROR_INVALID_FREE); raw_spin_unlock_irqrestore(&meta->lock, flags); return; } @@ -770,7 +771,7 @@ void __kfence_free(void *addr) kfence_guarded_free(addr, meta, false); } -bool kfence_handle_page_fault(unsigned long addr, struct pt_regs *regs) +bool kfence_handle_page_fault(unsigned long addr, bool is_write, struct pt_regs *regs) { const int page_index = (addr - (unsigned long)__kfence_pool) / PAGE_SIZE; struct kfence_metadata *to_report = NULL; @@ -833,11 +834,11 @@ bool kfence_handle_page_fault(unsigned long addr, struct pt_regs *regs) out: if (to_report) { - kfence_report_error(addr, regs, to_report, error_type); + kfence_report_error(addr, is_write, regs, to_report, error_type); raw_spin_unlock_irqrestore(&to_report->lock, flags); } else { /* This may be a UAF or OOB access, but we can't be sure. */ - kfence_report_error(addr, regs, NULL, KFENCE_ERROR_INVALID); + kfence_report_error(addr, is_write, regs, NULL, KFENCE_ERROR_INVALID); } return kfence_unprotect(addr); /* Unprotect and let access proceed. */ diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h index 0d83e628a97d..1accc840dbbe 100644 --- a/mm/kfence/kfence.h +++ b/mm/kfence/kfence.h @@ -105,7 +105,7 @@ enum kfence_error_type { KFENCE_ERROR_INVALID_FREE, /* Invalid free. */ }; -void kfence_report_error(unsigned long address, struct pt_regs *regs, +void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *regs, const struct kfence_metadata *meta, enum kfence_error_type type); void kfence_print_object(struct seq_file *seq, const struct kfence_metadata *meta); diff --git a/mm/kfence/kfence_test.c b/mm/kfence/kfence_test.c new file mode 100644 index 000000000000..db1bb596acaf --- /dev/null +++ b/mm/kfence/kfence_test.c @@ -0,0 +1,858 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Test cases for KFENCE memory safety error detector. Since the interface with + * which KFENCE's reports are obtained is via the console, this is the output we + * should verify. For each test case checks the presence (or absence) of + * generated reports. Relies on 'console' tracepoint to capture reports as they + * appear in the kernel log. + * + * Copyright (C) 2020, Google LLC. + * Author: Alexander Potapenko + * Marco Elver + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kfence.h" + +/* Report as observed from console. */ +static struct { + spinlock_t lock; + int nlines; + char lines[2][256]; +} observed = { + .lock = __SPIN_LOCK_UNLOCKED(observed.lock), +}; + +/* Probe for console output: obtains observed lines of interest. */ +static void probe_console(void *ignore, const char *buf, size_t len) +{ + unsigned long flags; + int nlines; + + spin_lock_irqsave(&observed.lock, flags); + nlines = observed.nlines; + + if (strnstr(buf, "BUG: KFENCE: ", len) && strnstr(buf, "test_", len)) { + /* + * KFENCE report and related to the test. + * + * The provided @buf is not NUL-terminated; copy no more than + * @len bytes and let strscpy() add the missing NUL-terminator. + */ + strscpy(observed.lines[0], buf, min(len + 1, sizeof(observed.lines[0]))); + nlines = 1; + } else if (nlines == 1 && (strnstr(buf, "at 0x", len) || strnstr(buf, "of 0x", len))) { + strscpy(observed.lines[nlines++], buf, min(len + 1, sizeof(observed.lines[0]))); + } + + WRITE_ONCE(observed.nlines, nlines); /* Publish new nlines. */ + spin_unlock_irqrestore(&observed.lock, flags); +} + +/* Check if a report related to the test exists. */ +static bool report_available(void) +{ + return READ_ONCE(observed.nlines) == ARRAY_SIZE(observed.lines); +} + +/* Information we expect in a report. */ +struct expect_report { + enum kfence_error_type type; /* The type or error. */ + void *fn; /* Function pointer to expected function where access occurred. */ + char *addr; /* Address at which the bad access occurred. */ + bool is_write; /* Is access a write. */ +}; + +static const char *get_access_type(const struct expect_report *r) +{ + return r->is_write ? "write" : "read"; +} + +/* Check observed report matches information in @r. */ +static bool report_matches(const struct expect_report *r) +{ + bool ret = false; + unsigned long flags; + typeof(observed.lines) expect; + const char *end; + char *cur; + + /* Doubled-checked locking. */ + if (!report_available()) + return false; + + /* Generate expected report contents. */ + + /* Title */ + cur = expect[0]; + end = &expect[0][sizeof(expect[0]) - 1]; + switch (r->type) { + case KFENCE_ERROR_OOB: + cur += scnprintf(cur, end - cur, "BUG: KFENCE: out-of-bounds %s", + get_access_type(r)); + break; + case KFENCE_ERROR_UAF: + cur += scnprintf(cur, end - cur, "BUG: KFENCE: use-after-free %s", + get_access_type(r)); + break; + case KFENCE_ERROR_CORRUPTION: + cur += scnprintf(cur, end - cur, "BUG: KFENCE: memory corruption"); + break; + case KFENCE_ERROR_INVALID: + cur += scnprintf(cur, end - cur, "BUG: KFENCE: invalid %s", + get_access_type(r)); + break; + case KFENCE_ERROR_INVALID_FREE: + cur += scnprintf(cur, end - cur, "BUG: KFENCE: invalid free"); + break; + } + + scnprintf(cur, end - cur, " in %pS", r->fn); + /* The exact offset won't match, remove it; also strip module name. */ + cur = strchr(expect[0], '+'); + if (cur) + *cur = '\0'; + + /* Access information */ + cur = expect[1]; + end = &expect[1][sizeof(expect[1]) - 1]; + + switch (r->type) { + case KFENCE_ERROR_OOB: + cur += scnprintf(cur, end - cur, "Out-of-bounds %s at", get_access_type(r)); + break; + case KFENCE_ERROR_UAF: + cur += scnprintf(cur, end - cur, "Use-after-free %s at", get_access_type(r)); + break; + case KFENCE_ERROR_CORRUPTION: + cur += scnprintf(cur, end - cur, "Corrupted memory at"); + break; + case KFENCE_ERROR_INVALID: + cur += scnprintf(cur, end - cur, "Invalid %s at", get_access_type(r)); + break; + case KFENCE_ERROR_INVALID_FREE: + cur += scnprintf(cur, end - cur, "Invalid free of"); + break; + } + + cur += scnprintf(cur, end - cur, " 0x" PTR_FMT, (void *)r->addr); + + spin_lock_irqsave(&observed.lock, flags); + if (!report_available()) + goto out; /* A new report is being captured. */ + + /* Finally match expected output to what we actually observed. */ + ret = strstr(observed.lines[0], expect[0]) && strstr(observed.lines[1], expect[1]); +out: + spin_unlock_irqrestore(&observed.lock, flags); + return ret; +} + +/* ===== Test cases ===== */ + +#define TEST_PRIV_WANT_MEMCACHE ((void *)1) + +/* Cache used by tests; if NULL, allocate from kmalloc instead. */ +static struct kmem_cache *test_cache; + +static size_t setup_test_cache(struct kunit *test, size_t size, slab_flags_t flags, + void (*ctor)(void *)) +{ + if (test->priv != TEST_PRIV_WANT_MEMCACHE) + return size; + + kunit_info(test, "%s: size=%zu, ctor=%ps\n", __func__, size, ctor); + + /* + * Use SLAB_NOLEAKTRACE to prevent merging with existing caches. Any + * other flag in SLAB_NEVER_MERGE also works. Use SLAB_ACCOUNT to + * allocate via memcg, if enabled. + */ + flags |= SLAB_NOLEAKTRACE | SLAB_ACCOUNT; + test_cache = kmem_cache_create("test", size, 1, flags, ctor); + KUNIT_ASSERT_TRUE_MSG(test, test_cache, "could not create cache"); + + return size; +} + +static void test_cache_destroy(void) +{ + if (!test_cache) + return; + + kmem_cache_destroy(test_cache); + test_cache = NULL; +} + +static inline size_t kmalloc_cache_alignment(size_t size) +{ + return kmalloc_caches[kmalloc_type(GFP_KERNEL)][kmalloc_index(size)]->align; +} + +/* Must always inline to match stack trace against caller. */ +static __always_inline void test_free(void *ptr) +{ + if (test_cache) + kmem_cache_free(test_cache, ptr); + else + kfree(ptr); +} + +/* + * If this should be a KFENCE allocation, and on which side the allocation and + * the closest guard page should be. + */ +enum allocation_policy { + ALLOCATE_ANY, /* KFENCE, any side. */ + ALLOCATE_LEFT, /* KFENCE, left side of page. */ + ALLOCATE_RIGHT, /* KFENCE, right side of page. */ + ALLOCATE_NONE, /* No KFENCE allocation. */ +}; + +/* + * Try to get a guarded allocation from KFENCE. Uses either kmalloc() or the + * current test_cache if set up. + */ +static void *test_alloc(struct kunit *test, size_t size, gfp_t gfp, enum allocation_policy policy) +{ + void *alloc; + unsigned long timeout, resched_after; + const char *policy_name; + + switch (policy) { + case ALLOCATE_ANY: + policy_name = "any"; + break; + case ALLOCATE_LEFT: + policy_name = "left"; + break; + case ALLOCATE_RIGHT: + policy_name = "right"; + break; + case ALLOCATE_NONE: + policy_name = "none"; + break; + } + + kunit_info(test, "%s: size=%zu, gfp=%x, policy=%s, cache=%i\n", __func__, size, gfp, + policy_name, !!test_cache); + + /* + * 100x the sample interval should be more than enough to ensure we get + * a KFENCE allocation eventually. + */ + timeout = jiffies + msecs_to_jiffies(100 * CONFIG_KFENCE_SAMPLE_INTERVAL); + /* + * Especially for non-preemption kernels, ensure the allocation-gate + * timer can catch up: after @resched_after, every failed allocation + * attempt yields, to ensure the allocation-gate timer is scheduled. + */ + resched_after = jiffies + msecs_to_jiffies(CONFIG_KFENCE_SAMPLE_INTERVAL); + do { + if (test_cache) + alloc = kmem_cache_alloc(test_cache, gfp); + else + alloc = kmalloc(size, gfp); + + if (is_kfence_address(alloc)) { + struct page *page = virt_to_head_page(alloc); + struct kmem_cache *s = test_cache ?: kmalloc_caches[kmalloc_type(GFP_KERNEL)][kmalloc_index(size)]; + + /* + * Verify that various helpers return the right values + * even for KFENCE objects; these are required so that + * memcg accounting works correctly. + */ + KUNIT_EXPECT_EQ(test, obj_to_index(s, page, alloc), 0U); + KUNIT_EXPECT_EQ(test, objs_per_slab_page(s, page), 1); + + if (policy == ALLOCATE_ANY) + return alloc; + if (policy == ALLOCATE_LEFT && IS_ALIGNED((unsigned long)alloc, PAGE_SIZE)) + return alloc; + if (policy == ALLOCATE_RIGHT && + !IS_ALIGNED((unsigned long)alloc, PAGE_SIZE)) + return alloc; + } else if (policy == ALLOCATE_NONE) + return alloc; + + test_free(alloc); + + if (time_after(jiffies, resched_after)) + cond_resched(); + } while (time_before(jiffies, timeout)); + + KUNIT_ASSERT_TRUE_MSG(test, false, "failed to allocate from KFENCE"); + return NULL; /* Unreachable. */ +} + +static void test_out_of_bounds_read(struct kunit *test) +{ + size_t size = 32; + struct expect_report expect = { + .type = KFENCE_ERROR_OOB, + .fn = test_out_of_bounds_read, + .is_write = false, + }; + char *buf; + + setup_test_cache(test, size, 0, NULL); + + /* + * If we don't have our own cache, adjust based on alignment, so that we + * actually access guard pages on either side. + */ + if (!test_cache) + size = kmalloc_cache_alignment(size); + + /* Test both sides. */ + + buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_LEFT); + expect.addr = buf - 1; + READ_ONCE(*expect.addr); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); + test_free(buf); + + buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_RIGHT); + expect.addr = buf + size; + READ_ONCE(*expect.addr); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); + test_free(buf); +} + +static void test_out_of_bounds_write(struct kunit *test) +{ + size_t size = 32; + struct expect_report expect = { + .type = KFENCE_ERROR_OOB, + .fn = test_out_of_bounds_write, + .is_write = true, + }; + char *buf; + + setup_test_cache(test, size, 0, NULL); + buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_LEFT); + expect.addr = buf - 1; + WRITE_ONCE(*expect.addr, 42); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); + test_free(buf); +} + +static void test_use_after_free_read(struct kunit *test) +{ + const size_t size = 32; + struct expect_report expect = { + .type = KFENCE_ERROR_UAF, + .fn = test_use_after_free_read, + .is_write = false, + }; + + setup_test_cache(test, size, 0, NULL); + expect.addr = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY); + test_free(expect.addr); + READ_ONCE(*expect.addr); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +static void test_double_free(struct kunit *test) +{ + const size_t size = 32; + struct expect_report expect = { + .type = KFENCE_ERROR_INVALID_FREE, + .fn = test_double_free, + }; + + setup_test_cache(test, size, 0, NULL); + expect.addr = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY); + test_free(expect.addr); + test_free(expect.addr); /* Double-free. */ + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +static void test_invalid_addr_free(struct kunit *test) +{ + const size_t size = 32; + struct expect_report expect = { + .type = KFENCE_ERROR_INVALID_FREE, + .fn = test_invalid_addr_free, + }; + char *buf; + + setup_test_cache(test, size, 0, NULL); + buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY); + expect.addr = buf + 1; /* Free on invalid address. */ + test_free(expect.addr); /* Invalid address free. */ + test_free(buf); /* No error. */ + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +static void test_corruption(struct kunit *test) +{ + size_t size = 32; + struct expect_report expect = { + .type = KFENCE_ERROR_CORRUPTION, + .fn = test_corruption, + }; + char *buf; + + setup_test_cache(test, size, 0, NULL); + + /* Test both sides. */ + + buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_LEFT); + expect.addr = buf + size; + WRITE_ONCE(*expect.addr, 42); + test_free(buf); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); + + buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_RIGHT); + expect.addr = buf - 1; + WRITE_ONCE(*expect.addr, 42); + test_free(buf); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* + * KFENCE is unable to detect an OOB if the allocation's alignment requirements + * leave a gap between the object and the guard page. Specifically, an + * allocation of e.g. 73 bytes is aligned on 8 and 128 bytes for SLUB or SLAB + * respectively. Therefore it is impossible for the allocated object to + * contiguously line up with the right guard page. + * + * However, we test that an access to memory beyond the gap results in KFENCE + * detecting an OOB access. + */ +static void test_kmalloc_aligned_oob_read(struct kunit *test) +{ + const size_t size = 73; + const size_t align = kmalloc_cache_alignment(size); + struct expect_report expect = { + .type = KFENCE_ERROR_OOB, + .fn = test_kmalloc_aligned_oob_read, + .is_write = false, + }; + char *buf; + + buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_RIGHT); + + /* + * The object is offset to the right, so there won't be an OOB to the + * left of it. + */ + READ_ONCE(*(buf - 1)); + KUNIT_EXPECT_FALSE(test, report_available()); + + /* + * @buf must be aligned on @align, therefore buf + size belongs to the + * same page -> no OOB. + */ + READ_ONCE(*(buf + size)); + KUNIT_EXPECT_FALSE(test, report_available()); + + /* Overflowing by @align bytes will result in an OOB. */ + expect.addr = buf + size + align; + READ_ONCE(*expect.addr); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); + + test_free(buf); +} + +static void test_kmalloc_aligned_oob_write(struct kunit *test) +{ + const size_t size = 73; + struct expect_report expect = { + .type = KFENCE_ERROR_CORRUPTION, + .fn = test_kmalloc_aligned_oob_write, + }; + char *buf; + + buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_RIGHT); + /* + * The object is offset to the right, so we won't get a page + * fault immediately after it. + */ + expect.addr = buf + size; + WRITE_ONCE(*expect.addr, READ_ONCE(*expect.addr) + 1); + KUNIT_EXPECT_FALSE(test, report_available()); + test_free(buf); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* Test cache shrinking and destroying with KFENCE. */ +static void test_shrink_memcache(struct kunit *test) +{ + const size_t size = 32; + void *buf; + + setup_test_cache(test, size, 0, NULL); + KUNIT_EXPECT_TRUE(test, test_cache); + buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY); + kmem_cache_shrink(test_cache); + test_free(buf); + + KUNIT_EXPECT_FALSE(test, report_available()); +} + +static void ctor_set_x(void *obj) +{ + /* Every object has at least 8 bytes. */ + memset(obj, 'x', 8); +} + +/* Ensure that SL*B does not modify KFENCE objects on bulk free. */ +static void test_free_bulk(struct kunit *test) +{ + int iter; + + for (iter = 0; iter < 5; iter++) { + const size_t size = setup_test_cache(test, 8 + prandom_u32_max(300), 0, + (iter & 1) ? ctor_set_x : NULL); + void *objects[] = { + test_alloc(test, size, GFP_KERNEL, ALLOCATE_RIGHT), + test_alloc(test, size, GFP_KERNEL, ALLOCATE_NONE), + test_alloc(test, size, GFP_KERNEL, ALLOCATE_LEFT), + test_alloc(test, size, GFP_KERNEL, ALLOCATE_NONE), + test_alloc(test, size, GFP_KERNEL, ALLOCATE_NONE), + }; + + kmem_cache_free_bulk(test_cache, ARRAY_SIZE(objects), objects); + KUNIT_ASSERT_FALSE(test, report_available()); + test_cache_destroy(); + } +} + +/* Test init-on-free works. */ +static void test_init_on_free(struct kunit *test) +{ + const size_t size = 32; + struct expect_report expect = { + .type = KFENCE_ERROR_UAF, + .fn = test_init_on_free, + .is_write = false, + }; + int i; + + if (!IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON)) + return; + /* Assume it hasn't been disabled on command line. */ + + setup_test_cache(test, size, 0, NULL); + expect.addr = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY); + for (i = 0; i < size; i++) + expect.addr[i] = i + 1; + test_free(expect.addr); + + for (i = 0; i < size; i++) { + /* + * This may fail if the page was recycled by KFENCE and then + * written to again -- this however, is near impossible with a + * default config. + */ + KUNIT_EXPECT_EQ(test, expect.addr[i], (char)0); + + if (!i) /* Only check first access to not fail test if page is ever re-protected. */ + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); + } +} + +/* Ensure that constructors work properly. */ +static void test_memcache_ctor(struct kunit *test) +{ + const size_t size = 32; + char *buf; + int i; + + setup_test_cache(test, size, 0, ctor_set_x); + buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY); + + for (i = 0; i < 8; i++) + KUNIT_EXPECT_EQ(test, buf[i], (char)'x'); + + test_free(buf); + + KUNIT_EXPECT_FALSE(test, report_available()); +} + +/* Test that memory is zeroed if requested. */ +static void test_gfpzero(struct kunit *test) +{ + const size_t size = PAGE_SIZE; /* PAGE_SIZE so we can use ALLOCATE_ANY. */ + char *buf1, *buf2; + int i; + + if (CONFIG_KFENCE_SAMPLE_INTERVAL > 100) { + kunit_warn(test, "skipping ... would take too long\n"); + return; + } + + setup_test_cache(test, size, 0, NULL); + buf1 = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY); + for (i = 0; i < size; i++) + buf1[i] = i + 1; + test_free(buf1); + + /* Try to get same address again -- this can take a while. */ + for (i = 0;; i++) { + buf2 = test_alloc(test, size, GFP_KERNEL | __GFP_ZERO, ALLOCATE_ANY); + if (buf1 == buf2) + break; + test_free(buf2); + + if (i == CONFIG_KFENCE_NUM_OBJECTS) { + kunit_warn(test, "giving up ... cannot get same object back\n"); + return; + } + } + + for (i = 0; i < size; i++) + KUNIT_EXPECT_EQ(test, buf2[i], (char)0); + + test_free(buf2); + + KUNIT_EXPECT_FALSE(test, report_available()); +} + +static void test_invalid_access(struct kunit *test) +{ + const struct expect_report expect = { + .type = KFENCE_ERROR_INVALID, + .fn = test_invalid_access, + .addr = &__kfence_pool[10], + .is_write = false, + }; + + READ_ONCE(__kfence_pool[10]); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* Test SLAB_TYPESAFE_BY_RCU works. */ +static void test_memcache_typesafe_by_rcu(struct kunit *test) +{ + const size_t size = 32; + struct expect_report expect = { + .type = KFENCE_ERROR_UAF, + .fn = test_memcache_typesafe_by_rcu, + .is_write = false, + }; + + setup_test_cache(test, size, SLAB_TYPESAFE_BY_RCU, NULL); + KUNIT_EXPECT_TRUE(test, test_cache); /* Want memcache. */ + + expect.addr = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY); + *expect.addr = 42; + + rcu_read_lock(); + test_free(expect.addr); + KUNIT_EXPECT_EQ(test, *expect.addr, (char)42); + /* + * Up to this point, memory should not have been freed yet, and + * therefore there should be no KFENCE report from the above access. + */ + rcu_read_unlock(); + + /* Above access to @expect.addr should not have generated a report! */ + KUNIT_EXPECT_FALSE(test, report_available()); + + /* Only after rcu_barrier() is the memory guaranteed to be freed. */ + rcu_barrier(); + + /* Expect use-after-free. */ + KUNIT_EXPECT_EQ(test, *expect.addr, (char)42); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* Test krealloc(). */ +static void test_krealloc(struct kunit *test) +{ + const size_t size = 32; + const struct expect_report expect = { + .type = KFENCE_ERROR_UAF, + .fn = test_krealloc, + .addr = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY), + .is_write = false, + }; + char *buf = expect.addr; + int i; + + KUNIT_EXPECT_FALSE(test, test_cache); + KUNIT_EXPECT_EQ(test, ksize(buf), size); /* Precise size match after KFENCE alloc. */ + for (i = 0; i < size; i++) + buf[i] = i + 1; + + /* Check that we successfully change the size. */ + buf = krealloc(buf, size * 3, GFP_KERNEL); /* Grow. */ + /* Note: Might no longer be a KFENCE alloc. */ + KUNIT_EXPECT_GE(test, ksize(buf), size * 3); + for (i = 0; i < size; i++) + KUNIT_EXPECT_EQ(test, buf[i], (char)(i + 1)); + for (; i < size * 3; i++) /* Fill to extra bytes. */ + buf[i] = i + 1; + + buf = krealloc(buf, size * 2, GFP_KERNEL); /* Shrink. */ + KUNIT_EXPECT_GE(test, ksize(buf), size * 2); + for (i = 0; i < size * 2; i++) + KUNIT_EXPECT_EQ(test, buf[i], (char)(i + 1)); + + buf = krealloc(buf, 0, GFP_KERNEL); /* Free. */ + KUNIT_EXPECT_EQ(test, (unsigned long)buf, (unsigned long)ZERO_SIZE_PTR); + KUNIT_ASSERT_FALSE(test, report_available()); /* No reports yet! */ + + READ_ONCE(*expect.addr); /* Ensure krealloc() actually freed earlier KFENCE object. */ + KUNIT_ASSERT_TRUE(test, report_matches(&expect)); +} + +/* Test that some objects from a bulk allocation belong to KFENCE pool. */ +static void test_memcache_alloc_bulk(struct kunit *test) +{ + const size_t size = 32; + bool pass = false; + unsigned long timeout; + + setup_test_cache(test, size, 0, NULL); + KUNIT_EXPECT_TRUE(test, test_cache); /* Want memcache. */ + /* + * 100x the sample interval should be more than enough to ensure we get + * a KFENCE allocation eventually. + */ + timeout = jiffies + msecs_to_jiffies(100 * CONFIG_KFENCE_SAMPLE_INTERVAL); + do { + void *objects[100]; + int i, num = kmem_cache_alloc_bulk(test_cache, GFP_ATOMIC, ARRAY_SIZE(objects), + objects); + if (!num) + continue; + for (i = 0; i < ARRAY_SIZE(objects); i++) { + if (is_kfence_address(objects[i])) { + pass = true; + break; + } + } + kmem_cache_free_bulk(test_cache, num, objects); + /* + * kmem_cache_alloc_bulk() disables interrupts, and calling it + * in a tight loop may not give KFENCE a chance to switch the + * static branch. Call cond_resched() to let KFENCE chime in. + */ + cond_resched(); + } while (!pass && time_before(jiffies, timeout)); + + KUNIT_EXPECT_TRUE(test, pass); + KUNIT_EXPECT_FALSE(test, report_available()); +} + +/* + * KUnit does not provide a way to provide arguments to tests, and we encode + * additional info in the name. Set up 2 tests per test case, one using the + * default allocator, and another using a custom memcache (suffix '-memcache'). + */ +#define KFENCE_KUNIT_CASE(test_name) \ + { .run_case = test_name, .name = #test_name }, \ + { .run_case = test_name, .name = #test_name "-memcache" } + +static struct kunit_case kfence_test_cases[] = { + KFENCE_KUNIT_CASE(test_out_of_bounds_read), + KFENCE_KUNIT_CASE(test_out_of_bounds_write), + KFENCE_KUNIT_CASE(test_use_after_free_read), + KFENCE_KUNIT_CASE(test_double_free), + KFENCE_KUNIT_CASE(test_invalid_addr_free), + KFENCE_KUNIT_CASE(test_corruption), + KFENCE_KUNIT_CASE(test_free_bulk), + KFENCE_KUNIT_CASE(test_init_on_free), + KUNIT_CASE(test_kmalloc_aligned_oob_read), + KUNIT_CASE(test_kmalloc_aligned_oob_write), + KUNIT_CASE(test_shrink_memcache), + KUNIT_CASE(test_memcache_ctor), + KUNIT_CASE(test_invalid_access), + KUNIT_CASE(test_gfpzero), + KUNIT_CASE(test_memcache_typesafe_by_rcu), + KUNIT_CASE(test_krealloc), + KUNIT_CASE(test_memcache_alloc_bulk), + {}, +}; + +/* ===== End test cases ===== */ + +static int test_init(struct kunit *test) +{ + unsigned long flags; + int i; + + spin_lock_irqsave(&observed.lock, flags); + for (i = 0; i < ARRAY_SIZE(observed.lines); i++) + observed.lines[i][0] = '\0'; + observed.nlines = 0; + spin_unlock_irqrestore(&observed.lock, flags); + + /* Any test with 'memcache' in its name will want a memcache. */ + if (strstr(test->name, "memcache")) + test->priv = TEST_PRIV_WANT_MEMCACHE; + else + test->priv = NULL; + + return 0; +} + +static void test_exit(struct kunit *test) +{ + test_cache_destroy(); +} + +static struct kunit_suite kfence_test_suite = { + .name = "kfence", + .test_cases = kfence_test_cases, + .init = test_init, + .exit = test_exit, +}; +static struct kunit_suite *kfence_test_suites[] = { &kfence_test_suite, NULL }; + +static void register_tracepoints(struct tracepoint *tp, void *ignore) +{ + check_trace_callback_type_console(probe_console); + if (!strcmp(tp->name, "console")) + WARN_ON(tracepoint_probe_register(tp, probe_console, NULL)); +} + +static void unregister_tracepoints(struct tracepoint *tp, void *ignore) +{ + if (!strcmp(tp->name, "console")) + tracepoint_probe_unregister(tp, probe_console, NULL); +} + +/* + * We only want to do tracepoints setup and teardown once, therefore we have to + * customize the init and exit functions and cannot rely on kunit_test_suite(). + */ +static int __init kfence_test_init(void) +{ + /* + * Because we want to be able to build the test as a module, we need to + * iterate through all known tracepoints, since the static registration + * won't work here. + */ + for_each_kernel_tracepoint(register_tracepoints, NULL); + return __kunit_test_suites_init(kfence_test_suites); +} + +static void kfence_test_exit(void) +{ + __kunit_test_suites_exit(kfence_test_suites); + for_each_kernel_tracepoint(unregister_tracepoints, NULL); + tracepoint_synchronize_unregister(); +} + +late_initcall(kfence_test_init); +module_exit(kfence_test_exit); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Alexander Potapenko , Marco Elver "); diff --git a/mm/kfence/report.c b/mm/kfence/report.c index 4dbfa9a382e4..901bd7ee83d8 100644 --- a/mm/kfence/report.c +++ b/mm/kfence/report.c @@ -156,7 +156,12 @@ static void print_diff_canary(unsigned long address, size_t bytes_to_show, pr_cont(" ]"); } -void kfence_report_error(unsigned long address, struct pt_regs *regs, +static const char *get_access_type(bool is_write) +{ + return is_write ? "write" : "read"; +} + +void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *regs, const struct kfence_metadata *meta, enum kfence_error_type type) { unsigned long stack_entries[KFENCE_STACK_DEPTH] = { 0 }; @@ -194,17 +199,19 @@ void kfence_report_error(unsigned long address, struct pt_regs *regs, case KFENCE_ERROR_OOB: { const bool left_of_object = address < meta->addr; - pr_err("BUG: KFENCE: out-of-bounds in %pS\n\n", (void *)stack_entries[skipnr]); - pr_err("Out-of-bounds access at 0x" PTR_FMT " (%luB %s of kfence-#%zd):\n", - (void *)address, + pr_err("BUG: KFENCE: out-of-bounds %s in %pS\n\n", get_access_type(is_write), + (void *)stack_entries[skipnr]); + pr_err("Out-of-bounds %s at 0x" PTR_FMT " (%luB %s of kfence-#%zd):\n", + get_access_type(is_write), (void *)address, left_of_object ? meta->addr - address : address - meta->addr, left_of_object ? "left" : "right", object_index); break; } case KFENCE_ERROR_UAF: - pr_err("BUG: KFENCE: use-after-free in %pS\n\n", (void *)stack_entries[skipnr]); - pr_err("Use-after-free access at 0x" PTR_FMT " (in kfence-#%zd):\n", - (void *)address, object_index); + pr_err("BUG: KFENCE: use-after-free %s in %pS\n\n", get_access_type(is_write), + (void *)stack_entries[skipnr]); + pr_err("Use-after-free %s at 0x" PTR_FMT " (in kfence-#%zd):\n", + get_access_type(is_write), (void *)address, object_index); break; case KFENCE_ERROR_CORRUPTION: pr_err("BUG: KFENCE: memory corruption in %pS\n\n", (void *)stack_entries[skipnr]); @@ -213,8 +220,10 @@ void kfence_report_error(unsigned long address, struct pt_regs *regs, pr_cont(" (in kfence-#%zd):\n", object_index); break; case KFENCE_ERROR_INVALID: - pr_err("BUG: KFENCE: invalid access in %pS\n\n", (void *)stack_entries[skipnr]); - pr_err("Invalid access at 0x" PTR_FMT ":\n", (void *)address); + pr_err("BUG: KFENCE: invalid %s in %pS\n\n", get_access_type(is_write), + (void *)stack_entries[skipnr]); + pr_err("Invalid %s at 0x" PTR_FMT ":\n", get_access_type(is_write), + (void *)address); break; case KFENCE_ERROR_INVALID_FREE: pr_err("BUG: KFENCE: invalid free in %pS\n\n", (void *)stack_entries[skipnr]); From 0825c1d57f02e3fb228bbecad827956d4c796d3a Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 25 Feb 2021 17:19:35 -0800 Subject: [PATCH 065/118] MAINTAINERS: add entry for KFENCE Add entry for KFENCE maintainers. Link: https://lkml.kernel.org/r/20201103175841.3495947-10-elver@google.com Signed-off-by: Alexander Potapenko Signed-off-by: Marco Elver Reviewed-by: Dmitry Vyukov Reviewed-by: SeongJae Park Co-developed-by: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christopher Lameter Cc: Dave Hansen Cc: David Rientjes Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Hillf Danton Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jann Horn Cc: Joern Engel Cc: Jonathan Corbet Cc: Joonsoo Kim Cc: Kees Cook Cc: Mark Rutland Cc: Paul E. McKenney Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index c71664ca8bfd..40040db747fc 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9867,6 +9867,18 @@ F: include/linux/keyctl.h F: include/uapi/linux/keyctl.h F: security/keys/ +KFENCE +M: Alexander Potapenko +M: Marco Elver +R: Dmitry Vyukov +L: kasan-dev@googlegroups.com +S: Maintained +F: Documentation/dev-tools/kfence.rst +F: arch/*/include/asm/kfence.h +F: include/linux/kfence.h +F: lib/Kconfig.kfence +F: mm/kfence/ + KFIFO M: Stefani Seibold S: Maintained From 35beccf0926d42ee0d56e41979ec8cdf814c4769 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 25 Feb 2021 17:19:40 -0800 Subject: [PATCH 066/118] kfence: report sensitive information based on no_hash_pointers We cannot rely on CONFIG_DEBUG_KERNEL to decide if we're running a "debug kernel" where we can safely show potentially sensitive information in the kernel log. Instead, simply rely on the newly introduced "no_hash_pointers" to print unhashed kernel pointers, as well as decide if our reports can include other potentially sensitive information such as registers and corrupted bytes. Link: https://lkml.kernel.org/r/20210223082043.1972742-1-elver@google.com Signed-off-by: Marco Elver Cc: Timur Tabi Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Andrey Konovalov Cc: Jann Horn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/dev-tools/kfence.rst | 8 ++++---- mm/kfence/core.c | 10 +++------- mm/kfence/kfence.h | 7 ------- mm/kfence/kfence_test.c | 2 +- mm/kfence/report.c | 18 ++++++++++-------- 5 files changed, 18 insertions(+), 27 deletions(-) diff --git a/Documentation/dev-tools/kfence.rst b/Documentation/dev-tools/kfence.rst index 58a0a5fa1ddc..fdf04e741ea5 100644 --- a/Documentation/dev-tools/kfence.rst +++ b/Documentation/dev-tools/kfence.rst @@ -88,8 +88,8 @@ A typical out-of-bounds access looks like this:: The header of the report provides a short summary of the function involved in the access. It is followed by more detailed information about the access and -its origin. Note that, real kernel addresses are only shown for -``CONFIG_DEBUG_KERNEL=y`` builds. +its origin. Note that, real kernel addresses are only shown when using the +kernel command line option ``no_hash_pointers``. Use-after-free accesses are reported as:: @@ -184,8 +184,8 @@ invalidly written bytes (offset from the address) are shown; in this representation, '.' denote untouched bytes. In the example above ``0xac`` is the value written to the invalid address at offset 0, and the remaining '.' denote that no following bytes have been touched. Note that, real values are -only shown for ``CONFIG_DEBUG_KERNEL=y`` builds; to avoid information -disclosure for non-debug builds, '!' is used instead to denote invalidly +only shown if the kernel was booted with ``no_hash_pointers``; to avoid +information disclosure otherwise, '!' is used instead to denote invalidly written bytes. And finally, KFENCE may also report on invalid accesses to any protected page diff --git a/mm/kfence/core.c b/mm/kfence/core.c index cfe3d32ac5b7..3b8ec938470a 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -646,13 +646,9 @@ void __init kfence_init(void) WRITE_ONCE(kfence_enabled, true); schedule_delayed_work(&kfence_timer, 0); - pr_info("initialized - using %lu bytes for %d objects", KFENCE_POOL_SIZE, - CONFIG_KFENCE_NUM_OBJECTS); - if (IS_ENABLED(CONFIG_DEBUG_KERNEL)) - pr_cont(" at 0x%px-0x%px\n", (void *)__kfence_pool, - (void *)(__kfence_pool + KFENCE_POOL_SIZE)); - else - pr_cont("\n"); + pr_info("initialized - using %lu bytes for %d objects at 0x%p-0x%p\n", KFENCE_POOL_SIZE, + CONFIG_KFENCE_NUM_OBJECTS, (void *)__kfence_pool, + (void *)(__kfence_pool + KFENCE_POOL_SIZE)); } void kfence_shutdown_cache(struct kmem_cache *s) diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h index 1accc840dbbe..24065321ff8a 100644 --- a/mm/kfence/kfence.h +++ b/mm/kfence/kfence.h @@ -16,13 +16,6 @@ #include "../slab.h" /* for struct kmem_cache */ -/* For non-debug builds, avoid leaking kernel pointers into dmesg. */ -#ifdef CONFIG_DEBUG_KERNEL -#define PTR_FMT "%px" -#else -#define PTR_FMT "%p" -#endif - /* * Get the canary byte pattern for @addr. Use a pattern that varies based on the * lower 3 bits of the address, to detect memory corruptions with higher diff --git a/mm/kfence/kfence_test.c b/mm/kfence/kfence_test.c index db1bb596acaf..4acf4251ee04 100644 --- a/mm/kfence/kfence_test.c +++ b/mm/kfence/kfence_test.c @@ -146,7 +146,7 @@ static bool report_matches(const struct expect_report *r) break; } - cur += scnprintf(cur, end - cur, " 0x" PTR_FMT, (void *)r->addr); + cur += scnprintf(cur, end - cur, " 0x%p", (void *)r->addr); spin_lock_irqsave(&observed.lock, flags); if (!report_available()) diff --git a/mm/kfence/report.c b/mm/kfence/report.c index 901bd7ee83d8..4a424de44e2d 100644 --- a/mm/kfence/report.c +++ b/mm/kfence/report.c @@ -19,6 +19,8 @@ #include "kfence.h" +extern bool no_hash_pointers; + /* Helper function to either print to a seq_file or to console. */ __printf(2, 3) static void seq_con_printf(struct seq_file *seq, const char *fmt, ...) @@ -118,7 +120,7 @@ void kfence_print_object(struct seq_file *seq, const struct kfence_metadata *met } seq_con_printf(seq, - "kfence-#%zd [0x" PTR_FMT "-0x" PTR_FMT + "kfence-#%zd [0x%p-0x%p" ", size=%d, cache=%s] allocated by task %d:\n", meta - kfence_metadata, (void *)start, (void *)(start + size - 1), size, (cache && cache->name) ? cache->name : "", meta->alloc_track.pid); @@ -148,7 +150,7 @@ static void print_diff_canary(unsigned long address, size_t bytes_to_show, for (cur = (const u8 *)address; cur < end; cur++) { if (*cur == KFENCE_CANARY_PATTERN(cur)) pr_cont(" ."); - else if (IS_ENABLED(CONFIG_DEBUG_KERNEL)) + else if (no_hash_pointers) pr_cont(" 0x%02x", *cur); else /* Do not leak kernel memory in non-debug builds. */ pr_cont(" !"); @@ -201,7 +203,7 @@ void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *r pr_err("BUG: KFENCE: out-of-bounds %s in %pS\n\n", get_access_type(is_write), (void *)stack_entries[skipnr]); - pr_err("Out-of-bounds %s at 0x" PTR_FMT " (%luB %s of kfence-#%zd):\n", + pr_err("Out-of-bounds %s at 0x%p (%luB %s of kfence-#%zd):\n", get_access_type(is_write), (void *)address, left_of_object ? meta->addr - address : address - meta->addr, left_of_object ? "left" : "right", object_index); @@ -210,24 +212,24 @@ void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *r case KFENCE_ERROR_UAF: pr_err("BUG: KFENCE: use-after-free %s in %pS\n\n", get_access_type(is_write), (void *)stack_entries[skipnr]); - pr_err("Use-after-free %s at 0x" PTR_FMT " (in kfence-#%zd):\n", + pr_err("Use-after-free %s at 0x%p (in kfence-#%zd):\n", get_access_type(is_write), (void *)address, object_index); break; case KFENCE_ERROR_CORRUPTION: pr_err("BUG: KFENCE: memory corruption in %pS\n\n", (void *)stack_entries[skipnr]); - pr_err("Corrupted memory at 0x" PTR_FMT " ", (void *)address); + pr_err("Corrupted memory at 0x%p ", (void *)address); print_diff_canary(address, 16, meta); pr_cont(" (in kfence-#%zd):\n", object_index); break; case KFENCE_ERROR_INVALID: pr_err("BUG: KFENCE: invalid %s in %pS\n\n", get_access_type(is_write), (void *)stack_entries[skipnr]); - pr_err("Invalid %s at 0x" PTR_FMT ":\n", get_access_type(is_write), + pr_err("Invalid %s at 0x%p:\n", get_access_type(is_write), (void *)address); break; case KFENCE_ERROR_INVALID_FREE: pr_err("BUG: KFENCE: invalid free in %pS\n\n", (void *)stack_entries[skipnr]); - pr_err("Invalid free of 0x" PTR_FMT " (in kfence-#%zd):\n", (void *)address, + pr_err("Invalid free of 0x%p (in kfence-#%zd):\n", (void *)address, object_index); break; } @@ -242,7 +244,7 @@ void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *r /* Print report footer. */ pr_err("\n"); - if (IS_ENABLED(CONFIG_DEBUG_KERNEL) && regs) + if (no_hash_pointers && regs) show_regs(regs); else dump_stack_print_info(KERN_ERR); From 9c0dee54eb91d48cca048bd7bd2c1f4a166e0252 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 25 Feb 2021 17:19:44 -0800 Subject: [PATCH 067/118] tracing: add error_report_end trace point Patch series "Add error_report_end tracepoint to KFENCE and KASAN", v3. This patchset adds a tracepoint, error_repor_end, that is to be used by KFENCE, KASAN, and potentially other bug detection tools, when they print an error report. One of the possible use cases is userspace collection of kernel error reports: interested parties can subscribe to the tracing event via tracefs, and get notified when an error report occurs. This patch (of 3): Introduce error_report_end tracepoint. It can be used in debugging tools like KASAN, KFENCE, etc. to provide extensions to the error reporting mechanisms (e.g. allow tests hook into error reporting, ease error report collection from production kernels). Another benefit would be making use of ftrace for debugging or benchmarking the tools themselves. Should we need it, the tracepoint name leaves us with the possibility to introduce a complementary error_report_start tracepoint in the future. Link: https://lkml.kernel.org/r/20210121131915.1331302-1-glider@google.com Link: https://lkml.kernel.org/r/20210121131915.1331302-2-glider@google.com Signed-off-by: Alexander Potapenko Suggested-by: Marco Elver Cc: Andrey Konovalov Cc: Dmitry Vyukov Cc: Ingo Molnar Cc: Petr Mladek Cc: Steven Rostedt Cc: Sergey Senozhatsky Cc: Greg Kroah-Hartman Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/error_report.h | 74 +++++++++++++++++++++++++++++ kernel/trace/Makefile | 1 + kernel/trace/error_report-traces.c | 11 +++++ 3 files changed, 86 insertions(+) create mode 100644 include/trace/events/error_report.h create mode 100644 kernel/trace/error_report-traces.c diff --git a/include/trace/events/error_report.h b/include/trace/events/error_report.h new file mode 100644 index 000000000000..96f64bf218b2 --- /dev/null +++ b/include/trace/events/error_report.h @@ -0,0 +1,74 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Declarations for error reporting tracepoints. + * + * Copyright (C) 2021, Google LLC. + */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM error_report + +#if !defined(_TRACE_ERROR_REPORT_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_ERROR_REPORT_H + +#include + +#ifndef __ERROR_REPORT_DECLARE_TRACE_ENUMS_ONCE_ONLY +#define __ERROR_REPORT_DECLARE_TRACE_ENUMS_ONCE_ONLY + +enum error_detector { + ERROR_DETECTOR_KFENCE, + ERROR_DETECTOR_KASAN +}; + +#endif /* __ERROR_REPORT_DECLARE_TRACE_ENUMS_ONCE_ONLY */ + +#define error_detector_list \ + EM(ERROR_DETECTOR_KFENCE, "kfence") \ + EMe(ERROR_DETECTOR_KASAN, "kasan") +/* Always end the list with an EMe. */ + +#undef EM +#undef EMe + +#define EM(a, b) TRACE_DEFINE_ENUM(a); +#define EMe(a, b) TRACE_DEFINE_ENUM(a); + +error_detector_list + +#undef EM +#undef EMe + +#define EM(a, b) { a, b }, +#define EMe(a, b) { a, b } + +#define show_error_detector_list(val) \ + __print_symbolic(val, error_detector_list) + +DECLARE_EVENT_CLASS(error_report_template, + TP_PROTO(enum error_detector error_detector, unsigned long id), + TP_ARGS(error_detector, id), + TP_STRUCT__entry(__field(enum error_detector, error_detector) + __field(unsigned long, id)), + TP_fast_assign(__entry->error_detector = error_detector; + __entry->id = id;), + TP_printk("[%s] %lx", + show_error_detector_list(__entry->error_detector), + __entry->id)); + +/** + * error_report_end - called after printing the error report + * @error_detector: short string describing the error detection tool + * @id: pseudo-unique descriptor identifying the report + * (e.g. the memory access address) + * + * This event occurs right after a debugging tool finishes printing the error + * report. + */ +DEFINE_EVENT(error_report_template, error_report_end, + TP_PROTO(enum error_detector error_detector, unsigned long id), + TP_ARGS(error_detector, id)); + +#endif /* _TRACE_ERROR_REPORT_H */ + +/* This part must be outside protection */ +#include diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 7e44cea89fdc..b28d3e5013cd 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -81,6 +81,7 @@ obj-$(CONFIG_SYNTH_EVENTS) += trace_events_synth.o obj-$(CONFIG_HIST_TRIGGERS) += trace_events_hist.o obj-$(CONFIG_BPF_EVENTS) += bpf_trace.o obj-$(CONFIG_KPROBE_EVENTS) += trace_kprobe.o +obj-$(CONFIG_TRACEPOINTS) += error_report-traces.o obj-$(CONFIG_TRACEPOINTS) += power-traces.o ifeq ($(CONFIG_PM),y) obj-$(CONFIG_TRACEPOINTS) += rpm-traces.o diff --git a/kernel/trace/error_report-traces.c b/kernel/trace/error_report-traces.c new file mode 100644 index 000000000000..f89792c25b11 --- /dev/null +++ b/kernel/trace/error_report-traces.c @@ -0,0 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Error reporting trace points. + * + * Copyright (C) 2021, Google LLC. + */ + +#define CREATE_TRACE_POINTS +#include + +EXPORT_TRACEPOINT_SYMBOL_GPL(error_report_end); From f2b84d2e40eb1a17f72dc4a1da463ec8de649f19 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 25 Feb 2021 17:19:47 -0800 Subject: [PATCH 068/118] kfence: use error_report_end tracepoint Make it possible to trace KFENCE error reporting. A good usecase is watching for trace events from the userspace to detect and process memory corruption reports from the kernel. Link: https://lkml.kernel.org/r/20210121131915.1331302-3-glider@google.com Signed-off-by: Alexander Potapenko Suggested-by: Marco Elver Cc: Andrey Konovalov Cc: Dmitry Vyukov Cc: Ingo Molnar Cc: Petr Mladek Cc: Steven Rostedt Cc: Sergey Senozhatsky Cc: Vlastimil Babka Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kfence/report.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/kfence/report.c b/mm/kfence/report.c index 4a424de44e2d..ab83d5a59bb1 100644 --- a/mm/kfence/report.c +++ b/mm/kfence/report.c @@ -14,6 +14,7 @@ #include #include #include +#include #include @@ -248,6 +249,7 @@ void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *r show_regs(regs); else dump_stack_print_info(KERN_ERR); + trace_error_report_end(ERROR_DETECTOR_KFENCE, address); pr_err("==================================================================\n"); lockdep_on(); From d3a61f745e0d089a2484740283a434deb6dd4eb5 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 25 Feb 2021 17:19:51 -0800 Subject: [PATCH 069/118] kasan: use error_report_end tracepoint Make it possible to trace KASAN error reporting. A good usecase is watching for trace events from the userspace to detect and process memory corruption reports from the kernel. Link: https://lkml.kernel.org/r/20210121131915.1331302-4-glider@google.com Signed-off-by: Alexander Potapenko Suggested-by: Marco Elver Cc: Andrey Konovalov Cc: Dmitry Vyukov Cc: Ingo Molnar Cc: Petr Mladek Cc: Steven Rostedt Cc: Sergey Senozhatsky Cc: Greg Kroah-Hartman Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/report.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 234f35a84f19..87b271206163 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -25,6 +25,7 @@ #include #include #include +#include #include @@ -84,8 +85,9 @@ static void start_report(unsigned long *flags) pr_err("==================================================================\n"); } -static void end_report(unsigned long *flags) +static void end_report(unsigned long *flags, unsigned long addr) { + trace_error_report_end(ERROR_DETECTOR_KASAN, addr); pr_err("==================================================================\n"); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); spin_unlock_irqrestore(&report_lock, *flags); @@ -355,7 +357,7 @@ void kasan_report_invalid_free(void *object, unsigned long ip) print_address_description(object, tag); pr_err("\n"); print_memory_metadata(object); - end_report(&flags); + end_report(&flags, (unsigned long)object); } static void __kasan_report(unsigned long addr, size_t size, bool is_write, @@ -401,7 +403,7 @@ static void __kasan_report(unsigned long addr, size_t size, bool is_write, dump_stack(); } - end_report(&flags); + end_report(&flags, addr); } bool kasan_report(unsigned long addr, size_t size, bool is_write, From 928501344fc645f80390afc12708c81b3595745d Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 25 Feb 2021 17:19:55 -0800 Subject: [PATCH 070/118] kasan, mm: don't save alloc stacks twice Patch series "kasan: optimizations and fixes for HW_TAGS", v4. This patchset makes the HW_TAGS mode more efficient, mostly by reworking poisoning approaches and simplifying/inlining some internal helpers. With this change, the overhead of HW_TAGS annotations excluding setting and checking memory tags is ~3%. The performance impact caused by tags will be unknown until we have hardware that supports MTE. As a side-effect, this patchset speeds up generic KASAN by ~15%. This patch (of 13): Currently KASAN saves allocation stacks in both kasan_slab_alloc() and kasan_kmalloc() annotations. This patch changes KASAN to save allocation stacks for slab objects from kmalloc caches in kasan_kmalloc() only, and stacks for other slab objects in kasan_slab_alloc() only. This change requires ____kasan_kmalloc() knowing whether the object belongs to a kmalloc cache. This is implemented by adding a flag field to the kasan_info structure. That flag is only set for kmalloc caches via a new kasan_cache_create_kmalloc() annotation. Link: https://lkml.kernel.org/r/cover.1612546384.git.andreyknvl@google.com Link: https://lkml.kernel.org/r/7c673ebca8d00f40a7ad6f04ab9a2bddeeae2097.1612546384.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Catalin Marinas Cc: Vincenzo Frascino Cc: Dmitry Vyukov Cc: Alexander Potapenko Cc: Will Deacon Cc: Andrey Ryabinin Cc: Peter Collingbourne Cc: Evgenii Stepanov Cc: Branislav Rankov Cc: Kevin Brodsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan.h | 9 +++++++++ mm/kasan/common.c | 18 ++++++++++++++---- mm/slab_common.c | 1 + 3 files changed, 24 insertions(+), 4 deletions(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 7eaf2d9effb4..3fb31e8a353e 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -83,6 +83,7 @@ static inline void kasan_disable_current(void) {} struct kasan_cache { int alloc_meta_offset; int free_meta_offset; + bool is_kmalloc; }; #ifdef CONFIG_KASAN_HW_TAGS @@ -143,6 +144,13 @@ static __always_inline void kasan_cache_create(struct kmem_cache *cache, __kasan_cache_create(cache, size, flags); } +void __kasan_cache_create_kmalloc(struct kmem_cache *cache); +static __always_inline void kasan_cache_create_kmalloc(struct kmem_cache *cache) +{ + if (kasan_enabled()) + __kasan_cache_create_kmalloc(cache); +} + size_t __kasan_metadata_size(struct kmem_cache *cache); static __always_inline size_t kasan_metadata_size(struct kmem_cache *cache) { @@ -278,6 +286,7 @@ static inline void kasan_free_pages(struct page *page, unsigned int order) {} static inline void kasan_cache_create(struct kmem_cache *cache, unsigned int *size, slab_flags_t *flags) {} +static inline void kasan_cache_create_kmalloc(struct kmem_cache *cache) {} static inline size_t kasan_metadata_size(struct kmem_cache *cache) { return 0; } static inline void kasan_poison_slab(struct page *page) {} static inline void kasan_unpoison_object_data(struct kmem_cache *cache, diff --git a/mm/kasan/common.c b/mm/kasan/common.c index af1768c4fee5..d8d83ca56fe2 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -210,6 +210,11 @@ void __kasan_cache_create(struct kmem_cache *cache, unsigned int *size, *size = optimal_size; } +void __kasan_cache_create_kmalloc(struct kmem_cache *cache) +{ + cache->kasan_info.is_kmalloc = true; +} + size_t __kasan_metadata_size(struct kmem_cache *cache) { if (!kasan_stack_collection_enabled()) @@ -394,17 +399,22 @@ void __kasan_slab_free_mempool(void *ptr, unsigned long ip) } } -static void set_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) +static void set_alloc_info(struct kmem_cache *cache, void *object, + gfp_t flags, bool is_kmalloc) { struct kasan_alloc_meta *alloc_meta; + /* Don't save alloc info for kmalloc caches in kasan_slab_alloc(). */ + if (cache->kasan_info.is_kmalloc && !is_kmalloc) + return; + alloc_meta = kasan_get_alloc_meta(cache, object); if (alloc_meta) kasan_set_track(&alloc_meta->alloc_track, flags); } static void *____kasan_kmalloc(struct kmem_cache *cache, const void *object, - size_t size, gfp_t flags, bool keep_tag) + size_t size, gfp_t flags, bool is_kmalloc) { unsigned long redzone_start; unsigned long redzone_end; @@ -423,7 +433,7 @@ static void *____kasan_kmalloc(struct kmem_cache *cache, const void *object, KASAN_GRANULE_SIZE); redzone_end = round_up((unsigned long)object + cache->object_size, KASAN_GRANULE_SIZE); - tag = assign_tag(cache, object, false, keep_tag); + tag = assign_tag(cache, object, false, is_kmalloc); /* Tag is ignored in set_tag without CONFIG_KASAN_SW/HW_TAGS */ kasan_unpoison(set_tag(object, tag), size); @@ -431,7 +441,7 @@ static void *____kasan_kmalloc(struct kmem_cache *cache, const void *object, KASAN_KMALLOC_REDZONE); if (kasan_stack_collection_enabled()) - set_alloc_info(cache, (void *)object, flags); + set_alloc_info(cache, (void *)object, flags, is_kmalloc); return set_tag(object, tag); } diff --git a/mm/slab_common.c b/mm/slab_common.c index 284954ef1da5..897c3a446b04 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -643,6 +643,7 @@ struct kmem_cache *__init create_kmalloc_cache(const char *name, panic("Out of memory when creating slab %s\n", name); create_boot_cache(s, name, size, flags, useroffset, usersize); + kasan_cache_create_kmalloc(s); list_add(&s->list, &slab_caches); s->refcount = 1; return s; From e2db1a9aa3814960a56583df39ea71e36d802278 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 25 Feb 2021 17:19:59 -0800 Subject: [PATCH 071/118] kasan, mm: optimize kmalloc poisoning For allocations from kmalloc caches, kasan_kmalloc() always follows kasan_slab_alloc(). Currenly, both of them unpoison the whole object, which is unnecessary. This patch provides separate implementations for both annotations: kasan_slab_alloc() unpoisons the whole object, and kasan_kmalloc() only poisons the redzone. For generic KASAN, the redzone start might not be aligned to KASAN_GRANULE_SIZE. Therefore, the poisoning is split in two parts: kasan_poison_last_granule() poisons the unaligned part, and then kasan_poison() poisons the rest. This patch also clarifies alignment guarantees of each of the poisoning functions and drops the unnecessary round_up() call for redzone_end. With this change, the early SLUB cache annotation needs to be changed to kasan_slab_alloc(), as kasan_kmalloc() doesn't unpoison objects now. The number of poisoned bytes for objects in this cache stays the same, as kmem_cache_node->object_size is equal to sizeof(struct kmem_cache_node). Link: https://lkml.kernel.org/r/7e3961cb52be380bc412860332063f5f7ce10d13.1612546384.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Branislav Rankov Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Kevin Brodsky Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/common.c | 93 +++++++++++++++++++++++++++++++---------------- mm/kasan/kasan.h | 43 +++++++++++++++++++++- mm/kasan/shadow.c | 28 +++++++------- mm/slub.c | 3 +- 4 files changed, 119 insertions(+), 48 deletions(-) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index d8d83ca56fe2..218b23a5a597 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -278,21 +278,11 @@ void __kasan_poison_object_data(struct kmem_cache *cache, void *object) * based on objects indexes, so that objects that are next to each other * get different tags. */ -static u8 assign_tag(struct kmem_cache *cache, const void *object, - bool init, bool keep_tag) +static u8 assign_tag(struct kmem_cache *cache, const void *object, bool init) { if (IS_ENABLED(CONFIG_KASAN_GENERIC)) return 0xff; - /* - * 1. When an object is kmalloc()'ed, two hooks are called: - * kasan_slab_alloc() and kasan_kmalloc(). We assign the - * tag only in the first one. - * 2. We reuse the same tag for krealloc'ed objects. - */ - if (keep_tag) - return get_tag(object); - /* * If the cache neither has a constructor nor has SLAB_TYPESAFE_BY_RCU * set, assign a tag when the object is being allocated (init == false). @@ -325,7 +315,7 @@ void * __must_check __kasan_init_slab_obj(struct kmem_cache *cache, } /* Tag is ignored in set_tag() without CONFIG_KASAN_SW/HW_TAGS */ - object = set_tag(object, assign_tag(cache, object, true, false)); + object = set_tag(object, assign_tag(cache, object, true)); return (void *)object; } @@ -413,12 +403,46 @@ static void set_alloc_info(struct kmem_cache *cache, void *object, kasan_set_track(&alloc_meta->alloc_track, flags); } +void * __must_check __kasan_slab_alloc(struct kmem_cache *cache, + void *object, gfp_t flags) +{ + u8 tag; + void *tagged_object; + + if (gfpflags_allow_blocking(flags)) + kasan_quarantine_reduce(); + + if (unlikely(object == NULL)) + return NULL; + + if (is_kfence_address(object)) + return (void *)object; + + /* + * Generate and assign random tag for tag-based modes. + * Tag is ignored in set_tag() for the generic mode. + */ + tag = assign_tag(cache, object, false); + tagged_object = set_tag(object, tag); + + /* + * Unpoison the whole object. + * For kmalloc() allocations, kasan_kmalloc() will do precise poisoning. + */ + kasan_unpoison(tagged_object, cache->object_size); + + /* Save alloc info (if possible) for non-kmalloc() allocations. */ + if (kasan_stack_collection_enabled()) + set_alloc_info(cache, (void *)object, flags, false); + + return tagged_object; +} + static void *____kasan_kmalloc(struct kmem_cache *cache, const void *object, - size_t size, gfp_t flags, bool is_kmalloc) + size_t size, gfp_t flags) { unsigned long redzone_start; unsigned long redzone_end; - u8 tag; if (gfpflags_allow_blocking(flags)) kasan_quarantine_reduce(); @@ -429,33 +453,41 @@ static void *____kasan_kmalloc(struct kmem_cache *cache, const void *object, if (is_kfence_address(kasan_reset_tag(object))) return (void *)object; + /* + * The object has already been unpoisoned by kasan_slab_alloc() for + * kmalloc() or by ksize() for krealloc(). + */ + + /* + * The redzone has byte-level precision for the generic mode. + * Partially poison the last object granule to cover the unaligned + * part of the redzone. + */ + if (IS_ENABLED(CONFIG_KASAN_GENERIC)) + kasan_poison_last_granule((void *)object, size); + + /* Poison the aligned part of the redzone. */ redzone_start = round_up((unsigned long)(object + size), KASAN_GRANULE_SIZE); - redzone_end = round_up((unsigned long)object + cache->object_size, - KASAN_GRANULE_SIZE); - tag = assign_tag(cache, object, false, is_kmalloc); - - /* Tag is ignored in set_tag without CONFIG_KASAN_SW/HW_TAGS */ - kasan_unpoison(set_tag(object, tag), size); + redzone_end = (unsigned long)object + cache->object_size; kasan_poison((void *)redzone_start, redzone_end - redzone_start, KASAN_KMALLOC_REDZONE); + /* + * Save alloc info (if possible) for kmalloc() allocations. + * This also rewrites the alloc info when called from kasan_krealloc(). + */ if (kasan_stack_collection_enabled()) - set_alloc_info(cache, (void *)object, flags, is_kmalloc); + set_alloc_info(cache, (void *)object, flags, true); - return set_tag(object, tag); -} - -void * __must_check __kasan_slab_alloc(struct kmem_cache *cache, - void *object, gfp_t flags) -{ - return ____kasan_kmalloc(cache, object, cache->object_size, flags, false); + /* Keep the tag that was set by kasan_slab_alloc(). */ + return (void *)object; } void * __must_check __kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size, gfp_t flags) { - return ____kasan_kmalloc(cache, object, size, flags, true); + return ____kasan_kmalloc(cache, object, size, flags); } EXPORT_SYMBOL(__kasan_kmalloc); @@ -496,8 +528,7 @@ void * __must_check __kasan_krealloc(const void *object, size_t size, gfp_t flag if (unlikely(!PageSlab(page))) return __kasan_kmalloc_large(object, size, flags); else - return ____kasan_kmalloc(page->slab_cache, object, size, - flags, true); + return ____kasan_kmalloc(page->slab_cache, object, size, flags); } void __kasan_kfree_large(void *ptr, unsigned long ip) diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index fb883740fd27..222858e2e6af 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -367,12 +367,51 @@ static inline bool kasan_byte_accessible(const void *addr) #else /* CONFIG_KASAN_HW_TAGS */ -void kasan_poison(const void *address, size_t size, u8 value); -void kasan_unpoison(const void *address, size_t size); +/** + * kasan_poison - mark the memory range as unaccessible + * @addr - range start address, must be aligned to KASAN_GRANULE_SIZE + * @size - range size + * @value - value that's written to metadata for the range + * + * The size gets aligned to KASAN_GRANULE_SIZE before marking the range. + */ +void kasan_poison(const void *addr, size_t size, u8 value); + +/** + * kasan_unpoison - mark the memory range as accessible + * @addr - range start address, must be aligned to KASAN_GRANULE_SIZE + * @size - range size + * + * For the tag-based modes, the @size gets aligned to KASAN_GRANULE_SIZE before + * marking the range. + * For the generic mode, the last granule of the memory range gets partially + * unpoisoned based on the @size. + */ +void kasan_unpoison(const void *addr, size_t size); + bool kasan_byte_accessible(const void *addr); #endif /* CONFIG_KASAN_HW_TAGS */ +#ifdef CONFIG_KASAN_GENERIC + +/** + * kasan_poison_last_granule - mark the last granule of the memory range as + * unaccessible + * @addr - range start address, must be aligned to KASAN_GRANULE_SIZE + * @size - range size + * + * This function is only available for the generic mode, as it's the only mode + * that has partially poisoned memory granules. + */ +void kasan_poison_last_granule(const void *address, size_t size); + +#else /* CONFIG_KASAN_GENERIC */ + +static inline void kasan_poison_last_granule(const void *address, size_t size) { } + +#endif /* CONFIG_KASAN_GENERIC */ + /* * Exported functions for interfaces called from assembly or from generated * code. Declarations here to avoid warning about missing declarations. diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 1372a2fc0ca9..1ed7817e4ee6 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -69,10 +69,6 @@ void *memcpy(void *dest, const void *src, size_t len) return __memcpy(dest, src, len); } -/* - * Poisons the shadow memory for 'size' bytes starting from 'addr'. - * Memory addresses should be aligned to KASAN_GRANULE_SIZE. - */ void kasan_poison(const void *address, size_t size, u8 value) { void *shadow_start, *shadow_end; @@ -83,12 +79,12 @@ void kasan_poison(const void *address, size_t size, u8 value) * addresses to this function. */ address = kasan_reset_tag(address); - size = round_up(size, KASAN_GRANULE_SIZE); /* Skip KFENCE memory if called explicitly outside of sl*b. */ if (is_kfence_address(address)) return; + size = round_up(size, KASAN_GRANULE_SIZE); shadow_start = kasan_mem_to_shadow(address); shadow_end = kasan_mem_to_shadow(address + size); @@ -96,6 +92,16 @@ void kasan_poison(const void *address, size_t size, u8 value) } EXPORT_SYMBOL(kasan_poison); +#ifdef CONFIG_KASAN_GENERIC +void kasan_poison_last_granule(const void *address, size_t size) +{ + if (size & KASAN_GRANULE_MASK) { + u8 *shadow = (u8 *)kasan_mem_to_shadow(address + size); + *shadow = size & KASAN_GRANULE_MASK; + } +} +#endif + void kasan_unpoison(const void *address, size_t size) { u8 tag = get_tag(address); @@ -115,16 +121,12 @@ void kasan_unpoison(const void *address, size_t size) if (is_kfence_address(address)) return; + /* Unpoison round_up(size, KASAN_GRANULE_SIZE) bytes. */ kasan_poison(address, size, tag); - if (size & KASAN_GRANULE_MASK) { - u8 *shadow = (u8 *)kasan_mem_to_shadow(address + size); - - if (IS_ENABLED(CONFIG_KASAN_SW_TAGS)) - *shadow = tag; - else /* CONFIG_KASAN_GENERIC */ - *shadow = size & KASAN_GRANULE_MASK; - } + /* Partially poison the last granule for the generic mode. */ + if (IS_ENABLED(CONFIG_KASAN_GENERIC)) + kasan_poison_last_granule(address, size); } #ifdef CONFIG_MEMORY_HOTPLUG diff --git a/mm/slub.c b/mm/slub.c index 383616af28c4..e26c274b4657 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3579,8 +3579,7 @@ static void early_kmem_cache_node_alloc(int node) init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); init_tracking(kmem_cache_node, n); #endif - n = kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node), - GFP_KERNEL); + n = kasan_slab_alloc(kmem_cache_node, n, GFP_KERNEL); page->freelist = get_freepointer(kmem_cache_node, n); page->inuse = 1; page->frozen = 0; From 43a219cbe5a46ec3f6a1874bb2cb2fd4de8322cc Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 25 Feb 2021 17:20:03 -0800 Subject: [PATCH 072/118] kasan: optimize large kmalloc poisoning Similarly to kasan_kmalloc(), kasan_kmalloc_large() doesn't need to unpoison the object as it as already unpoisoned by alloc_pages() (or by ksize() for krealloc()). This patch changes kasan_kmalloc_large() to only poison the redzone. Link: https://lkml.kernel.org/r/33dee5aac0e550ad7f8e26f590c9b02c6129b4a3.1612546384.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Branislav Rankov Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Kevin Brodsky Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/common.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 218b23a5a597..dcdc92948364 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -494,7 +494,6 @@ EXPORT_SYMBOL(__kasan_kmalloc); void * __must_check __kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags) { - struct page *page; unsigned long redzone_start; unsigned long redzone_end; @@ -504,12 +503,23 @@ void * __must_check __kasan_kmalloc_large(const void *ptr, size_t size, if (unlikely(ptr == NULL)) return NULL; - page = virt_to_page(ptr); + /* + * The object has already been unpoisoned by kasan_alloc_pages() for + * alloc_pages() or by ksize() for krealloc(). + */ + + /* + * The redzone has byte-level precision for the generic mode. + * Partially poison the last object granule to cover the unaligned + * part of the redzone. + */ + if (IS_ENABLED(CONFIG_KASAN_GENERIC)) + kasan_poison_last_granule(ptr, size); + + /* Poison the aligned part of the redzone. */ redzone_start = round_up((unsigned long)(ptr + size), KASAN_GRANULE_SIZE); - redzone_end = (unsigned long)ptr + page_size(page); - - kasan_unpoison(ptr, size); + redzone_end = (unsigned long)ptr + page_size(virt_to_page(ptr)); kasan_poison((void *)redzone_start, redzone_end - redzone_start, KASAN_PAGE_REDZONE); From df54b383124cf3e09f66644ee8a2eb977e8c7f26 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 25 Feb 2021 17:20:07 -0800 Subject: [PATCH 073/118] kasan: clean up setting free info in kasan_slab_free Put kasan_stack_collection_enabled() check and kasan_set_free_info() calls next to each other. The way this was previously implemented was a minor optimization that relied of the the fact that kasan_stack_collection_enabled() is always true for generic KASAN. The confusion that this brings outweights saving a few instructions. Link: https://lkml.kernel.org/r/f838e249be5ab5810bf54a36ef5072cfd80e2da7.1612546384.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Branislav Rankov Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Kevin Brodsky Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/common.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index dcdc92948364..48d51daeda95 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -350,13 +350,11 @@ static bool ____kasan_slab_free(struct kmem_cache *cache, void *object, kasan_poison(object, cache->object_size, KASAN_KMALLOC_FREE); - if (!kasan_stack_collection_enabled()) - return false; - if ((IS_ENABLED(CONFIG_KASAN_GENERIC) && !quarantine)) return false; - kasan_set_free_info(cache, object, tag); + if (kasan_stack_collection_enabled()) + kasan_set_free_info(cache, object, tag); return kasan_quarantine_put(cache, object); } From 200072ce33b298cf14d3ed2a570f5eb27609677d Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 25 Feb 2021 17:20:11 -0800 Subject: [PATCH 074/118] kasan: unify large kfree checks Unify checks in kasan_kfree_large() and in kasan_slab_free_mempool() for large allocations as it's done for small kfree() allocations. With this change, kasan_slab_free_mempool() starts checking that the first byte of the memory that's being freed is accessible. Link: https://lkml.kernel.org/r/14ffc4cd867e0b1ed58f7527e3b748a1b4ad08aa.1612546384.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Branislav Rankov Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Kevin Brodsky Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan.h | 16 ++++++++-------- mm/kasan/common.c | 36 ++++++++++++++++++++++++++---------- 2 files changed, 34 insertions(+), 18 deletions(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 3fb31e8a353e..b91732bd05d7 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -200,6 +200,13 @@ static __always_inline bool kasan_slab_free(struct kmem_cache *s, void *object) return false; } +void __kasan_kfree_large(void *ptr, unsigned long ip); +static __always_inline void kasan_kfree_large(void *ptr) +{ + if (kasan_enabled()) + __kasan_kfree_large(ptr, _RET_IP_); +} + void __kasan_slab_free_mempool(void *ptr, unsigned long ip); static __always_inline void kasan_slab_free_mempool(void *ptr) { @@ -247,13 +254,6 @@ static __always_inline void * __must_check kasan_krealloc(const void *object, return (void *)object; } -void __kasan_kfree_large(void *ptr, unsigned long ip); -static __always_inline void kasan_kfree_large(void *ptr) -{ - if (kasan_enabled()) - __kasan_kfree_large(ptr, _RET_IP_); -} - /* * Unlike kasan_check_read/write(), kasan_check_byte() is performed even for * the hardware tag-based mode that doesn't rely on compiler instrumentation. @@ -302,6 +302,7 @@ static inline bool kasan_slab_free(struct kmem_cache *s, void *object) { return false; } +static inline void kasan_kfree_large(void *ptr) {} static inline void kasan_slab_free_mempool(void *ptr) {} static inline void *kasan_slab_alloc(struct kmem_cache *s, void *object, gfp_t flags) @@ -322,7 +323,6 @@ static inline void *kasan_krealloc(const void *object, size_t new_size, { return (void *)object; } -static inline void kasan_kfree_large(void *ptr) {} static inline bool kasan_check_byte(const void *address) { return true; diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 48d51daeda95..8a3d66393dc5 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -364,6 +364,31 @@ bool __kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip) return ____kasan_slab_free(cache, object, ip, true); } +static bool ____kasan_kfree_large(void *ptr, unsigned long ip) +{ + if (ptr != page_address(virt_to_head_page(ptr))) { + kasan_report_invalid_free(ptr, ip); + return true; + } + + if (!kasan_byte_accessible(ptr)) { + kasan_report_invalid_free(ptr, ip); + return true; + } + + /* + * The object will be poisoned by kasan_free_pages() or + * kasan_slab_free_mempool(). + */ + + return false; +} + +void __kasan_kfree_large(void *ptr, unsigned long ip) +{ + ____kasan_kfree_large(ptr, ip); +} + void __kasan_slab_free_mempool(void *ptr, unsigned long ip) { struct page *page; @@ -377,10 +402,8 @@ void __kasan_slab_free_mempool(void *ptr, unsigned long ip) * KMALLOC_MAX_SIZE, and kmalloc falls back onto page_alloc. */ if (unlikely(!PageSlab(page))) { - if (ptr != page_address(page)) { - kasan_report_invalid_free(ptr, ip); + if (____kasan_kfree_large(ptr, ip)) return; - } kasan_poison(ptr, page_size(page), KASAN_FREE_PAGE); } else { ____kasan_slab_free(page->slab_cache, ptr, ip, false); @@ -539,13 +562,6 @@ void * __must_check __kasan_krealloc(const void *object, size_t size, gfp_t flag return ____kasan_kmalloc(page->slab_cache, object, size, flags); } -void __kasan_kfree_large(void *ptr, unsigned long ip) -{ - if (ptr != page_address(virt_to_head_page(ptr))) - kasan_report_invalid_free(ptr, ip); - /* The object will be poisoned by kasan_free_pages(). */ -} - bool __kasan_check_byte(const void *address, unsigned long ip) { if (!kasan_byte_accessible(address)) { From b87c28b9a7ef64590943435ea59f40092f2376d5 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 25 Feb 2021 17:20:15 -0800 Subject: [PATCH 075/118] kasan: rework krealloc tests This patch reworks KASAN-KUnit tests for krealloc() to: 1. Check both slab and page_alloc based krealloc() implementations. 2. Allow at least one full granule to fit between old and new sizes for each KASAN mode, and check accesses to that granule accordingly. Link: https://lkml.kernel.org/r/c707f128a2bb9f2f05185d1eb52192cf179cf4fa.1612546384.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Branislav Rankov Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Kevin Brodsky Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/test_kasan.c | 91 ++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 81 insertions(+), 10 deletions(-) diff --git a/lib/test_kasan.c b/lib/test_kasan.c index 25576303897b..e1bd1d1096de 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -252,11 +252,14 @@ static void kmalloc_large_oob_right(struct kunit *test) kfree(ptr); } -static void kmalloc_oob_krealloc_more(struct kunit *test) +static void krealloc_more_oob_helper(struct kunit *test, + size_t size1, size_t size2) { char *ptr1, *ptr2; - size_t size1 = 17; - size_t size2 = 19; + size_t middle; + + KUNIT_ASSERT_LT(test, size1, size2); + middle = size1 + (size2 - size1) / 2; ptr1 = kmalloc(size1, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1); @@ -264,15 +267,31 @@ static void kmalloc_oob_krealloc_more(struct kunit *test) ptr2 = krealloc(ptr1, size2, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2); - KUNIT_EXPECT_KASAN_FAIL(test, ptr2[size2 + OOB_TAG_OFF] = 'x'); + /* All offsets up to size2 must be accessible. */ + ptr2[size1 - 1] = 'x'; + ptr2[size1] = 'x'; + ptr2[middle] = 'x'; + ptr2[size2 - 1] = 'x'; + + /* Generic mode is precise, so unaligned size2 must be inaccessible. */ + if (IS_ENABLED(CONFIG_KASAN_GENERIC)) + KUNIT_EXPECT_KASAN_FAIL(test, ptr2[size2] = 'x'); + + /* For all modes first aligned offset after size2 must be inaccessible. */ + KUNIT_EXPECT_KASAN_FAIL(test, + ptr2[round_up(size2, KASAN_GRANULE_SIZE)] = 'x'); + kfree(ptr2); } -static void kmalloc_oob_krealloc_less(struct kunit *test) +static void krealloc_less_oob_helper(struct kunit *test, + size_t size1, size_t size2) { char *ptr1, *ptr2; - size_t size1 = 17; - size_t size2 = 15; + size_t middle; + + KUNIT_ASSERT_LT(test, size2, size1); + middle = size2 + (size1 - size2) / 2; ptr1 = kmalloc(size1, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1); @@ -280,10 +299,60 @@ static void kmalloc_oob_krealloc_less(struct kunit *test) ptr2 = krealloc(ptr1, size2, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2); - KUNIT_EXPECT_KASAN_FAIL(test, ptr2[size2 + OOB_TAG_OFF] = 'x'); + /* Must be accessible for all modes. */ + ptr2[size2 - 1] = 'x'; + + /* Generic mode is precise, so unaligned size2 must be inaccessible. */ + if (IS_ENABLED(CONFIG_KASAN_GENERIC)) + KUNIT_EXPECT_KASAN_FAIL(test, ptr2[size2] = 'x'); + + /* For all modes first aligned offset after size2 must be inaccessible. */ + KUNIT_EXPECT_KASAN_FAIL(test, + ptr2[round_up(size2, KASAN_GRANULE_SIZE)] = 'x'); + + /* + * For all modes all size2, middle, and size1 should land in separate + * granules and thus the latter two offsets should be inaccessible. + */ + KUNIT_EXPECT_LE(test, round_up(size2, KASAN_GRANULE_SIZE), + round_down(middle, KASAN_GRANULE_SIZE)); + KUNIT_EXPECT_LE(test, round_up(middle, KASAN_GRANULE_SIZE), + round_down(size1, KASAN_GRANULE_SIZE)); + KUNIT_EXPECT_KASAN_FAIL(test, ptr2[middle] = 'x'); + KUNIT_EXPECT_KASAN_FAIL(test, ptr2[size1 - 1] = 'x'); + KUNIT_EXPECT_KASAN_FAIL(test, ptr2[size1] = 'x'); + kfree(ptr2); } +static void krealloc_more_oob(struct kunit *test) +{ + krealloc_more_oob_helper(test, 201, 235); +} + +static void krealloc_less_oob(struct kunit *test) +{ + krealloc_less_oob_helper(test, 235, 201); +} + +static void krealloc_pagealloc_more_oob(struct kunit *test) +{ + /* page_alloc fallback in only implemented for SLUB. */ + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_SLUB); + + krealloc_more_oob_helper(test, KMALLOC_MAX_CACHE_SIZE + 201, + KMALLOC_MAX_CACHE_SIZE + 235); +} + +static void krealloc_pagealloc_less_oob(struct kunit *test) +{ + /* page_alloc fallback in only implemented for SLUB. */ + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_SLUB); + + krealloc_less_oob_helper(test, KMALLOC_MAX_CACHE_SIZE + 235, + KMALLOC_MAX_CACHE_SIZE + 201); +} + static void kmalloc_oob_16(struct kunit *test) { struct { @@ -977,8 +1046,10 @@ static struct kunit_case kasan_kunit_test_cases[] = { KUNIT_CASE(pagealloc_oob_right), KUNIT_CASE(pagealloc_uaf), KUNIT_CASE(kmalloc_large_oob_right), - KUNIT_CASE(kmalloc_oob_krealloc_more), - KUNIT_CASE(kmalloc_oob_krealloc_less), + KUNIT_CASE(krealloc_more_oob), + KUNIT_CASE(krealloc_less_oob), + KUNIT_CASE(krealloc_pagealloc_more_oob), + KUNIT_CASE(krealloc_pagealloc_less_oob), KUNIT_CASE(kmalloc_oob_16), KUNIT_CASE(kmalloc_uaf_16), KUNIT_CASE(kmalloc_oob_in_memset), From 26a5ca7a73be31f76c291465680517cde37051ca Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 25 Feb 2021 17:20:19 -0800 Subject: [PATCH 076/118] kasan, mm: fail krealloc on freed objects Currently, if krealloc() is called on a freed object with KASAN enabled, it allocates and returns a new object, but doesn't copy any memory from the old one as ksize() returns 0. This makes the caller believe that krealloc() succeeded (KASAN report is printed though). This patch adds an accessibility check into __do_krealloc(). If the check fails, krealloc() returns NULL. This check duplicates the one in ksize(); this is fixed in the following patch. This patch also adds a KASAN-KUnit test to check krealloc() behaviour when it's called on a freed object. Link: https://lkml.kernel.org/r/cbcf7b02be0a1ca11de4f833f2ff0b3f2c9b00c8.1612546384.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Branislav Rankov Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Kevin Brodsky Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/test_kasan.c | 20 ++++++++++++++++++++ mm/slab_common.c | 3 +++ 2 files changed, 23 insertions(+) diff --git a/lib/test_kasan.c b/lib/test_kasan.c index e1bd1d1096de..e5647d147b35 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -353,6 +353,25 @@ static void krealloc_pagealloc_less_oob(struct kunit *test) KMALLOC_MAX_CACHE_SIZE + 201); } +/* + * Check that krealloc() detects a use-after-free, returns NULL, + * and doesn't unpoison the freed object. + */ +static void krealloc_uaf(struct kunit *test) +{ + char *ptr1, *ptr2; + int size1 = 201; + int size2 = 235; + + ptr1 = kmalloc(size1, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1); + kfree(ptr1); + + KUNIT_EXPECT_KASAN_FAIL(test, ptr2 = krealloc(ptr1, size2, GFP_KERNEL)); + KUNIT_ASSERT_PTR_EQ(test, (void *)ptr2, NULL); + KUNIT_EXPECT_KASAN_FAIL(test, *(volatile char *)ptr1); +} + static void kmalloc_oob_16(struct kunit *test) { struct { @@ -1050,6 +1069,7 @@ static struct kunit_case kasan_kunit_test_cases[] = { KUNIT_CASE(krealloc_less_oob), KUNIT_CASE(krealloc_pagealloc_more_oob), KUNIT_CASE(krealloc_pagealloc_less_oob), + KUNIT_CASE(krealloc_uaf), KUNIT_CASE(kmalloc_oob_16), KUNIT_CASE(kmalloc_uaf_16), KUNIT_CASE(kmalloc_oob_in_memset), diff --git a/mm/slab_common.c b/mm/slab_common.c index 897c3a446b04..4aedb8455352 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1136,6 +1136,9 @@ static __always_inline void *__do_krealloc(const void *p, size_t new_size, void *ret; size_t ks; + if (likely(!ZERO_OR_NULL_PTR(p)) && !kasan_check_byte(p)) + return NULL; + ks = ksize(p); if (ks >= new_size) { From d12d9ad816299052385bac351fad338a073121b3 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 25 Feb 2021 17:20:23 -0800 Subject: [PATCH 077/118] kasan, mm: optimize krealloc poisoning Currently, krealloc() always calls ksize(), which unpoisons the whole object including the redzone. This is inefficient, as kasan_krealloc() repoisons the redzone for objects that fit into the same buffer. This patch changes krealloc() instrumentation to use uninstrumented __ksize() that doesn't unpoison the memory. Instead, kasan_kreallos() is changed to unpoison the memory excluding the redzone. For objects that don't fit into the old allocation, this patch disables KASAN accessibility checks when copying memory into a new object instead of unpoisoning it. Link: https://lkml.kernel.org/r/9bef90327c9cb109d736c40115684fd32f49e6b0.1612546384.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Branislav Rankov Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Kevin Brodsky Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/common.c | 12 ++++++++++-- mm/slab_common.c | 20 ++++++++++++++------ 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 8a3d66393dc5..1e510649833b 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -476,7 +476,7 @@ static void *____kasan_kmalloc(struct kmem_cache *cache, const void *object, /* * The object has already been unpoisoned by kasan_slab_alloc() for - * kmalloc() or by ksize() for krealloc(). + * kmalloc() or by kasan_krealloc() for krealloc(). */ /* @@ -526,7 +526,7 @@ void * __must_check __kasan_kmalloc_large(const void *ptr, size_t size, /* * The object has already been unpoisoned by kasan_alloc_pages() for - * alloc_pages() or by ksize() for krealloc(). + * alloc_pages() or by kasan_krealloc() for krealloc(). */ /* @@ -554,8 +554,16 @@ void * __must_check __kasan_krealloc(const void *object, size_t size, gfp_t flag if (unlikely(object == ZERO_SIZE_PTR)) return (void *)object; + /* + * Unpoison the object's data. + * Part of it might already have been unpoisoned, but it's unknown + * how big that part is. + */ + kasan_unpoison(object, size); + page = virt_to_head_page(object); + /* Piggy-back on kmalloc() instrumentation to poison the redzone. */ if (unlikely(!PageSlab(page))) return __kasan_kmalloc_large(object, size, flags); else diff --git a/mm/slab_common.c b/mm/slab_common.c index 4aedb8455352..88e833986332 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1136,19 +1136,27 @@ static __always_inline void *__do_krealloc(const void *p, size_t new_size, void *ret; size_t ks; - if (likely(!ZERO_OR_NULL_PTR(p)) && !kasan_check_byte(p)) - return NULL; - - ks = ksize(p); + /* Don't use instrumented ksize to allow precise KASAN poisoning. */ + if (likely(!ZERO_OR_NULL_PTR(p))) { + if (!kasan_check_byte(p)) + return NULL; + ks = kfence_ksize(p) ?: __ksize(p); + } else + ks = 0; + /* If the object still fits, repoison it precisely. */ if (ks >= new_size) { p = kasan_krealloc((void *)p, new_size, flags); return (void *)p; } ret = kmalloc_track_caller(new_size, flags); - if (ret && p) - memcpy(ret, p, ks); + if (ret && p) { + /* Disable KASAN checks as the object's redzone is accessed. */ + kasan_disable_current(); + memcpy(ret, kasan_reset_tag(p), ks); + kasan_enable_current(); + } return ret; } From cde8a7eb778c7c71f70d636aa0bb1ec081b9167c Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 25 Feb 2021 17:20:27 -0800 Subject: [PATCH 078/118] kasan: ensure poisoning size alignment A previous changes d99f6a10c161 ("kasan: don't round_up too much") attempted to simplify the code by adding a round_up(size) call into kasan_poison(). While this allows to have less round_up() calls around the code, this results in round_up() being called multiple times. This patch removes round_up() of size from kasan_poison() and ensures that all callers round_up() the size explicitly. This patch also adds WARN_ON() alignment checks for address and size to kasan_poison() and kasan_unpoison(). Link: https://lkml.kernel.org/r/3ffe8d4a246ae67a8b5e91f65bf98cd7cba9d7b9.1612546384.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Branislav Rankov Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Kevin Brodsky Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/common.c | 9 ++++++--- mm/kasan/kasan.h | 33 ++++++++++++++++++++------------- mm/kasan/shadow.c | 37 ++++++++++++++++++++++--------------- 3 files changed, 48 insertions(+), 31 deletions(-) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 1e510649833b..dec7375fb884 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -261,7 +261,8 @@ void __kasan_unpoison_object_data(struct kmem_cache *cache, void *object) void __kasan_poison_object_data(struct kmem_cache *cache, void *object) { - kasan_poison(object, cache->object_size, KASAN_KMALLOC_REDZONE); + kasan_poison(object, round_up(cache->object_size, KASAN_GRANULE_SIZE), + KASAN_KMALLOC_REDZONE); } /* @@ -348,7 +349,8 @@ static bool ____kasan_slab_free(struct kmem_cache *cache, void *object, return true; } - kasan_poison(object, cache->object_size, KASAN_KMALLOC_FREE); + kasan_poison(object, round_up(cache->object_size, KASAN_GRANULE_SIZE), + KASAN_KMALLOC_FREE); if ((IS_ENABLED(CONFIG_KASAN_GENERIC) && !quarantine)) return false; @@ -490,7 +492,8 @@ static void *____kasan_kmalloc(struct kmem_cache *cache, const void *object, /* Poison the aligned part of the redzone. */ redzone_start = round_up((unsigned long)(object + size), KASAN_GRANULE_SIZE); - redzone_end = (unsigned long)object + cache->object_size; + redzone_end = round_up((unsigned long)(object + cache->object_size), + KASAN_GRANULE_SIZE); kasan_poison((void *)redzone_start, redzone_end - redzone_start, KASAN_KMALLOC_REDZONE); diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 222858e2e6af..8c55634d6edd 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -330,30 +330,37 @@ static inline u8 kasan_random_tag(void) { return 0; } #ifdef CONFIG_KASAN_HW_TAGS -static inline void kasan_poison(const void *address, size_t size, u8 value) +static inline void kasan_poison(const void *addr, size_t size, u8 value) { - address = kasan_reset_tag(address); + addr = kasan_reset_tag(addr); /* Skip KFENCE memory if called explicitly outside of sl*b. */ - if (is_kfence_address(address)) + if (is_kfence_address(addr)) return; - hw_set_mem_tag_range((void *)address, - round_up(size, KASAN_GRANULE_SIZE), value); + if (WARN_ON((unsigned long)addr & KASAN_GRANULE_MASK)) + return; + if (WARN_ON(size & KASAN_GRANULE_MASK)) + return; + + hw_set_mem_tag_range((void *)addr, size, value); } -static inline void kasan_unpoison(const void *address, size_t size) +static inline void kasan_unpoison(const void *addr, size_t size) { - u8 tag = get_tag(address); + u8 tag = get_tag(addr); - address = kasan_reset_tag(address); + addr = kasan_reset_tag(addr); /* Skip KFENCE memory if called explicitly outside of sl*b. */ - if (is_kfence_address(address)) + if (is_kfence_address(addr)) return; - hw_set_mem_tag_range((void *)address, - round_up(size, KASAN_GRANULE_SIZE), tag); + if (WARN_ON((unsigned long)addr & KASAN_GRANULE_MASK)) + return; + size = round_up(size, KASAN_GRANULE_SIZE); + + hw_set_mem_tag_range((void *)addr, size, tag); } static inline bool kasan_byte_accessible(const void *addr) @@ -370,7 +377,7 @@ static inline bool kasan_byte_accessible(const void *addr) /** * kasan_poison - mark the memory range as unaccessible * @addr - range start address, must be aligned to KASAN_GRANULE_SIZE - * @size - range size + * @size - range size, must be aligned to KASAN_GRANULE_SIZE * @value - value that's written to metadata for the range * * The size gets aligned to KASAN_GRANULE_SIZE before marking the range. @@ -380,7 +387,7 @@ void kasan_poison(const void *addr, size_t size, u8 value); /** * kasan_unpoison - mark the memory range as accessible * @addr - range start address, must be aligned to KASAN_GRANULE_SIZE - * @size - range size + * @size - range size, can be unaligned * * For the tag-based modes, the @size gets aligned to KASAN_GRANULE_SIZE before * marking the range. diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 1ed7817e4ee6..63f43443f5d7 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -69,7 +69,7 @@ void *memcpy(void *dest, const void *src, size_t len) return __memcpy(dest, src, len); } -void kasan_poison(const void *address, size_t size, u8 value) +void kasan_poison(const void *addr, size_t size, u8 value) { void *shadow_start, *shadow_end; @@ -78,55 +78,62 @@ void kasan_poison(const void *address, size_t size, u8 value) * some of the callers (e.g. kasan_poison_object_data) pass tagged * addresses to this function. */ - address = kasan_reset_tag(address); + addr = kasan_reset_tag(addr); /* Skip KFENCE memory if called explicitly outside of sl*b. */ - if (is_kfence_address(address)) + if (is_kfence_address(addr)) return; - size = round_up(size, KASAN_GRANULE_SIZE); - shadow_start = kasan_mem_to_shadow(address); - shadow_end = kasan_mem_to_shadow(address + size); + if (WARN_ON((unsigned long)addr & KASAN_GRANULE_MASK)) + return; + if (WARN_ON(size & KASAN_GRANULE_MASK)) + return; + + shadow_start = kasan_mem_to_shadow(addr); + shadow_end = kasan_mem_to_shadow(addr + size); __memset(shadow_start, value, shadow_end - shadow_start); } EXPORT_SYMBOL(kasan_poison); #ifdef CONFIG_KASAN_GENERIC -void kasan_poison_last_granule(const void *address, size_t size) +void kasan_poison_last_granule(const void *addr, size_t size) { if (size & KASAN_GRANULE_MASK) { - u8 *shadow = (u8 *)kasan_mem_to_shadow(address + size); + u8 *shadow = (u8 *)kasan_mem_to_shadow(addr + size); *shadow = size & KASAN_GRANULE_MASK; } } #endif -void kasan_unpoison(const void *address, size_t size) +void kasan_unpoison(const void *addr, size_t size) { - u8 tag = get_tag(address); + u8 tag = get_tag(addr); /* * Perform shadow offset calculation based on untagged address, as * some of the callers (e.g. kasan_unpoison_object_data) pass tagged * addresses to this function. */ - address = kasan_reset_tag(address); + addr = kasan_reset_tag(addr); /* * Skip KFENCE memory if called explicitly outside of sl*b. Also note * that calls to ksize(), where size is not a multiple of machine-word * size, would otherwise poison the invalid portion of the word. */ - if (is_kfence_address(address)) + if (is_kfence_address(addr)) return; - /* Unpoison round_up(size, KASAN_GRANULE_SIZE) bytes. */ - kasan_poison(address, size, tag); + if (WARN_ON((unsigned long)addr & KASAN_GRANULE_MASK)) + return; + + /* Unpoison all granules that cover the object. */ + kasan_poison(addr, round_up(size, KASAN_GRANULE_SIZE), tag); /* Partially poison the last granule for the generic mode. */ if (IS_ENABLED(CONFIG_KASAN_GENERIC)) - kasan_poison_last_granule(address, size); + kasan_poison_last_granule(addr, size); } #ifdef CONFIG_MEMORY_HOTPLUG From 2cb34276427a093e2d7cc6ea63ac447bad1ff4c1 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 25 Feb 2021 17:20:31 -0800 Subject: [PATCH 079/118] arm64: kasan: simplify and inline MTE functions This change provides a simpler implementation of mte_get_mem_tag(), mte_get_random_tag(), and mte_set_mem_tag_range(). Simplifications include removing system_supports_mte() checks as these functions are onlye called from KASAN runtime that had already checked system_supports_mte(). Besides that, size and address alignment checks are removed from mte_set_mem_tag_range(), as KASAN now does those. This change also moves these functions into the asm/mte-kasan.h header and implements mte_set_mem_tag_range() via inline assembly to avoid unnecessary functions calls. [vincenzo.frascino@arm.com: fix warning in mte_get_random_tag()] Link: https://lkml.kernel.org/r/20210211152208.23811-1-vincenzo.frascino@arm.com Link: https://lkml.kernel.org/r/a26121b294fdf76e369cb7a74351d1c03a908930.1612546384.git.andreyknvl@google.com Co-developed-by: Vincenzo Frascino Signed-off-by: Vincenzo Frascino Signed-off-by: Andrey Konovalov Reviewed-by: Catalin Marinas Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Branislav Rankov Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Kevin Brodsky Cc: Marco Elver Cc: Peter Collingbourne Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/include/asm/cache.h | 1 - arch/arm64/include/asm/kasan.h | 1 + arch/arm64/include/asm/mte-def.h | 2 + arch/arm64/include/asm/mte-kasan.h | 67 ++++++++++++++++++++++++++---- arch/arm64/include/asm/mte.h | 2 - arch/arm64/kernel/mte.c | 46 -------------------- arch/arm64/lib/mte.S | 16 ------- 7 files changed, 61 insertions(+), 74 deletions(-) diff --git a/arch/arm64/include/asm/cache.h b/arch/arm64/include/asm/cache.h index 77cbbe3625f2..a074459f8f2f 100644 --- a/arch/arm64/include/asm/cache.h +++ b/arch/arm64/include/asm/cache.h @@ -6,7 +6,6 @@ #define __ASM_CACHE_H #include -#include #define CTR_L1IP_SHIFT 14 #define CTR_L1IP_MASK 3 diff --git a/arch/arm64/include/asm/kasan.h b/arch/arm64/include/asm/kasan.h index 0aaf9044cd6a..12d5f47f7dbe 100644 --- a/arch/arm64/include/asm/kasan.h +++ b/arch/arm64/include/asm/kasan.h @@ -6,6 +6,7 @@ #include #include +#include #include #define arch_kasan_set_tag(addr, tag) __tag_set(addr, tag) diff --git a/arch/arm64/include/asm/mte-def.h b/arch/arm64/include/asm/mte-def.h index 2d73a1612f09..cf241b0f0a42 100644 --- a/arch/arm64/include/asm/mte-def.h +++ b/arch/arm64/include/asm/mte-def.h @@ -11,4 +11,6 @@ #define MTE_TAG_SIZE 4 #define MTE_TAG_MASK GENMASK((MTE_TAG_SHIFT + (MTE_TAG_SIZE - 1)), MTE_TAG_SHIFT) +#define __MTE_PREAMBLE ARM64_ASM_PREAMBLE ".arch_extension memtag\n" + #endif /* __ASM_MTE_DEF_H */ diff --git a/arch/arm64/include/asm/mte-kasan.h b/arch/arm64/include/asm/mte-kasan.h index 3748d5bb88c0..7ab500e2ad17 100644 --- a/arch/arm64/include/asm/mte-kasan.h +++ b/arch/arm64/include/asm/mte-kasan.h @@ -11,12 +11,15 @@ #include -/* - * The functions below are meant to be used only for the - * KASAN_HW_TAGS interface defined in asm/memory.h. - */ #ifdef CONFIG_ARM64_MTE +/* + * These functions are meant to be only used from KASAN runtime through + * the arch_*() interface defined in asm/memory.h. + * These functions don't include system_supports_mte() checks, + * as KASAN only calls them when MTE is supported and enabled. + */ + static inline u8 mte_get_ptr_tag(void *ptr) { /* Note: The format of KASAN tags is 0xF */ @@ -25,9 +28,54 @@ static inline u8 mte_get_ptr_tag(void *ptr) return tag; } -u8 mte_get_mem_tag(void *addr); -u8 mte_get_random_tag(void); -void *mte_set_mem_tag_range(void *addr, size_t size, u8 tag); +/* Get allocation tag for the address. */ +static inline u8 mte_get_mem_tag(void *addr) +{ + asm(__MTE_PREAMBLE "ldg %0, [%0]" + : "+r" (addr)); + + return mte_get_ptr_tag(addr); +} + +/* Generate a random tag. */ +static inline u8 mte_get_random_tag(void) +{ + void *addr; + + asm(__MTE_PREAMBLE "irg %0, %0" + : "=r" (addr)); + + return mte_get_ptr_tag(addr); +} + +/* + * Assign allocation tags for a region of memory based on the pointer tag. + * Note: The address must be non-NULL and MTE_GRANULE_SIZE aligned and + * size must be non-zero and MTE_GRANULE_SIZE aligned. + */ +static inline void mte_set_mem_tag_range(void *addr, size_t size, u8 tag) +{ + u64 curr, end; + + if (!size) + return; + + curr = (u64)__tag_set(addr, tag); + end = curr + size; + + do { + /* + * 'asm volatile' is required to prevent the compiler to move + * the statement outside of the loop. + */ + asm volatile(__MTE_PREAMBLE "stg %0, [%0]" + : + : "r" (curr) + : "memory"); + + curr += MTE_GRANULE_SIZE; + } while (curr != end); +} void mte_enable_kernel(void); void mte_init_tags(u64 max_tag); @@ -46,13 +94,14 @@ static inline u8 mte_get_mem_tag(void *addr) { return 0xFF; } + static inline u8 mte_get_random_tag(void) { return 0xFF; } -static inline void *mte_set_mem_tag_range(void *addr, size_t size, u8 tag) + +static inline void mte_set_mem_tag_range(void *addr, size_t size, u8 tag) { - return addr; } static inline void mte_enable_kernel(void) diff --git a/arch/arm64/include/asm/mte.h b/arch/arm64/include/asm/mte.h index d02aff9f493d..9b557a457f24 100644 --- a/arch/arm64/include/asm/mte.h +++ b/arch/arm64/include/asm/mte.h @@ -8,8 +8,6 @@ #include #include -#define __MTE_PREAMBLE ARM64_ASM_PREAMBLE ".arch_extension memtag\n" - #ifndef __ASSEMBLY__ #include diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c index 2cfc850809ce..b3c70a612c7a 100644 --- a/arch/arm64/kernel/mte.c +++ b/arch/arm64/kernel/mte.c @@ -19,7 +19,6 @@ #include #include #include -#include #include #include @@ -88,51 +87,6 @@ int memcmp_pages(struct page *page1, struct page *page2) return ret; } -u8 mte_get_mem_tag(void *addr) -{ - if (!system_supports_mte()) - return 0xFF; - - asm(__MTE_PREAMBLE "ldg %0, [%0]" - : "+r" (addr)); - - return mte_get_ptr_tag(addr); -} - -u8 mte_get_random_tag(void) -{ - void *addr; - - if (!system_supports_mte()) - return 0xFF; - - asm(__MTE_PREAMBLE "irg %0, %0" - : "+r" (addr)); - - return mte_get_ptr_tag(addr); -} - -void *mte_set_mem_tag_range(void *addr, size_t size, u8 tag) -{ - void *ptr = addr; - - if ((!system_supports_mte()) || (size == 0)) - return addr; - - /* Make sure that size is MTE granule aligned. */ - WARN_ON(size & (MTE_GRANULE_SIZE - 1)); - - /* Make sure that the address is MTE granule aligned. */ - WARN_ON((u64)addr & (MTE_GRANULE_SIZE - 1)); - - tag = 0xF0 | tag; - ptr = (void *)__tag_set(ptr, tag); - - mte_assign_mem_tag_range(ptr, size); - - return ptr; -} - void mte_init_tags(u64 max_tag) { static bool gcr_kernel_excl_initialized; diff --git a/arch/arm64/lib/mte.S b/arch/arm64/lib/mte.S index 9e1a12e10053..351537c12f36 100644 --- a/arch/arm64/lib/mte.S +++ b/arch/arm64/lib/mte.S @@ -149,19 +149,3 @@ SYM_FUNC_START(mte_restore_page_tags) ret SYM_FUNC_END(mte_restore_page_tags) - -/* - * Assign allocation tags for a region of memory based on the pointer tag - * x0 - source pointer - * x1 - size - * - * Note: The address must be non-NULL and MTE_GRANULE_SIZE aligned and - * size must be non-zero and MTE_GRANULE_SIZE aligned. - */ -SYM_FUNC_START(mte_assign_mem_tag_range) -1: stg x0, [x0] - add x0, x0, #MTE_GRANULE_SIZE - subs x1, x1, #MTE_GRANULE_SIZE - b.gt 1b - ret -SYM_FUNC_END(mte_assign_mem_tag_range) From c80a03664e154b7263af1c4dd53f42221d0c8283 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 25 Feb 2021 17:20:35 -0800 Subject: [PATCH 080/118] kasan: inline HW_TAGS helper functions Mark all static functions in common.c and kasan.h that are used for hardware tag-based KASAN as inline to avoid unnecessary function calls. Link: https://lkml.kernel.org/r/2c94a2af0657f2b95b9337232339ff5ffa643ab5.1612546384.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Branislav Rankov Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Kevin Brodsky Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/common.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index dec7375fb884..b5e08d4cefec 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -279,7 +279,8 @@ void __kasan_poison_object_data(struct kmem_cache *cache, void *object) * based on objects indexes, so that objects that are next to each other * get different tags. */ -static u8 assign_tag(struct kmem_cache *cache, const void *object, bool init) +static inline u8 assign_tag(struct kmem_cache *cache, + const void *object, bool init) { if (IS_ENABLED(CONFIG_KASAN_GENERIC)) return 0xff; @@ -321,8 +322,8 @@ void * __must_check __kasan_init_slab_obj(struct kmem_cache *cache, return (void *)object; } -static bool ____kasan_slab_free(struct kmem_cache *cache, void *object, - unsigned long ip, bool quarantine) +static inline bool ____kasan_slab_free(struct kmem_cache *cache, + void *object, unsigned long ip, bool quarantine) { u8 tag; void *tagged_object; @@ -366,7 +367,7 @@ bool __kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip) return ____kasan_slab_free(cache, object, ip, true); } -static bool ____kasan_kfree_large(void *ptr, unsigned long ip) +static inline bool ____kasan_kfree_large(void *ptr, unsigned long ip) { if (ptr != page_address(virt_to_head_page(ptr))) { kasan_report_invalid_free(ptr, ip); @@ -461,8 +462,8 @@ void * __must_check __kasan_slab_alloc(struct kmem_cache *cache, return tagged_object; } -static void *____kasan_kmalloc(struct kmem_cache *cache, const void *object, - size_t size, gfp_t flags) +static inline void *____kasan_kmalloc(struct kmem_cache *cache, + const void *object, size_t size, gfp_t flags) { unsigned long redzone_start; unsigned long redzone_end; From 7169487bc2a7c5732a6eeebc6dc3d1351d4a6350 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 25 Feb 2021 17:20:38 -0800 Subject: [PATCH 081/118] kasan: clarify that only first bug is reported in HW_TAGS Hwardware tag-based KASAN only reports the first found bug. After that MTE tag checking gets disabled. Clarify this in comments and documentation. Link: https://lkml.kernel.org/r/00383ba88a47c3f8342d12263c24bdf95527b07d.1612546384.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Branislav Rankov Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Kevin Brodsky Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/dev-tools/kasan.rst | 8 ++++++-- mm/kasan/hw_tags.c | 2 +- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst index cde14aeefca7..ddf4239a5890 100644 --- a/Documentation/dev-tools/kasan.rst +++ b/Documentation/dev-tools/kasan.rst @@ -155,7 +155,7 @@ Boot parameters ~~~~~~~~~~~~~~~ Hardware tag-based KASAN mode (see the section about various modes below) is -intended for use in production as a security mitigation. Therefore it supports +intended for use in production as a security mitigation. Therefore, it supports boot parameters that allow to disable KASAN competely or otherwise control particular KASAN features. @@ -165,7 +165,8 @@ particular KASAN features. traces collection (default: ``on``). - ``kasan.fault=report`` or ``=panic`` controls whether to only print a KASAN - report or also panic the kernel (default: ``report``). + report or also panic the kernel (default: ``report``). Note, that tag + checking gets disabled after the first reported bug. For developers ~~~~~~~~~~~~~~ @@ -295,6 +296,9 @@ Note, that enabling CONFIG_KASAN_HW_TAGS always results in in-kernel TBI being enabled. Even when kasan.mode=off is provided, or when the hardware doesn't support MTE (but supports TBI). +Hardware tag-based KASAN only reports the first found bug. After that MTE tag +checking gets disabled. + What memory accesses are sanitised by KASAN? -------------------------------------------- diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c index b31aeef505dd..2aad21fda156 100644 --- a/mm/kasan/hw_tags.c +++ b/mm/kasan/hw_tags.c @@ -48,7 +48,7 @@ EXPORT_SYMBOL(kasan_flag_enabled); /* Whether to collect alloc/free stack traces. */ DEFINE_STATIC_KEY_FALSE(kasan_flag_stacktrace); -/* Whether panic or disable tag checking on fault. */ +/* Whether to panic or print a report and disable tag checking on fault. */ bool kasan_flag_panic __ro_after_init; /* kasan=off/on */ From 2956f4e4f0c504697f9dd6b84fd5c57ede35d333 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 25 Feb 2021 17:20:42 -0800 Subject: [PATCH 082/118] alpha: remove CONFIG_EXPERIMENTAL from defconfigs Since CONFIG_EXPERIMENTAL was removed in 2013, go ahead and drop it from any defconfig files. Link: https://lkml.kernel.org/r/20210115005956.29408-1-rdunlap@infradead.org Fixes: 3d374d09f16f ("final removal of CONFIG_EXPERIMENTAL") Signed-off-by: Randy Dunlap Cc: Kees Cook Cc: Greg Kroah-Hartman Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Matt Turner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/configs/defconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/alpha/configs/defconfig b/arch/alpha/configs/defconfig index 6293675db164..724c4075df40 100644 --- a/arch/alpha/configs/defconfig +++ b/arch/alpha/configs/defconfig @@ -1,4 +1,3 @@ -CONFIG_EXPERIMENTAL=y CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y CONFIG_LOG_BUF_SHIFT=14 From 152c432b128cb043fc107e8f211195fe94b2159c Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Thu, 25 Feb 2021 17:20:45 -0800 Subject: [PATCH 083/118] proc/wchan: use printk format instead of lookup_symbol_name() To resolve the symbol fuction name for wchan, use the printk format specifier %ps instead of manually looking up the symbol function name via lookup_symbol_name(). Link: https://lkml.kernel.org/r/20201217165413.GA1959@ls3530.fritz.box Signed-off-by: Helge Deller Cc: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index 56bf14316122..3851bfcdba56 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -67,7 +67,6 @@ #include #include #include -#include #include #include #include @@ -386,19 +385,17 @@ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { unsigned long wchan; - char symname[KSYM_NAME_LEN]; - if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) - goto print0; + if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) + wchan = get_wchan(task); + else + wchan = 0; - wchan = get_wchan(task); - if (wchan && !lookup_symbol_name(wchan, symname)) { - seq_puts(m, symname); - return 0; - } + if (wchan) + seq_printf(m, "%ps", (void *) wchan); + else + seq_putc(m, '0'); -print0: - seq_putc(m, '0'); return 0; } #endif /* CONFIG_KALLSYMS */ From 4508943794efdd94171549c0bd52810e2f4ad9fe Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 25 Feb 2021 17:20:49 -0800 Subject: [PATCH 084/118] proc: use kvzalloc for our kernel buffer Since sysctl: pass kernel pointers to ->proc_handler we have been pre-allocating a buffer to copy the data from the proc handlers into, and then copying that to userspace. The problem is this just blindly kzalloc()'s the buffer size passed in from the read, which in the case of our 'cat' binary was 64kib. Order-4 allocations are not awesome, and since we can potentially allocate up to our maximum order, so use kvzalloc for these buffers. [willy@infradead.org: changelog tweaks] Link: https://lkml.kernel.org/r/6345270a2c1160b89dd5e6715461f388176899d1.1612972413.git.josef@toxicpanda.com Fixes: 32927393dc1c ("sysctl: pass kernel pointers to ->proc_handler") Signed-off-by: Josef Bacik Reviewed-by: Christoph Hellwig Acked-by: Vlastimil Babka Cc: Al Viro Cc: Alexey Dobriyan CC: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/proc_sysctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 656ba24c317d..984e42f8cb11 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -571,7 +571,7 @@ static ssize_t proc_sys_call_handler(struct kiocb *iocb, struct iov_iter *iter, error = -ENOMEM; if (count >= KMALLOC_MAX_SIZE) goto out; - kbuf = kzalloc(count + 1, GFP_KERNEL); + kbuf = kvzalloc(count + 1, GFP_KERNEL); if (!kbuf) goto out; @@ -600,7 +600,7 @@ static ssize_t proc_sys_call_handler(struct kiocb *iocb, struct iov_iter *iter, error = count; out_free_buf: - kfree(kbuf); + kvfree(kbuf); out: sysctl_head_finish(head); From 3b3376f222e3ab58367d9dd405cafd09d5e37b7c Mon Sep 17 00:00:00 2001 From: Lin Feng Date: Thu, 25 Feb 2021 17:20:53 -0800 Subject: [PATCH 085/118] sysctl.c: fix underflow value setting risk in vm_table Apart from subsystem specific .proc_handler handler, all ctl_tables with extra1 and extra2 members set should use proc_dointvec_minmax instead of proc_dointvec, or the limit set in extra* never work and potentially echo underflow values(negative numbers) is likely make system unstable. Especially vfs_cache_pressure and zone_reclaim_mode, -1 is apparently not a valid value, but we can set to them. And then kernel may crash. # echo -1 > /proc/sys/vm/vfs_cache_pressure Link: https://lkml.kernel.org/r/20201223105535.2875-1-linf@wangsu.com Signed-off-by: Lin Feng Cc: Alexey Dobriyan Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c9fbdd848138..62fbd09b5dc1 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2962,7 +2962,7 @@ static struct ctl_table vm_table[] = { .data = &block_dump, .maxlen = sizeof(block_dump), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, { @@ -2970,7 +2970,7 @@ static struct ctl_table vm_table[] = { .data = &sysctl_vfs_cache_pressure, .maxlen = sizeof(sysctl_vfs_cache_pressure), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, #if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \ @@ -2980,7 +2980,7 @@ static struct ctl_table vm_table[] = { .data = &sysctl_legacy_va_layout, .maxlen = sizeof(sysctl_legacy_va_layout), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, #endif @@ -2990,7 +2990,7 @@ static struct ctl_table vm_table[] = { .data = &node_reclaim_mode, .maxlen = sizeof(node_reclaim_mode), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, { From df54714f579a77662054132161612ce3da876b0d Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 25 Feb 2021 17:20:56 -0800 Subject: [PATCH 086/118] include/linux: remove repeated words Drop the doubled word "for" in a comment. {firewire-cdev.h} Drop the doubled word "in" in a comment. {input.h} Drop the doubled word "a" in a comment. {mdev.h} Drop the doubled word "the" in a comment. {ptrace.h} Link: https://lkml.kernel.org/r/20210126232444.22861-1-rdunlap@infradead.org Signed-off-by: Randy Dunlap Cc: Stefan Richter Cc: Dmitry Torokhov Cc: Kirti Wankhede Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mdev.h | 2 +- include/linux/ptrace.h | 2 +- include/uapi/linux/firewire-cdev.h | 2 +- include/uapi/linux/input.h | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/mdev.h b/include/linux/mdev.h index 9004375c462e..27eb383cb95d 100644 --- a/include/linux/mdev.h +++ b/include/linux/mdev.h @@ -42,7 +42,7 @@ struct device *mdev_get_iommu_device(struct device *dev); * @mdev: mdev_device structure on of mediated device * that is being created * Returns integer: success (0) or error (< 0) - * @remove: Called to free resources in parent device's driver for a + * @remove: Called to free resources in parent device's driver for * a mediated device. It is mandatory to provide 'remove' * ops. * @mdev: mdev_device device structure which is being diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index 2a9df80ea887..b5ebf6c01292 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -171,7 +171,7 @@ static inline void ptrace_event(int event, unsigned long message) * * Check whether @event is enabled and, if so, report @event and @pid * to the ptrace parent. @pid is reported as the pid_t seen from the - * the ptrace parent's pid namespace. + * ptrace parent's pid namespace. * * Called without locks. */ diff --git a/include/uapi/linux/firewire-cdev.h b/include/uapi/linux/firewire-cdev.h index 7e5b5c10a49c..5effa9832802 100644 --- a/include/uapi/linux/firewire-cdev.h +++ b/include/uapi/linux/firewire-cdev.h @@ -844,7 +844,7 @@ struct fw_cdev_queue_iso { * struct fw_cdev_start_iso - Start an isochronous transmission or reception * @cycle: Cycle in which to start I/O. If @cycle is greater than or * equal to 0, the I/O will start on that cycle. - * @sync: Determines the value to wait for for receive packets that have + * @sync: Determines the value to wait for receive packets that have * the %FW_CDEV_ISO_SYNC bit set * @tags: Tag filter bit mask. Only valid for isochronous reception. * Determines the tag values for which packets will be accepted. diff --git a/include/uapi/linux/input.h b/include/uapi/linux/input.h index 9a61c28ed3ae..ee3127461ee0 100644 --- a/include/uapi/linux/input.h +++ b/include/uapi/linux/input.h @@ -84,7 +84,7 @@ struct input_id { * in units per radian. * When INPUT_PROP_ACCELEROMETER is set the resolution changes. * The main axes (ABS_X, ABS_Y, ABS_Z) are then reported in - * in units per g (units/g) and in units per degree per second + * units per g (units/g) and in units per degree per second * (units/deg/s) for rotational axes (ABS_RX, ABS_RY, ABS_RZ). */ struct input_absinfo { From c131bd0b5448bb577b7a9ed48c4e528807e8d5af Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Thu, 25 Feb 2021 17:21:00 -0800 Subject: [PATCH 087/118] treewide: Miguel has moved Update contact info. Link: https://lkml.kernel.org/r/20210206162524.GA11520@kernel.org Signed-off-by: Miguel Ojeda Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- .mailmap | 1 + CREDITS | 9 +++------ Documentation/admin-guide/auxdisplay/cfag12864b.rst | 2 +- Documentation/admin-guide/auxdisplay/ks0108.rst | 2 +- MAINTAINERS | 12 ++++++------ drivers/auxdisplay/cfag12864b.c | 4 ++-- drivers/auxdisplay/cfag12864bfb.c | 4 ++-- drivers/auxdisplay/ks0108.c | 4 ++-- include/linux/cfag12864b.h | 2 +- include/linux/ks0108.h | 2 +- samples/auxdisplay/cfag12864b-example.c | 2 +- 11 files changed, 21 insertions(+), 23 deletions(-) diff --git a/.mailmap b/.mailmap index 87a8bbdbf749..85b93cdefc87 100644 --- a/.mailmap +++ b/.mailmap @@ -237,6 +237,7 @@ Maxime Ripard Mayuresh Janorkar Michael Buesch Michel Dänzer +Miguel Ojeda Mike Rapoport Mike Rapoport Mike Rapoport diff --git a/CREDITS b/CREDITS index be097156bd71..cef83b958cbe 100644 --- a/CREDITS +++ b/CREDITS @@ -2841,14 +2841,11 @@ S: Subiaco, 6008 S: Perth, Western Australia S: Australia -N: Miguel Ojeda Sandonis -E: miguel.ojeda.sandonis@gmail.com -W: http://miguelojeda.es -W: http://jair.lab.fi.uva.es/~migojed/ +N: Miguel Ojeda +E: ojeda@kernel.org +W: https://ojeda.dev D: Author of the ks0108, cfag12864b and cfag12864bfb auxiliary display drivers. D: Maintainer of the auxiliary display drivers tree (drivers/auxdisplay/*) -S: C/ Mieses 20, 9-B -S: Valladolid 47009 S: Spain N: Peter Oruba diff --git a/Documentation/admin-guide/auxdisplay/cfag12864b.rst b/Documentation/admin-guide/auxdisplay/cfag12864b.rst index 18c2865bd322..da385d851acc 100644 --- a/Documentation/admin-guide/auxdisplay/cfag12864b.rst +++ b/Documentation/admin-guide/auxdisplay/cfag12864b.rst @@ -3,7 +3,7 @@ cfag12864b LCD Driver Documentation =================================== :License: GPLv2 -:Author & Maintainer: Miguel Ojeda Sandonis +:Author & Maintainer: Miguel Ojeda :Date: 2006-10-27 diff --git a/Documentation/admin-guide/auxdisplay/ks0108.rst b/Documentation/admin-guide/auxdisplay/ks0108.rst index c0b7faf73136..a7d3fe509373 100644 --- a/Documentation/admin-guide/auxdisplay/ks0108.rst +++ b/Documentation/admin-guide/auxdisplay/ks0108.rst @@ -3,7 +3,7 @@ ks0108 LCD Controller Driver Documentation ========================================== :License: GPLv2 -:Author & Maintainer: Miguel Ojeda Sandonis +:Author & Maintainer: Miguel Ojeda :Date: 2006-10-27 diff --git a/MAINTAINERS b/MAINTAINERS index 40040db747fc..e42082eccf36 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2982,7 +2982,7 @@ F: include/uapi/linux/audit.h F: kernel/audit* AUXILIARY DISPLAY DRIVERS -M: Miguel Ojeda Sandonis +M: Miguel Ojeda S: Maintained F: drivers/auxdisplay/ F: include/linux/cfag12864b.h @@ -4128,13 +4128,13 @@ F: scripts/extract-cert.c F: scripts/sign-file.c CFAG12864B LCD DRIVER -M: Miguel Ojeda Sandonis +M: Miguel Ojeda S: Maintained F: drivers/auxdisplay/cfag12864b.c F: include/linux/cfag12864b.h CFAG12864BFB LCD FRAMEBUFFER DRIVER -M: Miguel Ojeda Sandonis +M: Miguel Ojeda S: Maintained F: drivers/auxdisplay/cfag12864bfb.c F: include/linux/cfag12864b.h @@ -4304,7 +4304,7 @@ S: Supported F: drivers/infiniband/hw/usnic/ CLANG-FORMAT FILE -M: Miguel Ojeda +M: Miguel Ojeda S: Maintained F: .clang-format @@ -4444,7 +4444,7 @@ S: Maintained F: drivers/platform/x86/compal-laptop.c COMPILER ATTRIBUTES -M: Miguel Ojeda +M: Miguel Ojeda S: Maintained F: include/linux/compiler_attributes.h @@ -9939,7 +9939,7 @@ F: include/linux/kprobes.h F: kernel/kprobes.c KS0108 LCD CONTROLLER DRIVER -M: Miguel Ojeda Sandonis +M: Miguel Ojeda S: Maintained F: Documentation/admin-guide/auxdisplay/ks0108.rst F: drivers/auxdisplay/ks0108.c diff --git a/drivers/auxdisplay/cfag12864b.c b/drivers/auxdisplay/cfag12864b.c index 7eebae7e322c..fd430e6866a1 100644 --- a/drivers/auxdisplay/cfag12864b.c +++ b/drivers/auxdisplay/cfag12864b.c @@ -5,7 +5,7 @@ * Description: cfag12864b LCD driver * Depends: ks0108 * - * Author: Copyright (C) Miguel Ojeda Sandonis + * Author: Copyright (C) Miguel Ojeda * Date: 2006-10-31 */ @@ -376,5 +376,5 @@ module_init(cfag12864b_init); module_exit(cfag12864b_exit); MODULE_LICENSE("GPL v2"); -MODULE_AUTHOR("Miguel Ojeda Sandonis "); +MODULE_AUTHOR("Miguel Ojeda "); MODULE_DESCRIPTION("cfag12864b LCD driver"); diff --git a/drivers/auxdisplay/cfag12864bfb.c b/drivers/auxdisplay/cfag12864bfb.c index 2002291ab338..d66821adf453 100644 --- a/drivers/auxdisplay/cfag12864bfb.c +++ b/drivers/auxdisplay/cfag12864bfb.c @@ -5,7 +5,7 @@ * Description: cfag12864b LCD framebuffer driver * Depends: cfag12864b * - * Author: Copyright (C) Miguel Ojeda Sandonis + * Author: Copyright (C) Miguel Ojeda * Date: 2006-10-31 */ @@ -171,5 +171,5 @@ module_init(cfag12864bfb_init); module_exit(cfag12864bfb_exit); MODULE_LICENSE("GPL v2"); -MODULE_AUTHOR("Miguel Ojeda Sandonis "); +MODULE_AUTHOR("Miguel Ojeda "); MODULE_DESCRIPTION("cfag12864b LCD framebuffer driver"); diff --git a/drivers/auxdisplay/ks0108.c b/drivers/auxdisplay/ks0108.c index abfe3fa9e6f4..03c95ad4216c 100644 --- a/drivers/auxdisplay/ks0108.c +++ b/drivers/auxdisplay/ks0108.c @@ -5,7 +5,7 @@ * Description: ks0108 LCD Controller driver * Depends: parport * - * Author: Copyright (C) Miguel Ojeda Sandonis + * Author: Copyright (C) Miguel Ojeda * Date: 2006-10-31 */ @@ -182,6 +182,6 @@ module_init(ks0108_init); module_exit(ks0108_exit); MODULE_LICENSE("GPL v2"); -MODULE_AUTHOR("Miguel Ojeda Sandonis "); +MODULE_AUTHOR("Miguel Ojeda "); MODULE_DESCRIPTION("ks0108 LCD Controller driver"); diff --git a/include/linux/cfag12864b.h b/include/linux/cfag12864b.h index 4060004968c8..6617d9c68d86 100644 --- a/include/linux/cfag12864b.h +++ b/include/linux/cfag12864b.h @@ -4,7 +4,7 @@ * Version: 0.1.0 * Description: cfag12864b LCD driver header * - * Author: Copyright (C) Miguel Ojeda Sandonis + * Author: Copyright (C) Miguel Ojeda * Date: 2006-10-12 */ diff --git a/include/linux/ks0108.h b/include/linux/ks0108.h index 0738389b42b6..1a37a664f915 100644 --- a/include/linux/ks0108.h +++ b/include/linux/ks0108.h @@ -4,7 +4,7 @@ * Version: 0.1.0 * Description: ks0108 LCD Controller driver header * - * Author: Copyright (C) Miguel Ojeda Sandonis + * Author: Copyright (C) Miguel Ojeda * Date: 2006-10-31 */ diff --git a/samples/auxdisplay/cfag12864b-example.c b/samples/auxdisplay/cfag12864b-example.c index bfeab44f81d0..2e3bb7375c99 100644 --- a/samples/auxdisplay/cfag12864b-example.c +++ b/samples/auxdisplay/cfag12864b-example.c @@ -4,7 +4,7 @@ * Version: 0.1.0 * Description: cfag12864b LCD userspace example program * - * Author: Copyright (C) Miguel Ojeda Sandonis + * Author: Copyright (C) Miguel Ojeda * Date: 2006-10-31 */ From c1f26493ed7f363c63e0e9d91e50d4db26df6603 Mon Sep 17 00:00:00 2001 From: Hubert Jasudowicz Date: Thu, 25 Feb 2021 17:21:03 -0800 Subject: [PATCH 088/118] groups: use flexible-array member in struct group_info Replace zero-size array with flexible array member, as recommended by the docs. Link: https://lkml.kernel.org/r/155995eed35c3c1bdcc56e69d8997c8e4c46740a.1611620846.git.hubert.jasudowicz@gmail.com Signed-off-by: Hubert Jasudowicz Cc: "Peter Zijlstra (Intel)" Cc: Micah Morton Cc: Gao Xiang Cc: Michael Kelley Cc: Thomas Cedeno Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cred.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/cred.h b/include/linux/cred.h index 18639c069263..4c6350503697 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -25,7 +25,7 @@ struct inode; struct group_info { atomic_t usage; int ngroups; - kgid_t gid[0]; + kgid_t gid[]; } __randomize_layout; /** From e1e014115dfd48ab3e3691ce46f9484ce12e67d4 Mon Sep 17 00:00:00 2001 From: Hubert Jasudowicz Date: Thu, 25 Feb 2021 17:21:07 -0800 Subject: [PATCH 089/118] groups: simplify struct group_info allocation Combine kmalloc and vmalloc into a single call. Use struct_size macro instead of direct size calculation. Link: https://lkml.kernel.org/r/ba9ba5beea9a44b7196c41a0d9528abd5f20dd2e.1611620846.git.hubert.jasudowicz@gmail.com Signed-off-by: Hubert Jasudowicz Cc: Gao Xiang Cc: Micah Morton Cc: Michael Kelley Cc: "Peter Zijlstra (Intel)" Cc: Thomas Cedeno Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/groups.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/kernel/groups.c b/kernel/groups.c index fe7e6385530e..787b381c7c00 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -15,12 +15,7 @@ struct group_info *groups_alloc(int gidsetsize) { struct group_info *gi; - unsigned int len; - - len = sizeof(struct group_info) + sizeof(kgid_t) * gidsetsize; - gi = kmalloc(len, GFP_KERNEL_ACCOUNT|__GFP_NOWARN|__GFP_NORETRY); - if (!gi) - gi = __vmalloc(len, GFP_KERNEL_ACCOUNT); + gi = kvmalloc(struct_size(gi, gid, gidsetsize), GFP_KERNEL_ACCOUNT); if (!gi) return NULL; From c034f48e99907d5be147ac8f0f3e630a9307c2be Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 25 Feb 2021 17:21:10 -0800 Subject: [PATCH 090/118] kernel: delete repeated words in comments Drop repeated words in kernel/events/. {if, the, that, with, time} Drop repeated words in kernel/locking/. {it, no, the} Drop repeated words in kernel/sched/. {in, not} Link: https://lkml.kernel.org/r/20210127023412.26292-1-rdunlap@infradead.org Signed-off-by: Randy Dunlap Acked-by: Will Deacon [kernel/locking/] Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Will Deacon Cc: Mathieu Desnoyers Cc: "Paul E. McKenney" Cc: Juri Lelli Cc: Vincent Guittot Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/events/core.c | 8 ++++---- kernel/events/uprobes.c | 2 +- kernel/locking/rtmutex.c | 4 ++-- kernel/locking/rwsem.c | 2 +- kernel/locking/semaphore.c | 2 +- kernel/sched/fair.c | 2 +- kernel/sched/membarrier.c | 2 +- 7 files changed, 11 insertions(+), 11 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 129dee540a8b..0aeca5f3c0ac 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -269,7 +269,7 @@ static void event_function_call(struct perf_event *event, event_f func, void *da if (!event->parent) { /* * If this is a !child event, we must hold ctx::mutex to - * stabilize the the event->ctx relation. See + * stabilize the event->ctx relation. See * perf_event_ctx_lock(). */ lockdep_assert_held(&ctx->mutex); @@ -1303,7 +1303,7 @@ static void put_ctx(struct perf_event_context *ctx) * life-time rules separate them. That is an exiting task cannot fork, and a * spawning task cannot (yet) exit. * - * But remember that that these are parent<->child context relations, and + * But remember that these are parent<->child context relations, and * migration does not affect children, therefore these two orderings should not * interact. * @@ -1442,7 +1442,7 @@ static u64 primary_event_id(struct perf_event *event) /* * Get the perf_event_context for a task and lock it. * - * This has to cope with with the fact that until it is locked, + * This has to cope with the fact that until it is locked, * the context could get moved to another task. */ static struct perf_event_context * @@ -2486,7 +2486,7 @@ static void perf_set_shadow_time(struct perf_event *event, * But this is a bit hairy. * * So instead, we have an explicit cgroup call to remain - * within the time time source all along. We believe it + * within the time source all along. We believe it * is cleaner and simpler to understand. */ if (is_cgroup_event(event)) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 3ea7f8f92f1d..6addc9780319 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1733,7 +1733,7 @@ void uprobe_free_utask(struct task_struct *t) } /* - * Allocate a uprobe_task object for the task if if necessary. + * Allocate a uprobe_task object for the task if necessary. * Called when the thread hits a breakpoint. * * Returns: diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 03b21135313c..48fff6437901 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1420,7 +1420,7 @@ rt_mutex_fasttrylock(struct rt_mutex *lock, } /* - * Performs the wakeup of the the top-waiter and re-enables preemption. + * Performs the wakeup of the top-waiter and re-enables preemption. */ void rt_mutex_postunlock(struct wake_q_head *wake_q) { @@ -1819,7 +1819,7 @@ struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock) * been started. * @waiter: the pre-initialized rt_mutex_waiter * - * Wait for the the lock acquisition started on our behalf by + * Wait for the lock acquisition started on our behalf by * rt_mutex_start_proxy_lock(). Upon failure, the caller must call * rt_mutex_cleanup_proxy_lock(). * diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index ba67600c7b2c..abba5df50006 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -1048,7 +1048,7 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) /* * If there were already threads queued before us and: - * 1) there are no no active locks, wake the front + * 1) there are no active locks, wake the front * queued process(es) as the handoff bit might be set. * 2) there are no active writers and some readers, the lock * must be read owned; so we try to wake any read lock diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c index d9dd94defc0a..9aa855a96c4a 100644 --- a/kernel/locking/semaphore.c +++ b/kernel/locking/semaphore.c @@ -119,7 +119,7 @@ EXPORT_SYMBOL(down_killable); * @sem: the semaphore to be acquired * * Try to acquire the semaphore atomically. Returns 0 if the semaphore has - * been acquired successfully or 1 if it it cannot be acquired. + * been acquired successfully or 1 if it cannot be acquired. * * NOTE: This return value is inverted from both spin_trylock and * mutex_trylock! Be careful about this when converting code. diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8a8bd7b13634..794c2cb945f8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5126,7 +5126,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) /* * When a group wakes up we want to make sure that its quota is not already * expired/exceeded, otherwise it may be allowed to steal additional ticks of - * runtime as update_curr() throttling can not not trigger until it's on-rq. + * runtime as update_curr() throttling can not trigger until it's on-rq. */ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) { diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 08ae45ad9261..acdae625c636 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -454,7 +454,7 @@ static int sync_runqueues_membarrier_state(struct mm_struct *mm) /* * For each cpu runqueue, if the task's mm match @mm, ensure that all - * @mm's membarrier state set bits are also set in in the runqueue's + * @mm's membarrier state set bits are also set in the runqueue's * membarrier state. This ensures that a runqueue scheduling * between threads which are users of @mm has its membarrier state * updated. From 7b4693e644cbdafdb2a2393fee8f81d85edd1b7d Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 25 Feb 2021 17:21:14 -0800 Subject: [PATCH 091/118] MAINTAINERS: add uapi directories to API/ABI section Let's add include/uapi/ and arch/*/include/uapi/ to API/ABI section, so that for patches modifying them, get_maintainers.pl suggests CCing linux-api@ so people don't forget. Link: https://lkml.kernel.org/r/20210217174745.13591-1-vbabka@suse.cz Signed-off-by: Vlastimil Babka Reported-by: David Hildenbrand Cc: Michael Kerrisk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index e42082eccf36..498cc779e354 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -261,6 +261,8 @@ ABI/API L: linux-api@vger.kernel.org F: include/linux/syscalls.h F: kernel/sys_ni.c +F: include/uapi/ +F: arch/*/include/uapi/ ABIT UGURU 1,2 HARDWARE MONITOR DRIVER M: Hans de Goede From 0e24465d3313832e82f8bd9ee2439da1367dd2e5 Mon Sep 17 00:00:00 2001 From: Huang Shijie Date: Thu, 25 Feb 2021 17:21:17 -0800 Subject: [PATCH 092/118] lib/genalloc.c: change return type to unsigned long for bitmap_set_ll Just as bitmap_clear_ll(), change return type to unsigned long for bitmap_set_ll to avoid the possible overflow in future. Link: https://lkml.kernel.org/r/20210105031644.2771-1-sjhuang@iluvatar.ai Signed-off-by: Huang Shijie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/genalloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/genalloc.c b/lib/genalloc.c index dab97bb69df6..5dcf9cdcbc46 100644 --- a/lib/genalloc.c +++ b/lib/genalloc.c @@ -81,7 +81,8 @@ static int clear_bits_ll(unsigned long *addr, unsigned long mask_to_clear) * users set the same bit, one user will return remain bits, otherwise * return 0. */ -static int bitmap_set_ll(unsigned long *map, unsigned long start, unsigned long nr) +static unsigned long +bitmap_set_ll(unsigned long *map, unsigned long start, unsigned long nr) { unsigned long *p = map + BIT_WORD(start); const unsigned long size = start + nr; From a28a6e860c6cf231cf3c5171c75c342adcd00406 Mon Sep 17 00:00:00 2001 From: Francis Laniel Date: Thu, 25 Feb 2021 17:21:20 -0800 Subject: [PATCH 093/118] string.h: move fortified functions definitions in a dedicated header. This patch adds fortify-string.h to contain fortified functions definitions. Thus, the code is more separated and compile time is approximately 1% faster for people who do not set CONFIG_FORTIFY_SOURCE. Link: https://lkml.kernel.org/r/20210111092141.22946-1-laniel_francis@privacyrequired.com Link: https://lkml.kernel.org/r/20210111092141.22946-2-laniel_francis@privacyrequired.com Signed-off-by: Francis Laniel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fortify-string.h | 302 +++++++++++++++++++++++++++++++++ include/linux/string.h | 282 +----------------------------- 2 files changed, 303 insertions(+), 281 deletions(-) create mode 100644 include/linux/fortify-string.h diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h new file mode 100644 index 000000000000..c1be37437e77 --- /dev/null +++ b/include/linux/fortify-string.h @@ -0,0 +1,302 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_FORTIFY_STRING_H_ +#define _LINUX_FORTIFY_STRING_H_ + + +#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) +extern void *__underlying_memchr(const void *p, int c, __kernel_size_t size) __RENAME(memchr); +extern int __underlying_memcmp(const void *p, const void *q, __kernel_size_t size) __RENAME(memcmp); +extern void *__underlying_memcpy(void *p, const void *q, __kernel_size_t size) __RENAME(memcpy); +extern void *__underlying_memmove(void *p, const void *q, __kernel_size_t size) __RENAME(memmove); +extern void *__underlying_memset(void *p, int c, __kernel_size_t size) __RENAME(memset); +extern char *__underlying_strcat(char *p, const char *q) __RENAME(strcat); +extern char *__underlying_strcpy(char *p, const char *q) __RENAME(strcpy); +extern __kernel_size_t __underlying_strlen(const char *p) __RENAME(strlen); +extern char *__underlying_strncat(char *p, const char *q, __kernel_size_t count) __RENAME(strncat); +extern char *__underlying_strncpy(char *p, const char *q, __kernel_size_t size) __RENAME(strncpy); +#else +#define __underlying_memchr __builtin_memchr +#define __underlying_memcmp __builtin_memcmp +#define __underlying_memcpy __builtin_memcpy +#define __underlying_memmove __builtin_memmove +#define __underlying_memset __builtin_memset +#define __underlying_strcat __builtin_strcat +#define __underlying_strcpy __builtin_strcpy +#define __underlying_strlen __builtin_strlen +#define __underlying_strncat __builtin_strncat +#define __underlying_strncpy __builtin_strncpy +#endif + +__FORTIFY_INLINE char *strncpy(char *p, const char *q, __kernel_size_t size) +{ + size_t p_size = __builtin_object_size(p, 1); + + if (__builtin_constant_p(size) && p_size < size) + __write_overflow(); + if (p_size < size) + fortify_panic(__func__); + return __underlying_strncpy(p, q, size); +} + +__FORTIFY_INLINE char *strcat(char *p, const char *q) +{ + size_t p_size = __builtin_object_size(p, 1); + + if (p_size == (size_t)-1) + return __underlying_strcat(p, q); + if (strlcat(p, q, p_size) >= p_size) + fortify_panic(__func__); + return p; +} + +__FORTIFY_INLINE __kernel_size_t strlen(const char *p) +{ + __kernel_size_t ret; + size_t p_size = __builtin_object_size(p, 1); + + /* Work around gcc excess stack consumption issue */ + if (p_size == (size_t)-1 || + (__builtin_constant_p(p[p_size - 1]) && p[p_size - 1] == '\0')) + return __underlying_strlen(p); + ret = strnlen(p, p_size); + if (p_size <= ret) + fortify_panic(__func__); + return ret; +} + +extern __kernel_size_t __real_strnlen(const char *, __kernel_size_t) __RENAME(strnlen); +__FORTIFY_INLINE __kernel_size_t strnlen(const char *p, __kernel_size_t maxlen) +{ + size_t p_size = __builtin_object_size(p, 1); + __kernel_size_t ret = __real_strnlen(p, maxlen < p_size ? maxlen : p_size); + + if (p_size <= ret && maxlen != ret) + fortify_panic(__func__); + return ret; +} + +/* defined after fortified strlen to reuse it */ +extern size_t __real_strlcpy(char *, const char *, size_t) __RENAME(strlcpy); +__FORTIFY_INLINE size_t strlcpy(char *p, const char *q, size_t size) +{ + size_t ret; + size_t p_size = __builtin_object_size(p, 1); + size_t q_size = __builtin_object_size(q, 1); + + if (p_size == (size_t)-1 && q_size == (size_t)-1) + return __real_strlcpy(p, q, size); + ret = strlen(q); + if (size) { + size_t len = (ret >= size) ? size - 1 : ret; + + if (__builtin_constant_p(len) && len >= p_size) + __write_overflow(); + if (len >= p_size) + fortify_panic(__func__); + __underlying_memcpy(p, q, len); + p[len] = '\0'; + } + return ret; +} + +/* defined after fortified strnlen to reuse it */ +extern ssize_t __real_strscpy(char *, const char *, size_t) __RENAME(strscpy); +__FORTIFY_INLINE ssize_t strscpy(char *p, const char *q, size_t size) +{ + size_t len; + /* Use string size rather than possible enclosing struct size. */ + size_t p_size = __builtin_object_size(p, 1); + size_t q_size = __builtin_object_size(q, 1); + + /* If we cannot get size of p and q default to call strscpy. */ + if (p_size == (size_t) -1 && q_size == (size_t) -1) + return __real_strscpy(p, q, size); + + /* + * If size can be known at compile time and is greater than + * p_size, generate a compile time write overflow error. + */ + if (__builtin_constant_p(size) && size > p_size) + __write_overflow(); + + /* + * This call protects from read overflow, because len will default to q + * length if it smaller than size. + */ + len = strnlen(q, size); + /* + * If len equals size, we will copy only size bytes which leads to + * -E2BIG being returned. + * Otherwise we will copy len + 1 because of the final '\O'. + */ + len = len == size ? size : len + 1; + + /* + * Generate a runtime write overflow error if len is greater than + * p_size. + */ + if (len > p_size) + fortify_panic(__func__); + + /* + * We can now safely call vanilla strscpy because we are protected from: + * 1. Read overflow thanks to call to strnlen(). + * 2. Write overflow thanks to above ifs. + */ + return __real_strscpy(p, q, len); +} + +/* defined after fortified strlen and strnlen to reuse them */ +__FORTIFY_INLINE char *strncat(char *p, const char *q, __kernel_size_t count) +{ + size_t p_len, copy_len; + size_t p_size = __builtin_object_size(p, 1); + size_t q_size = __builtin_object_size(q, 1); + + if (p_size == (size_t)-1 && q_size == (size_t)-1) + return __underlying_strncat(p, q, count); + p_len = strlen(p); + copy_len = strnlen(q, count); + if (p_size < p_len + copy_len + 1) + fortify_panic(__func__); + __underlying_memcpy(p + p_len, q, copy_len); + p[p_len + copy_len] = '\0'; + return p; +} + +__FORTIFY_INLINE void *memset(void *p, int c, __kernel_size_t size) +{ + size_t p_size = __builtin_object_size(p, 0); + + if (__builtin_constant_p(size) && p_size < size) + __write_overflow(); + if (p_size < size) + fortify_panic(__func__); + return __underlying_memset(p, c, size); +} + +__FORTIFY_INLINE void *memcpy(void *p, const void *q, __kernel_size_t size) +{ + size_t p_size = __builtin_object_size(p, 0); + size_t q_size = __builtin_object_size(q, 0); + + if (__builtin_constant_p(size)) { + if (p_size < size) + __write_overflow(); + if (q_size < size) + __read_overflow2(); + } + if (p_size < size || q_size < size) + fortify_panic(__func__); + return __underlying_memcpy(p, q, size); +} + +__FORTIFY_INLINE void *memmove(void *p, const void *q, __kernel_size_t size) +{ + size_t p_size = __builtin_object_size(p, 0); + size_t q_size = __builtin_object_size(q, 0); + + if (__builtin_constant_p(size)) { + if (p_size < size) + __write_overflow(); + if (q_size < size) + __read_overflow2(); + } + if (p_size < size || q_size < size) + fortify_panic(__func__); + return __underlying_memmove(p, q, size); +} + +extern void *__real_memscan(void *, int, __kernel_size_t) __RENAME(memscan); +__FORTIFY_INLINE void *memscan(void *p, int c, __kernel_size_t size) +{ + size_t p_size = __builtin_object_size(p, 0); + + if (__builtin_constant_p(size) && p_size < size) + __read_overflow(); + if (p_size < size) + fortify_panic(__func__); + return __real_memscan(p, c, size); +} + +__FORTIFY_INLINE int memcmp(const void *p, const void *q, __kernel_size_t size) +{ + size_t p_size = __builtin_object_size(p, 0); + size_t q_size = __builtin_object_size(q, 0); + + if (__builtin_constant_p(size)) { + if (p_size < size) + __read_overflow(); + if (q_size < size) + __read_overflow2(); + } + if (p_size < size || q_size < size) + fortify_panic(__func__); + return __underlying_memcmp(p, q, size); +} + +__FORTIFY_INLINE void *memchr(const void *p, int c, __kernel_size_t size) +{ + size_t p_size = __builtin_object_size(p, 0); + + if (__builtin_constant_p(size) && p_size < size) + __read_overflow(); + if (p_size < size) + fortify_panic(__func__); + return __underlying_memchr(p, c, size); +} + +void *__real_memchr_inv(const void *s, int c, size_t n) __RENAME(memchr_inv); +__FORTIFY_INLINE void *memchr_inv(const void *p, int c, size_t size) +{ + size_t p_size = __builtin_object_size(p, 0); + + if (__builtin_constant_p(size) && p_size < size) + __read_overflow(); + if (p_size < size) + fortify_panic(__func__); + return __real_memchr_inv(p, c, size); +} + +extern void *__real_kmemdup(const void *src, size_t len, gfp_t gfp) __RENAME(kmemdup); +__FORTIFY_INLINE void *kmemdup(const void *p, size_t size, gfp_t gfp) +{ + size_t p_size = __builtin_object_size(p, 0); + + if (__builtin_constant_p(size) && p_size < size) + __read_overflow(); + if (p_size < size) + fortify_panic(__func__); + return __real_kmemdup(p, size, gfp); +} + +/* defined after fortified strlen and memcpy to reuse them */ +__FORTIFY_INLINE char *strcpy(char *p, const char *q) +{ + size_t p_size = __builtin_object_size(p, 1); + size_t q_size = __builtin_object_size(q, 1); + size_t size; + + if (p_size == (size_t)-1 && q_size == (size_t)-1) + return __underlying_strcpy(p, q); + size = strlen(q) + 1; + /* test here to use the more stringent object size */ + if (p_size < size) + fortify_panic(__func__); + memcpy(p, q, size); + return p; +} + +/* Don't use these outside the FORITFY_SOURCE implementation */ +#undef __underlying_memchr +#undef __underlying_memcmp +#undef __underlying_memcpy +#undef __underlying_memmove +#undef __underlying_memset +#undef __underlying_strcat +#undef __underlying_strcpy +#undef __underlying_strlen +#undef __underlying_strncat +#undef __underlying_strncpy + +#endif /* _LINUX_FORTIFY_STRING_H_ */ diff --git a/include/linux/string.h b/include/linux/string.h index 4fcfb56abcf5..9521d8cab18e 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -266,287 +266,7 @@ void __read_overflow3(void) __compiletime_error("detected read beyond size of ob void __write_overflow(void) __compiletime_error("detected write beyond size of object passed as 1st parameter"); #if !defined(__NO_FORTIFY) && defined(__OPTIMIZE__) && defined(CONFIG_FORTIFY_SOURCE) - -#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) -extern void *__underlying_memchr(const void *p, int c, __kernel_size_t size) __RENAME(memchr); -extern int __underlying_memcmp(const void *p, const void *q, __kernel_size_t size) __RENAME(memcmp); -extern void *__underlying_memcpy(void *p, const void *q, __kernel_size_t size) __RENAME(memcpy); -extern void *__underlying_memmove(void *p, const void *q, __kernel_size_t size) __RENAME(memmove); -extern void *__underlying_memset(void *p, int c, __kernel_size_t size) __RENAME(memset); -extern char *__underlying_strcat(char *p, const char *q) __RENAME(strcat); -extern char *__underlying_strcpy(char *p, const char *q) __RENAME(strcpy); -extern __kernel_size_t __underlying_strlen(const char *p) __RENAME(strlen); -extern char *__underlying_strncat(char *p, const char *q, __kernel_size_t count) __RENAME(strncat); -extern char *__underlying_strncpy(char *p, const char *q, __kernel_size_t size) __RENAME(strncpy); -#else -#define __underlying_memchr __builtin_memchr -#define __underlying_memcmp __builtin_memcmp -#define __underlying_memcpy __builtin_memcpy -#define __underlying_memmove __builtin_memmove -#define __underlying_memset __builtin_memset -#define __underlying_strcat __builtin_strcat -#define __underlying_strcpy __builtin_strcpy -#define __underlying_strlen __builtin_strlen -#define __underlying_strncat __builtin_strncat -#define __underlying_strncpy __builtin_strncpy -#endif - -__FORTIFY_INLINE char *strncpy(char *p, const char *q, __kernel_size_t size) -{ - size_t p_size = __builtin_object_size(p, 1); - if (__builtin_constant_p(size) && p_size < size) - __write_overflow(); - if (p_size < size) - fortify_panic(__func__); - return __underlying_strncpy(p, q, size); -} - -__FORTIFY_INLINE char *strcat(char *p, const char *q) -{ - size_t p_size = __builtin_object_size(p, 1); - if (p_size == (size_t)-1) - return __underlying_strcat(p, q); - if (strlcat(p, q, p_size) >= p_size) - fortify_panic(__func__); - return p; -} - -__FORTIFY_INLINE __kernel_size_t strlen(const char *p) -{ - __kernel_size_t ret; - size_t p_size = __builtin_object_size(p, 1); - - /* Work around gcc excess stack consumption issue */ - if (p_size == (size_t)-1 || - (__builtin_constant_p(p[p_size - 1]) && p[p_size - 1] == '\0')) - return __underlying_strlen(p); - ret = strnlen(p, p_size); - if (p_size <= ret) - fortify_panic(__func__); - return ret; -} - -extern __kernel_size_t __real_strnlen(const char *, __kernel_size_t) __RENAME(strnlen); -__FORTIFY_INLINE __kernel_size_t strnlen(const char *p, __kernel_size_t maxlen) -{ - size_t p_size = __builtin_object_size(p, 1); - __kernel_size_t ret = __real_strnlen(p, maxlen < p_size ? maxlen : p_size); - if (p_size <= ret && maxlen != ret) - fortify_panic(__func__); - return ret; -} - -/* defined after fortified strlen to reuse it */ -extern size_t __real_strlcpy(char *, const char *, size_t) __RENAME(strlcpy); -__FORTIFY_INLINE size_t strlcpy(char *p, const char *q, size_t size) -{ - size_t ret; - size_t p_size = __builtin_object_size(p, 1); - size_t q_size = __builtin_object_size(q, 1); - if (p_size == (size_t)-1 && q_size == (size_t)-1) - return __real_strlcpy(p, q, size); - ret = strlen(q); - if (size) { - size_t len = (ret >= size) ? size - 1 : ret; - if (__builtin_constant_p(len) && len >= p_size) - __write_overflow(); - if (len >= p_size) - fortify_panic(__func__); - __underlying_memcpy(p, q, len); - p[len] = '\0'; - } - return ret; -} - -/* defined after fortified strnlen to reuse it */ -extern ssize_t __real_strscpy(char *, const char *, size_t) __RENAME(strscpy); -__FORTIFY_INLINE ssize_t strscpy(char *p, const char *q, size_t size) -{ - size_t len; - /* Use string size rather than possible enclosing struct size. */ - size_t p_size = __builtin_object_size(p, 1); - size_t q_size = __builtin_object_size(q, 1); - - /* If we cannot get size of p and q default to call strscpy. */ - if (p_size == (size_t) -1 && q_size == (size_t) -1) - return __real_strscpy(p, q, size); - - /* - * If size can be known at compile time and is greater than - * p_size, generate a compile time write overflow error. - */ - if (__builtin_constant_p(size) && size > p_size) - __write_overflow(); - - /* - * This call protects from read overflow, because len will default to q - * length if it smaller than size. - */ - len = strnlen(q, size); - /* - * If len equals size, we will copy only size bytes which leads to - * -E2BIG being returned. - * Otherwise we will copy len + 1 because of the final '\O'. - */ - len = len == size ? size : len + 1; - - /* - * Generate a runtime write overflow error if len is greater than - * p_size. - */ - if (len > p_size) - fortify_panic(__func__); - - /* - * We can now safely call vanilla strscpy because we are protected from: - * 1. Read overflow thanks to call to strnlen(). - * 2. Write overflow thanks to above ifs. - */ - return __real_strscpy(p, q, len); -} - -/* defined after fortified strlen and strnlen to reuse them */ -__FORTIFY_INLINE char *strncat(char *p, const char *q, __kernel_size_t count) -{ - size_t p_len, copy_len; - size_t p_size = __builtin_object_size(p, 1); - size_t q_size = __builtin_object_size(q, 1); - if (p_size == (size_t)-1 && q_size == (size_t)-1) - return __underlying_strncat(p, q, count); - p_len = strlen(p); - copy_len = strnlen(q, count); - if (p_size < p_len + copy_len + 1) - fortify_panic(__func__); - __underlying_memcpy(p + p_len, q, copy_len); - p[p_len + copy_len] = '\0'; - return p; -} - -__FORTIFY_INLINE void *memset(void *p, int c, __kernel_size_t size) -{ - size_t p_size = __builtin_object_size(p, 0); - if (__builtin_constant_p(size) && p_size < size) - __write_overflow(); - if (p_size < size) - fortify_panic(__func__); - return __underlying_memset(p, c, size); -} - -__FORTIFY_INLINE void *memcpy(void *p, const void *q, __kernel_size_t size) -{ - size_t p_size = __builtin_object_size(p, 0); - size_t q_size = __builtin_object_size(q, 0); - if (__builtin_constant_p(size)) { - if (p_size < size) - __write_overflow(); - if (q_size < size) - __read_overflow2(); - } - if (p_size < size || q_size < size) - fortify_panic(__func__); - return __underlying_memcpy(p, q, size); -} - -__FORTIFY_INLINE void *memmove(void *p, const void *q, __kernel_size_t size) -{ - size_t p_size = __builtin_object_size(p, 0); - size_t q_size = __builtin_object_size(q, 0); - if (__builtin_constant_p(size)) { - if (p_size < size) - __write_overflow(); - if (q_size < size) - __read_overflow2(); - } - if (p_size < size || q_size < size) - fortify_panic(__func__); - return __underlying_memmove(p, q, size); -} - -extern void *__real_memscan(void *, int, __kernel_size_t) __RENAME(memscan); -__FORTIFY_INLINE void *memscan(void *p, int c, __kernel_size_t size) -{ - size_t p_size = __builtin_object_size(p, 0); - if (__builtin_constant_p(size) && p_size < size) - __read_overflow(); - if (p_size < size) - fortify_panic(__func__); - return __real_memscan(p, c, size); -} - -__FORTIFY_INLINE int memcmp(const void *p, const void *q, __kernel_size_t size) -{ - size_t p_size = __builtin_object_size(p, 0); - size_t q_size = __builtin_object_size(q, 0); - if (__builtin_constant_p(size)) { - if (p_size < size) - __read_overflow(); - if (q_size < size) - __read_overflow2(); - } - if (p_size < size || q_size < size) - fortify_panic(__func__); - return __underlying_memcmp(p, q, size); -} - -__FORTIFY_INLINE void *memchr(const void *p, int c, __kernel_size_t size) -{ - size_t p_size = __builtin_object_size(p, 0); - if (__builtin_constant_p(size) && p_size < size) - __read_overflow(); - if (p_size < size) - fortify_panic(__func__); - return __underlying_memchr(p, c, size); -} - -void *__real_memchr_inv(const void *s, int c, size_t n) __RENAME(memchr_inv); -__FORTIFY_INLINE void *memchr_inv(const void *p, int c, size_t size) -{ - size_t p_size = __builtin_object_size(p, 0); - if (__builtin_constant_p(size) && p_size < size) - __read_overflow(); - if (p_size < size) - fortify_panic(__func__); - return __real_memchr_inv(p, c, size); -} - -extern void *__real_kmemdup(const void *src, size_t len, gfp_t gfp) __RENAME(kmemdup); -__FORTIFY_INLINE void *kmemdup(const void *p, size_t size, gfp_t gfp) -{ - size_t p_size = __builtin_object_size(p, 0); - if (__builtin_constant_p(size) && p_size < size) - __read_overflow(); - if (p_size < size) - fortify_panic(__func__); - return __real_kmemdup(p, size, gfp); -} - -/* defined after fortified strlen and memcpy to reuse them */ -__FORTIFY_INLINE char *strcpy(char *p, const char *q) -{ - size_t p_size = __builtin_object_size(p, 1); - size_t q_size = __builtin_object_size(q, 1); - size_t size; - if (p_size == (size_t)-1 && q_size == (size_t)-1) - return __underlying_strcpy(p, q); - size = strlen(q) + 1; - /* test here to use the more stringent object size */ - if (p_size < size) - fortify_panic(__func__); - memcpy(p, q, size); - return p; -} - -/* Don't use these outside the FORITFY_SOURCE implementation */ -#undef __underlying_memchr -#undef __underlying_memcmp -#undef __underlying_memcpy -#undef __underlying_memmove -#undef __underlying_memset -#undef __underlying_strcat -#undef __underlying_strcpy -#undef __underlying_strlen -#undef __underlying_strncat -#undef __underlying_strncpy +#include #endif /** From d262093656a0eec6d6114a3178a9d887fddd0ded Mon Sep 17 00:00:00 2001 From: Yogesh Lal Date: Thu, 25 Feb 2021 17:21:24 -0800 Subject: [PATCH 094/118] lib: stackdepot: add support to configure STACK_HASH_SIZE Use CONFIG_STACK_HASH_ORDER to configure STACK_HASH_SIZE. Aim is to have configurable value for STACK_HASH_SIZE, so depend on use case one can configure it. One example is of Page Owner, CONFIG_PAGE_OWNER works only if page_owner=on via kernel parameter on CONFIG_PAGE_OWNER configured system. Thus, unless admin enable it via command line option, the stackdepot will just waste 8M memory without any customer. Making it configurable and use lower value helps to enable features like CONFIG_PAGE_OWNER without any significant overhead. Link: https://lkml.kernel.org/r/1611749198-24316-1-git-send-email-vjitta@codeaurora.org Signed-off-by: Yogesh Lal Signed-off-by: Vinayak Menon Signed-off-by: Vijayanand Jitta Reviewed-by: Minchan Kim Reviewed-by: Alexander Potapenko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig | 9 +++++++++ lib/stackdepot.c | 3 +-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/lib/Kconfig b/lib/Kconfig index 46806332a8cc..a38cc61256f1 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -651,6 +651,15 @@ config STACKDEPOT bool select STACKTRACE +config STACK_HASH_ORDER + int "stack depot hash size (12 => 4KB, 20 => 1024KB)" + range 12 20 + default 20 + depends on STACKDEPOT + help + Select the hash size as a power of 2 for the stackdepot hash table. + Choose a lower value to reduce the memory impact. + config SBITMAP bool diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 890dcc2e984e..4b9715470e87 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -141,8 +141,7 @@ static struct stack_record *depot_alloc_stack(unsigned long *entries, int size, return stack; } -#define STACK_HASH_ORDER 20 -#define STACK_HASH_SIZE (1L << STACK_HASH_ORDER) +#define STACK_HASH_SIZE (1L << CONFIG_STACK_HASH_ORDER) #define STACK_HASH_MASK (STACK_HASH_SIZE - 1) #define STACK_HASH_SEED 0x9747b28c From e1fdc403349c64fa58f4c163f4bf9b860b4db808 Mon Sep 17 00:00:00 2001 From: Vijayanand Jitta Date: Thu, 25 Feb 2021 17:21:27 -0800 Subject: [PATCH 095/118] lib: stackdepot: add support to disable stack depot Add a kernel parameter stack_depot_disable to disable stack depot. So that stack hash table doesn't consume any memory when stack depot is disabled. The use case is CONFIG_PAGE_OWNER without page_owner=on. Without this patch, stackdepot will consume the memory for the hashtable. By default, it's 8M which is never trivial. With this option, in CONFIG_PAGE_OWNER configured system, page_owner=off, stack_depot_disable in kernel command line, we could save the wasted memory for the hashtable. [akpm@linux-foundation.org: fix CONFIG_STACKDEPOT=n build] Link: https://lkml.kernel.org/r/1611749198-24316-2-git-send-email-vjitta@codeaurora.org Signed-off-by: Vinayak Menon Signed-off-by: Vijayanand Jitta Cc: Alexander Potapenko Cc: Minchan Kim Cc: Yogesh Lal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- .../admin-guide/kernel-parameters.txt | 6 ++++ include/linux/stackdepot.h | 9 ++++++ init/main.c | 2 ++ lib/stackdepot.c | 32 ++++++++++++++++--- 4 files changed, 45 insertions(+), 4 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index bab6a8b01202..04545725f187 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5182,6 +5182,12 @@ growing up) the main stack are reserved for no other mapping. Default value is 256 pages. + stack_depot_disable= [KNL] + Setting this to true through kernel command line will + disable the stack depot thereby saving the static memory + consumed by the stack hash table. By default this is set + to false. + stacktrace [FTRACE] Enabled the stack tracer on boot up. diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index 24d49c732341..6bb4bc1a5f54 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -21,4 +21,13 @@ unsigned int stack_depot_fetch(depot_stack_handle_t handle, unsigned int filter_irq_stacks(unsigned long *entries, unsigned int nr_entries); +#ifdef CONFIG_STACKDEPOT +int stack_depot_init(void); +#else +static inline int stack_depot_init(void) +{ + return 0; +} +#endif /* CONFIG_STACKDEPOT */ + #endif diff --git a/init/main.c b/init/main.c index 261051070e3c..3648c9f94882 100644 --- a/init/main.c +++ b/init/main.c @@ -97,6 +97,7 @@ #include #include #include +#include #include #include @@ -827,6 +828,7 @@ static void __init mm_init(void) init_mem_debugging_and_hardening(); kfence_alloc_pool(); report_meminit(); + stack_depot_init(); mem_init(); /* page_owner must be initialized after buddy is ready */ page_ext_init_flatmem_late(); diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 4b9715470e87..cc21116512a7 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -31,6 +31,7 @@ #include #include #include +#include #define DEPOT_STACK_BITS (sizeof(depot_stack_handle_t) * 8) @@ -145,9 +146,32 @@ static struct stack_record *depot_alloc_stack(unsigned long *entries, int size, #define STACK_HASH_MASK (STACK_HASH_SIZE - 1) #define STACK_HASH_SEED 0x9747b28c -static struct stack_record *stack_table[STACK_HASH_SIZE] = { - [0 ... STACK_HASH_SIZE - 1] = NULL -}; +static bool stack_depot_disable; +static struct stack_record **stack_table; + +static int __init is_stack_depot_disabled(char *str) +{ + kstrtobool(str, &stack_depot_disable); + if (stack_depot_disable) { + pr_info("Stack Depot is disabled\n"); + stack_table = NULL; + } + return 0; +} +early_param("stack_depot_disable", is_stack_depot_disabled); + +int __init stack_depot_init(void) +{ + if (!stack_depot_disable) { + size_t size = (STACK_HASH_SIZE * sizeof(struct stack_record *)); + int i; + + stack_table = memblock_alloc(size, size); + for (i = 0; i < STACK_HASH_SIZE; i++) + stack_table[i] = NULL; + } + return 0; +} /* Calculate hash for a stack */ static inline u32 hash_stack(unsigned long *entries, unsigned int size) @@ -241,7 +265,7 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries, unsigned long flags; u32 hash; - if (unlikely(nr_entries == 0)) + if (unlikely(nr_entries == 0) || stack_depot_disable) goto fast_exit; hash = hash_stack(entries, nr_entries); From 64427985c76fcb54c783de617edf353009499a03 Mon Sep 17 00:00:00 2001 From: Vijayanand Jitta Date: Thu, 25 Feb 2021 17:21:31 -0800 Subject: [PATCH 096/118] lib: stackdepot: fix ignoring return value warning Fix the below ignoring return value warning for kstrtobool in is_stack_depot_disabled function. lib/stackdepot.c: In function 'is_stack_depot_disabled': lib/stackdepot.c:154:2: warning: ignoring return value of 'kstrtobool' declared with attribute 'warn_unused_result' [-Wunused-result] Link: https://lkml.kernel.org/r/1612163048-28026-1-git-send-email-vjitta@codeaurora.org Fixes: b9779abb09a8 ("lib: stackdepot: add support to disable stack depot") Signed-off-by: Vijayanand Jitta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/stackdepot.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/lib/stackdepot.c b/lib/stackdepot.c index cc21116512a7..49f67a0c6e5d 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -151,8 +151,10 @@ static struct stack_record **stack_table; static int __init is_stack_depot_disabled(char *str) { - kstrtobool(str, &stack_depot_disable); - if (stack_depot_disable) { + int ret; + + ret = kstrtobool(str, &stack_depot_disable); + if (!ret && stack_depot_disable) { pr_info("Stack Depot is disabled\n"); stack_table = NULL; } From 96251a75e0097639a6df558e4e62f762100f03d3 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 25 Feb 2021 17:21:34 -0800 Subject: [PATCH 097/118] lib/cmdline: remove an unneeded local variable in next_arg() The local variable 'next' is unneeded because you can simply advance the existing pointer 'args'. Link: https://lkml.kernel.org/r/20210201014707.3828753-1-masahiroy@kernel.org Signed-off-by: Masahiro Yamada Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/cmdline.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/lib/cmdline.c b/lib/cmdline.c index dfd4c4423f9a..5d474c626e24 100644 --- a/lib/cmdline.c +++ b/lib/cmdline.c @@ -228,7 +228,6 @@ char *next_arg(char *args, char **param, char **val) { unsigned int i, equals = 0; int in_quote = 0, quoted = 0; - char *next; if (*args == '"') { args++; @@ -266,10 +265,10 @@ char *next_arg(char *args, char **param, char **val) if (args[i]) { args[i] = '\0'; - next = args + i + 1; + args += i + 1; } else - next = args + i; + args += i; /* Chew up trailing spaces. */ - return skip_spaces(next); + return skip_spaces(args); } From 4945cca232ce8bc699b8743f2436af664c471b96 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 25 Feb 2021 17:21:37 -0800 Subject: [PATCH 098/118] include/linux/bitops.h: spelling s/synomyn/synonym/ Fix a misspelling of "synonym". Link: https://lkml.kernel.org/r/20210108105305.2028120-1-geert+renesas@glider.be Signed-off-by: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bitops.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/bitops.h b/include/linux/bitops.h index a61f192c096b..a5a48303b0f1 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -214,7 +214,7 @@ static inline int get_count_order_long(unsigned long l) * __ffs64 - find first set bit in a 64 bit word * @word: The 64 bit word * - * On 64 bit arches this is a synomyn for __ffs + * On 64 bit arches this is a synonym for __ffs * The result is not defined if no bits are set, so check that @word * is non-zero before calling this. */ From b5e8736a954aecd33adf276a2680dc24a36a2420 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 25 Feb 2021 17:21:40 -0800 Subject: [PATCH 099/118] checkpatch: improve blank line after declaration test Avoid multiple false positives by ignoring attributes. Various attributes like volatile and ____cacheline_aligned_in_smp cause checkpatch to emit invalid "Missing a blank line after declarations" messages. Use copies of $sline and $prevline, remove $Attribute and $Sparse, and use the existing tests to avoid these false positives. Miscellanea: o Add volatile to $Attribute This also reduces checkpatch runtime a bit by moving the indentation comparison test to the start of the block to avoid multiple unnecessary regex tests. Link: https://lkml.kernel.org/r/9015fd00742bf4e5b824ad6d7fd7189530958548.camel@perches.com Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 52 ++++++++++++++++++++++++------------------- 1 file changed, 29 insertions(+), 23 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index ca4201753d5e..bc8069503819 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -382,6 +382,7 @@ our $InitAttribute = qr{$InitAttributeData|$InitAttributeConst|$InitAttributeIni # We need \b after 'init' otherwise 'initconst' will cause a false positive in a check our $Attribute = qr{ const| + volatile| __percpu| __nocast| __safe| @@ -3776,43 +3777,48 @@ sub process { } # check for missing blank lines after declarations - if ($sline =~ /^\+\s+\S/ && #Not at char 1 - # actual declarations - ($prevline =~ /^\+\s+$Declare\s*$Ident\s*[=,;:\[]/ || +# (declarations must have the same indentation and not be at the start of line) + if (($prevline =~ /\+(\s+)\S/) && $sline =~ /^\+$1\S/) { + # use temporaries + my $sl = $sline; + my $pl = $prevline; + # remove $Attribute/$Sparse uses to simplify comparisons + $sl =~ s/\b(?:$Attribute|$Sparse)\b//g; + $pl =~ s/\b(?:$Attribute|$Sparse)\b//g; + if (($pl =~ /^\+\s+$Declare\s*$Ident\s*[=,;:\[]/ || # function pointer declarations - $prevline =~ /^\+\s+$Declare\s*\(\s*\*\s*$Ident\s*\)\s*[=,;:\[\(]/ || + $pl =~ /^\+\s+$Declare\s*\(\s*\*\s*$Ident\s*\)\s*[=,;:\[\(]/ || # foo bar; where foo is some local typedef or #define - $prevline =~ /^\+\s+$Ident(?:\s+|\s*\*\s*)$Ident\s*[=,;\[]/ || + $pl =~ /^\+\s+$Ident(?:\s+|\s*\*\s*)$Ident\s*[=,;\[]/ || # known declaration macros - $prevline =~ /^\+\s+$declaration_macros/) && + $pl =~ /^\+\s+$declaration_macros/) && # for "else if" which can look like "$Ident $Ident" - !($prevline =~ /^\+\s+$c90_Keywords\b/ || + !($pl =~ /^\+\s+$c90_Keywords\b/ || # other possible extensions of declaration lines - $prevline =~ /(?:$Compare|$Assignment|$Operators)\s*$/ || + $pl =~ /(?:$Compare|$Assignment|$Operators)\s*$/ || # not starting a section or a macro "\" extended line - $prevline =~ /(?:\{\s*|\\)$/) && + $pl =~ /(?:\{\s*|\\)$/) && # looks like a declaration - !($sline =~ /^\+\s+$Declare\s*$Ident\s*[=,;:\[]/ || + !($sl =~ /^\+\s+$Declare\s*$Ident\s*[=,;:\[]/ || # function pointer declarations - $sline =~ /^\+\s+$Declare\s*\(\s*\*\s*$Ident\s*\)\s*[=,;:\[\(]/ || + $sl =~ /^\+\s+$Declare\s*\(\s*\*\s*$Ident\s*\)\s*[=,;:\[\(]/ || # foo bar; where foo is some local typedef or #define - $sline =~ /^\+\s+$Ident(?:\s+|\s*\*\s*)$Ident\s*[=,;\[]/ || + $sl =~ /^\+\s+$Ident(?:\s+|\s*\*\s*)$Ident\s*[=,;\[]/ || # known declaration macros - $sline =~ /^\+\s+$declaration_macros/ || + $sl =~ /^\+\s+$declaration_macros/ || # start of struct or union or enum - $sline =~ /^\+\s+(?:static\s+)?(?:const\s+)?(?:union|struct|enum|typedef)\b/ || + $sl =~ /^\+\s+(?:static\s+)?(?:const\s+)?(?:union|struct|enum|typedef)\b/ || # start or end of block or continuation of declaration - $sline =~ /^\+\s+(?:$|[\{\}\.\#\"\?\:\(\[])/ || + $sl =~ /^\+\s+(?:$|[\{\}\.\#\"\?\:\(\[])/ || # bitfield continuation - $sline =~ /^\+\s+$Ident\s*:\s*\d+\s*[,;]/ || + $sl =~ /^\+\s+$Ident\s*:\s*\d+\s*[,;]/ || # other possible extensions of declaration lines - $sline =~ /^\+\s+\(?\s*(?:$Compare|$Assignment|$Operators)/) && - # indentation of previous and current line are the same - (($prevline =~ /\+(\s+)\S/) && $sline =~ /^\+$1\S/)) { - if (WARN("LINE_SPACING", - "Missing a blank line after declarations\n" . $hereprev) && - $fix) { - fix_insert_line($fixlinenr, "\+"); + $sl =~ /^\+\s+\(?\s*(?:$Compare|$Assignment|$Operators)/)) { + if (WARN("LINE_SPACING", + "Missing a blank line after declarations\n" . $hereprev) && + $fix) { + fix_insert_line($fixlinenr, "\+"); + } } } From 35cdcbfc5cfc30012b790d9b077bd949ad46f1dd Mon Sep 17 00:00:00 2001 From: Peng Wang Date: Thu, 25 Feb 2021 17:21:44 -0800 Subject: [PATCH 100/118] checkpatch: ignore warning designated initializers using NR_CPUS Some max_length wants to hold as large room as possible to ensure enough size to tackle with the biggest NR_CPUS. An example below: kernel/cgroup/cpuset.c: static struct cftype legacy_files[] = { { .name = "cpus", .seq_show = cpuset_common_seq_show, .write = cpuset_write_resmask, .max_write_len = (100U + 6 * NR_CPUS), .private = FILE_CPULIST, }, ... } Link: https://lkml.kernel.org/r/5d4998aa8a8ac7efada2c7daffa9e73559f8b186.1609331255.git.rocking@linux.alibaba.com Signed-off-by: Peng Wang Acked-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index bc8069503819..7ba8fbbf9f1b 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -7025,12 +7025,14 @@ sub process { # use of NR_CPUS is usually wrong # ignore definitions of NR_CPUS and usage to define arrays as likely right +# ignore designated initializers using NR_CPUS if ($line =~ /\bNR_CPUS\b/ && $line !~ /^.\s*\s*#\s*if\b.*\bNR_CPUS\b/ && $line !~ /^.\s*\s*#\s*define\b.*\bNR_CPUS\b/ && $line !~ /^.\s*$Declare\s.*\[[^\]]*NR_CPUS[^\]]*\]/ && $line !~ /\[[^\]]*\.\.\.[^\]]*NR_CPUS[^\]]*\]/ && - $line !~ /\[[^\]]*NR_CPUS[^\]]*\.\.\.[^\]]*\]/) + $line !~ /\[[^\]]*NR_CPUS[^\]]*\.\.\.[^\]]*\]/ && + $line !~ /^.\s*\.\w+\s*=\s*.*\bNR_CPUS\b/) { WARN("NR_CPUS", "usage of NR_CPUS is often wrong - consider using cpu_possible(), num_possible_cpus(), for_each_possible_cpu(), etc\n" . $herecurr); From ea7dbab3e5054db7c013579096cfe7b0f10d1d65 Mon Sep 17 00:00:00 2001 From: Dwaipayan Ray Date: Thu, 25 Feb 2021 17:21:47 -0800 Subject: [PATCH 101/118] checkpatch: trivial style fixes Indentations should use tabs wherever possible. Replace spaces by tabs for indents. Link: https://lkml.kernel.org/r/20210105103044.40282-1-dwaipayanray1@gmail.com Signed-off-by: Dwaipayan Ray Acked-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 7ba8fbbf9f1b..345879a305be 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2974,7 +2974,7 @@ sub process { } if (!defined $lines[$linenr]) { WARN("BAD_SIGN_OFF", - "Co-developed-by: must be immediately followed by Signed-off-by:\n" . "$here\n" . $rawline); + "Co-developed-by: must be immediately followed by Signed-off-by:\n" . "$here\n" . $rawline); } elsif ($rawlines[$linenr] !~ /^\s*signed-off-by:\s*(.*)/i) { WARN("BAD_SIGN_OFF", "Co-developed-by: must be immediately followed by Signed-off-by:\n" . "$here\n" . $rawline . "\n" .$rawlines[$linenr]); @@ -2997,8 +2997,8 @@ sub process { if (ERROR("GERRIT_CHANGE_ID", "Remove Gerrit Change-Id's before submitting upstream\n" . $herecurr) && $fix) { - fix_delete_line($fixlinenr, $rawline); - } + fix_delete_line($fixlinenr, $rawline); + } } # Check if the commit log is in a possible stack dump @@ -3240,10 +3240,10 @@ sub process { next if ($start_char =~ /^\S$/); next if (index(" \t.,;?!", $end_char) == -1); - # avoid repeating hex occurrences like 'ff ff fe 09 ...' - if ($first =~ /\b[0-9a-f]{2,}\b/i) { - next if (!exists($allow_repeated_words{lc($first)})); - } + # avoid repeating hex occurrences like 'ff ff fe 09 ...' + if ($first =~ /\b[0-9a-f]{2,}\b/i) { + next if (!exists($allow_repeated_words{lc($first)})); + } if (WARN("REPEATED_WORD", "Possible repeated word: '$first'\n" . $herecurr) && @@ -4423,7 +4423,7 @@ sub process { WARN("STATIC_CONST_CHAR_ARRAY", "char * array declaration might be better as static const\n" . $herecurr); - } + } # check for sizeof(foo)/sizeof(foo[0]) that could be ARRAY_SIZE(foo) if ($line =~ m@\bsizeof\s*\(\s*($Lval)\s*\)@) { @@ -5276,7 +5276,7 @@ sub process { $lines[$linenr - 3] !~ /^[ +]\s*$Ident\s*:/) { WARN("RETURN_VOID", "void function return statements are not generally useful\n" . $hereprev); - } + } # if statements using unnecessary parentheses - ie: if ((foo == bar)) if ($perl_version_ok && From adb2da82fcf99b6006fbaf3e3cd12649365fc967 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 25 Feb 2021 17:21:50 -0800 Subject: [PATCH 102/118] checkpatch: prefer ftrace over function entry/exit printks Prefer using ftrace over function entry/exit logging messages. Warn with various function entry/exit only logging that only use __func__ with or without descriptive decoration. Link: https://lkml.kernel.org/r/47c01081533a417c99c9a80a4cd537f8c308503f.camel@perches.com Signed-off-by: Joe Perches Cc: Dan Carpenter Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 345879a305be..736129c21c16 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -507,6 +507,30 @@ our $signature_tags = qr{(?xi: Cc: )}; +our $tracing_logging_tags = qr{(?xi: + [=-]*> | + <[=-]* | + \[ | + \] | + start | + called | + entered | + entry | + enter | + in | + inside | + here | + begin | + exit | + end | + done | + leave | + completed | + out | + return | + [\.\!:\s]* +)}; + sub edit_distance_min { my (@arr) = @_; my $len = scalar @arr; @@ -5972,6 +5996,17 @@ sub process { "Prefer using '\"%s...\", __func__' to using '$context_function', this function's name, in a string\n" . $herecurr); } +# check for unnecessary function tracing like uses +# This does not use $logFunctions because there are many instances like +# 'dprintk(FOO, "%s()\n", __func__);' which do not match $logFunctions + if ($rawline =~ /^\+.*\([^"]*"$tracing_logging_tags{0,3}%s(?:\s*\(\s*\)\s*)?$tracing_logging_tags{0,3}(?:\\n)?"\s*,\s*__func__\s*\)\s*;/) { + if (WARN("TRACING_LOGGING", + "Unnecessary ftrace-like logging - prefer using ftrace\n" . $herecurr) && + $fix) { + fix_delete_line($fixlinenr, $rawline); + } + } + # check for spaces before a quoted newline if ($rawline =~ /^.*\".*\s\\n/) { if (WARN("QUOTED_WHITESPACE_BEFORE_NEWLINE", From 0972b8bfe0de8c0f05796aceb8f2428b0efb20cd Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 25 Feb 2021 17:21:54 -0800 Subject: [PATCH 103/118] checkpatch: improve TYPECAST_INT_CONSTANT test message Improve the TYPECAST_INT_CONSTANT test by showing the suggested conversion for various type of uses like (unsigned int)1 to 1U. Link: https://lkml.kernel.org/r/ecefe8dcb93fe7028311b69dd297ba52224233d4.camel@perches.com Signed-off-by: Joe Perches Cc: Douglas Gilbert Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 736129c21c16..a04df2657d49 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -6518,18 +6518,18 @@ sub process { if ($line =~ /(\(\s*$C90_int_types\s*\)\s*)($Constant)\b/) { my $cast = $1; my $const = $2; + my $suffix = ""; + my $newconst = $const; + $newconst =~ s/${Int_type}$//; + $suffix .= 'U' if ($cast =~ /\bunsigned\b/); + if ($cast =~ /\blong\s+long\b/) { + $suffix .= 'LL'; + } elsif ($cast =~ /\blong\b/) { + $suffix .= 'L'; + } if (WARN("TYPECAST_INT_CONSTANT", - "Unnecessary typecast of c90 int constant\n" . $herecurr) && + "Unnecessary typecast of c90 int constant - '$cast$const' could be '$const$suffix'\n" . $herecurr) && $fix) { - my $suffix = ""; - my $newconst = $const; - $newconst =~ s/${Int_type}$//; - $suffix .= 'U' if ($cast =~ /\bunsigned\b/); - if ($cast =~ /\blong\s+long\b/) { - $suffix .= 'LL'; - } elsif ($cast =~ /\blong\b/) { - $suffix .= 'L'; - } $fixed[$fixlinenr] =~ s/\Q$cast\E$const\b/$newconst$suffix/; } } From de93245c00a44578ae73964b7e36607d04fed5b3 Mon Sep 17 00:00:00 2001 From: Aditya Srivastava Date: Thu, 25 Feb 2021 17:21:57 -0800 Subject: [PATCH 104/118] checkpatch: add warning for avoiding .L prefix symbols in assembly files objtool requires that all code must be contained in an ELF symbol. Symbol names that have a '.L' prefix do not emit symbol table entries, as they have special meaning for the assembler. '.L' prefixed symbols can be used within a code region, but should be avoided for denoting a range of code via 'SYM_*_START/END' annotations. Add a new check to emit a warning on finding the usage of '.L' symbols for '.S' files, if it denotes range of code via SYM_*_START/END annotation pair. Link: https://lkml.kernel.org/r/20210123190459.9701-1-yashsri421@gmail.com Link: https://lore.kernel.org/lkml/20210112210154.GI4646@sirena.org.uk Signed-off-by: Aditya Srivastava Suggested-by: Mark Brown Acked-by: Joe Perches Acked-by: Nick Desaulniers Cc: Aditya Srivastava Cc: Lukas Bulwahn Cc: Dwaipayan Ray Cc: Josh Poimboeuf Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index a04df2657d49..d8793bdbc492 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -3599,6 +3599,13 @@ sub process { } } +# check for .L prefix local symbols in .S files + if ($realfile =~ /\.S$/ && + $line =~ /^\+\s*(?:[A-Z]+_)?SYM_[A-Z]+_(?:START|END)(?:_[A-Z_]+)?\s*\(\s*\.L/) { + WARN("AVOID_L_PREFIX", + "Avoid using '.L' prefixed local symbol names for denoting a range of code via 'SYM_*_START/END' annotations; see Documentation/asm-annotations.rst\n" . $herecurr); + } + # check we are in a valid source file C or perl if not then ignore this hunk next if ($realfile !~ /\.(h|c|pl|dtsi|dts)$/); From 58f02267f04a79a5ef13dfbcf30f5ae080389f87 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 25 Feb 2021 17:22:01 -0800 Subject: [PATCH 105/118] checkpatch: add kmalloc_array_node to unnecessary OOM message check commit 5799b255c491 ("include/linux/slab.h: add kmalloc_array_node() and kcalloc_node()") was added in 2017. Update the unnecessary OOM message test to include it. Link: https://lkml.kernel.org/r/b9dc4a808b1518e08ab8761480d9872e5d18e7cd.camel@perches.com Signed-off-by: Joe Perches Reported-by: Jakub Kicinski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index d8793bdbc492..3cdc0703e00f 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -487,7 +487,7 @@ our $logFunctions = qr{(?x: our $allocFunctions = qr{(?x: (?:(?:devm_)? - (?:kv|k|v)[czm]alloc(?:_node|_array)? | + (?:kv|k|v)[czm]alloc(?:_array)?(?:_node)? | kstrdup(?:_const)? | kmemdup(?:_nul)?) | (?:\w+)?alloc_skb(?:_ip_align)? | From 263afd39c06f5939ef943e0d535380d4b8e56484 Mon Sep 17 00:00:00 2001 From: Chris Down Date: Thu, 25 Feb 2021 17:22:04 -0800 Subject: [PATCH 106/118] checkpatch: don't warn about colon termination in linker scripts This check erroneously flags cases like the one in my recent printk enumeration patch[0], where the spaces are syntactic, and `section:' vs. `section :' is syntactically important: ERROR: space prohibited before that ':' (ctx:WxW) #258: FILE: include/asm-generic/vmlinux.lds.h:314: + .printk_fmts : AT(ADDR(.printk_fmts) - LOAD_OFFSET) { 0: https://lore.kernel.org/patchwork/patch/1375749/ Link: https://lkml.kernel.org/r/YBwhqsc2TIVeid3t@chrisdown.name Link: https://lkml.kernel.org/r/YB6UsjCOy1qrrlSD@chrisdown.name Signed-off-by: Chris Down Acked-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 3cdc0703e00f..75c93316547b 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -5044,7 +5044,7 @@ sub process { # A colon needs no spaces before when it is # terminating a case value or a label. } elsif ($opv eq ':C' || $opv eq ':L') { - if ($ctx =~ /Wx./) { + if ($ctx =~ /Wx./ and $realfile !~ m@.*\.lds\.h$@) { if (ERROR("SPACING", "space prohibited before that '$op' $at\n" . $hereptr)) { $good = rtrim($fix_elements[$n]) . trim($fix_elements[$n + 1]); From 5b8f82e1a17695c9e5fec5842b234967782d7e5b Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 25 Feb 2021 17:22:08 -0800 Subject: [PATCH 107/118] checkpatch: do not apply "initialise globals to 0" check to BPF progs BPF programs explicitly initialise global variables to 0 to make sure clang (v10 or older) do not put the variables in the common section. Skip "initialise globals to 0" check for BPF programs to elimiate error messages like: ERROR: do not initialise globals to 0 #19: FILE: samples/bpf/tracex1_kern.c:21: Link: https://lkml.kernel.org/r/20210209211954.490077-1-songliubraving@fb.com Signed-off-by: Song Liu Acked-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 75c93316547b..df8b23dc1eb0 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2453,6 +2453,15 @@ sub get_raw_comment { return $comment; } +sub exclude_global_initialisers { + my ($realfile) = @_; + + # Do not check for BPF programs (tools/testing/selftests/bpf/progs/*.c, samples/bpf/*_kern.c, *.bpf.c). + return $realfile =~ m@^tools/testing/selftests/bpf/progs/.*\.c$@ || + $realfile =~ m@^samples/bpf/.*_kern\.c$@ || + $realfile =~ m@/bpf/.*\.bpf\.c$@; +} + sub process { my $filename = shift; @@ -4358,7 +4367,8 @@ sub process { } # check for global initialisers. - if ($line =~ /^\+$Type\s*$Ident(?:\s+$Modifier)*\s*=\s*($zero_initializer)\s*;/) { + if ($line =~ /^\+$Type\s*$Ident(?:\s+$Modifier)*\s*=\s*($zero_initializer)\s*;/ && + !exclude_global_initialisers($realfile)) { if (ERROR("GLOBAL_INITIALISERS", "do not initialise globals to $1\n" . $herecurr) && $fix) { From 073a9ecb3a73401662430bb955aedeac1de643d1 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 25 Feb 2021 17:22:11 -0800 Subject: [PATCH 108/118] init/version.c: remove Version_ symbol This code hunk creates a Version_ symbol if CONFIG_KALLSYMS is disabled. For example, building the kernel v5.10 for allnoconfig creates the following symbol: $ nm vmlinux | grep Version_ c116b028 B Version_330240 There is no in-tree user of this symbol. Commit 197dcffc8ba0 ("init/version.c: define version_string only if CONFIG_KALLSYMS is not defined") mentions that Version_* is only used with ksymoops. However, a commit in the pre-git era [1] had added the statement, "ksymoops is useless on 2.6. Please use the Oops in its original format". That statement existed until commit 4eb9241127a0 ("Documentation: admin-guide: update bug-hunting.rst") finally removed the stale ksymoops information. This symbol is no longer needed. [1] https://git.kernel.org/pub/scm/linux/kernel/git/history/history.git/commit/?id=ad68b2f085f5c79e4759ca2d13947b3c885ee831 Link: https://lkml.kernel.org/r/20210120033452.2895170-1-masahiroy@kernel.org Signed-off-by: Masahiro Yamada Cc: Mauro Carvalho Chehab Cc: Randy Dunlap Cc: Daniel Guilak Cc: Lee Revell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- init/version.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/init/version.c b/init/version.c index 80d2b7566b39..92afc782b043 100644 --- a/init/version.c +++ b/init/version.c @@ -16,14 +16,6 @@ #include #include -#ifndef CONFIG_KALLSYMS -#define version(a) Version_ ## a -#define version_string(a) version(a) - -extern int version_string(LINUX_VERSION_CODE); -int version_string(LINUX_VERSION_CODE); -#endif - struct uts_namespace init_uts_ns = { .ns.count = REFCOUNT_INIT(2), .name = { From a5a673f7312253a842f3da8c60c980461cc269ec Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 25 Feb 2021 17:22:15 -0800 Subject: [PATCH 109/118] init: clean up early_param_on_off() macro Use early_param() to define early_param_on_off(). Link: https://lkml.kernel.org/r/20210201041532.4025025-1-masahiroy@kernel.org Signed-off-by: Masahiro Yamada Reviewed-by: Johan Hovold Reviewed-by: Miguel Ojeda Cc: Masahiro Yamada Cc: Joe Perches Cc: Nick Desaulniers Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/init.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/init.h b/include/linux/init.h index a01f01c1a5c5..31f54de58429 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -338,14 +338,14 @@ struct obs_kernel_param { var = 1; \ return 0; \ } \ - __setup_param(str_on, parse_##var##_on, parse_##var##_on, 1); \ + early_param(str_on, parse_##var##_on); \ \ static int __init parse_##var##_off(char *arg) \ { \ var = 0; \ return 0; \ } \ - __setup_param(str_off, parse_##var##_off, parse_##var##_off, 1) + early_param(str_off, parse_##var##_off) /* Relies on boot_command_line being set */ void __init parse_early_param(void); From f9c8bc4604c95a7c55293f244f67753f6e96096f Mon Sep 17 00:00:00 2001 From: Bhaskar Chowdhury Date: Thu, 25 Feb 2021 17:22:18 -0800 Subject: [PATCH 110/118] init/Kconfig: fix a typo in CC_VERSION_TEXT help text s/compier/compiler/ Link: https://lkml.kernel.org/r/20210224223325.29099-1-unixbhaskar@gmail.com Signed-off-by: Bhaskar Chowdhury Acked-by: Randy Dunlap Reviewed-by: Nathan Chancellor Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- init/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/init/Kconfig b/init/Kconfig index 719871f8727c..efdc35abccb6 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -19,7 +19,7 @@ config CC_VERSION_TEXT CC_VERSION_TEXT so it is recorded in include/config/auto.conf.cmd. When the compiler is updated, Kconfig will be invoked. - - Ensure full rebuild when the compier is updated + - Ensure full rebuild when the compiler is updated include/linux/kconfig.h contains this option in the comment line so fixdep adds include/config/cc/version/text.h into the auto-generated dependency. When the compiler is updated, syncconfig will touch it From 3159ed57792be7453793bda27297a423e1c63d6c Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 25 Feb 2021 17:22:22 -0800 Subject: [PATCH 111/118] fs/coredump: use kmap_local_page() In dump_user_range() there is no reason for the mapping to be global. Use kmap_local_page() rather than kmap. Link: https://lkml.kernel.org/r/20210203223328.558945-1-ira.weiny@intel.com Signed-off-by: Ira Weiny Cc: Alexander Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/coredump.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/coredump.c b/fs/coredump.c index ae778937a1ff..1c0fdc1aa70b 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -897,10 +897,10 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start, */ page = get_dump_page(addr); if (page) { - void *kaddr = kmap(page); + void *kaddr = kmap_local_page(page); stop = !dump_emit(cprm, kaddr, PAGE_SIZE); - kunmap(page); + kunmap_local(kaddr); put_page(page); } else { stop = !dump_skip(cprm, PAGE_SIZE); From b3656d8227f4c45812c6b40815d8f4e446ed372a Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 25 Feb 2021 17:22:25 -0800 Subject: [PATCH 112/118] seq_file: document how per-entry resources are managed. Patch series "Fix some seq_file users that were recently broken". A recent change to seq_file broke some users which were using seq_file in a non-"standard" way ... though the "standard" isn't documented, so they can be excused. The result is a possible leak - of memory in one case, of references to a 'transport' in the other. These three patches: 1/ document and explain the problem 2/ fix the problem user in x86 3/ fix the problem user in net/sctp This patch (of 3): Users of seq_file will sometimes find it convenient to take a resource, such as a lock or memory allocation, in the ->start or ->next operations. These are per-entry resources, distinct from per-session resources which are taken in ->start and released in ->stop. The preferred management of these is release the resource on the subsequent call to ->next or ->stop. However prior to Commit 1f4aace60b0e ("fs/seq_file.c: simplify seq_file iteration code and interface") it happened that ->show would always be called after ->start or ->next, and a few users chose to release the resource in ->show. This is no longer reliable. Since the mentioned commit, ->next will always come after a successful ->show (to ensure m->index is updated correctly), so the original ordering cannot be maintained. This patch updates the documentation to clearly state the required behaviour. Other patches will fix the few problematic users. [akpm@linux-foundation.org: fix typo, per Willy] Link: https://lkml.kernel.org/r/161248518659.21478.2484341937387294998.stgit@noble1 Link: https://lkml.kernel.org/r/161248539020.21478.3147971477400875336.stgit@noble1 Fixes: 1f4aace60b0e ("fs/seq_file.c: simplify seq_file iteration code and interface") Signed-off-by: NeilBrown Cc: Xin Long Cc: Alexander Viro Cc: Jonathan Corbet Cc: Ingo Molnar Cc: Dave Hansen Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Vlad Yasevich Cc: Neil Horman Cc: Marcelo Ricardo Leitner Cc: "David S. Miller" Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/seq_file.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Documentation/filesystems/seq_file.rst b/Documentation/filesystems/seq_file.rst index 56856481dc8d..a6726082a7c2 100644 --- a/Documentation/filesystems/seq_file.rst +++ b/Documentation/filesystems/seq_file.rst @@ -217,6 +217,12 @@ between the calls to start() and stop(), so holding a lock during that time is a reasonable thing to do. The seq_file code will also avoid taking any other locks while the iterator is active. +The iterater value returned by start() or next() is guaranteed to be +passed to a subsequent next() or stop() call. This allows resources +such as locks that were taken to be reliably released. There is *no* +guarantee that the iterator will be passed to show(), though in practice +it often will be. + Formatted output ================ From 3d2fc4c082448e9c05792f9b2a11c1d5db408b85 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 25 Feb 2021 17:22:29 -0800 Subject: [PATCH 113/118] x86: fix seq_file iteration for pat/memtype.c The memtype seq_file iterator allocates a buffer in the ->start and ->next functions and frees it in the ->show function. The preferred handling for such resources is to free them in the subsequent ->next or ->stop function call. Since Commit 1f4aace60b0e ("fs/seq_file.c: simplify seq_file iteration code and interface") there is no guarantee that ->show will be called after ->next, so this function can now leak memory. So move the freeing of the buffer to ->next and ->stop. Link: https://lkml.kernel.org/r/161248539022.21478.13874455485854739066.stgit@noble1 Fixes: 1f4aace60b0e ("fs/seq_file.c: simplify seq_file iteration code and interface") Signed-off-by: NeilBrown Cc: Xin Long Cc: Alexander Viro Cc: Andy Lutomirski Cc: Dave Hansen Cc: "David S. Miller" Cc: Ingo Molnar Cc: Jonathan Corbet Cc: Marcelo Ricardo Leitner Cc: Neil Horman Cc: Peter Zijlstra Cc: Vlad Yasevich Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/pat/memtype.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c index 8f665c352bf0..ca311aaa67b8 100644 --- a/arch/x86/mm/pat/memtype.c +++ b/arch/x86/mm/pat/memtype.c @@ -1164,12 +1164,14 @@ static void *memtype_seq_start(struct seq_file *seq, loff_t *pos) static void *memtype_seq_next(struct seq_file *seq, void *v, loff_t *pos) { + kfree(v); ++*pos; return memtype_get_idx(*pos); } static void memtype_seq_stop(struct seq_file *seq, void *v) { + kfree(v); } static int memtype_seq_show(struct seq_file *seq, void *v) @@ -1181,8 +1183,6 @@ static int memtype_seq_show(struct seq_file *seq, void *v) entry_print->end, cattr_name(entry_print->type)); - kfree(entry_print); - return 0; } From db7fbf492d94a0b59d8f85b3184231662586dea9 Mon Sep 17 00:00:00 2001 From: George Prekas Date: Thu, 25 Feb 2021 17:22:34 -0800 Subject: [PATCH 114/118] scripts/gdb: fix list_for_each If the list is uninitialized (next pointer is NULL), list_for_each gets stuck in an infinite loop. Print a message and treat list as empty. Link: https://lkml.kernel.org/r/4ae23bb1-c333-f669-da2d-fa35c4f49018@amazon.com Signed-off-by: George Prekas Reviewed-by: Jan Kiszka Cc: Kieran Bingham Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/gdb/linux/lists.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/scripts/gdb/linux/lists.py b/scripts/gdb/linux/lists.py index c487ddf09d38..bae4d70b7eae 100644 --- a/scripts/gdb/linux/lists.py +++ b/scripts/gdb/linux/lists.py @@ -27,6 +27,11 @@ def list_for_each(head): raise TypeError("Must be struct list_head not {}" .format(head.type)) + if head['next'] == 0: + gdb.write("list_for_each: Uninitialized list '{}' treated as empty\n" + .format(head.address)) + return + node = head['next'].dereference() while node.address != head.address: yield node.address From d54ce6158e354f5358a547b96299ecd7f3725393 Mon Sep 17 00:00:00 2001 From: Sumit Garg Date: Thu, 25 Feb 2021 17:22:38 -0800 Subject: [PATCH 115/118] kgdb: fix to kill breakpoints on initmem after boot Currently breakpoints in kernel .init.text section are not handled correctly while allowing to remove them even after corresponding pages have been freed. Fix it via killing .init.text section breakpoints just prior to initmem pages being freed. Doug: "HW breakpoints aren't handled by this patch but it's probably not such a big deal". Link: https://lkml.kernel.org/r/20210224081652.587785-1-sumit.garg@linaro.org Signed-off-by: Sumit Garg Suggested-by: Doug Anderson Acked-by: Doug Anderson Acked-by: Daniel Thompson Tested-by: Daniel Thompson Cc: Masami Hiramatsu Cc: Steven Rostedt (VMware) Cc: Jason Wessel Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kgdb.h | 2 ++ init/main.c | 1 + kernel/debug/debug_core.c | 11 +++++++++++ 3 files changed, 14 insertions(+) diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h index 0444b44bd156..392a3670944c 100644 --- a/include/linux/kgdb.h +++ b/include/linux/kgdb.h @@ -359,9 +359,11 @@ extern atomic_t kgdb_active; extern bool dbg_is_early; extern void __init dbg_late_init(void); extern void kgdb_panic(const char *msg); +extern void kgdb_free_init_mem(void); #else /* ! CONFIG_KGDB */ #define in_dbg_master() (0) #define dbg_late_init() static inline void kgdb_panic(const char *msg) {} +static inline void kgdb_free_init_mem(void) { } #endif /* ! CONFIG_KGDB */ #endif /* _KGDB_H_ */ diff --git a/init/main.c b/init/main.c index 3648c9f94882..53b278845b88 100644 --- a/init/main.c +++ b/init/main.c @@ -1426,6 +1426,7 @@ static int __ref kernel_init(void *unused) async_synchronize_full(); kprobe_free_init_mem(); ftrace_free_init_mem(); + kgdb_free_init_mem(); free_initmem(); mark_readonly(); diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index b636d517c02c..4708aec492df 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -455,6 +455,17 @@ setundefined: return 0; } +void kgdb_free_init_mem(void) +{ + int i; + + /* Clear init memory breakpoints. */ + for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { + if (init_section_contains((void *)kgdb_break[i].bpt_addr, 0)) + kgdb_break[i].state = BP_UNDEFINED; + } +} + #ifdef CONFIG_KGDB_KDB void kdb_dump_stack_on_cpu(int cpu) { From 6aaa31aeb9cf260e1b7155cc11ec864f052db5ec Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Thu, 25 Feb 2021 17:22:42 -0800 Subject: [PATCH 116/118] ubsan: remove overflow checks Since GCC 8.0 -fsanitize=signed-integer-overflow doesn't work with -fwrapv. -fwrapv makes signed overflows defines and GCC essentially disables ubsan checks. On GCC < 8.0 -fwrapv doesn't have influence on -fsanitize=signed-integer-overflow setting, so it kinda works but generates false-positves and violates uaccess rules: lib/iov_iter.o: warning: objtool: iovec_from_user()+0x22d: call to __ubsan_handle_add_overflow() with UACCESS enabled Disable signed overflow checks to avoid these problems. Remove unsigned overflow checks as well. Unsigned overflow appeared as side effect of commit cdf8a76fda4a ("ubsan: move cc-option tests into Kconfig"), but it never worked (kernel doesn't boot). And unsigned overflows are allowed by C standard, so it just pointless. Link: https://lkml.kernel.org/r/20210209232348.20510-1-ryabinin.a.a@gmail.com Signed-off-by: Andrey Ryabinin Acked-by: Peter Zijlstra (Intel) Cc: Josh Poimboeuf Cc: Randy Dunlap Cc: Stephen Rothwell Cc: Dmitry Vyukov Cc: Kees Cook Cc: Alexander Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig.ubsan | 17 ----------- lib/test_ubsan.c | 49 ------------------------------ lib/ubsan.c | 68 ------------------------------------------ scripts/Makefile.ubsan | 2 -- 4 files changed, 136 deletions(-) diff --git a/lib/Kconfig.ubsan b/lib/Kconfig.ubsan index 3a0b1c930733..e5372a13511d 100644 --- a/lib/Kconfig.ubsan +++ b/lib/Kconfig.ubsan @@ -112,23 +112,6 @@ config UBSAN_UNREACHABLE This option enables -fsanitize=unreachable which checks for control flow reaching an expected-to-be-unreachable position. -config UBSAN_SIGNED_OVERFLOW - bool "Perform checking for signed arithmetic overflow" - default UBSAN - depends on $(cc-option,-fsanitize=signed-integer-overflow) - help - This option enables -fsanitize=signed-integer-overflow which checks - for overflow of any arithmetic operations with signed integers. - -config UBSAN_UNSIGNED_OVERFLOW - bool "Perform checking for unsigned arithmetic overflow" - depends on $(cc-option,-fsanitize=unsigned-integer-overflow) - depends on !X86_32 # avoid excessive stack usage on x86-32/clang - help - This option enables -fsanitize=unsigned-integer-overflow which checks - for overflow of any arithmetic operations with unsigned integers. This - currently causes x86 to fail to boot. - config UBSAN_OBJECT_SIZE bool "Perform checking for accesses beyond the end of objects" default UBSAN diff --git a/lib/test_ubsan.c b/lib/test_ubsan.c index 5e5d9355ef49..7e7bbd0f3fd2 100644 --- a/lib/test_ubsan.c +++ b/lib/test_ubsan.c @@ -11,51 +11,6 @@ typedef void(*test_ubsan_fp)(void); #config, IS_ENABLED(config) ? "y" : "n"); \ } while (0) -static void test_ubsan_add_overflow(void) -{ - volatile int val = INT_MAX; - volatile unsigned int uval = UINT_MAX; - - UBSAN_TEST(CONFIG_UBSAN_SIGNED_OVERFLOW); - val += 2; - - UBSAN_TEST(CONFIG_UBSAN_UNSIGNED_OVERFLOW); - uval += 2; -} - -static void test_ubsan_sub_overflow(void) -{ - volatile int val = INT_MIN; - volatile unsigned int uval = 0; - volatile int val2 = 2; - - UBSAN_TEST(CONFIG_UBSAN_SIGNED_OVERFLOW); - val -= val2; - - UBSAN_TEST(CONFIG_UBSAN_UNSIGNED_OVERFLOW); - uval -= val2; -} - -static void test_ubsan_mul_overflow(void) -{ - volatile int val = INT_MAX / 2; - volatile unsigned int uval = UINT_MAX / 2; - - UBSAN_TEST(CONFIG_UBSAN_SIGNED_OVERFLOW); - val *= 3; - - UBSAN_TEST(CONFIG_UBSAN_UNSIGNED_OVERFLOW); - uval *= 3; -} - -static void test_ubsan_negate_overflow(void) -{ - volatile int val = INT_MIN; - - UBSAN_TEST(CONFIG_UBSAN_SIGNED_OVERFLOW); - val = -val; -} - static void test_ubsan_divrem_overflow(void) { volatile int val = 16; @@ -155,10 +110,6 @@ static void test_ubsan_object_size_mismatch(void) } static const test_ubsan_fp test_ubsan_array[] = { - test_ubsan_add_overflow, - test_ubsan_sub_overflow, - test_ubsan_mul_overflow, - test_ubsan_negate_overflow, test_ubsan_shift_out_of_bounds, test_ubsan_out_of_bounds, test_ubsan_load_invalid_value, diff --git a/lib/ubsan.c b/lib/ubsan.c index bec38c64d6a6..26229973049d 100644 --- a/lib/ubsan.c +++ b/lib/ubsan.c @@ -163,74 +163,6 @@ static void ubsan_epilogue(void) } } -static void handle_overflow(struct overflow_data *data, void *lhs, - void *rhs, char op) -{ - - struct type_descriptor *type = data->type; - char lhs_val_str[VALUE_LENGTH]; - char rhs_val_str[VALUE_LENGTH]; - - if (suppress_report(&data->location)) - return; - - ubsan_prologue(&data->location, type_is_signed(type) ? - "signed-integer-overflow" : - "unsigned-integer-overflow"); - - val_to_string(lhs_val_str, sizeof(lhs_val_str), type, lhs); - val_to_string(rhs_val_str, sizeof(rhs_val_str), type, rhs); - pr_err("%s %c %s cannot be represented in type %s\n", - lhs_val_str, - op, - rhs_val_str, - type->type_name); - - ubsan_epilogue(); -} - -void __ubsan_handle_add_overflow(void *data, - void *lhs, void *rhs) -{ - - handle_overflow(data, lhs, rhs, '+'); -} -EXPORT_SYMBOL(__ubsan_handle_add_overflow); - -void __ubsan_handle_sub_overflow(void *data, - void *lhs, void *rhs) -{ - handle_overflow(data, lhs, rhs, '-'); -} -EXPORT_SYMBOL(__ubsan_handle_sub_overflow); - -void __ubsan_handle_mul_overflow(void *data, - void *lhs, void *rhs) -{ - handle_overflow(data, lhs, rhs, '*'); -} -EXPORT_SYMBOL(__ubsan_handle_mul_overflow); - -void __ubsan_handle_negate_overflow(void *_data, void *old_val) -{ - struct overflow_data *data = _data; - char old_val_str[VALUE_LENGTH]; - - if (suppress_report(&data->location)) - return; - - ubsan_prologue(&data->location, "negation-overflow"); - - val_to_string(old_val_str, sizeof(old_val_str), data->type, old_val); - - pr_err("negation of %s cannot be represented in type %s:\n", - old_val_str, data->type->type_name); - - ubsan_epilogue(); -} -EXPORT_SYMBOL(__ubsan_handle_negate_overflow); - - void __ubsan_handle_divrem_overflow(void *_data, void *lhs, void *rhs) { struct overflow_data *data = _data; diff --git a/scripts/Makefile.ubsan b/scripts/Makefile.ubsan index 0e53a93e8f15..9e2092fd5206 100644 --- a/scripts/Makefile.ubsan +++ b/scripts/Makefile.ubsan @@ -8,8 +8,6 @@ ubsan-cflags-$(CONFIG_UBSAN_LOCAL_BOUNDS) += -fsanitize=local-bounds ubsan-cflags-$(CONFIG_UBSAN_SHIFT) += -fsanitize=shift ubsan-cflags-$(CONFIG_UBSAN_DIV_ZERO) += -fsanitize=integer-divide-by-zero ubsan-cflags-$(CONFIG_UBSAN_UNREACHABLE) += -fsanitize=unreachable -ubsan-cflags-$(CONFIG_UBSAN_SIGNED_OVERFLOW) += -fsanitize=signed-integer-overflow -ubsan-cflags-$(CONFIG_UBSAN_UNSIGNED_OVERFLOW) += -fsanitize=unsigned-integer-overflow ubsan-cflags-$(CONFIG_UBSAN_OBJECT_SIZE) += -fsanitize=object-size ubsan-cflags-$(CONFIG_UBSAN_BOOL) += -fsanitize=bool ubsan-cflags-$(CONFIG_UBSAN_ENUM) += -fsanitize=enum From dd23e8098f33a55b22b869bc7fc0a795ccbb9f87 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Thu, 25 Feb 2021 17:22:46 -0800 Subject: [PATCH 117/118] initramfs: panic with memory information On systems with large amounts of reserved memory we may fail to successfully complete unpack_to_rootfs() and be left with: Kernel panic - not syncing: write error this is not too helpful to understand what happened, so let's wrap the panic() calls with a surrounding show_mem() such that we have a chance of understanding the memory conditions leading to these allocation failures. [akpm@linux-foundation.org: replace macro with C function] Link: https://lkml.kernel.org/r/20210114231517.1854379-1-f.fainelli@gmail.com Signed-off-by: Florian Fainelli Cc: Barret Rhoden Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- init/initramfs.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/init/initramfs.c b/init/initramfs.c index 55b74d7e5260..5fa84711127a 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -45,6 +46,16 @@ static void __init error(char *x) message = x; } +static void panic_show_mem(const char *fmt, ...) +{ + va_list args; + + show_mem(0, NULL); + va_start(args, fmt); + panic(fmt, args); + va_end(args); +} + /* link hash */ #define N_ALIGN(len) ((((len) + 1) & ~3) + 2) @@ -80,7 +91,7 @@ static char __init *find_link(int major, int minor, int ino, } q = kmalloc(sizeof(struct hash), GFP_KERNEL); if (!q) - panic("can't allocate link hash entry"); + panic_show_mem("can't allocate link hash entry"); q->major = major; q->minor = minor; q->ino = ino; @@ -125,7 +136,7 @@ static void __init dir_add(const char *name, time64_t mtime) { struct dir_entry *de = kmalloc(sizeof(struct dir_entry), GFP_KERNEL); if (!de) - panic("can't allocate dir_entry buffer"); + panic_show_mem("can't allocate dir_entry buffer"); INIT_LIST_HEAD(&de->list); de->name = kstrdup(name, GFP_KERNEL); de->mtime = mtime; @@ -460,7 +471,7 @@ static char * __init unpack_to_rootfs(char *buf, unsigned long len) name_buf = kmalloc(N_ALIGN(PATH_MAX), GFP_KERNEL); if (!header_buf || !symlink_buf || !name_buf) - panic("can't allocate buffers"); + panic_show_mem("can't allocate buffers"); state = Start; this_header = 0; @@ -607,7 +618,7 @@ static int __init populate_rootfs(void) /* Load the built in initramfs */ char *err = unpack_to_rootfs(__initramfs_start, __initramfs_size); if (err) - panic("%s", err); /* Failed to decompress INTERNAL initramfs */ + panic_show_mem("%s", err); /* Failed to decompress INTERNAL initramfs */ if (!initrd_start || IS_ENABLED(CONFIG_INITRAMFS_FORCE)) goto done; From f685a533a7fab35c5d069dcd663f59c8e4171a75 Mon Sep 17 00:00:00 2001 From: Huang Pei Date: Thu, 25 Feb 2021 17:22:49 -0800 Subject: [PATCH 118/118] MIPS: make userspace mapping young by default MIPS page fault path(except huge page) takes 3 exceptions (1 TLB Miss + 2 TLB Invalid), butthe second TLB Invalid exception is just triggered by __update_tlb from do_page_fault writing tlb without _PAGE_VALID set. With this patch, user space mapping prot is made young by default (with both _PAGE_VALID and _PAGE_YOUNG set), and it only take 1 TLB Miss + 1 TLB Invalid exception Remove pte_sw_mkyoung without polluting MM code and make page fault delay of MIPS on par with other architecture Link: https://lkml.kernel.org/r/20210204013942.8398-1-huangpei@loongson.cn Signed-off-by: Huang Pei Reviewed-by: Nicholas Piggin Acked-by: Acked-by: Thomas Bogendoerfer Cc: Christophe Leroy Cc: Cc: Bibo Mao Cc: Jiaxun Yang Cc: Paul Burton Cc: Li Xuefeng Cc: Yang Tiezhu Cc: Gao Juxin Cc: Fuxin Zhang Cc: Huacai Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/mips/mm/cache.c | 30 ++++++++++++++++-------------- include/linux/pgtable.h | 8 -------- mm/memory.c | 4 ---- 3 files changed, 16 insertions(+), 26 deletions(-) diff --git a/arch/mips/mm/cache.c b/arch/mips/mm/cache.c index 1754498b0717..7719d632df8d 100644 --- a/arch/mips/mm/cache.c +++ b/arch/mips/mm/cache.c @@ -157,29 +157,31 @@ unsigned long _page_cachable_default; EXPORT_SYMBOL(_page_cachable_default); #define PM(p) __pgprot(_page_cachable_default | (p)) +#define PVA(p) PM(_PAGE_VALID | _PAGE_ACCESSED | (p)) static inline void setup_protection_map(void) { protection_map[0] = PM(_PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_NO_READ); - protection_map[1] = PM(_PAGE_PRESENT | _PAGE_NO_EXEC); - protection_map[2] = PM(_PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_NO_READ); - protection_map[3] = PM(_PAGE_PRESENT | _PAGE_NO_EXEC); - protection_map[4] = PM(_PAGE_PRESENT); - protection_map[5] = PM(_PAGE_PRESENT); - protection_map[6] = PM(_PAGE_PRESENT); - protection_map[7] = PM(_PAGE_PRESENT); + protection_map[1] = PVA(_PAGE_PRESENT | _PAGE_NO_EXEC); + protection_map[2] = PVA(_PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_NO_READ); + protection_map[3] = PVA(_PAGE_PRESENT | _PAGE_NO_EXEC); + protection_map[4] = PVA(_PAGE_PRESENT); + protection_map[5] = PVA(_PAGE_PRESENT); + protection_map[6] = PVA(_PAGE_PRESENT); + protection_map[7] = PVA(_PAGE_PRESENT); protection_map[8] = PM(_PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_NO_READ); - protection_map[9] = PM(_PAGE_PRESENT | _PAGE_NO_EXEC); - protection_map[10] = PM(_PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_WRITE | + protection_map[9] = PVA(_PAGE_PRESENT | _PAGE_NO_EXEC); + protection_map[10] = PVA(_PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_WRITE | _PAGE_NO_READ); - protection_map[11] = PM(_PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_WRITE); - protection_map[12] = PM(_PAGE_PRESENT); - protection_map[13] = PM(_PAGE_PRESENT); - protection_map[14] = PM(_PAGE_PRESENT | _PAGE_WRITE); - protection_map[15] = PM(_PAGE_PRESENT | _PAGE_WRITE); + protection_map[11] = PVA(_PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_WRITE); + protection_map[12] = PVA(_PAGE_PRESENT); + protection_map[13] = PVA(_PAGE_PRESENT); + protection_map[14] = PVA(_PAGE_PRESENT); + protection_map[15] = PVA(_PAGE_PRESENT); } +#undef _PVA #undef PM void cpu_cache_init(void) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 36eb748f3c97..cdfc4e9f253e 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -432,14 +432,6 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres * To be differentiate with macro pte_mkyoung, this macro is used on platforms * where software maintains page access bit. */ -#ifndef pte_sw_mkyoung -static inline pte_t pte_sw_mkyoung(pte_t pte) -{ - return pte; -} -#define pte_sw_mkyoung pte_sw_mkyoung -#endif - #ifndef pte_savedwrite #define pte_savedwrite pte_write #endif diff --git a/mm/memory.c b/mm/memory.c index 784249f3307b..c8e357627318 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2902,7 +2902,6 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) } flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); entry = mk_pte(new_page, vma->vm_page_prot); - entry = pte_sw_mkyoung(entry); entry = maybe_mkwrite(pte_mkdirty(entry), vma); /* @@ -3560,7 +3559,6 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) __SetPageUptodate(page); entry = mk_pte(page, vma->vm_page_prot); - entry = pte_sw_mkyoung(entry); if (vma->vm_flags & VM_WRITE) entry = pte_mkwrite(pte_mkdirty(entry)); @@ -3745,8 +3743,6 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) if (prefault && arch_wants_old_prefaulted_pte()) entry = pte_mkold(entry); - else - entry = pte_sw_mkyoung(entry); if (write) entry = maybe_mkwrite(pte_mkdirty(entry), vma);