From 2c012a4ad1a2cd3fb5a0f9307b9d219f84eda1fa Mon Sep 17 00:00:00 2001 From: Kuo-Hsin Yang Date: Thu, 11 Jul 2019 20:52:04 -0700 Subject: [PATCH 001/147] mm: vmscan: scan anonymous pages on file refaults When file refaults are detected and there are many inactive file pages, the system never reclaim anonymous pages, the file pages are dropped aggressively when there are still a lot of cold anonymous pages and system thrashes. This issue impacts the performance of applications with large executable, e.g. chrome. With this patch, when file refault is detected, inactive_list_is_low() always returns true for file pages in get_scan_count() to enable scanning anonymous pages. The problem can be reproduced by the following test program. ---8<--- void fallocate_file(const char *filename, off_t size) { struct stat st; int fd; if (!stat(filename, &st) && st.st_size >= size) return; fd = open(filename, O_WRONLY | O_CREAT, 0600); if (fd < 0) { perror("create file"); exit(1); } if (posix_fallocate(fd, 0, size)) { perror("fallocate"); exit(1); } close(fd); } long *alloc_anon(long size) { long *start = malloc(size); memset(start, 1, size); return start; } long access_file(const char *filename, long size, long rounds) { int fd, i; volatile char *start1, *end1, *start2; const int page_size = getpagesize(); long sum = 0; fd = open(filename, O_RDONLY); if (fd == -1) { perror("open"); exit(1); } /* * Some applications, e.g. chrome, use a lot of executable file * pages, map some of the pages with PROT_EXEC flag to simulate * the behavior. */ start1 = mmap(NULL, size / 2, PROT_READ | PROT_EXEC, MAP_SHARED, fd, 0); if (start1 == MAP_FAILED) { perror("mmap"); exit(1); } end1 = start1 + size / 2; start2 = mmap(NULL, size / 2, PROT_READ, MAP_SHARED, fd, size / 2); if (start2 == MAP_FAILED) { perror("mmap"); exit(1); } for (i = 0; i < rounds; ++i) { struct timeval before, after; volatile char *ptr1 = start1, *ptr2 = start2; gettimeofday(&before, NULL); for (; ptr1 < end1; ptr1 += page_size, ptr2 += page_size) sum += *ptr1 + *ptr2; gettimeofday(&after, NULL); printf("File access time, round %d: %f (sec) ", i, (after.tv_sec - before.tv_sec) + (after.tv_usec - before.tv_usec) / 1000000.0); } return sum; } int main(int argc, char *argv[]) { const long MB = 1024 * 1024; long anon_mb, file_mb, file_rounds; const char filename[] = "large"; long *ret1; long ret2; if (argc != 4) { printf("usage: thrash ANON_MB FILE_MB FILE_ROUNDS "); exit(0); } anon_mb = atoi(argv[1]); file_mb = atoi(argv[2]); file_rounds = atoi(argv[3]); fallocate_file(filename, file_mb * MB); printf("Allocate %ld MB anonymous pages ", anon_mb); ret1 = alloc_anon(anon_mb * MB); printf("Access %ld MB file pages ", file_mb); ret2 = access_file(filename, file_mb * MB, file_rounds); printf("Print result to prevent optimization: %ld ", *ret1 + ret2); return 0; } ---8<--- Running the test program on 2GB RAM VM with kernel 5.2.0-rc5, the program fills ram with 2048 MB memory, access a 200 MB file for 10 times. Without this patch, the file cache is dropped aggresively and every access to the file is from disk. $ ./thrash 2048 200 10 Allocate 2048 MB anonymous pages Access 200 MB file pages File access time, round 0: 2.489316 (sec) File access time, round 1: 2.581277 (sec) File access time, round 2: 2.487624 (sec) File access time, round 3: 2.449100 (sec) File access time, round 4: 2.420423 (sec) File access time, round 5: 2.343411 (sec) File access time, round 6: 2.454833 (sec) File access time, round 7: 2.483398 (sec) File access time, round 8: 2.572701 (sec) File access time, round 9: 2.493014 (sec) With this patch, these file pages can be cached. $ ./thrash 2048 200 10 Allocate 2048 MB anonymous pages Access 200 MB file pages File access time, round 0: 2.475189 (sec) File access time, round 1: 2.440777 (sec) File access time, round 2: 2.411671 (sec) File access time, round 3: 1.955267 (sec) File access time, round 4: 0.029924 (sec) File access time, round 5: 0.000808 (sec) File access time, round 6: 0.000771 (sec) File access time, round 7: 0.000746 (sec) File access time, round 8: 0.000738 (sec) File access time, round 9: 0.000747 (sec) Checked the swap out stats during the test [1], 19006 pages swapped out with this patch, 3418 pages swapped out without this patch. There are more swap out, but I think it's within reasonable range when file backed data set doesn't fit into the memory. $ ./thrash 2000 100 2100 5 1 # ANON_MB FILE_EXEC FILE_NOEXEC ROUNDS PROCESSES Allocate 2000 MB anonymous pages active_anon: 1613644, inactive_anon: 348656, active_file: 892, inactive_file: 1384 (kB) pswpout: 7972443, pgpgin: 478615246 Access 100 MB executable file pages Access 2100 MB regular file pages File access time, round 0: 12.165, (sec) active_anon: 1433788, inactive_anon: 478116, active_file: 17896, inactive_file: 24328 (kB) File access time, round 1: 11.493, (sec) active_anon: 1430576, inactive_anon: 477144, active_file: 25440, inactive_file: 26172 (kB) File access time, round 2: 11.455, (sec) active_anon: 1427436, inactive_anon: 476060, active_file: 21112, inactive_file: 28808 (kB) File access time, round 3: 11.454, (sec) active_anon: 1420444, inactive_anon: 473632, active_file: 23216, inactive_file: 35036 (kB) File access time, round 4: 11.479, (sec) active_anon: 1413964, inactive_anon: 471460, active_file: 31728, inactive_file: 32224 (kB) pswpout: 7991449 (+ 19006), pgpgin: 489924366 (+ 11309120) With 4 processes accessing non-overlapping parts of a large file, 30316 pages swapped out with this patch, 5152 pages swapped out without this patch. The swapout number is small comparing to pgpgin. [1]: https://github.com/vovo/testing/blob/master/mem_thrash.c Link: http://lkml.kernel.org/r/20190701081038.GA83398@google.com Fixes: e9868505987a ("mm,vmscan: only evict file pages when we have plenty") Fixes: 7c5bd705d8f9 ("mm: memcg: only evict file pages when we have plenty") Signed-off-by: Kuo-Hsin Yang Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Sonny Rao Cc: Mel Gorman Cc: Rik van Riel Cc: Vladimir Davydov Cc: Minchan Kim Cc: [4.12+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 910e02c793ff..96aafbf8ce4e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2125,7 +2125,7 @@ static void shrink_active_list(unsigned long nr_to_scan, * 10TB 320 32GB */ static bool inactive_list_is_low(struct lruvec *lruvec, bool file, - struct scan_control *sc, bool actual_reclaim) + struct scan_control *sc, bool trace) { enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE; struct pglist_data *pgdat = lruvec_pgdat(lruvec); @@ -2151,7 +2151,7 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file, * rid of the stale workingset quickly. */ refaults = lruvec_page_state_local(lruvec, WORKINGSET_ACTIVATE); - if (file && actual_reclaim && lruvec->refaults != refaults) { + if (file && lruvec->refaults != refaults) { inactive_ratio = 0; } else { gb = (inactive + active) >> (30 - PAGE_SHIFT); @@ -2161,7 +2161,7 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file, inactive_ratio = 1; } - if (actual_reclaim) + if (trace) trace_mm_vmscan_inactive_list_is_low(pgdat->node_id, sc->reclaim_idx, lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive, lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active, From 9bd3bb6703d8c0a5fb8aec8e3287bd55b7341dcd Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 11 Jul 2019 20:52:08 -0700 Subject: [PATCH 002/147] mm/nvdimm: add is_ioremap_addr and use that to check ioremap address Architectures like powerpc use different address range to map ioremap and vmalloc range. The memunmap() check used by the nvdimm layer was wrongly using is_vmalloc_addr() to check for ioremap range which fails for ppc64. This result in ppc64 not freeing the ioremap mapping. The side effect of this is an unbind failure during module unload with papr_scm nvdimm driver Link: http://lkml.kernel.org/r/20190701134038.14165-1-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Fixes: b5beae5e224f ("powerpc/pseries: Add driver for PAPR SCM regions") Cc: Dan Williams Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/include/asm/pgtable.h | 14 ++++++++++++++ include/linux/mm.h | 5 +++++ kernel/iomem.c | 2 +- 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 3f53be60fb01..64145751b2fd 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -140,6 +140,20 @@ static inline void pte_frag_set(mm_context_t *ctx, void *p) } #endif +#ifdef CONFIG_PPC64 +#define is_ioremap_addr is_ioremap_addr +static inline bool is_ioremap_addr(const void *x) +{ +#ifdef CONFIG_MMU + unsigned long addr = (unsigned long)x; + + return addr >= IOREMAP_BASE && addr < IOREMAP_END; +#else + return false; +#endif +} +#endif /* CONFIG_PPC64 */ + #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_PGTABLE_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index dd0b5f4e1e45..0a6dae2f2b84 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -633,6 +633,11 @@ static inline bool is_vmalloc_addr(const void *x) return false; #endif } + +#ifndef is_ioremap_addr +#define is_ioremap_addr(x) is_vmalloc_addr(x) +#endif + #ifdef CONFIG_MMU extern int is_vmalloc_or_module_addr(const void *x); #else diff --git a/kernel/iomem.c b/kernel/iomem.c index 93c264444510..62c92e43aa0d 100644 --- a/kernel/iomem.c +++ b/kernel/iomem.c @@ -121,7 +121,7 @@ EXPORT_SYMBOL(memremap); void memunmap(void *addr) { - if (is_vmalloc_addr(addr)) + if (is_ioremap_addr(addr)) iounmap((void __iomem *) addr); } EXPORT_SYMBOL(memunmap); From dd9239900e12db84c198855b262ae7796db1123b Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Thu, 11 Jul 2019 20:52:11 -0700 Subject: [PATCH 003/147] mm/memcontrol: fix wrong statistics in memory.stat When we calculate total statistics for memcg1_stats and memcg1_events, we use the the index 'i' in the for loop as the events index. Actually we should use memcg1_stats[i] and memcg1_events[i] as the events index. Link: http://lkml.kernel.org/r/1562116978-19539-1-git-send-email-laoar.shao@gmail.com Fixes: 42a300353577 ("mm: memcontrol: fix recursive statistics correctness & scalabilty"). Signed-off-by: Yafang Shao Cc: Michal Hocko Cc: Johannes Weiner Cc: Yafang Shao Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ba9138a4a1de..591eafafbd8c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3530,12 +3530,13 @@ static int memcg_stat_show(struct seq_file *m, void *v) if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) continue; seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], - (u64)memcg_page_state(memcg, i) * PAGE_SIZE); + (u64)memcg_page_state(memcg, memcg1_stats[i]) * + PAGE_SIZE); } for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) seq_printf(m, "total_%s %llu\n", memcg1_event_names[i], - (u64)memcg_events(memcg, i)); + (u64)memcg_events(memcg, memcg1_events[i])); for (i = 0; i < NR_LRU_LISTS; i++) seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], From 810481a246089117d98e3373a3cb735c3efc1f90 Mon Sep 17 00:00:00 2001 From: Henry Burns Date: Thu, 11 Jul 2019 20:52:14 -0700 Subject: [PATCH 004/147] mm/z3fold.c: lock z3fold page before __SetPageMovable() Following zsmalloc.c's example we call trylock_page() and unlock_page(). Also make z3fold_page_migrate() assert that newpage is passed in locked, as per the documentation. [akpm@linux-foundation.org: fix trylock_page return value test, per Shakeel] Link: http://lkml.kernel.org/r/20190702005122.41036-1-henryburns@google.com Link: http://lkml.kernel.org/r/20190702233538.52793-1-henryburns@google.com Signed-off-by: Henry Burns Suggested-by: Vitaly Wool Acked-by: Vitaly Wool Acked-by: David Rientjes Reviewed-by: Shakeel Butt Cc: Vitaly Vul Cc: Mike Rapoport Cc: Xidong Wang Cc: Jonathan Adams Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/z3fold.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/mm/z3fold.c b/mm/z3fold.c index 985732c8b025..dfcd69d08c1e 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -924,7 +924,16 @@ retry: set_bit(PAGE_HEADLESS, &page->private); goto headless; } - __SetPageMovable(page, pool->inode->i_mapping); + if (can_sleep) { + lock_page(page); + __SetPageMovable(page, pool->inode->i_mapping); + unlock_page(page); + } else { + if (trylock_page(page)) { + __SetPageMovable(page, pool->inode->i_mapping); + unlock_page(page); + } + } z3fold_page_lock(zhdr); found: @@ -1331,6 +1340,7 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa VM_BUG_ON_PAGE(!PageMovable(page), page); VM_BUG_ON_PAGE(!PageIsolated(page), page); + VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); zhdr = page_address(page); pool = zhdr_to_pool(zhdr); From c32cc30c0544f13982ee0185d55f4910319b1a79 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 11 Jul 2019 20:52:18 -0700 Subject: [PATCH 005/147] nilfs2: do not use unexported cpu_to_le32()/le32_to_cpu() in uapi header cpu_to_le32/le32_to_cpu is defined in include/linux/byteorder/generic.h, which is not exported to user-space. UAPI headers must use the ones prefixed with double-underscore. Detected by compile-testing exported headers: include/linux/nilfs2_ondisk.h: In function `nilfs_checkpoint_set_snapshot': include/linux/nilfs2_ondisk.h:536:17: error: implicit declaration of function `cpu_to_le32' [-Werror=implicit-function-declaration] cp->cp_flags = cpu_to_le32(le32_to_cpu(cp->cp_flags) | \ ^ include/linux/nilfs2_ondisk.h:552:1: note: in expansion of macro `NILFS_CHECKPOINT_FNS' NILFS_CHECKPOINT_FNS(SNAPSHOT, snapshot) ^~~~~~~~~~~~~~~~~~~~ include/linux/nilfs2_ondisk.h:536:29: error: implicit declaration of function `le32_to_cpu' [-Werror=implicit-function-declaration] cp->cp_flags = cpu_to_le32(le32_to_cpu(cp->cp_flags) | \ ^ include/linux/nilfs2_ondisk.h:552:1: note: in expansion of macro `NILFS_CHECKPOINT_FNS' NILFS_CHECKPOINT_FNS(SNAPSHOT, snapshot) ^~~~~~~~~~~~~~~~~~~~ include/linux/nilfs2_ondisk.h: In function `nilfs_segment_usage_set_clean': include/linux/nilfs2_ondisk.h:622:19: error: implicit declaration of function `cpu_to_le64' [-Werror=implicit-function-declaration] su->su_lastmod = cpu_to_le64(0); ^~~~~~~~~~~ Link: http://lkml.kernel.org/r/20190605053006.14332-1-yamada.masahiro@socionext.com Fixes: e63e88bc53ba ("nilfs2: move ioctl interface and disk layout to uapi separately") Signed-off-by: Masahiro Yamada Acked-by: Ryusuke Konishi Cc: Arnd Bergmann Cc: Greg KH Cc: Joe Perches Cc: [4.9+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/nilfs2_ondisk.h | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/include/uapi/linux/nilfs2_ondisk.h b/include/uapi/linux/nilfs2_ondisk.h index a7e66ab11d1d..c23f91ae5fe8 100644 --- a/include/uapi/linux/nilfs2_ondisk.h +++ b/include/uapi/linux/nilfs2_ondisk.h @@ -29,7 +29,7 @@ #include #include - +#include #define NILFS_INODE_BMAP_SIZE 7 @@ -533,19 +533,19 @@ enum { static inline void \ nilfs_checkpoint_set_##name(struct nilfs_checkpoint *cp) \ { \ - cp->cp_flags = cpu_to_le32(le32_to_cpu(cp->cp_flags) | \ - (1UL << NILFS_CHECKPOINT_##flag)); \ + cp->cp_flags = __cpu_to_le32(__le32_to_cpu(cp->cp_flags) | \ + (1UL << NILFS_CHECKPOINT_##flag)); \ } \ static inline void \ nilfs_checkpoint_clear_##name(struct nilfs_checkpoint *cp) \ { \ - cp->cp_flags = cpu_to_le32(le32_to_cpu(cp->cp_flags) & \ + cp->cp_flags = __cpu_to_le32(__le32_to_cpu(cp->cp_flags) & \ ~(1UL << NILFS_CHECKPOINT_##flag)); \ } \ static inline int \ nilfs_checkpoint_##name(const struct nilfs_checkpoint *cp) \ { \ - return !!(le32_to_cpu(cp->cp_flags) & \ + return !!(__le32_to_cpu(cp->cp_flags) & \ (1UL << NILFS_CHECKPOINT_##flag)); \ } @@ -595,20 +595,20 @@ enum { static inline void \ nilfs_segment_usage_set_##name(struct nilfs_segment_usage *su) \ { \ - su->su_flags = cpu_to_le32(le32_to_cpu(su->su_flags) | \ + su->su_flags = __cpu_to_le32(__le32_to_cpu(su->su_flags) | \ (1UL << NILFS_SEGMENT_USAGE_##flag));\ } \ static inline void \ nilfs_segment_usage_clear_##name(struct nilfs_segment_usage *su) \ { \ su->su_flags = \ - cpu_to_le32(le32_to_cpu(su->su_flags) & \ + __cpu_to_le32(__le32_to_cpu(su->su_flags) & \ ~(1UL << NILFS_SEGMENT_USAGE_##flag)); \ } \ static inline int \ nilfs_segment_usage_##name(const struct nilfs_segment_usage *su) \ { \ - return !!(le32_to_cpu(su->su_flags) & \ + return !!(__le32_to_cpu(su->su_flags) & \ (1UL << NILFS_SEGMENT_USAGE_##flag)); \ } @@ -619,15 +619,15 @@ NILFS_SEGMENT_USAGE_FNS(ERROR, error) static inline void nilfs_segment_usage_set_clean(struct nilfs_segment_usage *su) { - su->su_lastmod = cpu_to_le64(0); - su->su_nblocks = cpu_to_le32(0); - su->su_flags = cpu_to_le32(0); + su->su_lastmod = __cpu_to_le64(0); + su->su_nblocks = __cpu_to_le32(0); + su->su_flags = __cpu_to_le32(0); } static inline int nilfs_segment_usage_clean(const struct nilfs_segment_usage *su) { - return !le32_to_cpu(su->su_flags); + return !__le32_to_cpu(su->su_flags); } /** From ae2c8880482977a8b6772657b7c400462695be46 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Thu, 11 Jul 2019 20:52:21 -0700 Subject: [PATCH 006/147] MAINTAINERS: nilfs2: update email address Change my email since lab.ntt.co.jp email domain has been deprecated due to company policy. Link: http://lkml.kernel.org/r/1562495153-8166-1-git-send-email-konishi.ryusuke@gmail.com Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 618e4979960b..7aff268f02aa 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11215,7 +11215,7 @@ F: include/uapi/linux/nfs* F: include/uapi/linux/sunrpc/ NILFS2 FILESYSTEM -M: Ryusuke Konishi +M: Ryusuke Konishi L: linux-nilfs@vger.kernel.org W: https://nilfs.sourceforge.io/ W: https://nilfs.osdn.jp/ From a760f8a67cb38d19fd52f2a28c65c967e469367e Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Thu, 11 Jul 2019 20:52:24 -0700 Subject: [PATCH 007/147] include/linux/dmar.h: replace single-char identifiers in macros There are a few macros in IOMMU have single-char identifiers make the code hard to read and debug. Replace them with meaningful names. Link: http://lkml.kernel.org/r/1559566783-13627-1-git-send-email-cai@lca.pw Signed-off-by: Qian Cai Suggested-by: Andrew Morton Cc: Joerg Roedel Cc: Robin Murphy Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/dmar.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/include/linux/dmar.h b/include/linux/dmar.h index 28813c6f44b6..a7cf3599d9a1 100644 --- a/include/linux/dmar.h +++ b/include/linux/dmar.h @@ -92,12 +92,14 @@ static inline bool dmar_rcu_check(void) #define dmar_rcu_dereference(p) rcu_dereference_check((p), dmar_rcu_check()) -#define for_each_dev_scope(a, c, p, d) \ - for ((p) = 0; ((d) = (p) < (c) ? dmar_rcu_dereference((a)[(p)].dev) : \ - NULL, (p) < (c)); (p)++) +#define for_each_dev_scope(devs, cnt, i, tmp) \ + for ((i) = 0; ((tmp) = (i) < (cnt) ? \ + dmar_rcu_dereference((devs)[(i)].dev) : NULL, (i) < (cnt)); \ + (i)++) -#define for_each_active_dev_scope(a, c, p, d) \ - for_each_dev_scope((a), (c), (p), (d)) if (!(d)) { continue; } else +#define for_each_active_dev_scope(devs, cnt, i, tmp) \ + for_each_dev_scope((devs), (cnt), (i), (tmp)) \ + if (!(tmp)) { continue; } else extern int dmar_table_init(void); extern int dmar_dev_scope_init(void); From 31013836a71e07751a6827f9d2ad41ef502ddaff Mon Sep 17 00:00:00 2001 From: Nicolas Boichat Date: Thu, 11 Jul 2019 20:52:27 -0700 Subject: [PATCH 008/147] scripts/decode_stacktrace: match basepath using shell prefix operator, not regex The basepath may contain special characters, which would confuse the regex matcher. ${var#prefix} does the right thing. Link: http://lkml.kernel.org/r/20190518055946.181563-1-drinkcat@chromium.org Fixes: 67a28de47faa8358 ("scripts/decode_stacktrace: only strip base path when a prefix of the path") Signed-off-by: Nicolas Boichat Reviewed-by: Stephen Boyd Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/decode_stacktrace.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/decode_stacktrace.sh b/scripts/decode_stacktrace.sh index a7a36209a193..6c2c05a75b54 100755 --- a/scripts/decode_stacktrace.sh +++ b/scripts/decode_stacktrace.sh @@ -85,7 +85,7 @@ parse_symbol() { fi # Strip out the base of the path - code=${code//^$basepath/""} + code=${code#$basepath/} # In the case of inlines, move everything to same line code=${code//$'\n'/' '} From fe7d14f174f18745d8dc141377e5f85ae7757d66 Mon Sep 17 00:00:00 2001 From: Nicolas Boichat Date: Thu, 11 Jul 2019 20:52:30 -0700 Subject: [PATCH 009/147] scripts/decode_stacktrace: look for modules with .ko.debug extension In Chromium OS kernel builds, we split the debug information as .ko.debug files, and that's what decode_stacktrace.sh needs to use. Relax objfile matching rule to allow any .ko* file to be matched. [drinkcat@chromium.org: add quotes around name pattern] Link: http://lkml.kernel.org/r/20190528103346.42720-1-drinkcat@chromium.org Link: http://lkml.kernel.org/r/20190521234148.64060-1-drinkcat@chromium.org Signed-off-by: Nicolas Boichat Cc: Marc Zyngier Cc: Konstantin Khlebnikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/decode_stacktrace.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/decode_stacktrace.sh b/scripts/decode_stacktrace.sh index 6c2c05a75b54..fa704f17275e 100755 --- a/scripts/decode_stacktrace.sh +++ b/scripts/decode_stacktrace.sh @@ -28,7 +28,7 @@ parse_symbol() { local objfile=${modcache[$module]} else [[ $modpath == "" ]] && return - local objfile=$(find "$modpath" -name $module.ko -print -quit) + local objfile=$(find "$modpath" -name "$module.ko*" -print -quit) [[ $objfile == "" ]] && return modcache[$module]=$objfile fi From cc0e5f1ce0a8017c68983eb6b41a1dbd0d24aa98 Mon Sep 17 00:00:00 2001 From: Paul Walmsley Date: Thu, 11 Jul 2019 20:52:33 -0700 Subject: [PATCH 010/147] scripts/spelling.txt: drop "sepc" from the misspelling list The RISC-V architecture has a register named the "Supervisor Exception Program Counter", or "sepc". This abbreviation triggers checkpatch.pl's misspelling detector, resulting in noise in the checkpatch output. The risk that this noise could cause more useful warnings to be missed seems to outweigh the harm of an occasional misspelling of "spec". Thus drop the "sepc" entry from the misspelling list. [akpm@linux-foundation.org: fix existing "sepc" instances, per Joe] Link: http://lkml.kernel.org/r/20190518210037.13674-1-paul.walmsley@sifive.com Signed-off-by: Paul Walmsley Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/kvm/book3s_xics.c | 2 +- arch/unicore32/include/mach/regs-gpio.h | 2 +- drivers/net/wireless/realtek/rtlwifi/wifi.h | 2 +- drivers/scsi/lpfc/lpfc_init.c | 2 +- drivers/staging/rtl8723bs/hal/rtl8723b_phycfg.c | 2 +- scripts/spelling.txt | 1 - 6 files changed, 5 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c index e8276161872e..381bf8dea193 100644 --- a/arch/powerpc/kvm/book3s_xics.c +++ b/arch/powerpc/kvm/book3s_xics.c @@ -827,7 +827,7 @@ static noinline int kvmppc_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) * * Note: If EOI is incorrectly used by SW to lower the CPPR * value (ie more favored), we do not check for rejection of - * a pending interrupt, this is a SW error and PAPR sepcifies + * a pending interrupt, this is a SW error and PAPR specifies * that we don't have to deal with it. * * The sending of an EOI to the ICS is handled after the diff --git a/arch/unicore32/include/mach/regs-gpio.h b/arch/unicore32/include/mach/regs-gpio.h index 806350e1ccb6..5fc701ee33e3 100644 --- a/arch/unicore32/include/mach/regs-gpio.h +++ b/arch/unicore32/include/mach/regs-gpio.h @@ -32,7 +32,7 @@ */ #define GPIO_GEDR (PKUNITY_GPIO_BASE + 0x0018) /* - * Sepcial Voltage Detect Reg GPIO_GPIR. + * Special Voltage Detect Reg GPIO_GPIR. */ #define GPIO_GPIR (PKUNITY_GPIO_BASE + 0x0020) diff --git a/drivers/net/wireless/realtek/rtlwifi/wifi.h b/drivers/net/wireless/realtek/rtlwifi/wifi.h index 81caa3782ec0..3bdda1c98339 100644 --- a/drivers/net/wireless/realtek/rtlwifi/wifi.h +++ b/drivers/net/wireless/realtek/rtlwifi/wifi.h @@ -598,7 +598,7 @@ enum ht_channel_width { HT_CHANNEL_WIDTH_MAX, }; -/* Ref: 802.11i sepc D10.0 7.3.2.25.1 +/* Ref: 802.11i spec D10.0 7.3.2.25.1 * Cipher Suites Encryption Algorithms */ enum rt_enc_alg { diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c index eaaef682de25..adfc2ec0f4fc 100644 --- a/drivers/scsi/lpfc/lpfc_init.c +++ b/drivers/scsi/lpfc/lpfc_init.c @@ -2963,7 +2963,7 @@ lpfc_stop_hba_timers(struct lpfc_hba *phba) del_timer_sync(&phba->fcp_poll_timer); break; case LPFC_PCI_DEV_OC: - /* Stop any OneConnect device sepcific driver timers */ + /* Stop any OneConnect device specific driver timers */ lpfc_sli4_stop_fcf_redisc_wait_timer(phba); break; default: diff --git a/drivers/staging/rtl8723bs/hal/rtl8723b_phycfg.c b/drivers/staging/rtl8723bs/hal/rtl8723b_phycfg.c index 4f2ad54af398..6da7f8e7bdae 100644 --- a/drivers/staging/rtl8723bs/hal/rtl8723b_phycfg.c +++ b/drivers/staging/rtl8723bs/hal/rtl8723b_phycfg.c @@ -45,7 +45,7 @@ static u32 phy_CalculateBitShift(u32 BitMask) /** * Function: PHY_QueryBBReg * -* OverView: Read "sepcific bits" from BB register +* OverView: Read "specific bits" from BB register * * Input: * struct adapter * Adapter, diff --git a/scripts/spelling.txt b/scripts/spelling.txt index 86b87332b9e5..5ae83ce31902 100644 --- a/scripts/spelling.txt +++ b/scripts/spelling.txt @@ -1145,7 +1145,6 @@ senarios||scenarios sentivite||sensitive separatly||separately sepcify||specify -sepc||spec seperated||separated seperately||separately seperate||separate From 85f946ffd3b61ffccefd41a52f267437d878dd3e Mon Sep 17 00:00:00 2001 From: Chris Paterson Date: Thu, 11 Jul 2019 20:52:36 -0700 Subject: [PATCH 011/147] scripts/spelling.txt: add spelling fix for prohibited Misspelling 'prohibited' is quite common in the real world, although surprisingly not so much in the Linux Kernel. In addition to fixing the typo we may as well add it to the spelling checker. Also adding the present participle (prohibiting). Link: http://lkml.kernel.org/r/20190514153341.22540-1-chris.paterson2@renesas.com Fixes: 5bf2fbbef50c ("clk: renesas: cpg-mssr: Add r8a77470 support") Signed-off-by: Chris Paterson Acked-by: Stephen Boyd Acked-by: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/clk/renesas/r8a77470-cpg-mssr.c | 2 +- scripts/spelling.txt | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/clk/renesas/r8a77470-cpg-mssr.c b/drivers/clk/renesas/r8a77470-cpg-mssr.c index ab0fb10b6bf0..d81ae65f0d18 100644 --- a/drivers/clk/renesas/r8a77470-cpg-mssr.c +++ b/drivers/clk/renesas/r8a77470-cpg-mssr.c @@ -175,7 +175,7 @@ static const unsigned int r8a77470_crit_mod_clks[] __initconst = { *--------------------------------------------------- * 0 0 20 x80 x78 x50 * 0 1 26 x60 x60 x56 - * 1 0 Prohibitted setting + * 1 0 Prohibited setting * 1 1 30 x52 x52 x50 * * *1 : Table 7.4 indicates VCO output (PLL0 = VCO) diff --git a/scripts/spelling.txt b/scripts/spelling.txt index 5ae83ce31902..99f82a2a5b54 100644 --- a/scripts/spelling.txt +++ b/scripts/spelling.txt @@ -1009,6 +1009,8 @@ programers||programmers programm||program programms||programs progresss||progress +prohibitted||prohibited +prohibitting||prohibiting promiscous||promiscuous promps||prompts pronnounced||pronounced From ca90bbd410a1dc4b607210a5483786726043e9ec Mon Sep 17 00:00:00 2001 From: Evan Green Date: Thu, 11 Jul 2019 20:52:39 -0700 Subject: [PATCH 012/147] scripts/decode_stacktrace: Accept dash/underscore in modules The manpage for modprobe mentions that dashes and underscores are treated interchangeably in module names. The stack trace dumps seem to print module names with underscores. Use bash to replace _ with the pattern [-_] so that file names with dashes or underscores can be found. For example, this line: [ 27.919759] hda_widget_sysfs_init+0x2b8/0x3a5 [snd_hda_core] should find a module named snd-hda-core.ko. Link: http://lkml.kernel.org/r/20190531205926.42474-1-evgreen@chromium.org Signed-off-by: Evan Green Reviewed-by: Douglas Anderson Acked-by: Konstantin Khlebnikov Cc: Stephen Rothwell Cc: Douglas Anderson Cc: Evan Green Cc: Nicolas Boichat Cc: Marc Zyngier Cc: Manuel Traut Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/decode_stacktrace.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/decode_stacktrace.sh b/scripts/decode_stacktrace.sh index fa704f17275e..13e5fbafdf2f 100755 --- a/scripts/decode_stacktrace.sh +++ b/scripts/decode_stacktrace.sh @@ -28,7 +28,7 @@ parse_symbol() { local objfile=${modcache[$module]} else [[ $modpath == "" ]] && return - local objfile=$(find "$modpath" -name "$module.ko*" -print -quit) + local objfile=$(find "$modpath" -name "${module//_/[-_]}.ko*" -print -quit) [[ $objfile == "" ]] && return modcache[$module]=$objfile fi From 6e22fd003e28d4414d2a84eb51f1491f49a70ab9 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 11 Jul 2019 20:52:42 -0700 Subject: [PATCH 013/147] scripts/spelling.txt: add more spellings to spelling.txt Here are some of the more common spelling mistakes and typos that I've found while fixing up spelling mistakes in the kernel over the past few months. Developers keep on coming up with more inventive ways to spell words. Link: http://lkml.kernel.org/r/20190618134807.9729-1-colin.king@canonical.com Signed-off-by: Colin Ian King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/spelling.txt | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/scripts/spelling.txt b/scripts/spelling.txt index 99f82a2a5b54..de75b9feaaed 100644 --- a/scripts/spelling.txt +++ b/scripts/spelling.txt @@ -41,6 +41,7 @@ accquired||acquired accross||across acessable||accessible acess||access +acessing||accessing achitecture||architecture acient||ancient acitions||actions @@ -54,6 +55,7 @@ activete||activate actived||activated actualy||actually acumulating||accumulating +acumulative||accumulative acumulator||accumulator adapater||adapter addional||additional @@ -103,6 +105,7 @@ alogrithm||algorithm alot||a lot alow||allow alows||allows +alredy||already altough||although alue||value ambigious||ambiguous @@ -223,6 +226,7 @@ boardcast||broadcast borad||board boundry||boundary brievely||briefly +brigde||bridge broadcase||broadcast broadcat||broadcast bufufer||buffer @@ -239,6 +243,7 @@ calulate||calculate cancelation||cancellation cancle||cancel capabilites||capabilities +capabilties||capabilities capabilty||capability capabitilies||capabilities capablity||capability @@ -325,6 +330,7 @@ conector||connector connecetd||connected configuartion||configuration configuation||configuration +configued||configured configuratoin||configuration configuraton||configuration configuretion||configuration @@ -407,6 +413,7 @@ depreacte||deprecate desactivate||deactivate desciptor||descriptor desciptors||descriptors +descripto||descriptor descripton||description descrition||description descritptor||descriptor @@ -432,6 +439,7 @@ deveolpment||development devided||divided deviece||device diable||disable +dicline||decline dictionnary||dictionary didnt||didn't diferent||different @@ -461,6 +469,7 @@ disharge||discharge disnabled||disabled dispertion||dispersion dissapears||disappears +dissconect||disconnect distiction||distinction divisable||divisible divsiors||divisors @@ -469,11 +478,14 @@ documantation||documentation documentaion||documentation documment||document doesnt||doesn't +donwload||download +donwloading||downloading dorp||drop dosen||doesn downlad||download downlads||downloads droped||dropped +droput||dropout druing||during dynmaic||dynamic eanable||enable @@ -482,6 +494,7 @@ ecspecially||especially edditable||editable editting||editing efective||effective +effectivness||effectiveness efficently||efficiently ehther||ether eigth||eight @@ -543,6 +556,7 @@ extensability||extensibility extention||extension extenstion||extension extracter||extractor +faied||failed faield||failed falied||failed faild||failed @@ -567,6 +581,7 @@ fetaures||features fileystem||filesystem fimware||firmware firmare||firmware +firmaware||firmware firware||firmware finanize||finalize findn||find @@ -601,6 +616,8 @@ funtions||functions furthur||further futhermore||furthermore futrue||future +gatable||gateable +gateing||gating gauage||gauge gaurenteed||guaranteed generiously||generously @@ -641,9 +658,11 @@ iomaped||iomapped imblance||imbalance immeadiately||immediately immedaite||immediate +immedate||immediate immediatelly||immediately immediatly||immediately immidiate||immediate +immutible||immutable impelentation||implementation impementated||implemented implemantation||implementation @@ -661,10 +680,12 @@ incative||inactive incomming||incoming incompatabilities||incompatibilities incompatable||incompatible +incompatble||incompatible inconsistant||inconsistent increas||increase incremeted||incremented incrment||increment +inculde||include indendation||indentation indended||intended independant||independent @@ -778,6 +799,7 @@ libary||library librairies||libraries libraris||libraries licenceing||licencing +logaritmic||logarithmic loggging||logging loggin||login logile||logfile @@ -832,6 +854,7 @@ mispelled||misspelled mispelt||misspelt mising||missing mismactch||mismatch +missign||missing missmanaged||mismanaged missmatch||mismatch miximum||maximum @@ -848,6 +871,7 @@ mopdule||module mroe||more mulitplied||multiplied multidimensionnal||multidimensional +multipe||multiple multple||multiple mumber||number muticast||multicast @@ -870,7 +894,9 @@ nescessary||necessary nessessary||necessary noticable||noticeable notications||notifications +notifcations||notifications notifed||notified +notity||notify numebr||number numner||number obtaion||obtain @@ -887,6 +913,7 @@ occuring||occurring offser||offset offet||offset offloded||offloaded +offseting||offsetting omited||omitted omiting||omitting omitt||omit @@ -1025,6 +1052,7 @@ prosess||process protable||portable protcol||protocol protecion||protection +protedcted||protected protocoll||protocol promixity||proximity psudo||pseudo @@ -1039,6 +1067,7 @@ reasearcher||researcher reasearchers||researchers reasearch||research recepient||recipient +recevied||received receving||receiving recieved||received recieve||receive @@ -1112,6 +1141,7 @@ retreived||retrieved retreive||retrieve retreiving||retrieving retrive||retrieve +retrived||retrieved retuned||returned reudce||reduce reuest||request @@ -1178,6 +1208,7 @@ singaled||signaled singal||signal singed||signed sleeped||slept +sliped||slipped softwares||software speach||speech specfic||specific @@ -1284,6 +1315,7 @@ threds||threads threshhold||threshold thresold||threshold throught||through +trackling||tracking troughput||throughput thses||these tiggers||triggers @@ -1410,5 +1442,6 @@ wnat||want workarould||workaround writeing||writing writting||writing +wtih||with zombe||zombie zomebie||zombie From 38ce85f028fe4ea7350f448260845bc3cb030ad1 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 11 Jul 2019 20:52:45 -0700 Subject: [PATCH 014/147] arch/sh/configs/sdk7786_defconfig: remove CONFIG_LOGFS After commit 1d0fd57a50aa ("logfs: remove from tree"), logfs was removed, drop CONFIG_LOGFS from all defconfigs. Link: http://lkml.kernel.org/r/20190530021032.190639-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sh/configs/sdk7786_defconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/sh/configs/sdk7786_defconfig b/arch/sh/configs/sdk7786_defconfig index 5209889765ad..49a29338789b 100644 --- a/arch/sh/configs/sdk7786_defconfig +++ b/arch/sh/configs/sdk7786_defconfig @@ -191,7 +191,6 @@ CONFIG_CONFIGFS_FS=y CONFIG_JFFS2_FS=m CONFIG_JFFS2_FS_XATTR=y CONFIG_UBIFS_FS=m -CONFIG_LOGFS=m CONFIG_CRAMFS=m CONFIG_SQUASHFS=m CONFIG_ROMFS_FS=m From 410615478667e5ed374755cafea60917ec322b64 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Thu, 11 Jul 2019 20:52:48 -0700 Subject: [PATCH 015/147] sh: config: remove left-over BACKLIGHT_LCD_SUPPORT CONFIG_BACKLIGHT_LCD_SUPPORT was removed in 8c5dc8d9f19c ("video: backlight: Remove useless BACKLIGHT_LCD_SUPPORT kernel symbol"). Options protected by CONFIG_BACKLIGHT_LCD_SUPPORT are now available directly. Link: http://lkml.kernel.org/r/20190603191925.20659-1-krzk@kernel.org Signed-off-by: Krzysztof Kozlowski Cc: Yoshinori Sato Cc: Rich Felker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sh/configs/hp6xx_defconfig | 1 - arch/sh/configs/sh2007_defconfig | 1 - 2 files changed, 2 deletions(-) diff --git a/arch/sh/configs/hp6xx_defconfig b/arch/sh/configs/hp6xx_defconfig index 4dcf7f552582..91d43e2bffea 100644 --- a/arch/sh/configs/hp6xx_defconfig +++ b/arch/sh/configs/hp6xx_defconfig @@ -40,7 +40,6 @@ CONFIG_FB=y CONFIG_FIRMWARE_EDID=y CONFIG_FB_HIT=y CONFIG_FB_SH_MOBILE_LCDC=y -CONFIG_BACKLIGHT_LCD_SUPPORT=y CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_FONTS=y CONFIG_FONT_PEARL_8x8=y diff --git a/arch/sh/configs/sh2007_defconfig b/arch/sh/configs/sh2007_defconfig index a1cf6447dbb1..cbd6742eb423 100644 --- a/arch/sh/configs/sh2007_defconfig +++ b/arch/sh/configs/sh2007_defconfig @@ -85,7 +85,6 @@ CONFIG_WATCHDOG=y CONFIG_SH_WDT=y CONFIG_SSB=y CONFIG_FB=y -CONFIG_BACKLIGHT_LCD_SUPPORT=y # CONFIG_LCD_CLASS_DEVICE is not set CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_FRAMEBUFFER_CONSOLE_DETECT_PRIMARY=y From 733f0025f0fb43e382b84db0930ae502099b7e62 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Thu, 11 Jul 2019 20:52:52 -0700 Subject: [PATCH 016/147] sh: prevent warnings when using iounmap When building drm/exynos for sh, as part of an allmodconfig build, the following warning triggered: exynos7_drm_decon.c: In function `decon_remove': exynos7_drm_decon.c:769:24: warning: unused variable `ctx' struct decon_context *ctx = dev_get_drvdata(&pdev->dev); The ctx variable is only used as argument to iounmap(). In sh - allmodconfig CONFIG_MMU is not defined so it ended up in: \#define __iounmap(addr) do { } while (0) \#define iounmap __iounmap Fix the warning by introducing a static inline function for iounmap. This is similar to several other architectures. Link: http://lkml.kernel.org/r/20190622114208.24427-1-sam@ravnborg.org Signed-off-by: Sam Ravnborg Reviewed-by: Geert Uytterhoeven Cc: Yoshinori Sato Cc: Rich Felker Cc: Will Deacon Cc: Mark Brown Cc: Inki Dae Cc: Krzysztof Kozlowski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sh/include/asm/io.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/sh/include/asm/io.h b/arch/sh/include/asm/io.h index c28e37a344ad..ac0561960c52 100644 --- a/arch/sh/include/asm/io.h +++ b/arch/sh/include/asm/io.h @@ -369,7 +369,11 @@ static inline int iounmap_fixed(void __iomem *addr) { return -EINVAL; } #define ioremap_nocache ioremap #define ioremap_uc ioremap -#define iounmap __iounmap + +static inline void iounmap(void __iomem *addr) +{ + __iounmap(addr); +} /* * Convert a physical pointer to a virtual kernel pointer for /dev/mem From e926d8a1e8675422e53104855a7bedec82fb570f Mon Sep 17 00:00:00 2001 From: ChenGang Date: Thu, 11 Jul 2019 20:52:55 -0700 Subject: [PATCH 017/147] fs: ocfs: fix spelling mistake "hearbeating" -> "heartbeat" There are some spelling mistakes in ocfs, fix it. Link: http://lkml.kernel.org/r/1558964623-106628-1-git-send-email-cg.chen@huawei.com Signed-off-by: ChenGang Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/cluster/heartbeat.c | 2 +- fs/ocfs2/cluster/quorum.c | 2 +- fs/ocfs2/cluster/tcp.c | 2 +- fs/ocfs2/dlm/dlmmaster.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 7a3a096856a8..7f74fcc6d7d9 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -1184,7 +1184,7 @@ bail: if (atomic_read(®->hr_steady_iterations) != 0) { if (atomic_dec_and_test(®->hr_unsteady_iterations)) { printk(KERN_NOTICE "o2hb: Unable to stabilize " - "heartbeart on region %s (%s)\n", + "heartbeat on region %s (%s)\n", config_item_name(®->hr_item), reg->hr_dev_name); atomic_set(®->hr_steady_iterations, 0); diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c index 3d5d4b2b1356..5c424a099280 100644 --- a/fs/ocfs2/cluster/quorum.c +++ b/fs/ocfs2/cluster/quorum.c @@ -76,7 +76,7 @@ static void o2quo_fence_self(void) }; } -/* Indicate that a timeout occurred on a hearbeat region write. The +/* Indicate that a timeout occurred on a heartbeat region write. The * other nodes in the cluster may consider us dead at that time so we * want to "fence" ourselves so that we don't scribble on the disk * after they think they've recovered us. This can't solve all diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index c599463d0694..c07c9aac537a 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -1762,7 +1762,7 @@ static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num, (msecs_to_jiffies(o2net_reconnect_delay()) + 1); if (node_num != o2nm_this_node()) { - /* believe it or not, accept and node hearbeating testing + /* believe it or not, accept and node heartbeating testing * can succeed for this node before we got here.. so * only use set_nn_state to clear the persistent error * if that hasn't already happened */ diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 810f841494ef..74b768ca1cd8 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -2161,7 +2161,7 @@ put: * think that $RECOVERY is currently mastered by a dead node. If so, * we wait a short time to allow that node to get notified by its own * heartbeat stack, then check again. All $RECOVERY lock resources - * mastered by dead nodes are purged when the hearbeat callback is + * mastered by dead nodes are purged when the heartbeat callback is * fired, so we can know for sure that it is safe to continue once * the node returns a live node or no node. */ static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm, From 0e71666b8b9e21e4cb5d805219eb5ed7c5617ca3 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 11 Jul 2019 20:52:58 -0700 Subject: [PATCH 018/147] ocfs2/dlm: use struct_size() helper One of the more common cases of allocation size calculations is finding the size of a structure that has a zero-sized array at the end, along with memory for some number of elements for that array. For example: struct dlm_migratable_lockres { ... struct dlm_migratable_lock ml[0]; // 16 bytes each, begins at byte 112 }; Make use of the struct_size() helper instead of an open-coded version in order to avoid any potential type mistakes. So, replace the following form: sizeof(struct dlm_migratable_lockres) + (mres->num_locks * sizeof(struct dlm_migratable_lock)) with: struct_size(mres, ml, mres->num_locks) Notice that, in this case, variable sz is not necessary, hence it is removed. This code was detected with the help of Coccinelle. Link: http://lkml.kernel.org/r/20190605204926.GA24467@embeddedor Signed-off-by: Gustavo A. R. Silva Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlm/dlmrecovery.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index e22d6a115220..064ce5bbc3f6 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -1109,7 +1109,7 @@ static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm, { u64 mig_cookie = be64_to_cpu(mres->mig_cookie); int mres_total_locks = be32_to_cpu(mres->total_locks); - int sz, ret = 0, status = 0; + int ret = 0, status = 0; u8 orig_flags = mres->flags, orig_master = mres->master; @@ -1117,9 +1117,6 @@ static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm, if (!mres->num_locks) return 0; - sz = sizeof(struct dlm_migratable_lockres) + - (mres->num_locks * sizeof(struct dlm_migratable_lock)); - /* add an all-done flag if we reached the last lock */ orig_flags = mres->flags; BUG_ON(total_locks > mres_total_locks); @@ -1133,7 +1130,8 @@ static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm, /* send it */ ret = o2net_send_message(DLM_MIG_LOCKRES_MSG, dlm->key, mres, - sz, send_to, &status); + struct_size(mres, ml, mres->num_locks), + send_to, &status); if (ret < 0) { /* XXX: negative status is not handled. * this will end up killing this node. */ From 8a7f5f4c26dd4e969b5f3b30d06c54dc6a520eda Mon Sep 17 00:00:00 2001 From: Gang He Date: Thu, 11 Jul 2019 20:53:02 -0700 Subject: [PATCH 019/147] ocfs2: add last unlock times in locking_state ocfs2 file system uses locking_state file under debugfs to dump each ocfs2 file system's dlm lock resources, but the dlm lock resources in memory are becoming more and more after the files were touched by the user. it will become a bit difficult to analyze these dlm lock resource records in locking_state file by the upper scripts, though some files are not active for now, which were accessed long time ago. Then, I'd like to add last pr/ex unlock times in locking_state file for each dlm lock resource record, the the upper scripts can use last unlock time to filter inactive dlm lock resource record. Link: http://lkml.kernel.org/r/20190611015414.27754-1-ghe@suse.com Signed-off-by: Gang He Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlmglue.c | 18 +++++++++++++++--- fs/ocfs2/ocfs2.h | 1 + 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index b5fc5d3c7525..5f696be267e7 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -460,6 +460,8 @@ static void ocfs2_update_lock_stats(struct ocfs2_lock_res *res, int level, if (ret) stats->ls_fail++; + + stats->ls_last = ktime_to_us(ktime_get_real()); } static inline void ocfs2_track_lock_refresh(struct ocfs2_lock_res *lockres) @@ -3079,8 +3081,10 @@ static void *ocfs2_dlm_seq_next(struct seq_file *m, void *v, loff_t *pos) * - Lock stats printed * New in version 3 * - Max time in lock stats is in usecs (instead of nsecs) + * New in version 4 + * - Add last pr/ex unlock times in usecs */ -#define OCFS2_DLM_DEBUG_STR_VERSION 3 +#define OCFS2_DLM_DEBUG_STR_VERSION 4 static int ocfs2_dlm_seq_show(struct seq_file *m, void *v) { int i; @@ -3131,6 +3135,8 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v) # define lock_max_prmode(_l) ((_l)->l_lock_prmode.ls_max) # define lock_max_exmode(_l) ((_l)->l_lock_exmode.ls_max) # define lock_refresh(_l) ((_l)->l_lock_refresh) +# define lock_last_prmode(_l) ((_l)->l_lock_prmode.ls_last) +# define lock_last_exmode(_l) ((_l)->l_lock_exmode.ls_last) #else # define lock_num_prmode(_l) (0) # define lock_num_exmode(_l) (0) @@ -3141,6 +3147,8 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v) # define lock_max_prmode(_l) (0) # define lock_max_exmode(_l) (0) # define lock_refresh(_l) (0) +# define lock_last_prmode(_l) (0ULL) +# define lock_last_exmode(_l) (0ULL) #endif /* The following seq_print was added in version 2 of this output */ seq_printf(m, "%u\t" @@ -3151,7 +3159,9 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v) "%llu\t" "%u\t" "%u\t" - "%u\t", + "%u\t" + "%llu\t" + "%llu\t", lock_num_prmode(lockres), lock_num_exmode(lockres), lock_num_prmode_failed(lockres), @@ -3160,7 +3170,9 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v) lock_total_exmode(lockres), lock_max_prmode(lockres), lock_max_exmode(lockres), - lock_refresh(lockres)); + lock_refresh(lockres), + lock_last_prmode(lockres), + lock_last_exmode(lockres)); /* End the line */ seq_printf(m, "\n"); diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index a4647a646f07..5c111eabaa1d 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -150,6 +150,7 @@ struct ocfs2_lock_stats { /* Storing max wait in usecs saves 24 bytes per inode */ u32 ls_max; /* Max wait in USEC */ + u64 ls_last; /* Last unlock time in USEC */ }; #endif From 8056773ac4b42f36bae6406030218a5f12749c64 Mon Sep 17 00:00:00 2001 From: Gang He Date: Thu, 11 Jul 2019 20:53:05 -0700 Subject: [PATCH 020/147] ocfs2: add locking filter debugfs file Add locking filter debugfs file, which is used to filter lock resources dump from locking_state debugfs file. We use d_filter_secs field to filter lock resources dump, the default d_filter_secs(0) value filters nothing, otherwise, only dump the last N seconds active lock resources. This enhancement can avoid dumping lots of old records. The d_filter_secs value can be changed via locking_filter file. [akpm@linux-foundation.org: fix undefined reference to `__udivdi3'] Link: http://lkml.kernel.org/r/20190611015414.27754-2-ghe@suse.com Signed-off-by: Gang He Reviewed-by: Joseph Qi Acked-by: Randy Dunlap [build-tested] Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlmglue.c | 38 ++++++++++++++++++++++++++++++++++++++ fs/ocfs2/ocfs2.h | 2 ++ 2 files changed, 40 insertions(+) diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 5f696be267e7..4089daba4c6f 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -2991,6 +2991,8 @@ struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void) kref_init(&dlm_debug->d_refcnt); INIT_LIST_HEAD(&dlm_debug->d_lockres_tracking); dlm_debug->d_locking_state = NULL; + dlm_debug->d_locking_filter = NULL; + dlm_debug->d_filter_secs = 0; out: return dlm_debug; } @@ -3090,10 +3092,34 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v) int i; char *lvb; struct ocfs2_lock_res *lockres = v; +#ifdef CONFIG_OCFS2_FS_STATS + u64 now, last; + struct ocfs2_dlm_debug *dlm_debug = + ((struct ocfs2_dlm_seq_priv *)m->private)->p_dlm_debug; +#endif if (!lockres) return -EINVAL; +#ifdef CONFIG_OCFS2_FS_STATS + if (dlm_debug->d_filter_secs) { + now = ktime_to_us(ktime_get_real()); + if (lockres->l_lock_prmode.ls_last > + lockres->l_lock_exmode.ls_last) + last = lockres->l_lock_prmode.ls_last; + else + last = lockres->l_lock_exmode.ls_last; + /* + * Use d_filter_secs field to filter lock resources dump, + * the default d_filter_secs(0) value filters nothing, + * otherwise, only dump the last N seconds active lock + * resources. + */ + if (div_u64(now - last, 1000000) > dlm_debug->d_filter_secs) + return 0; + } +#endif + seq_printf(m, "0x%x\t", OCFS2_DLM_DEBUG_STR_VERSION); if (lockres->l_type == OCFS2_LOCK_TYPE_DENTRY) @@ -3243,6 +3269,17 @@ static int ocfs2_dlm_init_debug(struct ocfs2_super *osb) goto out; } + dlm_debug->d_locking_filter = debugfs_create_u32("locking_filter", + 0600, + osb->osb_debug_root, + &dlm_debug->d_filter_secs); + if (!dlm_debug->d_locking_filter) { + ret = -EINVAL; + mlog(ML_ERROR, + "Unable to create locking filter debugfs file.\n"); + goto out; + } + ocfs2_get_dlm_debug(dlm_debug); out: return ret; @@ -3254,6 +3291,7 @@ static void ocfs2_dlm_shutdown_debug(struct ocfs2_super *osb) if (dlm_debug) { debugfs_remove(dlm_debug->d_locking_state); + debugfs_remove(dlm_debug->d_locking_filter); ocfs2_put_dlm_debug(dlm_debug); } } diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 5c111eabaa1d..c7539601555b 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -223,6 +223,8 @@ struct ocfs2_orphan_scan { struct ocfs2_dlm_debug { struct kref d_refcnt; struct dentry *d_locking_state; + struct dentry *d_locking_filter; + u32 d_filter_secs; struct list_head d_lockres_tracking; }; From 5da844a2c7df642de2618fc3efe9a92eec40899d Mon Sep 17 00:00:00 2001 From: Gang He Date: Thu, 11 Jul 2019 20:53:09 -0700 Subject: [PATCH 021/147] ocfs2: add first lock wait time in locking_state ocfs2 file system uses locking_state file under debugfs to dump each ocfs2 file system's dlm lock resources, but the users ever encountered some hang(deadlock) problems in ocfs2 file system. I'd like to add first lock wait time in locking_state file, which can help the upper scripts detect these deadlock problems via comparing the first lock wait time with the current time. Link: http://lkml.kernel.org/r/20190611015414.27754-3-ghe@suse.com Signed-off-by: Gang He Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlmglue.c | 32 +++++++++++++++++++++++++++++--- fs/ocfs2/ocfs2.h | 1 + 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 4089daba4c6f..cf90688ff2ed 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -426,6 +426,7 @@ static void ocfs2_remove_lockres_tracking(struct ocfs2_lock_res *res) static void ocfs2_init_lock_stats(struct ocfs2_lock_res *res) { res->l_lock_refresh = 0; + res->l_lock_wait = 0; memset(&res->l_lock_prmode, 0, sizeof(struct ocfs2_lock_stats)); memset(&res->l_lock_exmode, 0, sizeof(struct ocfs2_lock_stats)); } @@ -469,6 +470,21 @@ static inline void ocfs2_track_lock_refresh(struct ocfs2_lock_res *lockres) lockres->l_lock_refresh++; } +static inline void ocfs2_track_lock_wait(struct ocfs2_lock_res *lockres) +{ + struct ocfs2_mask_waiter *mw; + + if (list_empty(&lockres->l_mask_waiters)) { + lockres->l_lock_wait = 0; + return; + } + + mw = list_first_entry(&lockres->l_mask_waiters, + struct ocfs2_mask_waiter, mw_item); + lockres->l_lock_wait = + ktime_to_us(ktime_mono_to_real(mw->mw_lock_start)); +} + static inline void ocfs2_init_start_time(struct ocfs2_mask_waiter *mw) { mw->mw_lock_start = ktime_get(); @@ -484,6 +500,9 @@ static inline void ocfs2_update_lock_stats(struct ocfs2_lock_res *res, static inline void ocfs2_track_lock_refresh(struct ocfs2_lock_res *lockres) { } +static inline void ocfs2_track_lock_wait(struct ocfs2_lock_res *lockres) +{ +} static inline void ocfs2_init_start_time(struct ocfs2_mask_waiter *mw) { } @@ -877,6 +896,7 @@ static void lockres_set_flags(struct ocfs2_lock_res *lockres, list_del_init(&mw->mw_item); mw->mw_status = 0; complete(&mw->mw_complete); + ocfs2_track_lock_wait(lockres); } } static void lockres_or_flags(struct ocfs2_lock_res *lockres, unsigned long or) @@ -1388,6 +1408,7 @@ static void lockres_add_mask_waiter(struct ocfs2_lock_res *lockres, list_add_tail(&mw->mw_item, &lockres->l_mask_waiters); mw->mw_mask = mask; mw->mw_goal = goal; + ocfs2_track_lock_wait(lockres); } /* returns 0 if the mw that was removed was already satisfied, -EBUSY @@ -1404,6 +1425,7 @@ static int __lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres, list_del_init(&mw->mw_item); init_completion(&mw->mw_complete); + ocfs2_track_lock_wait(lockres); } return ret; @@ -3084,7 +3106,7 @@ static void *ocfs2_dlm_seq_next(struct seq_file *m, void *v, loff_t *pos) * New in version 3 * - Max time in lock stats is in usecs (instead of nsecs) * New in version 4 - * - Add last pr/ex unlock times in usecs + * - Add last pr/ex unlock times and first lock wait time in usecs */ #define OCFS2_DLM_DEBUG_STR_VERSION 4 static int ocfs2_dlm_seq_show(struct seq_file *m, void *v) @@ -3102,7 +3124,7 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v) return -EINVAL; #ifdef CONFIG_OCFS2_FS_STATS - if (dlm_debug->d_filter_secs) { + if (!lockres->l_lock_wait && dlm_debug->d_filter_secs) { now = ktime_to_us(ktime_get_real()); if (lockres->l_lock_prmode.ls_last > lockres->l_lock_exmode.ls_last) @@ -3163,6 +3185,7 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v) # define lock_refresh(_l) ((_l)->l_lock_refresh) # define lock_last_prmode(_l) ((_l)->l_lock_prmode.ls_last) # define lock_last_exmode(_l) ((_l)->l_lock_exmode.ls_last) +# define lock_wait(_l) ((_l)->l_lock_wait) #else # define lock_num_prmode(_l) (0) # define lock_num_exmode(_l) (0) @@ -3175,6 +3198,7 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v) # define lock_refresh(_l) (0) # define lock_last_prmode(_l) (0ULL) # define lock_last_exmode(_l) (0ULL) +# define lock_wait(_l) (0ULL) #endif /* The following seq_print was added in version 2 of this output */ seq_printf(m, "%u\t" @@ -3187,6 +3211,7 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v) "%u\t" "%u\t" "%llu\t" + "%llu\t" "%llu\t", lock_num_prmode(lockres), lock_num_exmode(lockres), @@ -3198,7 +3223,8 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v) lock_max_exmode(lockres), lock_refresh(lockres), lock_last_prmode(lockres), - lock_last_exmode(lockres)); + lock_last_exmode(lockres), + lock_wait(lockres)); /* End the line */ seq_printf(m, "\n"); diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index c7539601555b..fddbbd60f434 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -192,6 +192,7 @@ struct ocfs2_lock_res { #ifdef CONFIG_OCFS2_FS_STATS struct ocfs2_lock_stats l_lock_prmode; /* PR mode stats */ u32 l_lock_refresh; /* Disk refreshes */ + u64 l_lock_wait; /* First lock wait time */ struct ocfs2_lock_stats l_lock_exmode; /* EX mode stats */ #endif #ifdef CONFIG_DEBUG_LOCK_ALLOC From e581595ea29c737587bcc349420bfdacb9a6b02b Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 11 Jul 2019 20:53:12 -0700 Subject: [PATCH 022/147] ocfs: no need to check return value of debugfs_create functions When calling debugfs functions, there is no need to ever check the return value. The function can work or not, but the code logic should never do something different based on this. Also, because there is no need to save the file dentry, remove all of the variables that were being saved, and just recursively delete the whole directory when shutting down, saving a lot of logic and local variables. [gregkh@linuxfoundation.org: v2] Link: http://lkml.kernel.org/r/20190613055455.GE19717@kroah.com Link: http://lkml.kernel.org/r/20190612152912.GA19151@kroah.com Signed-off-by: Greg Kroah-Hartman Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Joseph Qi Cc: Jia Guo Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/blockcheck.c | 56 +++++------------- fs/ocfs2/blockcheck.h | 7 +-- fs/ocfs2/cluster/heartbeat.c | 100 +++++++++------------------------ fs/ocfs2/cluster/heartbeat.h | 2 +- fs/ocfs2/cluster/netdebug.c | 37 ++++-------- fs/ocfs2/cluster/nodemanager.c | 4 +- fs/ocfs2/cluster/tcp.c | 3 +- fs/ocfs2/cluster/tcp.h | 5 +- fs/ocfs2/dlm/dlmdebug.c | 44 ++------------- fs/ocfs2/dlm/dlmdebug.h | 10 ++-- fs/ocfs2/dlm/dlmdomain.c | 10 +--- fs/ocfs2/dlmglue.c | 25 +-------- fs/ocfs2/super.c | 29 +--------- 13 files changed, 72 insertions(+), 260 deletions(-) diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c index 005b813a56b6..429e6a8359a5 100644 --- a/fs/ocfs2/blockcheck.c +++ b/fs/ocfs2/blockcheck.c @@ -242,57 +242,29 @@ static struct dentry *blockcheck_debugfs_create(const char *name, static void ocfs2_blockcheck_debug_remove(struct ocfs2_blockcheck_stats *stats) { if (stats) { - debugfs_remove(stats->b_debug_check); - stats->b_debug_check = NULL; - debugfs_remove(stats->b_debug_failure); - stats->b_debug_failure = NULL; - debugfs_remove(stats->b_debug_recover); - stats->b_debug_recover = NULL; - debugfs_remove(stats->b_debug_dir); + debugfs_remove_recursive(stats->b_debug_dir); stats->b_debug_dir = NULL; } } -static int ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats, - struct dentry *parent) +static void ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats, + struct dentry *parent) { - int rc = -EINVAL; - - if (!stats) - goto out; - stats->b_debug_dir = debugfs_create_dir("blockcheck", parent); - if (!stats->b_debug_dir) - goto out; - stats->b_debug_check = - blockcheck_debugfs_create("blocks_checked", - stats->b_debug_dir, - &stats->b_check_count); + blockcheck_debugfs_create("blocks_checked", stats->b_debug_dir, + &stats->b_check_count); - stats->b_debug_failure = - blockcheck_debugfs_create("checksums_failed", - stats->b_debug_dir, - &stats->b_failure_count); + blockcheck_debugfs_create("checksums_failed", stats->b_debug_dir, + &stats->b_failure_count); - stats->b_debug_recover = - blockcheck_debugfs_create("ecc_recoveries", - stats->b_debug_dir, - &stats->b_recover_count); - if (stats->b_debug_check && stats->b_debug_failure && - stats->b_debug_recover) - rc = 0; - -out: - if (rc) - ocfs2_blockcheck_debug_remove(stats); - return rc; + blockcheck_debugfs_create("ecc_recoveries", stats->b_debug_dir, + &stats->b_recover_count); } #else -static inline int ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats, - struct dentry *parent) +static inline void ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats, + struct dentry *parent) { - return 0; } static inline void ocfs2_blockcheck_debug_remove(struct ocfs2_blockcheck_stats *stats) @@ -301,10 +273,10 @@ static inline void ocfs2_blockcheck_debug_remove(struct ocfs2_blockcheck_stats * #endif /* CONFIG_DEBUG_FS */ /* Always-called wrappers for starting and stopping the debugfs files */ -int ocfs2_blockcheck_stats_debugfs_install(struct ocfs2_blockcheck_stats *stats, - struct dentry *parent) +void ocfs2_blockcheck_stats_debugfs_install(struct ocfs2_blockcheck_stats *stats, + struct dentry *parent) { - return ocfs2_blockcheck_debug_install(stats, parent); + ocfs2_blockcheck_debug_install(stats, parent); } void ocfs2_blockcheck_stats_debugfs_remove(struct ocfs2_blockcheck_stats *stats) diff --git a/fs/ocfs2/blockcheck.h b/fs/ocfs2/blockcheck.h index f2d2689407fa..8f17d2c85f40 100644 --- a/fs/ocfs2/blockcheck.h +++ b/fs/ocfs2/blockcheck.h @@ -25,9 +25,6 @@ struct ocfs2_blockcheck_stats { * ocfs2_blockcheck_stats_debugfs_install() */ struct dentry *b_debug_dir; /* Parent of the debugfs files */ - struct dentry *b_debug_check; /* Exposes b_check_count */ - struct dentry *b_debug_failure; /* Exposes b_failure_count */ - struct dentry *b_debug_recover; /* Exposes b_recover_count */ }; @@ -56,8 +53,8 @@ int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr, struct ocfs2_blockcheck_stats *stats); /* Debug Initialization */ -int ocfs2_blockcheck_stats_debugfs_install(struct ocfs2_blockcheck_stats *stats, - struct dentry *parent); +void ocfs2_blockcheck_stats_debugfs_install(struct ocfs2_blockcheck_stats *stats, + struct dentry *parent); void ocfs2_blockcheck_stats_debugfs_remove(struct ocfs2_blockcheck_stats *stats); /* diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 7f74fcc6d7d9..f1b613327ac8 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -92,10 +92,6 @@ static struct o2hb_debug_buf *o2hb_db_failedregions; #define O2HB_DEBUG_REGION_PINNED "pinned" static struct dentry *o2hb_debug_dir; -static struct dentry *o2hb_debug_livenodes; -static struct dentry *o2hb_debug_liveregions; -static struct dentry *o2hb_debug_quorumregions; -static struct dentry *o2hb_debug_failedregions; static LIST_HEAD(o2hb_all_regions); @@ -1391,11 +1387,7 @@ static const struct file_operations o2hb_debug_fops = { void o2hb_exit(void) { - debugfs_remove(o2hb_debug_failedregions); - debugfs_remove(o2hb_debug_quorumregions); - debugfs_remove(o2hb_debug_liveregions); - debugfs_remove(o2hb_debug_livenodes); - debugfs_remove(o2hb_debug_dir); + debugfs_remove_recursive(o2hb_debug_dir); kfree(o2hb_db_livenodes); kfree(o2hb_db_liveregions); kfree(o2hb_db_quorumregions); @@ -1419,79 +1411,37 @@ static struct dentry *o2hb_debug_create(const char *name, struct dentry *dir, &o2hb_debug_fops); } -static int o2hb_debug_init(void) +static void o2hb_debug_init(void) { - int ret = -ENOMEM; - o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL); - if (!o2hb_debug_dir) { - mlog_errno(ret); - goto bail; - } - o2hb_debug_livenodes = o2hb_debug_create(O2HB_DEBUG_LIVENODES, - o2hb_debug_dir, - &o2hb_db_livenodes, - sizeof(*o2hb_db_livenodes), - O2HB_DB_TYPE_LIVENODES, - sizeof(o2hb_live_node_bitmap), - O2NM_MAX_NODES, - o2hb_live_node_bitmap); - if (!o2hb_debug_livenodes) { - mlog_errno(ret); - goto bail; - } + o2hb_debug_create(O2HB_DEBUG_LIVENODES, o2hb_debug_dir, + &o2hb_db_livenodes, sizeof(*o2hb_db_livenodes), + O2HB_DB_TYPE_LIVENODES, sizeof(o2hb_live_node_bitmap), + O2NM_MAX_NODES, o2hb_live_node_bitmap); - o2hb_debug_liveregions = o2hb_debug_create(O2HB_DEBUG_LIVEREGIONS, - o2hb_debug_dir, - &o2hb_db_liveregions, - sizeof(*o2hb_db_liveregions), - O2HB_DB_TYPE_LIVEREGIONS, - sizeof(o2hb_live_region_bitmap), - O2NM_MAX_REGIONS, - o2hb_live_region_bitmap); - if (!o2hb_debug_liveregions) { - mlog_errno(ret); - goto bail; - } + o2hb_debug_create(O2HB_DEBUG_LIVEREGIONS, o2hb_debug_dir, + &o2hb_db_liveregions, sizeof(*o2hb_db_liveregions), + O2HB_DB_TYPE_LIVEREGIONS, + sizeof(o2hb_live_region_bitmap), O2NM_MAX_REGIONS, + o2hb_live_region_bitmap); - o2hb_debug_quorumregions = - o2hb_debug_create(O2HB_DEBUG_QUORUMREGIONS, - o2hb_debug_dir, - &o2hb_db_quorumregions, - sizeof(*o2hb_db_quorumregions), - O2HB_DB_TYPE_QUORUMREGIONS, - sizeof(o2hb_quorum_region_bitmap), - O2NM_MAX_REGIONS, - o2hb_quorum_region_bitmap); - if (!o2hb_debug_quorumregions) { - mlog_errno(ret); - goto bail; - } + o2hb_debug_create(O2HB_DEBUG_QUORUMREGIONS, o2hb_debug_dir, + &o2hb_db_quorumregions, + sizeof(*o2hb_db_quorumregions), + O2HB_DB_TYPE_QUORUMREGIONS, + sizeof(o2hb_quorum_region_bitmap), O2NM_MAX_REGIONS, + o2hb_quorum_region_bitmap); - o2hb_debug_failedregions = - o2hb_debug_create(O2HB_DEBUG_FAILEDREGIONS, - o2hb_debug_dir, - &o2hb_db_failedregions, - sizeof(*o2hb_db_failedregions), - O2HB_DB_TYPE_FAILEDREGIONS, - sizeof(o2hb_failed_region_bitmap), - O2NM_MAX_REGIONS, - o2hb_failed_region_bitmap); - if (!o2hb_debug_failedregions) { - mlog_errno(ret); - goto bail; - } - - ret = 0; -bail: - if (ret) - o2hb_exit(); - - return ret; + o2hb_debug_create(O2HB_DEBUG_FAILEDREGIONS, o2hb_debug_dir, + &o2hb_db_failedregions, + sizeof(*o2hb_db_failedregions), + O2HB_DB_TYPE_FAILEDREGIONS, + sizeof(o2hb_failed_region_bitmap), O2NM_MAX_REGIONS, + o2hb_failed_region_bitmap); } -int o2hb_init(void) +void o2hb_init(void) { int i; @@ -1511,7 +1461,7 @@ int o2hb_init(void) o2hb_dependent_users = 0; - return o2hb_debug_init(); + o2hb_debug_init(); } /* if we're already in a callback then we're already serialized by the sem */ diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h index 7f37540ac4ab..beed31ea86cf 100644 --- a/fs/ocfs2/cluster/heartbeat.h +++ b/fs/ocfs2/cluster/heartbeat.h @@ -63,7 +63,7 @@ void o2hb_unregister_callback(const char *region_uuid, void o2hb_fill_node_map(unsigned long *map, unsigned bytes); void o2hb_exit(void); -int o2hb_init(void); +void o2hb_init(void); int o2hb_check_node_heartbeating_no_sem(u8 node_num); int o2hb_check_node_heartbeating_from_callback(u8 node_num); void o2hb_stop_all_regions(void); diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c index 0784575f4c2a..02bf4a1774cc 100644 --- a/fs/ocfs2/cluster/netdebug.c +++ b/fs/ocfs2/cluster/netdebug.c @@ -38,10 +38,6 @@ #define SHOW_SOCK_STATS 1 static struct dentry *o2net_dentry; -static struct dentry *sc_dentry; -static struct dentry *nst_dentry; -static struct dentry *stats_dentry; -static struct dentry *nodes_dentry; static DEFINE_SPINLOCK(o2net_debug_lock); @@ -490,36 +486,23 @@ static const struct file_operations nodes_fops = { void o2net_debugfs_exit(void) { - debugfs_remove(nodes_dentry); - debugfs_remove(stats_dentry); - debugfs_remove(sc_dentry); - debugfs_remove(nst_dentry); - debugfs_remove(o2net_dentry); + debugfs_remove_recursive(o2net_dentry); } -int o2net_debugfs_init(void) +void o2net_debugfs_init(void) { umode_t mode = S_IFREG|S_IRUSR; o2net_dentry = debugfs_create_dir(O2NET_DEBUG_DIR, NULL); - if (o2net_dentry) - nst_dentry = debugfs_create_file(NST_DEBUG_NAME, mode, - o2net_dentry, NULL, &nst_seq_fops); - if (nst_dentry) - sc_dentry = debugfs_create_file(SC_DEBUG_NAME, mode, - o2net_dentry, NULL, &sc_seq_fops); - if (sc_dentry) - stats_dentry = debugfs_create_file(STATS_DEBUG_NAME, mode, - o2net_dentry, NULL, &stats_seq_fops); - if (stats_dentry) - nodes_dentry = debugfs_create_file(NODES_DEBUG_NAME, mode, - o2net_dentry, NULL, &nodes_fops); - if (nodes_dentry) - return 0; - o2net_debugfs_exit(); - mlog_errno(-ENOMEM); - return -ENOMEM; + debugfs_create_file(NST_DEBUG_NAME, mode, o2net_dentry, NULL, + &nst_seq_fops); + debugfs_create_file(SC_DEBUG_NAME, mode, o2net_dentry, NULL, + &sc_seq_fops); + debugfs_create_file(STATS_DEBUG_NAME, mode, o2net_dentry, NULL, + &stats_seq_fops); + debugfs_create_file(NODES_DEBUG_NAME, mode, o2net_dentry, NULL, + &nodes_fops); } #endif /* CONFIG_DEBUG_FS */ diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c index 2234f7fd1f7c..7a7640c59f3c 100644 --- a/fs/ocfs2/cluster/nodemanager.c +++ b/fs/ocfs2/cluster/nodemanager.c @@ -828,9 +828,7 @@ static int __init init_o2nm(void) { int ret = -1; - ret = o2hb_init(); - if (ret) - goto out; + o2hb_init(); ret = o2net_init(); if (ret) diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index c07c9aac537a..48a3398f0bf5 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -2129,8 +2129,7 @@ int o2net_init(void) o2quo_init(); - if (o2net_debugfs_init()) - goto out; + o2net_debugfs_init(); o2net_hand = kzalloc(sizeof(struct o2net_handshake), GFP_KERNEL); o2net_keep_req = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL); diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h index dd4242be3f1f..de87cbffd175 100644 --- a/fs/ocfs2/cluster/tcp.h +++ b/fs/ocfs2/cluster/tcp.h @@ -109,16 +109,15 @@ struct o2net_send_tracking; struct o2net_sock_container; #ifdef CONFIG_DEBUG_FS -int o2net_debugfs_init(void); +void o2net_debugfs_init(void); void o2net_debugfs_exit(void); void o2net_debug_add_nst(struct o2net_send_tracking *nst); void o2net_debug_del_nst(struct o2net_send_tracking *nst); void o2net_debug_add_sc(struct o2net_sock_container *sc); void o2net_debug_del_sc(struct o2net_sock_container *sc); #else -static inline int o2net_debugfs_init(void) +static inline void o2net_debugfs_init(void) { - return 0; } static inline void o2net_debugfs_exit(void) { diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index c8af5bc9e980..a4b58ba99927 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -851,7 +851,7 @@ static const struct file_operations debug_state_fops = { /* end - debug state funcs */ /* files in subroot */ -int dlm_debug_init(struct dlm_ctxt *dlm) +void dlm_debug_init(struct dlm_ctxt *dlm) { struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt; @@ -860,10 +860,6 @@ int dlm_debug_init(struct dlm_ctxt *dlm) S_IFREG|S_IRUSR, dlm->dlm_debugfs_subroot, dlm, &debug_state_fops); - if (!dc->debug_state_dentry) { - mlog_errno(-ENOMEM); - goto bail; - } /* for dumping lockres */ dc->debug_lockres_dentry = @@ -871,20 +867,12 @@ int dlm_debug_init(struct dlm_ctxt *dlm) S_IFREG|S_IRUSR, dlm->dlm_debugfs_subroot, dlm, &debug_lockres_fops); - if (!dc->debug_lockres_dentry) { - mlog_errno(-ENOMEM); - goto bail; - } /* for dumping mles */ dc->debug_mle_dentry = debugfs_create_file(DLM_DEBUGFS_MLE_STATE, S_IFREG|S_IRUSR, dlm->dlm_debugfs_subroot, dlm, &debug_mle_fops); - if (!dc->debug_mle_dentry) { - mlog_errno(-ENOMEM); - goto bail; - } /* for dumping lockres on the purge list */ dc->debug_purgelist_dentry = @@ -892,15 +880,6 @@ int dlm_debug_init(struct dlm_ctxt *dlm) S_IFREG|S_IRUSR, dlm->dlm_debugfs_subroot, dlm, &debug_purgelist_fops); - if (!dc->debug_purgelist_dentry) { - mlog_errno(-ENOMEM); - goto bail; - } - - return 0; - -bail: - return -ENOMEM; } void dlm_debug_shutdown(struct dlm_ctxt *dlm) @@ -920,24 +899,16 @@ void dlm_debug_shutdown(struct dlm_ctxt *dlm) /* subroot - domain dir */ int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm) { - dlm->dlm_debugfs_subroot = debugfs_create_dir(dlm->name, - dlm_debugfs_root); - if (!dlm->dlm_debugfs_subroot) { - mlog_errno(-ENOMEM); - goto bail; - } - dlm->dlm_debug_ctxt = kzalloc(sizeof(struct dlm_debug_ctxt), GFP_KERNEL); if (!dlm->dlm_debug_ctxt) { mlog_errno(-ENOMEM); - goto bail; + return -ENOMEM; } + dlm->dlm_debugfs_subroot = debugfs_create_dir(dlm->name, + dlm_debugfs_root); return 0; -bail: - dlm_destroy_debugfs_subroot(dlm); - return -ENOMEM; } void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm) @@ -946,14 +917,9 @@ void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm) } /* debugfs root */ -int dlm_create_debugfs_root(void) +void dlm_create_debugfs_root(void) { dlm_debugfs_root = debugfs_create_dir(DLM_DEBUGFS_DIR, NULL); - if (!dlm_debugfs_root) { - mlog_errno(-ENOMEM); - return -ENOMEM; - } - return 0; } void dlm_destroy_debugfs_root(void) diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h index 74d019694c7e..7d0c7c9013ce 100644 --- a/fs/ocfs2/dlm/dlmdebug.h +++ b/fs/ocfs2/dlm/dlmdebug.h @@ -28,20 +28,19 @@ struct debug_lockres { struct dlm_lock_resource *dl_res; }; -int dlm_debug_init(struct dlm_ctxt *dlm); +void dlm_debug_init(struct dlm_ctxt *dlm); void dlm_debug_shutdown(struct dlm_ctxt *dlm); int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm); void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm); -int dlm_create_debugfs_root(void); +void dlm_create_debugfs_root(void); void dlm_destroy_debugfs_root(void); #else -static inline int dlm_debug_init(struct dlm_ctxt *dlm) +static inline void dlm_debug_init(struct dlm_ctxt *dlm) { - return 0; } static inline void dlm_debug_shutdown(struct dlm_ctxt *dlm) { @@ -53,9 +52,8 @@ static inline int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm) static inline void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm) { } -static inline int dlm_create_debugfs_root(void) +static inline void dlm_create_debugfs_root(void) { - return 0; } static inline void dlm_destroy_debugfs_root(void) { diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 9021e72e1f98..7338b5d4647c 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -1881,11 +1881,7 @@ static int dlm_join_domain(struct dlm_ctxt *dlm) goto bail; } - status = dlm_debug_init(dlm); - if (status < 0) { - mlog_errno(status); - goto bail; - } + dlm_debug_init(dlm); snprintf(wq_name, O2NM_MAX_NAME_LEN, "dlm_wq-%s", dlm->name); dlm->dlm_worker = alloc_workqueue(wq_name, WQ_MEM_RECLAIM, 0); @@ -2346,9 +2342,7 @@ static int __init dlm_init(void) goto error; } - status = dlm_create_debugfs_root(); - if (status) - goto error; + dlm_create_debugfs_root(); return 0; error: diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index cf90688ff2ed..dc987f56c2ea 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -3278,9 +3278,8 @@ static const struct file_operations ocfs2_dlm_debug_fops = { .llseek = seq_lseek, }; -static int ocfs2_dlm_init_debug(struct ocfs2_super *osb) +static void ocfs2_dlm_init_debug(struct ocfs2_super *osb) { - int ret = 0; struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug; dlm_debug->d_locking_state = debugfs_create_file("locking_state", @@ -3288,27 +3287,11 @@ static int ocfs2_dlm_init_debug(struct ocfs2_super *osb) osb->osb_debug_root, osb, &ocfs2_dlm_debug_fops); - if (!dlm_debug->d_locking_state) { - ret = -EINVAL; - mlog(ML_ERROR, - "Unable to create locking state debugfs file.\n"); - goto out; - } dlm_debug->d_locking_filter = debugfs_create_u32("locking_filter", 0600, osb->osb_debug_root, &dlm_debug->d_filter_secs); - if (!dlm_debug->d_locking_filter) { - ret = -EINVAL; - mlog(ML_ERROR, - "Unable to create locking filter debugfs file.\n"); - goto out; - } - - ocfs2_get_dlm_debug(dlm_debug); -out: - return ret; } static void ocfs2_dlm_shutdown_debug(struct ocfs2_super *osb) @@ -3332,11 +3315,7 @@ int ocfs2_dlm_init(struct ocfs2_super *osb) goto local; } - status = ocfs2_dlm_init_debug(osb); - if (status < 0) { - mlog_errno(status); - goto bail; - } + ocfs2_dlm_init_debug(osb); /* launch downconvert thread */ osb->dc_task = kthread_run(ocfs2_downconvert_thread, osb, "ocfs2dc-%s", diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index a201f9780b35..8b2f39506648 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1079,33 +1079,15 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) osb->osb_debug_root = debugfs_create_dir(osb->uuid_str, ocfs2_debugfs_root); - if (!osb->osb_debug_root) { - status = -EINVAL; - mlog(ML_ERROR, "Unable to create per-mount debugfs root.\n"); - goto read_super_error; - } osb->osb_ctxt = debugfs_create_file("fs_state", S_IFREG|S_IRUSR, osb->osb_debug_root, osb, &ocfs2_osb_debug_fops); - if (!osb->osb_ctxt) { - status = -EINVAL; - mlog_errno(status); - goto read_super_error; - } - if (ocfs2_meta_ecc(osb)) { - status = ocfs2_blockcheck_stats_debugfs_install( - &osb->osb_ecc_stats, - osb->osb_debug_root); - if (status) { - mlog(ML_ERROR, - "Unable to create blockcheck statistics " - "files\n"); - goto read_super_error; - } - } + if (ocfs2_meta_ecc(osb)) + ocfs2_blockcheck_stats_debugfs_install( &osb->osb_ecc_stats, + osb->osb_debug_root); status = ocfs2_mount_volume(sb); if (status < 0) @@ -1592,11 +1574,6 @@ static int __init ocfs2_init(void) goto out2; ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL); - if (!ocfs2_debugfs_root) { - status = -ENOMEM; - mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n"); - goto out3; - } ocfs2_set_locking_protocol(); From 4658d87cb38cb3ab3a234d1b8d63f65df4cce62b Mon Sep 17 00:00:00 2001 From: Hariprasad Kelam Date: Thu, 11 Jul 2019 20:53:16 -0700 Subject: [PATCH 023/147] fs/ocfs2/dlmglue.c: unneeded variable: "status" fix below issue reported by coccicheck fs/ocfs2/dlmglue.c:4410:5-11: Unneeded variable: "status". Return "0" on line 4428 We can not change return type of ocfs2_downconvert_thread as its registered as callback of kthread_create. Link: http://lkml.kernel.org/r/20190702183237.GA13975@hari-Inspiron-1545 Signed-off-by: Hariprasad Kelam Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlmglue.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index dc987f56c2ea..14207234fa3d 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -4407,7 +4407,6 @@ static int ocfs2_downconvert_thread_should_wake(struct ocfs2_super *osb) static int ocfs2_downconvert_thread(void *arg) { - int status = 0; struct ocfs2_super *osb = arg; /* only quit once we've been asked to stop and there is no more @@ -4425,7 +4424,7 @@ static int ocfs2_downconvert_thread(void *arg) } osb->dc_task = NULL; - return status; + return 0; } void ocfs2_wake_downconvert_thread(struct ocfs2_super *osb) From d8b2fa657deaa73ff70d40aea9a54997fc0c7fc9 Mon Sep 17 00:00:00 2001 From: Fuqian Huang Date: Thu, 11 Jul 2019 20:53:19 -0700 Subject: [PATCH 024/147] ocfs2: use kmemdup rather than duplicating its implementation kmemdup is introduced to duplicate a region of memory in a neat way. Rather than kmalloc/kzalloc + memcpy, which the programmer needs to write the size twice (sometimes lead to mistakes), kmemdup improves readability, leads to smaller code and also reduce the chances of mistakes. Suggestion to use kmemdup rather than using kmalloc/kzalloc + memcpy. [akpm@linux-foundation.org: coding style fixes] Link: http://lkml.kernel.org/r/20190703163147.881-1-huangfq.daxian@gmail.com Signed-off-by: Fuqian Huang Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/alloc.c | 10 +++++----- fs/ocfs2/localalloc.c | 6 ++---- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index d1348fc4ca6d..0c335b51043d 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -6191,17 +6191,17 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb, if (le16_to_cpu(tl->tl_used)) { trace_ocfs2_truncate_log_recovery_num(le16_to_cpu(tl->tl_used)); - *tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL); + /* + * Assuming the write-out below goes well, this copy will be + * passed back to recovery for processing. + */ + *tl_copy = kmemdup(tl_bh->b_data, tl_bh->b_size, GFP_KERNEL); if (!(*tl_copy)) { status = -ENOMEM; mlog_errno(status); goto bail; } - /* Assuming the write-out below goes well, this copy - * will be passed back to recovery for processing. */ - memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size); - /* All we need to do to clear the truncate log is set * tl_used. */ tl->tl_used = 0; diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index f03674afbd30..158e5af767fd 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -424,12 +424,11 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb) bh = osb->local_alloc_bh; alloc = (struct ocfs2_dinode *) bh->b_data; - alloc_copy = kmalloc(bh->b_size, GFP_NOFS); + alloc_copy = kmemdup(alloc, bh->b_size, GFP_NOFS); if (!alloc_copy) { status = -ENOMEM; goto out_commit; } - memcpy(alloc_copy, alloc, bh->b_size); status = ocfs2_journal_access_di(handle, INODE_CACHE(local_alloc_inode), bh, OCFS2_JOURNAL_ACCESS_WRITE); @@ -1272,13 +1271,12 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb, * local alloc shutdown won't try to double free main bitmap * bits. Make a copy so the sync function knows which bits to * free. */ - alloc_copy = kmalloc(osb->local_alloc_bh->b_size, GFP_NOFS); + alloc_copy = kmemdup(alloc, osb->local_alloc_bh->b_size, GFP_NOFS); if (!alloc_copy) { status = -ENOMEM; mlog_errno(status); goto bail; } - memcpy(alloc_copy, alloc, osb->local_alloc_bh->b_size); status = ocfs2_journal_access_di(handle, INODE_CACHE(local_alloc_inode), From 598a0717a816abc8f5d3c4598628338b9190d127 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 11 Jul 2019 20:53:23 -0700 Subject: [PATCH 025/147] mm/slab: validate cache membership under freelist hardening Patch series "mm/slab: Improved sanity checking". This adds defenses against slab cache confusion (as seen in real-world exploits[1]) and gracefully handles type confusions when trying to look up slab caches from an arbitrary page. (Also is patch 3: new LKDTM tests for these defenses as well as for the existing double-free detection. This patch (of 3): When building under CONFIG_SLAB_FREELIST_HARDENING, it makes sense to perform sanity-checking on the assumed slab cache during kmem_cache_free() to make sure the kernel doesn't mix freelists across slab caches and corrupt memory (as seen in the exploitation of flaws like CVE-2018-9568[1]). Note that the prior code might WARN() but still corrupt memory (i.e. return the assumed cache instead of the owned cache). There is no noticeable performance impact (changes are within noise). Measuring parallel kernel builds, I saw the following with CONFIG_SLAB_FREELIST_HARDENED, before and after this patch: before: Run times: 288.85 286.53 287.09 287.07 287.21 Min: 286.53 Max: 288.85 Mean: 287.35 Std Dev: 0.79 after: Run times: 289.58 287.40 286.97 287.20 287.01 Min: 286.97 Max: 289.58 Mean: 287.63 Std Dev: 0.99 Delta: 0.1% which is well below the standard deviation [1] https://github.com/ThomasKing2014/slides/raw/master/Building%20universal%20Android%20rooting%20with%20a%20type%20confusion%20vulnerability.pdf Link: http://lkml.kernel.org/r/20190530045017.15252-2-keescook@chromium.org Signed-off-by: Kees Cook Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Alexander Popov Cc: Alexander Potapenko Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.h | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/mm/slab.h b/mm/slab.h index 43ac818b8592..4dafae2c8620 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -310,7 +310,7 @@ static inline bool is_root_cache(struct kmem_cache *s) static inline bool slab_equal_or_root(struct kmem_cache *s, struct kmem_cache *p) { - return true; + return s == p; } static inline const char *cache_name(struct kmem_cache *s) @@ -363,18 +363,16 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) * will also be a constant. */ if (!memcg_kmem_enabled() && + !IS_ENABLED(CONFIG_SLAB_FREELIST_HARDENED) && !unlikely(s->flags & SLAB_CONSISTENCY_CHECKS)) return s; page = virt_to_head_page(x); cachep = page->slab_cache; - if (slab_equal_or_root(cachep, s)) - return cachep; - - pr_err("%s: Wrong slab cache. %s but object is from %s\n", - __func__, s->name, cachep->name); - WARN_ON_ONCE(1); - return s; + WARN_ONCE(!slab_equal_or_root(cachep, s), + "%s: Wrong slab cache. %s but object is from %s\n", + __func__, s->name, cachep->name); + return cachep; } static inline size_t slab_ksize(const struct kmem_cache *s) From a64b53780ec35b77daf817210c88aa42d172c98f Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 11 Jul 2019 20:53:26 -0700 Subject: [PATCH 026/147] mm/slab: sanity-check page type when looking up cache This avoids any possible type confusion when looking up an object. For example, if a non-slab were to be passed to kfree(), the invalid slab_cache pointer (i.e. overlapped with some other value from the struct page union) would be used for subsequent slab manipulations that could lead to further memory corruption. Since the page is already in cache, adding the PageSlab() check will have nearly zero cost, so add a check and WARN() to virt_to_cache(). Additionally replaces an open-coded virt_to_cache(). To support the failure mode this also updates all callers of virt_to_cache() and cache_from_obj() to handle a NULL cache pointer return value (though note that several already handle this case gracefully). [dan.carpenter@oracle.com: restore IRQs in kfree()] Link: http://lkml.kernel.org/r/20190613065637.GE16334@mwanda Link: http://lkml.kernel.org/r/20190530045017.15252-3-keescook@chromium.org Signed-off-by: Kees Cook Signed-off-by: Dan Carpenter Cc: Alexander Popov Cc: Alexander Potapenko Cc: Christoph Lameter Cc: David Rientjes Cc: Greg Kroah-Hartman Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 16 +++++++++------- mm/slab.h | 17 +++++++++++++---- 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/mm/slab.c b/mm/slab.c index f7117ad9b3a3..db01e9aae31b 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -371,12 +371,6 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp) static int slab_max_order = SLAB_MAX_ORDER_LO; static bool slab_max_order_set __initdata; -static inline struct kmem_cache *virt_to_cache(const void *obj) -{ - struct page *page = virt_to_head_page(obj); - return page->slab_cache; -} - static inline void *index_to_obj(struct kmem_cache *cache, struct page *page, unsigned int idx) { @@ -3715,6 +3709,8 @@ void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p) s = virt_to_cache(objp); else s = cache_from_obj(orig_s, objp); + if (!s) + continue; debug_check_no_locks_freed(objp, s->object_size); if (!(s->flags & SLAB_DEBUG_OBJECTS)) @@ -3749,6 +3745,10 @@ void kfree(const void *objp) local_irq_save(flags); kfree_debugcheck(objp); c = virt_to_cache(objp); + if (!c) { + local_irq_restore(flags); + return; + } debug_check_no_locks_freed(objp, c->object_size); debug_check_no_obj_freed(objp, c->object_size); @@ -4219,13 +4219,15 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page, */ size_t ksize(const void *objp) { + struct kmem_cache *c; size_t size; BUG_ON(!objp); if (unlikely(objp == ZERO_SIZE_PTR)) return 0; - size = virt_to_cache(objp)->object_size; + c = virt_to_cache(objp); + size = c ? c->object_size : 0; /* We assume that ksize callers could use the whole allocated area, * so we need to unpoison this area. */ diff --git a/mm/slab.h b/mm/slab.h index 4dafae2c8620..739099af6cbb 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -350,10 +350,20 @@ static inline void memcg_link_cache(struct kmem_cache *s) #endif /* CONFIG_MEMCG_KMEM */ +static inline struct kmem_cache *virt_to_cache(const void *obj) +{ + struct page *page; + + page = virt_to_head_page(obj); + if (WARN_ONCE(!PageSlab(page), "%s: Object is not a Slab page!\n", + __func__)) + return NULL; + return page->slab_cache; +} + static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) { struct kmem_cache *cachep; - struct page *page; /* * When kmemcg is not being used, both assignments should return the @@ -367,9 +377,8 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) !unlikely(s->flags & SLAB_CONSISTENCY_CHECKS)) return s; - page = virt_to_head_page(x); - cachep = page->slab_cache; - WARN_ONCE(!slab_equal_or_root(cachep, s), + cachep = virt_to_cache(x); + WARN_ONCE(cachep && !slab_equal_or_root(cachep, s), "%s: Wrong slab cache. %s but object is from %s\n", __func__, s->name, cachep->name); return cachep; From 966fede8e4be15bcc08e3c390080d3f9072a5367 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 11 Jul 2019 20:53:29 -0700 Subject: [PATCH 027/147] lkdtm/heap: add tests for freelist hardening This adds tests for double free and cross-cache freeing, which should both be caught by CONFIG_SLAB_FREELIST_HARDENED. Link: http://lkml.kernel.org/r/20190530045017.15252-4-keescook@chromium.org Signed-off-by: Kees Cook Cc: Alexander Popov Cc: Alexander Potapenko Cc: Christoph Lameter Cc: David Rientjes Cc: Greg Kroah-Hartman Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/misc/lkdtm/core.c | 5 +++ drivers/misc/lkdtm/heap.c | 72 ++++++++++++++++++++++++++++++++++++++ drivers/misc/lkdtm/lkdtm.h | 5 +++ 3 files changed, 82 insertions(+) diff --git a/drivers/misc/lkdtm/core.c b/drivers/misc/lkdtm/core.c index bba49abb6750..c7a507482051 100644 --- a/drivers/misc/lkdtm/core.c +++ b/drivers/misc/lkdtm/core.c @@ -120,6 +120,9 @@ static const struct crashtype crashtypes[] = { CRASHTYPE(READ_AFTER_FREE), CRASHTYPE(WRITE_BUDDY_AFTER_FREE), CRASHTYPE(READ_BUDDY_AFTER_FREE), + CRASHTYPE(SLAB_FREE_DOUBLE), + CRASHTYPE(SLAB_FREE_CROSS), + CRASHTYPE(SLAB_FREE_PAGE), CRASHTYPE(SOFTLOCKUP), CRASHTYPE(HARDLOCKUP), CRASHTYPE(SPINLOCKUP), @@ -426,6 +429,7 @@ static int __init lkdtm_module_init(void) lkdtm_bugs_init(&recur_count); lkdtm_perms_init(); lkdtm_usercopy_init(); + lkdtm_heap_init(); /* Register debugfs interface */ lkdtm_debugfs_root = debugfs_create_dir("provoke-crash", NULL); @@ -472,6 +476,7 @@ static void __exit lkdtm_module_exit(void) debugfs_remove_recursive(lkdtm_debugfs_root); /* Handle test-specific clean-up. */ + lkdtm_heap_exit(); lkdtm_usercopy_exit(); if (lkdtm_kprobe != NULL) diff --git a/drivers/misc/lkdtm/heap.c b/drivers/misc/lkdtm/heap.c index 65026d7de130..3c5cec85edce 100644 --- a/drivers/misc/lkdtm/heap.c +++ b/drivers/misc/lkdtm/heap.c @@ -7,6 +7,10 @@ #include #include +static struct kmem_cache *double_free_cache; +static struct kmem_cache *a_cache; +static struct kmem_cache *b_cache; + /* * This tries to stay within the next largest power-of-2 kmalloc cache * to avoid actually overwriting anything important if it's not detected @@ -146,3 +150,71 @@ void lkdtm_READ_BUDDY_AFTER_FREE(void) kfree(val); } + +void lkdtm_SLAB_FREE_DOUBLE(void) +{ + int *val; + + val = kmem_cache_alloc(double_free_cache, GFP_KERNEL); + if (!val) { + pr_info("Unable to allocate double_free_cache memory.\n"); + return; + } + + /* Just make sure we got real memory. */ + *val = 0x12345678; + pr_info("Attempting double slab free ...\n"); + kmem_cache_free(double_free_cache, val); + kmem_cache_free(double_free_cache, val); +} + +void lkdtm_SLAB_FREE_CROSS(void) +{ + int *val; + + val = kmem_cache_alloc(a_cache, GFP_KERNEL); + if (!val) { + pr_info("Unable to allocate a_cache memory.\n"); + return; + } + + /* Just make sure we got real memory. */ + *val = 0x12345679; + pr_info("Attempting cross-cache slab free ...\n"); + kmem_cache_free(b_cache, val); +} + +void lkdtm_SLAB_FREE_PAGE(void) +{ + unsigned long p = __get_free_page(GFP_KERNEL); + + pr_info("Attempting non-Slab slab free ...\n"); + kmem_cache_free(NULL, (void *)p); + free_page(p); +} + +/* + * We have constructors to keep the caches distinctly separated without + * needing to boot with "slab_nomerge". + */ +static void ctor_double_free(void *region) +{ } +static void ctor_a(void *region) +{ } +static void ctor_b(void *region) +{ } + +void __init lkdtm_heap_init(void) +{ + double_free_cache = kmem_cache_create("lkdtm-heap-double_free", + 64, 0, 0, ctor_double_free); + a_cache = kmem_cache_create("lkdtm-heap-a", 64, 0, 0, ctor_a); + b_cache = kmem_cache_create("lkdtm-heap-b", 64, 0, 0, ctor_b); +} + +void __exit lkdtm_heap_exit(void) +{ + kmem_cache_destroy(double_free_cache); + kmem_cache_destroy(a_cache); + kmem_cache_destroy(b_cache); +} diff --git a/drivers/misc/lkdtm/lkdtm.h b/drivers/misc/lkdtm/lkdtm.h index 23dc565b4307..c5ae0b37587d 100644 --- a/drivers/misc/lkdtm/lkdtm.h +++ b/drivers/misc/lkdtm/lkdtm.h @@ -28,11 +28,16 @@ void lkdtm_STACK_GUARD_PAGE_LEADING(void); void lkdtm_STACK_GUARD_PAGE_TRAILING(void); /* lkdtm_heap.c */ +void __init lkdtm_heap_init(void); +void __exit lkdtm_heap_exit(void); void lkdtm_OVERWRITE_ALLOCATION(void); void lkdtm_WRITE_AFTER_FREE(void); void lkdtm_READ_AFTER_FREE(void); void lkdtm_WRITE_BUDDY_AFTER_FREE(void); void lkdtm_READ_BUDDY_AFTER_FREE(void); +void lkdtm_SLAB_FREE_DOUBLE(void); +void lkdtm_SLAB_FREE_CROSS(void); +void lkdtm_SLAB_FREE_PAGE(void); /* lkdtm_perms.c */ void __init lkdtm_perms_init(void); From 9cf3a8d847bd08977dc168ed243ffbef3c456d88 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Thu, 11 Jul 2019 20:53:33 -0700 Subject: [PATCH 028/147] mm/slub.c: avoid double string traverse in kmem_cache_flags() If ',' is not found, kmem_cache_flags() calls strlen() to find the end of line. We can do it in a single pass using strchrnul(). Link: http://lkml.kernel.org/r/20190501053111.7950-1-ynorov@marvell.com Signed-off-by: Yury Norov Acked-by: Aaron Tomlin Reviewed-by: Andrew Morton Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index cd04dbd2b5d0..1802c87799ff 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1313,9 +1313,7 @@ slab_flags_t kmem_cache_flags(unsigned int object_size, char *end, *glob; size_t cmplen; - end = strchr(iter, ','); - if (!end) - end = iter + strlen(iter); + end = strchrnul(iter, ','); glob = strnchr(iter, end - iter, '*'); if (glob) From cb097cd48313575b03a5de092a04f6af8aa0739e Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 11 Jul 2019 20:53:36 -0700 Subject: [PATCH 029/147] slub: don't panic for memcg kmem cache creation failure Currently for CONFIG_SLUB, if a memcg kmem cache creation is failed and the corresponding root kmem cache has SLAB_PANIC flag, the kernel will be crashed. This is unnecessary as the kernel can handle the creation failures of memcg kmem caches. Additionally CONFIG_SLAB does not implement this behavior. So, to keep the behavior consistent between SLAB and SLUB, removing the panic for memcg kmem cache creation failures. The root kmem cache creation failure for SLAB_PANIC correctly panics for both SLAB and SLUB. Link: http://lkml.kernel.org/r/20190619232514.58994-1-shakeelb@google.com Reported-by: Dave Hansen Signed-off-by: Shakeel Butt Acked-by: David Rientjes Acked-by: Michal Hocko Cc: Alexey Dobriyan Cc: Johannes Weiner Cc: Christoph Lameter Cc: Roman Gushchin Cc: Pekka Enberg Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 1802c87799ff..d46a91759b96 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3650,10 +3650,6 @@ static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags) free_kmem_cache_nodes(s); error: - if (flags & SLAB_PANIC) - panic("Cannot create slab %s size=%u realsize=%u order=%u offset=%u flags=%lx\n", - s->name, s->size, s->size, - oo_order(s->oo), s->offset, (unsigned long)flags); return -EINVAL; } From 6ef9056952532c3b746de46aa10d45b4d7797bd8 Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Thu, 11 Jul 2019 20:53:39 -0700 Subject: [PATCH 030/147] mm/kmemleak.c: fix check for softirq context in_softirq() is a wrong predicate to check if we are in a softirq context. It also returns true if we have BH disabled, so objects are falsely stamped with "softirq" comm. The correct predicate is in_serving_softirq(). If user does cat from /sys/kernel/debug/kmemleak previously they would see this, which is clearly wrong, this is system call context (see the comm): unreferenced object 0xffff88805bd661c0 (size 64): comm "softirq", pid 0, jiffies 4294942959 (age 12.400s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 ff ff ff ff 00 00 00 00 ................ 00 00 00 00 00 00 00 00 01 00 00 00 00 00 00 00 ................ backtrace: [<0000000007dcb30c>] kmemleak_alloc_recursive include/linux/kmemleak.h:55 [inline] [<0000000007dcb30c>] slab_post_alloc_hook mm/slab.h:439 [inline] [<0000000007dcb30c>] slab_alloc mm/slab.c:3326 [inline] [<0000000007dcb30c>] kmem_cache_alloc_trace+0x13d/0x280 mm/slab.c:3553 [<00000000969722b7>] kmalloc include/linux/slab.h:547 [inline] [<00000000969722b7>] kzalloc include/linux/slab.h:742 [inline] [<00000000969722b7>] ip_mc_add1_src net/ipv4/igmp.c:1961 [inline] [<00000000969722b7>] ip_mc_add_src+0x36b/0x400 net/ipv4/igmp.c:2085 [<00000000a4134b5f>] ip_mc_msfilter+0x22d/0x310 net/ipv4/igmp.c:2475 [<00000000d20248ad>] do_ip_setsockopt.isra.0+0x19fe/0x1c00 net/ipv4/ip_sockglue.c:957 [<000000003d367be7>] ip_setsockopt+0x3b/0xb0 net/ipv4/ip_sockglue.c:1246 [<000000003c7c76af>] udp_setsockopt+0x4e/0x90 net/ipv4/udp.c:2616 [<000000000c1aeb23>] sock_common_setsockopt+0x3e/0x50 net/core/sock.c:3130 [<000000000157b92b>] __sys_setsockopt+0x9e/0x120 net/socket.c:2078 [<00000000a9f3d058>] __do_sys_setsockopt net/socket.c:2089 [inline] [<00000000a9f3d058>] __se_sys_setsockopt net/socket.c:2086 [inline] [<00000000a9f3d058>] __x64_sys_setsockopt+0x26/0x30 net/socket.c:2086 [<000000001b8da885>] do_syscall_64+0x7c/0x1a0 arch/x86/entry/common.c:301 [<00000000ba770c62>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 now they will see this: unreferenced object 0xffff88805413c800 (size 64): comm "syz-executor.4", pid 8960, jiffies 4294994003 (age 14.350s) hex dump (first 32 bytes): 00 7a 8a 57 80 88 ff ff e0 00 00 01 00 00 00 00 .z.W............ 00 00 00 00 00 00 00 00 01 00 00 00 00 00 00 00 ................ backtrace: [<00000000c5d3be64>] kmemleak_alloc_recursive include/linux/kmemleak.h:55 [inline] [<00000000c5d3be64>] slab_post_alloc_hook mm/slab.h:439 [inline] [<00000000c5d3be64>] slab_alloc mm/slab.c:3326 [inline] [<00000000c5d3be64>] kmem_cache_alloc_trace+0x13d/0x280 mm/slab.c:3553 [<0000000023865be2>] kmalloc include/linux/slab.h:547 [inline] [<0000000023865be2>] kzalloc include/linux/slab.h:742 [inline] [<0000000023865be2>] ip_mc_add1_src net/ipv4/igmp.c:1961 [inline] [<0000000023865be2>] ip_mc_add_src+0x36b/0x400 net/ipv4/igmp.c:2085 [<000000003029a9d4>] ip_mc_msfilter+0x22d/0x310 net/ipv4/igmp.c:2475 [<00000000ccd0a87c>] do_ip_setsockopt.isra.0+0x19fe/0x1c00 net/ipv4/ip_sockglue.c:957 [<00000000a85a3785>] ip_setsockopt+0x3b/0xb0 net/ipv4/ip_sockglue.c:1246 [<00000000ec13c18d>] udp_setsockopt+0x4e/0x90 net/ipv4/udp.c:2616 [<0000000052d748e3>] sock_common_setsockopt+0x3e/0x50 net/core/sock.c:3130 [<00000000512f1014>] __sys_setsockopt+0x9e/0x120 net/socket.c:2078 [<00000000181758bc>] __do_sys_setsockopt net/socket.c:2089 [inline] [<00000000181758bc>] __se_sys_setsockopt net/socket.c:2086 [inline] [<00000000181758bc>] __x64_sys_setsockopt+0x26/0x30 net/socket.c:2086 [<00000000d4b73623>] do_syscall_64+0x7c/0x1a0 arch/x86/entry/common.c:301 [<00000000c1098bec>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 Link: http://lkml.kernel.org/r/20190517171507.96046-1-dvyukov@gmail.com Signed-off-by: Dmitry Vyukov Acked-by: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kmemleak.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 9dd581d11565..3e147ea83182 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -575,7 +575,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, if (in_irq()) { object->pid = 0; strncpy(object->comm, "hardirq", sizeof(object->comm)); - } else if (in_softirq()) { + } else if (in_serving_softirq()) { object->pid = 0; strncpy(object->comm, "softirq", sizeof(object->comm)); } else { From 4e4dfce2278929de4379cdcfa2335dad7a6c4aa0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= Date: Thu, 11 Jul 2019 20:53:43 -0700 Subject: [PATCH 031/147] mm/kmemleak.c: change error at _write when kmemleak is disabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit According to POSIX, EBUSY means that the "device or resource is busy", and this can lead to people thinking that the file `/sys/kernel/debug/kmemleak/` is somehow locked or being used by other process. Change this error code to a more appropriate one. Link: http://lkml.kernel.org/r/20190612155231.19448-1-andrealmeid@collabora.com Signed-off-by: André Almeida Reviewed-by: Andrew Morton Acked-by: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kmemleak.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 3e147ea83182..aa8f4fa93ca3 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1866,7 +1866,7 @@ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf, } if (!kmemleak_enabled) { - ret = -EBUSY; + ret = -EPERM; goto out; } From b7c3613e685076bad6e5540501d84cc31bb30377 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= Date: Thu, 11 Jul 2019 20:53:46 -0700 Subject: [PATCH 032/147] docs: kmemleak: add more documentation details MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wikipedia now has a main article to "tracing garbage collector" topic. Change the URL and use the reStructuredText syntax for hyperlinks and add more details about the use of the tool. Add a section about how to use the kmemleak-test module to test the memory leak scanning. Link: http://lkml.kernel.org/r/20190612155231.19448-2-andrealmeid@collabora.com Signed-off-by: André Almeida Acked-by: Catalin Marinas Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/dev-tools/kmemleak.rst | 48 +++++++++++++++++++++++++--- 1 file changed, 44 insertions(+), 4 deletions(-) diff --git a/Documentation/dev-tools/kmemleak.rst b/Documentation/dev-tools/kmemleak.rst index e6f51260ff32..3621cd5e1eef 100644 --- a/Documentation/dev-tools/kmemleak.rst +++ b/Documentation/dev-tools/kmemleak.rst @@ -2,8 +2,8 @@ Kernel Memory Leak Detector =========================== Kmemleak provides a way of detecting possible kernel memory leaks in a -way similar to a tracing garbage collector -(https://en.wikipedia.org/wiki/Garbage_collection_%28computer_science%29#Tracing_garbage_collectors), +way similar to a `tracing garbage collector +`_, with the difference that the orphan objects are not freed but only reported via /sys/kernel/debug/kmemleak. A similar method is used by the Valgrind tool (``memcheck --leak-check``) to detect the memory leaks in @@ -15,10 +15,13 @@ Usage CONFIG_DEBUG_KMEMLEAK in "Kernel hacking" has to be enabled. A kernel thread scans the memory every 10 minutes (by default) and prints the -number of new unreferenced objects found. To display the details of all -the possible memory leaks:: +number of new unreferenced objects found. If the ``debugfs`` isn't already +mounted, mount with:: # mount -t debugfs nodev /sys/kernel/debug/ + +To display the details of all the possible scanned memory leaks:: + # cat /sys/kernel/debug/kmemleak To trigger an intermediate memory scan:: @@ -72,6 +75,9 @@ If CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF are enabled, the kmemleak is disabled by default. Passing ``kmemleak=on`` on the kernel command line enables the function. +If you are getting errors like "Error while writing to stdout" or "write_loop: +Invalid argument", make sure kmemleak is properly enabled. + Basic Algorithm --------------- @@ -218,3 +224,37 @@ the pointer is calculated by other methods than the usual container_of macro or the pointer is stored in a location not scanned by kmemleak. Page allocations and ioremap are not tracked. + +Testing with kmemleak-test +-------------------------- + +To check if you have all set up to use kmemleak, you can use the kmemleak-test +module, a module that deliberately leaks memory. Set CONFIG_DEBUG_KMEMLEAK_TEST +as module (it can't be used as bult-in) and boot the kernel with kmemleak +enabled. Load the module and perform a scan with:: + + # modprobe kmemleak-test + # echo scan > /sys/kernel/debug/kmemleak + +Note that the you may not get results instantly or on the first scanning. When +kmemleak gets results, it'll log ``kmemleak: new suspected +memory leaks``. Then read the file to see then:: + + # cat /sys/kernel/debug/kmemleak + unreferenced object 0xffff89862ca702e8 (size 32): + comm "modprobe", pid 2088, jiffies 4294680594 (age 375.486s) + hex dump (first 32 bytes): + 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b kkkkkkkkkkkkkkkk + 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b a5 kkkkkkkkkkkkkkk. + backtrace: + [<00000000e0a73ec7>] 0xffffffffc01d2036 + [<000000000c5d2a46>] do_one_initcall+0x41/0x1df + [<0000000046db7e0a>] do_init_module+0x55/0x200 + [<00000000542b9814>] load_module+0x203c/0x2480 + [<00000000c2850256>] __do_sys_finit_module+0xba/0xe0 + [<000000006564e7ef>] do_syscall_64+0x43/0x110 + [<000000007c873fa6>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 + ... + +Removing the module with ``rmmod kmemleak_test`` should also trigger some +kmemleak results. From e89692190065c12386bd37272ae8b7d142dd079f Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 11 Jul 2019 20:53:49 -0700 Subject: [PATCH 033/147] mm/kasan: print frame description for stack bugs This adds support for printing stack frame description on invalid stack accesses. The frame description is embedded by the compiler, which is parsed and then pretty-printed. Currently, we can only print the stack frame info for accesses to the task's own stack, but not accesses to other tasks' stacks. Example of what it looks like: page dumped because: kasan: bad access detected addr ffff8880673ef98a is located in stack of task insmod/2008 at offset 106 in frame: kasan_stack_oob+0x0/0xf5 [test_kasan] this frame has 2 objects: [32, 36) 'i' [96, 106) 'stack_array' Memory state around the buggy address: Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=198435 Link: http://lkml.kernel.org/r/20190522100048.146841-1-elver@google.com Signed-off-by: Marco Elver Reviewed-by: Andrey Ryabinin Cc: Alexander Potapenko Cc: Dmitry Vyukov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/kasan.h | 5 ++ mm/kasan/report.c | 165 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 170 insertions(+) diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 3ce956efa0cb..1979db4763e2 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -43,6 +43,11 @@ #define KASAN_ALLOCA_REDZONE_SIZE 32 +/* + * Stack frame marker (compiler ABI). + */ +#define KASAN_CURRENT_STACK_FRAME_MAGIC 0x41B58AB3 + /* Don't break randconfig/all*config builds */ #ifndef KASAN_ABI_VERSION #define KASAN_ABI_VERSION 1 diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 03a443579386..0e5f965f1882 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -28,6 +28,7 @@ #include #include #include +#include #include @@ -181,6 +182,168 @@ static inline bool init_task_stack_addr(const void *addr) sizeof(init_thread_union.stack)); } +static bool __must_check tokenize_frame_descr(const char **frame_descr, + char *token, size_t max_tok_len, + unsigned long *value) +{ + const char *sep = strchr(*frame_descr, ' '); + + if (sep == NULL) + sep = *frame_descr + strlen(*frame_descr); + + if (token != NULL) { + const size_t tok_len = sep - *frame_descr; + + if (tok_len + 1 > max_tok_len) { + pr_err("KASAN internal error: frame description too long: %s\n", + *frame_descr); + return false; + } + + /* Copy token (+ 1 byte for '\0'). */ + strlcpy(token, *frame_descr, tok_len + 1); + } + + /* Advance frame_descr past separator. */ + *frame_descr = sep + 1; + + if (value != NULL && kstrtoul(token, 10, value)) { + pr_err("KASAN internal error: not a valid number: %s\n", token); + return false; + } + + return true; +} + +static void print_decoded_frame_descr(const char *frame_descr) +{ + /* + * We need to parse the following string: + * "n alloc_1 alloc_2 ... alloc_n" + * where alloc_i looks like + * "offset size len name" + * or "offset size len name:line". + */ + + char token[64]; + unsigned long num_objects; + + if (!tokenize_frame_descr(&frame_descr, token, sizeof(token), + &num_objects)) + return; + + pr_err("\n"); + pr_err("this frame has %lu %s:\n", num_objects, + num_objects == 1 ? "object" : "objects"); + + while (num_objects--) { + unsigned long offset; + unsigned long size; + + /* access offset */ + if (!tokenize_frame_descr(&frame_descr, token, sizeof(token), + &offset)) + return; + /* access size */ + if (!tokenize_frame_descr(&frame_descr, token, sizeof(token), + &size)) + return; + /* name length (unused) */ + if (!tokenize_frame_descr(&frame_descr, NULL, 0, NULL)) + return; + /* object name */ + if (!tokenize_frame_descr(&frame_descr, token, sizeof(token), + NULL)) + return; + + /* Strip line number; without filename it's not very helpful. */ + strreplace(token, ':', '\0'); + + /* Finally, print object information. */ + pr_err(" [%lu, %lu) '%s'", offset, offset + size, token); + } +} + +static bool __must_check get_address_stack_frame_info(const void *addr, + unsigned long *offset, + const char **frame_descr, + const void **frame_pc) +{ + unsigned long aligned_addr; + unsigned long mem_ptr; + const u8 *shadow_bottom; + const u8 *shadow_ptr; + const unsigned long *frame; + + BUILD_BUG_ON(IS_ENABLED(CONFIG_STACK_GROWSUP)); + + /* + * NOTE: We currently only support printing frame information for + * accesses to the task's own stack. + */ + if (!object_is_on_stack(addr)) + return false; + + aligned_addr = round_down((unsigned long)addr, sizeof(long)); + mem_ptr = round_down(aligned_addr, KASAN_SHADOW_SCALE_SIZE); + shadow_ptr = kasan_mem_to_shadow((void *)aligned_addr); + shadow_bottom = kasan_mem_to_shadow(end_of_stack(current)); + + while (shadow_ptr >= shadow_bottom && *shadow_ptr != KASAN_STACK_LEFT) { + shadow_ptr--; + mem_ptr -= KASAN_SHADOW_SCALE_SIZE; + } + + while (shadow_ptr >= shadow_bottom && *shadow_ptr == KASAN_STACK_LEFT) { + shadow_ptr--; + mem_ptr -= KASAN_SHADOW_SCALE_SIZE; + } + + if (shadow_ptr < shadow_bottom) + return false; + + frame = (const unsigned long *)(mem_ptr + KASAN_SHADOW_SCALE_SIZE); + if (frame[0] != KASAN_CURRENT_STACK_FRAME_MAGIC) { + pr_err("KASAN internal error: frame info validation failed; invalid marker: %lu\n", + frame[0]); + return false; + } + + *offset = (unsigned long)addr - (unsigned long)frame; + *frame_descr = (const char *)frame[1]; + *frame_pc = (void *)frame[2]; + + return true; +} + +static void print_address_stack_frame(const void *addr) +{ + unsigned long offset; + const char *frame_descr; + const void *frame_pc; + + if (IS_ENABLED(CONFIG_KASAN_SW_TAGS)) + return; + + if (!get_address_stack_frame_info(addr, &offset, &frame_descr, + &frame_pc)) + return; + + /* + * get_address_stack_frame_info only returns true if the given addr is + * on the current task's stack. + */ + pr_err("\n"); + pr_err("addr %px is located in stack of task %s/%d at offset %lu in frame:\n", + addr, current->comm, task_pid_nr(current), offset); + pr_err(" %pS\n", frame_pc); + + if (!frame_descr) + return; + + print_decoded_frame_descr(frame_descr); +} + static void print_address_description(void *addr) { struct page *page = addr_to_page(addr); @@ -204,6 +367,8 @@ static void print_address_description(void *addr) pr_err("The buggy address belongs to the page:\n"); dump_page(page, "kasan: bad access detected"); } + + print_address_stack_frame(addr); } static bool row_is_guilty(const void *row, const void *guilty) From 19a33ca6c209f5e22c73b6beb4b1974153e93050 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 11 Jul 2019 20:53:52 -0700 Subject: [PATCH 034/147] lib/test_kasan: add bitops tests Patch series "Bitops instrumentation for KASAN", v5. This patch (of 3): This adds bitops tests to the test_kasan module. In a follow-up patch, support for bitops instrumentation will be added. Link: http://lkml.kernel.org/r/20190613125950.197667-2-elver@google.com Signed-off-by: Marco Elver Acked-by: Mark Rutland Reviewed-by: Andrey Ryabinin Cc: Peter Zijlstra (Intel) Cc: Dmitry Vyukov Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: "H. Peter Anvin" Cc: Jonathan Corbet Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Arnd Bergmann Cc: Josh Poimboeuf Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/test_kasan.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 78 insertions(+), 3 deletions(-) diff --git a/lib/test_kasan.c b/lib/test_kasan.c index e3c593c38eff..d85f25c65b0a 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -7,16 +7,17 @@ #define pr_fmt(fmt) "kasan test: %s " fmt, __func__ +#include #include +#include #include -#include #include +#include +#include #include #include #include #include -#include -#include /* * Note: test functions are marked noinline so that their names appear in @@ -619,6 +620,79 @@ static noinline void __init kasan_strings(void) strnlen(ptr, 1); } +static noinline void __init kasan_bitops(void) +{ + /* + * Allocate 1 more byte, which causes kzalloc to round up to 16-bytes; + * this way we do not actually corrupt other memory. + */ + long *bits = kzalloc(sizeof(*bits) + 1, GFP_KERNEL); + if (!bits) + return; + + /* + * Below calls try to access bit within allocated memory; however, the + * below accesses are still out-of-bounds, since bitops are defined to + * operate on the whole long the bit is in. + */ + pr_info("out-of-bounds in set_bit\n"); + set_bit(BITS_PER_LONG, bits); + + pr_info("out-of-bounds in __set_bit\n"); + __set_bit(BITS_PER_LONG, bits); + + pr_info("out-of-bounds in clear_bit\n"); + clear_bit(BITS_PER_LONG, bits); + + pr_info("out-of-bounds in __clear_bit\n"); + __clear_bit(BITS_PER_LONG, bits); + + pr_info("out-of-bounds in clear_bit_unlock\n"); + clear_bit_unlock(BITS_PER_LONG, bits); + + pr_info("out-of-bounds in __clear_bit_unlock\n"); + __clear_bit_unlock(BITS_PER_LONG, bits); + + pr_info("out-of-bounds in change_bit\n"); + change_bit(BITS_PER_LONG, bits); + + pr_info("out-of-bounds in __change_bit\n"); + __change_bit(BITS_PER_LONG, bits); + + /* + * Below calls try to access bit beyond allocated memory. + */ + pr_info("out-of-bounds in test_and_set_bit\n"); + test_and_set_bit(BITS_PER_LONG + BITS_PER_BYTE, bits); + + pr_info("out-of-bounds in __test_and_set_bit\n"); + __test_and_set_bit(BITS_PER_LONG + BITS_PER_BYTE, bits); + + pr_info("out-of-bounds in test_and_set_bit_lock\n"); + test_and_set_bit_lock(BITS_PER_LONG + BITS_PER_BYTE, bits); + + pr_info("out-of-bounds in test_and_clear_bit\n"); + test_and_clear_bit(BITS_PER_LONG + BITS_PER_BYTE, bits); + + pr_info("out-of-bounds in __test_and_clear_bit\n"); + __test_and_clear_bit(BITS_PER_LONG + BITS_PER_BYTE, bits); + + pr_info("out-of-bounds in test_and_change_bit\n"); + test_and_change_bit(BITS_PER_LONG + BITS_PER_BYTE, bits); + + pr_info("out-of-bounds in __test_and_change_bit\n"); + __test_and_change_bit(BITS_PER_LONG + BITS_PER_BYTE, bits); + + pr_info("out-of-bounds in test_bit\n"); + (void)test_bit(BITS_PER_LONG + BITS_PER_BYTE, bits); + +#if defined(clear_bit_unlock_is_negative_byte) + pr_info("out-of-bounds in clear_bit_unlock_is_negative_byte\n"); + clear_bit_unlock_is_negative_byte(BITS_PER_LONG + BITS_PER_BYTE, bits); +#endif + kfree(bits); +} + static int __init kmalloc_tests_init(void) { /* @@ -660,6 +734,7 @@ static int __init kmalloc_tests_init(void) kasan_memchr(); kasan_memcmp(); kasan_strings(); + kasan_bitops(); kasan_restore_multi_shot(multishot); From ff66135015726696568e998720d9b6afe2d04642 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 11 Jul 2019 20:53:56 -0700 Subject: [PATCH 035/147] x86: use static_cpu_has in uaccess region to avoid instrumentation This patch is a pre-requisite for enabling KASAN bitops instrumentation; using static_cpu_has instead of boot_cpu_has avoids instrumentation of test_bit inside the uaccess region. With instrumentation, the KASAN check would otherwise be flagged by objtool. For consistency, kernel/signal.c was changed to mirror this change, however, is never instrumented with KASAN (currently unsupported under x86 32bit). Link: http://lkml.kernel.org/r/20190613125950.197667-3-elver@google.com Signed-off-by: Marco Elver Suggested-by: H. Peter Anvin Acked-by: Peter Zijlstra (Intel) Reviewed-by: Andrey Ryabinin Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Dmitry Vyukov Cc: Ingo Molnar Cc: Jonathan Corbet Cc: Josh Poimboeuf Cc: Mark Rutland Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/ia32/ia32_signal.c | 2 +- arch/x86/kernel/signal.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 629d1ee05599..1cee10091b9f 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -358,7 +358,7 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig, put_user_ex(ptr_to_compat(&frame->uc), &frame->puc); /* Create the ucontext. */ - if (boot_cpu_has(X86_FEATURE_XSAVE)) + if (static_cpu_has(X86_FEATURE_XSAVE)) put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags); else put_user_ex(0, &frame->uc.uc_flags); diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 7cf508f78c8c..8eb7193e158d 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -391,7 +391,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, put_user_ex(&frame->uc, &frame->puc); /* Create the ucontext. */ - if (boot_cpu_has(X86_FEATURE_XSAVE)) + if (static_cpu_has(X86_FEATURE_XSAVE)) put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags); else put_user_ex(0, &frame->uc.uc_flags); From 751ad98d5f881df91ba47e013b82422912381e8e Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 11 Jul 2019 20:54:00 -0700 Subject: [PATCH 036/147] asm-generic, x86: add bitops instrumentation for KASAN This adds a new header to asm-generic to allow optionally instrumenting architecture-specific asm implementations of bitops. This change includes the required change for x86 as reference and changes the kernel API doc to point to bitops-instrumented.h instead. Rationale: the functions in x86's bitops.h are no longer the kernel API functions, but instead the arch_ prefixed functions, which are then instrumented via bitops-instrumented.h. Other architectures can similarly add support for asm implementations of bitops. The documentation text was derived from x86 and existing bitops asm-generic versions: 1) references to x86 have been removed; 2) as a result, some of the text had to be reworded for clarity and consistency. Tested using lib/test_kasan with bitops tests (pre-requisite patch). Bugzilla ref: https://bugzilla.kernel.org/show_bug.cgi?id=198439 Link: http://lkml.kernel.org/r/20190613125950.197667-4-elver@google.com Signed-off-by: Marco Elver Acked-by: Mark Rutland Reviewed-by: Andrey Ryabinin Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Dmitry Vyukov Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jonathan Corbet Cc: Josh Poimboeuf Cc: Peter Zijlstra (Intel) Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/core-api/kernel-api.rst | 2 +- arch/x86/include/asm/bitops.h | 189 ++++------------ include/asm-generic/bitops-instrumented.h | 263 ++++++++++++++++++++++ 3 files changed, 302 insertions(+), 152 deletions(-) create mode 100644 include/asm-generic/bitops-instrumented.h diff --git a/Documentation/core-api/kernel-api.rst b/Documentation/core-api/kernel-api.rst index 824f24ccf401..08af5caf036d 100644 --- a/Documentation/core-api/kernel-api.rst +++ b/Documentation/core-api/kernel-api.rst @@ -54,7 +54,7 @@ The Linux kernel provides more basic utility functions. Bit Operations -------------- -.. kernel-doc:: arch/x86/include/asm/bitops.h +.. kernel-doc:: include/asm-generic/bitops-instrumented.h :internal: Bitmap Operations diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index 8e790ec219a5..ba15d53c1ca7 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -49,23 +49,8 @@ #define CONST_MASK_ADDR(nr, addr) WBYTE_ADDR((void *)(addr) + ((nr)>>3)) #define CONST_MASK(nr) (1 << ((nr) & 7)) -/** - * set_bit - Atomically set a bit in memory - * @nr: the bit to set - * @addr: the address to start counting from - * - * This function is atomic and may not be reordered. See __set_bit() - * if you do not require the atomic guarantees. - * - * Note: there are no guarantees that this function will not be reordered - * on non x86 architectures, so if you are writing portable code, - * make sure not to rely on its reordering guarantees. - * - * Note that @nr may be almost arbitrarily large; this function is not - * restricted to acting on a single-word quantity. - */ static __always_inline void -set_bit(long nr, volatile unsigned long *addr) +arch_set_bit(long nr, volatile unsigned long *addr) { if (IS_IMMEDIATE(nr)) { asm volatile(LOCK_PREFIX "orb %1,%0" @@ -78,32 +63,14 @@ set_bit(long nr, volatile unsigned long *addr) } } -/** - * __set_bit - Set a bit in memory - * @nr: the bit to set - * @addr: the address to start counting from - * - * Unlike set_bit(), this function is non-atomic and may be reordered. - * If it's called on the same region of memory simultaneously, the effect - * may be that only one operation succeeds. - */ -static __always_inline void __set_bit(long nr, volatile unsigned long *addr) +static __always_inline void +arch___set_bit(long nr, volatile unsigned long *addr) { asm volatile(__ASM_SIZE(bts) " %1,%0" : : ADDR, "Ir" (nr) : "memory"); } -/** - * clear_bit - Clears a bit in memory - * @nr: Bit to clear - * @addr: Address to start counting from - * - * clear_bit() is atomic and may not be reordered. However, it does - * not contain a memory barrier, so if it is used for locking purposes, - * you should call smp_mb__before_atomic() and/or smp_mb__after_atomic() - * in order to ensure changes are visible on other processors. - */ static __always_inline void -clear_bit(long nr, volatile unsigned long *addr) +arch_clear_bit(long nr, volatile unsigned long *addr) { if (IS_IMMEDIATE(nr)) { asm volatile(LOCK_PREFIX "andb %1,%0" @@ -115,26 +82,21 @@ clear_bit(long nr, volatile unsigned long *addr) } } -/* - * clear_bit_unlock - Clears a bit in memory - * @nr: Bit to clear - * @addr: Address to start counting from - * - * clear_bit() is atomic and implies release semantics before the memory - * operation. It can be used for an unlock. - */ -static __always_inline void clear_bit_unlock(long nr, volatile unsigned long *addr) +static __always_inline void +arch_clear_bit_unlock(long nr, volatile unsigned long *addr) { barrier(); - clear_bit(nr, addr); + arch_clear_bit(nr, addr); } -static __always_inline void __clear_bit(long nr, volatile unsigned long *addr) +static __always_inline void +arch___clear_bit(long nr, volatile unsigned long *addr) { asm volatile(__ASM_SIZE(btr) " %1,%0" : : ADDR, "Ir" (nr) : "memory"); } -static __always_inline bool clear_bit_unlock_is_negative_byte(long nr, volatile unsigned long *addr) +static __always_inline bool +arch_clear_bit_unlock_is_negative_byte(long nr, volatile unsigned long *addr) { bool negative; asm volatile(LOCK_PREFIX "andb %2,%1" @@ -143,48 +105,23 @@ static __always_inline bool clear_bit_unlock_is_negative_byte(long nr, volatile : "ir" ((char) ~(1 << nr)) : "memory"); return negative; } +#define arch_clear_bit_unlock_is_negative_byte \ + arch_clear_bit_unlock_is_negative_byte -// Let everybody know we have it -#define clear_bit_unlock_is_negative_byte clear_bit_unlock_is_negative_byte - -/* - * __clear_bit_unlock - Clears a bit in memory - * @nr: Bit to clear - * @addr: Address to start counting from - * - * __clear_bit() is non-atomic and implies release semantics before the memory - * operation. It can be used for an unlock if no other CPUs can concurrently - * modify other bits in the word. - */ -static __always_inline void __clear_bit_unlock(long nr, volatile unsigned long *addr) +static __always_inline void +arch___clear_bit_unlock(long nr, volatile unsigned long *addr) { - __clear_bit(nr, addr); + arch___clear_bit(nr, addr); } -/** - * __change_bit - Toggle a bit in memory - * @nr: the bit to change - * @addr: the address to start counting from - * - * Unlike change_bit(), this function is non-atomic and may be reordered. - * If it's called on the same region of memory simultaneously, the effect - * may be that only one operation succeeds. - */ -static __always_inline void __change_bit(long nr, volatile unsigned long *addr) +static __always_inline void +arch___change_bit(long nr, volatile unsigned long *addr) { asm volatile(__ASM_SIZE(btc) " %1,%0" : : ADDR, "Ir" (nr) : "memory"); } -/** - * change_bit - Toggle a bit in memory - * @nr: Bit to change - * @addr: Address to start counting from - * - * change_bit() is atomic and may not be reordered. - * Note that @nr may be almost arbitrarily large; this function is not - * restricted to acting on a single-word quantity. - */ -static __always_inline void change_bit(long nr, volatile unsigned long *addr) +static __always_inline void +arch_change_bit(long nr, volatile unsigned long *addr) { if (IS_IMMEDIATE(nr)) { asm volatile(LOCK_PREFIX "xorb %1,%0" @@ -196,42 +133,20 @@ static __always_inline void change_bit(long nr, volatile unsigned long *addr) } } -/** - * test_and_set_bit - Set a bit and return its old value - * @nr: Bit to set - * @addr: Address to count from - * - * This operation is atomic and cannot be reordered. - * It also implies a memory barrier. - */ -static __always_inline bool test_and_set_bit(long nr, volatile unsigned long *addr) +static __always_inline bool +arch_test_and_set_bit(long nr, volatile unsigned long *addr) { return GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(bts), *addr, c, "Ir", nr); } -/** - * test_and_set_bit_lock - Set a bit and return its old value for lock - * @nr: Bit to set - * @addr: Address to count from - * - * This is the same as test_and_set_bit on x86. - */ static __always_inline bool -test_and_set_bit_lock(long nr, volatile unsigned long *addr) +arch_test_and_set_bit_lock(long nr, volatile unsigned long *addr) { - return test_and_set_bit(nr, addr); + return arch_test_and_set_bit(nr, addr); } -/** - * __test_and_set_bit - Set a bit and return its old value - * @nr: Bit to set - * @addr: Address to count from - * - * This operation is non-atomic and can be reordered. - * If two examples of this operation race, one can appear to succeed - * but actually fail. You must protect multiple accesses with a lock. - */ -static __always_inline bool __test_and_set_bit(long nr, volatile unsigned long *addr) +static __always_inline bool +arch___test_and_set_bit(long nr, volatile unsigned long *addr) { bool oldbit; @@ -242,28 +157,13 @@ static __always_inline bool __test_and_set_bit(long nr, volatile unsigned long * return oldbit; } -/** - * test_and_clear_bit - Clear a bit and return its old value - * @nr: Bit to clear - * @addr: Address to count from - * - * This operation is atomic and cannot be reordered. - * It also implies a memory barrier. - */ -static __always_inline bool test_and_clear_bit(long nr, volatile unsigned long *addr) +static __always_inline bool +arch_test_and_clear_bit(long nr, volatile unsigned long *addr) { return GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(btr), *addr, c, "Ir", nr); } -/** - * __test_and_clear_bit - Clear a bit and return its old value - * @nr: Bit to clear - * @addr: Address to count from - * - * This operation is non-atomic and can be reordered. - * If two examples of this operation race, one can appear to succeed - * but actually fail. You must protect multiple accesses with a lock. - * +/* * Note: the operation is performed atomically with respect to * the local CPU, but not other CPUs. Portable code should not * rely on this behaviour. @@ -271,7 +171,8 @@ static __always_inline bool test_and_clear_bit(long nr, volatile unsigned long * * accessed from a hypervisor on the same CPU if running in a VM: don't change * this without also updating arch/x86/kernel/kvm.c */ -static __always_inline bool __test_and_clear_bit(long nr, volatile unsigned long *addr) +static __always_inline bool +arch___test_and_clear_bit(long nr, volatile unsigned long *addr) { bool oldbit; @@ -282,8 +183,8 @@ static __always_inline bool __test_and_clear_bit(long nr, volatile unsigned long return oldbit; } -/* WARNING: non atomic and it can be reordered! */ -static __always_inline bool __test_and_change_bit(long nr, volatile unsigned long *addr) +static __always_inline bool +arch___test_and_change_bit(long nr, volatile unsigned long *addr) { bool oldbit; @@ -295,15 +196,8 @@ static __always_inline bool __test_and_change_bit(long nr, volatile unsigned lon return oldbit; } -/** - * test_and_change_bit - Change a bit and return its old value - * @nr: Bit to change - * @addr: Address to count from - * - * This operation is atomic and cannot be reordered. - * It also implies a memory barrier. - */ -static __always_inline bool test_and_change_bit(long nr, volatile unsigned long *addr) +static __always_inline bool +arch_test_and_change_bit(long nr, volatile unsigned long *addr) { return GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(btc), *addr, c, "Ir", nr); } @@ -326,16 +220,7 @@ static __always_inline bool variable_test_bit(long nr, volatile const unsigned l return oldbit; } -#if 0 /* Fool kernel-doc since it doesn't do macros yet */ -/** - * test_bit - Determine whether a bit is set - * @nr: bit number to test - * @addr: Address to start counting from - */ -static bool test_bit(int nr, const volatile unsigned long *addr); -#endif - -#define test_bit(nr, addr) \ +#define arch_test_bit(nr, addr) \ (__builtin_constant_p((nr)) \ ? constant_test_bit((nr), (addr)) \ : variable_test_bit((nr), (addr))) @@ -504,6 +389,8 @@ static __always_inline int fls64(__u64 x) #include +#include + #include #include diff --git a/include/asm-generic/bitops-instrumented.h b/include/asm-generic/bitops-instrumented.h new file mode 100644 index 000000000000..ddd1c6d9d8db --- /dev/null +++ b/include/asm-generic/bitops-instrumented.h @@ -0,0 +1,263 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * This file provides wrappers with sanitizer instrumentation for bit + * operations. + * + * To use this functionality, an arch's bitops.h file needs to define each of + * the below bit operations with an arch_ prefix (e.g. arch_set_bit(), + * arch___set_bit(), etc.). + */ +#ifndef _ASM_GENERIC_BITOPS_INSTRUMENTED_H +#define _ASM_GENERIC_BITOPS_INSTRUMENTED_H + +#include + +/** + * set_bit - Atomically set a bit in memory + * @nr: the bit to set + * @addr: the address to start counting from + * + * This is a relaxed atomic operation (no implied memory barriers). + * + * Note that @nr may be almost arbitrarily large; this function is not + * restricted to acting on a single-word quantity. + */ +static inline void set_bit(long nr, volatile unsigned long *addr) +{ + kasan_check_write(addr + BIT_WORD(nr), sizeof(long)); + arch_set_bit(nr, addr); +} + +/** + * __set_bit - Set a bit in memory + * @nr: the bit to set + * @addr: the address to start counting from + * + * Unlike set_bit(), this function is non-atomic. If it is called on the same + * region of memory concurrently, the effect may be that only one operation + * succeeds. + */ +static inline void __set_bit(long nr, volatile unsigned long *addr) +{ + kasan_check_write(addr + BIT_WORD(nr), sizeof(long)); + arch___set_bit(nr, addr); +} + +/** + * clear_bit - Clears a bit in memory + * @nr: Bit to clear + * @addr: Address to start counting from + * + * This is a relaxed atomic operation (no implied memory barriers). + */ +static inline void clear_bit(long nr, volatile unsigned long *addr) +{ + kasan_check_write(addr + BIT_WORD(nr), sizeof(long)); + arch_clear_bit(nr, addr); +} + +/** + * __clear_bit - Clears a bit in memory + * @nr: the bit to clear + * @addr: the address to start counting from + * + * Unlike clear_bit(), this function is non-atomic. If it is called on the same + * region of memory concurrently, the effect may be that only one operation + * succeeds. + */ +static inline void __clear_bit(long nr, volatile unsigned long *addr) +{ + kasan_check_write(addr + BIT_WORD(nr), sizeof(long)); + arch___clear_bit(nr, addr); +} + +/** + * clear_bit_unlock - Clear a bit in memory, for unlock + * @nr: the bit to set + * @addr: the address to start counting from + * + * This operation is atomic and provides release barrier semantics. + */ +static inline void clear_bit_unlock(long nr, volatile unsigned long *addr) +{ + kasan_check_write(addr + BIT_WORD(nr), sizeof(long)); + arch_clear_bit_unlock(nr, addr); +} + +/** + * __clear_bit_unlock - Clears a bit in memory + * @nr: Bit to clear + * @addr: Address to start counting from + * + * This is a non-atomic operation but implies a release barrier before the + * memory operation. It can be used for an unlock if no other CPUs can + * concurrently modify other bits in the word. + */ +static inline void __clear_bit_unlock(long nr, volatile unsigned long *addr) +{ + kasan_check_write(addr + BIT_WORD(nr), sizeof(long)); + arch___clear_bit_unlock(nr, addr); +} + +/** + * change_bit - Toggle a bit in memory + * @nr: Bit to change + * @addr: Address to start counting from + * + * This is a relaxed atomic operation (no implied memory barriers). + * + * Note that @nr may be almost arbitrarily large; this function is not + * restricted to acting on a single-word quantity. + */ +static inline void change_bit(long nr, volatile unsigned long *addr) +{ + kasan_check_write(addr + BIT_WORD(nr), sizeof(long)); + arch_change_bit(nr, addr); +} + +/** + * __change_bit - Toggle a bit in memory + * @nr: the bit to change + * @addr: the address to start counting from + * + * Unlike change_bit(), this function is non-atomic. If it is called on the same + * region of memory concurrently, the effect may be that only one operation + * succeeds. + */ +static inline void __change_bit(long nr, volatile unsigned long *addr) +{ + kasan_check_write(addr + BIT_WORD(nr), sizeof(long)); + arch___change_bit(nr, addr); +} + +/** + * test_and_set_bit - Set a bit and return its old value + * @nr: Bit to set + * @addr: Address to count from + * + * This is an atomic fully-ordered operation (implied full memory barrier). + */ +static inline bool test_and_set_bit(long nr, volatile unsigned long *addr) +{ + kasan_check_write(addr + BIT_WORD(nr), sizeof(long)); + return arch_test_and_set_bit(nr, addr); +} + +/** + * __test_and_set_bit - Set a bit and return its old value + * @nr: Bit to set + * @addr: Address to count from + * + * This operation is non-atomic. If two instances of this operation race, one + * can appear to succeed but actually fail. + */ +static inline bool __test_and_set_bit(long nr, volatile unsigned long *addr) +{ + kasan_check_write(addr + BIT_WORD(nr), sizeof(long)); + return arch___test_and_set_bit(nr, addr); +} + +/** + * test_and_set_bit_lock - Set a bit and return its old value, for lock + * @nr: Bit to set + * @addr: Address to count from + * + * This operation is atomic and provides acquire barrier semantics if + * the returned value is 0. + * It can be used to implement bit locks. + */ +static inline bool test_and_set_bit_lock(long nr, volatile unsigned long *addr) +{ + kasan_check_write(addr + BIT_WORD(nr), sizeof(long)); + return arch_test_and_set_bit_lock(nr, addr); +} + +/** + * test_and_clear_bit - Clear a bit and return its old value + * @nr: Bit to clear + * @addr: Address to count from + * + * This is an atomic fully-ordered operation (implied full memory barrier). + */ +static inline bool test_and_clear_bit(long nr, volatile unsigned long *addr) +{ + kasan_check_write(addr + BIT_WORD(nr), sizeof(long)); + return arch_test_and_clear_bit(nr, addr); +} + +/** + * __test_and_clear_bit - Clear a bit and return its old value + * @nr: Bit to clear + * @addr: Address to count from + * + * This operation is non-atomic. If two instances of this operation race, one + * can appear to succeed but actually fail. + */ +static inline bool __test_and_clear_bit(long nr, volatile unsigned long *addr) +{ + kasan_check_write(addr + BIT_WORD(nr), sizeof(long)); + return arch___test_and_clear_bit(nr, addr); +} + +/** + * test_and_change_bit - Change a bit and return its old value + * @nr: Bit to change + * @addr: Address to count from + * + * This is an atomic fully-ordered operation (implied full memory barrier). + */ +static inline bool test_and_change_bit(long nr, volatile unsigned long *addr) +{ + kasan_check_write(addr + BIT_WORD(nr), sizeof(long)); + return arch_test_and_change_bit(nr, addr); +} + +/** + * __test_and_change_bit - Change a bit and return its old value + * @nr: Bit to change + * @addr: Address to count from + * + * This operation is non-atomic. If two instances of this operation race, one + * can appear to succeed but actually fail. + */ +static inline bool __test_and_change_bit(long nr, volatile unsigned long *addr) +{ + kasan_check_write(addr + BIT_WORD(nr), sizeof(long)); + return arch___test_and_change_bit(nr, addr); +} + +/** + * test_bit - Determine whether a bit is set + * @nr: bit number to test + * @addr: Address to start counting from + */ +static inline bool test_bit(long nr, const volatile unsigned long *addr) +{ + kasan_check_read(addr + BIT_WORD(nr), sizeof(long)); + return arch_test_bit(nr, addr); +} + +#if defined(arch_clear_bit_unlock_is_negative_byte) +/** + * clear_bit_unlock_is_negative_byte - Clear a bit in memory and test if bottom + * byte is negative, for unlock. + * @nr: the bit to clear + * @addr: the address to start counting from + * + * This operation is atomic and provides release barrier semantics. + * + * This is a bit of a one-trick-pony for the filemap code, which clears + * PG_locked and tests PG_waiters, + */ +static inline bool +clear_bit_unlock_is_negative_byte(long nr, volatile unsigned long *addr) +{ + kasan_check_write(addr + BIT_WORD(nr), sizeof(long)); + return arch_clear_bit_unlock_is_negative_byte(nr, addr); +} +/* Let everybody know we have it. */ +#define clear_bit_unlock_is_negative_byte clear_bit_unlock_is_negative_byte +#endif + +#endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_H */ From 7d8ad890dad00f6cd64bfb44d9be4fceb10cf819 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 11 Jul 2019 20:54:03 -0700 Subject: [PATCH 037/147] mm/kasan: introduce __kasan_check_{read,write} Patch series "mm/kasan: Add object validation in ksize()", v3. This patch (of 5): This introduces __kasan_check_{read,write}. __kasan_check functions may be used from anywhere, even compilation units that disable instrumentation selectively. This change eliminates the need for the __KASAN_INTERNAL definition. [elver@google.com: v5] Link: http://lkml.kernel.org/r/20190708170706.174189-2-elver@google.com Link: http://lkml.kernel.org/r/20190626142014.141844-2-elver@google.com Signed-off-by: Marco Elver Acked-by: Mark Rutland Reviewed-by: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan-checks.h | 25 ++++++++++++++++++++++--- mm/kasan/common.c | 10 ++++------ 2 files changed, 26 insertions(+), 9 deletions(-) diff --git a/include/linux/kasan-checks.h b/include/linux/kasan-checks.h index a61dc075e2ce..221f05fbddd7 100644 --- a/include/linux/kasan-checks.h +++ b/include/linux/kasan-checks.h @@ -2,9 +2,28 @@ #ifndef _LINUX_KASAN_CHECKS_H #define _LINUX_KASAN_CHECKS_H -#if defined(__SANITIZE_ADDRESS__) || defined(__KASAN_INTERNAL) -void kasan_check_read(const volatile void *p, unsigned int size); -void kasan_check_write(const volatile void *p, unsigned int size); +/* + * __kasan_check_*: Always available when KASAN is enabled. This may be used + * even in compilation units that selectively disable KASAN, but must use KASAN + * to validate access to an address. Never use these in header files! + */ +#ifdef CONFIG_KASAN +void __kasan_check_read(const volatile void *p, unsigned int size); +void __kasan_check_write(const volatile void *p, unsigned int size); +#else +static inline void __kasan_check_read(const volatile void *p, unsigned int size) +{ } +static inline void __kasan_check_write(const volatile void *p, unsigned int size) +{ } +#endif + +/* + * kasan_check_*: Only available when the particular compilation unit has KASAN + * instrumentation enabled. May be used in header files. + */ +#ifdef __SANITIZE_ADDRESS__ +#define kasan_check_read __kasan_check_read +#define kasan_check_write __kasan_check_write #else static inline void kasan_check_read(const volatile void *p, unsigned int size) { } diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 242fdc01aaa9..6bada42cc152 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -14,8 +14,6 @@ * */ -#define __KASAN_INTERNAL - #include #include #include @@ -89,17 +87,17 @@ void kasan_disable_current(void) current->kasan_depth--; } -void kasan_check_read(const volatile void *p, unsigned int size) +void __kasan_check_read(const volatile void *p, unsigned int size) { check_memory_region((unsigned long)p, size, false, _RET_IP_); } -EXPORT_SYMBOL(kasan_check_read); +EXPORT_SYMBOL(__kasan_check_read); -void kasan_check_write(const volatile void *p, unsigned int size) +void __kasan_check_write(const volatile void *p, unsigned int size) { check_memory_region((unsigned long)p, size, true, _RET_IP_); } -EXPORT_SYMBOL(kasan_check_write); +EXPORT_SYMBOL(__kasan_check_write); #undef memset void *memset(void *addr, int c, size_t len) From b5f6e0fc7d60e0234dac82498e90dfe9027bad1f Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 11 Jul 2019 20:54:07 -0700 Subject: [PATCH 038/147] mm/kasan: change kasan_check_{read,write} to return boolean This changes {,__}kasan_check_{read,write} functions to return a boolean denoting if the access was valid or not. [sfr@canb.auug.org.au: include types.h for "bool"] Link: http://lkml.kernel.org/r/20190705184949.13cdd021@canb.auug.org.au Link: http://lkml.kernel.org/r/20190626142014.141844-3-elver@google.com Signed-off-by: Marco Elver Signed-off-by: Stephen Rothwell Reviewed-by: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Mark Rutland Cc: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan-checks.h | 30 ++++++++++++++++++++---------- mm/kasan/common.c | 8 ++++---- mm/kasan/generic.c | 13 +++++++------ mm/kasan/kasan.h | 10 +++++++++- mm/kasan/tags.c | 12 +++++++----- 5 files changed, 47 insertions(+), 26 deletions(-) diff --git a/include/linux/kasan-checks.h b/include/linux/kasan-checks.h index 221f05fbddd7..ac6aba632f2d 100644 --- a/include/linux/kasan-checks.h +++ b/include/linux/kasan-checks.h @@ -2,19 +2,25 @@ #ifndef _LINUX_KASAN_CHECKS_H #define _LINUX_KASAN_CHECKS_H +#include + /* * __kasan_check_*: Always available when KASAN is enabled. This may be used * even in compilation units that selectively disable KASAN, but must use KASAN * to validate access to an address. Never use these in header files! */ #ifdef CONFIG_KASAN -void __kasan_check_read(const volatile void *p, unsigned int size); -void __kasan_check_write(const volatile void *p, unsigned int size); +bool __kasan_check_read(const volatile void *p, unsigned int size); +bool __kasan_check_write(const volatile void *p, unsigned int size); #else -static inline void __kasan_check_read(const volatile void *p, unsigned int size) -{ } -static inline void __kasan_check_write(const volatile void *p, unsigned int size) -{ } +static inline bool __kasan_check_read(const volatile void *p, unsigned int size) +{ + return true; +} +static inline bool __kasan_check_write(const volatile void *p, unsigned int size) +{ + return true; +} #endif /* @@ -25,10 +31,14 @@ static inline void __kasan_check_write(const volatile void *p, unsigned int size #define kasan_check_read __kasan_check_read #define kasan_check_write __kasan_check_write #else -static inline void kasan_check_read(const volatile void *p, unsigned int size) -{ } -static inline void kasan_check_write(const volatile void *p, unsigned int size) -{ } +static inline bool kasan_check_read(const volatile void *p, unsigned int size) +{ + return true; +} +static inline bool kasan_check_write(const volatile void *p, unsigned int size) +{ + return true; +} #endif #endif diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 6bada42cc152..2277b82902d8 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -87,15 +87,15 @@ void kasan_disable_current(void) current->kasan_depth--; } -void __kasan_check_read(const volatile void *p, unsigned int size) +bool __kasan_check_read(const volatile void *p, unsigned int size) { - check_memory_region((unsigned long)p, size, false, _RET_IP_); + return check_memory_region((unsigned long)p, size, false, _RET_IP_); } EXPORT_SYMBOL(__kasan_check_read); -void __kasan_check_write(const volatile void *p, unsigned int size) +bool __kasan_check_write(const volatile void *p, unsigned int size) { - check_memory_region((unsigned long)p, size, true, _RET_IP_); + return check_memory_region((unsigned long)p, size, true, _RET_IP_); } EXPORT_SYMBOL(__kasan_check_write); diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index 504c79363a34..616f9dd82d12 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -166,29 +166,30 @@ static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size) return memory_is_poisoned_n(addr, size); } -static __always_inline void check_memory_region_inline(unsigned long addr, +static __always_inline bool check_memory_region_inline(unsigned long addr, size_t size, bool write, unsigned long ret_ip) { if (unlikely(size == 0)) - return; + return true; if (unlikely((void *)addr < kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) { kasan_report(addr, size, write, ret_ip); - return; + return false; } if (likely(!memory_is_poisoned(addr, size))) - return; + return true; kasan_report(addr, size, write, ret_ip); + return false; } -void check_memory_region(unsigned long addr, size_t size, bool write, +bool check_memory_region(unsigned long addr, size_t size, bool write, unsigned long ret_ip) { - check_memory_region_inline(addr, size, write, ret_ip); + return check_memory_region_inline(addr, size, write, ret_ip); } void kasan_cache_shrink(struct kmem_cache *cache) diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 1979db4763e2..014f19e76247 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -128,7 +128,15 @@ static inline bool addr_has_shadow(const void *addr) void kasan_poison_shadow(const void *address, size_t size, u8 value); -void check_memory_region(unsigned long addr, size_t size, bool write, +/** + * check_memory_region - Check memory region, and report if invalid access. + * @addr: the accessed address + * @size: the accessed size + * @write: true if access is a write access + * @ret_ip: return address + * @return: true if access was valid, false if invalid + */ +bool check_memory_region(unsigned long addr, size_t size, bool write, unsigned long ret_ip); void *find_first_bad_addr(void *addr, size_t size); diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c index 63fca3172659..0e987c9ca052 100644 --- a/mm/kasan/tags.c +++ b/mm/kasan/tags.c @@ -76,7 +76,7 @@ void *kasan_reset_tag(const void *addr) return reset_tag(addr); } -void check_memory_region(unsigned long addr, size_t size, bool write, +bool check_memory_region(unsigned long addr, size_t size, bool write, unsigned long ret_ip) { u8 tag; @@ -84,7 +84,7 @@ void check_memory_region(unsigned long addr, size_t size, bool write, void *untagged_addr; if (unlikely(size == 0)) - return; + return true; tag = get_tag((const void *)addr); @@ -106,22 +106,24 @@ void check_memory_region(unsigned long addr, size_t size, bool write, * set to KASAN_TAG_KERNEL (0xFF)). */ if (tag == KASAN_TAG_KERNEL) - return; + return true; untagged_addr = reset_tag((const void *)addr); if (unlikely(untagged_addr < kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) { kasan_report(addr, size, write, ret_ip); - return; + return false; } shadow_first = kasan_mem_to_shadow(untagged_addr); shadow_last = kasan_mem_to_shadow(untagged_addr + size - 1); for (shadow = shadow_first; shadow <= shadow_last; shadow++) { if (*shadow != tag) { kasan_report(addr, size, write, ret_ip); - return; + return false; } } + + return true; } #define DEFINE_HWASAN_LOAD_STORE(size) \ From bb104ed78552147bed3a981fdada622afd2084b6 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 11 Jul 2019 20:54:11 -0700 Subject: [PATCH 039/147] lib/test_kasan: Add test for double-kzfree detection Add a simple test that checks if double-kzfree is being detected correctly. Link: http://lkml.kernel.org/r/20190626142014.141844-4-elver@google.com Signed-off-by: Marco Elver Reviewed-by: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Mark Rutland Cc: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/test_kasan.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/lib/test_kasan.c b/lib/test_kasan.c index d85f25c65b0a..b63b367a94e8 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -693,6 +693,22 @@ static noinline void __init kasan_bitops(void) kfree(bits); } +static noinline void __init kmalloc_double_kzfree(void) +{ + char *ptr; + size_t size = 16; + + pr_info("double-free (kzfree)\n"); + ptr = kmalloc(size, GFP_KERNEL); + if (!ptr) { + pr_err("Allocation failed\n"); + return; + } + + kzfree(ptr); + kzfree(ptr); +} + static int __init kmalloc_tests_init(void) { /* @@ -735,6 +751,7 @@ static int __init kmalloc_tests_init(void) kasan_memcmp(); kasan_strings(); kasan_bitops(); + kmalloc_double_kzfree(); kasan_restore_multi_shot(multishot); From 10d1f8cb3965a6f633bf23eb984cda552927e3a5 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 11 Jul 2019 20:54:14 -0700 Subject: [PATCH 040/147] mm/slab: refactor common ksize KASAN logic into slab_common.c This refactors common code of ksize() between the various allocators into slab_common.c: __ksize() is the allocator-specific implementation without instrumentation, whereas ksize() includes the required KASAN logic. Link: http://lkml.kernel.org/r/20190626142014.141844-5-elver@google.com Signed-off-by: Marco Elver Acked-by: Christoph Lameter Reviewed-by: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Mark Rutland Cc: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 1 + mm/slab.c | 22 +++++----------------- mm/slab_common.c | 26 ++++++++++++++++++++++++++ mm/slob.c | 4 ++-- mm/slub.c | 14 ++------------ 5 files changed, 36 insertions(+), 31 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index 9449b19c5f10..98c3d12b7275 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -184,6 +184,7 @@ void * __must_check __krealloc(const void *, size_t, gfp_t); void * __must_check krealloc(const void *, size_t, gfp_t); void kfree(const void *); void kzfree(const void *); +size_t __ksize(const void *); size_t ksize(const void *); #ifdef CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR diff --git a/mm/slab.c b/mm/slab.c index db01e9aae31b..3521a351ceb5 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -4204,20 +4204,12 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page, #endif /* CONFIG_HARDENED_USERCOPY */ /** - * ksize - get the actual amount of memory allocated for a given object - * @objp: Pointer to the object + * __ksize -- Uninstrumented ksize. * - * kmalloc may internally round up allocations and return more memory - * than requested. ksize() can be used to determine the actual amount of - * memory allocated. The caller may use this additional memory, even though - * a smaller amount of memory was initially specified with the kmalloc call. - * The caller must guarantee that objp points to a valid object previously - * allocated with either kmalloc() or kmem_cache_alloc(). The object - * must not be freed during the duration of the call. - * - * Return: size of the actual memory used by @objp in bytes + * Unlike ksize(), __ksize() is uninstrumented, and does not provide the same + * safety checks as ksize() with KASAN instrumentation enabled. */ -size_t ksize(const void *objp) +size_t __ksize(const void *objp) { struct kmem_cache *c; size_t size; @@ -4228,11 +4220,7 @@ size_t ksize(const void *objp) c = virt_to_cache(objp); size = c ? c->object_size : 0; - /* We assume that ksize callers could use the whole allocated area, - * so we need to unpoison this area. - */ - kasan_unpoison_shadow(objp, size); return size; } -EXPORT_SYMBOL(ksize); +EXPORT_SYMBOL(__ksize); diff --git a/mm/slab_common.c b/mm/slab_common.c index 58251ba63e4a..b7c6a40e436a 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1597,6 +1597,32 @@ void kzfree(const void *p) } EXPORT_SYMBOL(kzfree); +/** + * ksize - get the actual amount of memory allocated for a given object + * @objp: Pointer to the object + * + * kmalloc may internally round up allocations and return more memory + * than requested. ksize() can be used to determine the actual amount of + * memory allocated. The caller may use this additional memory, even though + * a smaller amount of memory was initially specified with the kmalloc call. + * The caller must guarantee that objp points to a valid object previously + * allocated with either kmalloc() or kmem_cache_alloc(). The object + * must not be freed during the duration of the call. + * + * Return: size of the actual memory used by @objp in bytes + */ +size_t ksize(const void *objp) +{ + size_t size = __ksize(objp); + /* + * We assume that ksize callers could use whole allocated area, + * so we need to unpoison this area. + */ + kasan_unpoison_shadow(objp, size); + return size; +} +EXPORT_SYMBOL(ksize); + /* Tracepoints definitions. */ EXPORT_TRACEPOINT_SYMBOL(kmalloc); EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); diff --git a/mm/slob.c b/mm/slob.c index 84aefd9b91ee..7f421d0ca9ab 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -527,7 +527,7 @@ void kfree(const void *block) EXPORT_SYMBOL(kfree); /* can't use ksize for kmem_cache_alloc memory, only kmalloc */ -size_t ksize(const void *block) +size_t __ksize(const void *block) { struct page *sp; int align; @@ -545,7 +545,7 @@ size_t ksize(const void *block) m = (unsigned int *)(block - align); return SLOB_UNITS(*m) * SLOB_UNIT; } -EXPORT_SYMBOL(ksize); +EXPORT_SYMBOL(__ksize); int __kmem_cache_create(struct kmem_cache *c, slab_flags_t flags) { diff --git a/mm/slub.c b/mm/slub.c index d46a91759b96..5e217653286c 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3895,7 +3895,7 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page, } #endif /* CONFIG_HARDENED_USERCOPY */ -static size_t __ksize(const void *object) +size_t __ksize(const void *object) { struct page *page; @@ -3911,17 +3911,7 @@ static size_t __ksize(const void *object) return slab_ksize(page->slab_cache); } - -size_t ksize(const void *object) -{ - size_t size = __ksize(object); - /* We assume that ksize callers could use whole allocated area, - * so we need to unpoison this area. - */ - kasan_unpoison_shadow(object, size); - return size; -} -EXPORT_SYMBOL(ksize); +EXPORT_SYMBOL(__ksize); void kfree(const void *x) { From 0d4ca4c9bab397b525c9a4f875d31410ce4bc738 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 11 Jul 2019 20:54:18 -0700 Subject: [PATCH 041/147] mm/kasan: add object validation in ksize() ksize() has been unconditionally unpoisoning the whole shadow memory region associated with an allocation. This can lead to various undetected bugs, for example, double-kzfree(). Specifically, kzfree() uses ksize() to determine the actual allocation size, and subsequently zeroes the memory. Since ksize() used to just unpoison the whole shadow memory region, no invalid free was detected. This patch addresses this as follows: 1. Add a check in ksize(), and only then unpoison the memory region. 2. Preserve kasan_unpoison_slab() semantics by explicitly unpoisoning the shadow memory region using the size obtained from __ksize(). Tested: 1. With SLAB allocator: a) normal boot without warnings; b) verified the added double-kzfree() is detected. 2. With SLUB allocator: a) normal boot without warnings; b) verified the added double-kzfree() is detected. [elver@google.com: s/BUG_ON/WARN_ON_ONCE/, per Kees] Link: http://lkml.kernel.org/r/20190627094445.216365-6-elver@google.com Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=199359 Link: http://lkml.kernel.org/r/20190626142014.141844-6-elver@google.com Signed-off-by: Marco Elver Acked-by: Kees Cook Reviewed-by: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Mark Rutland Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan.h | 7 +++++-- mm/slab_common.c | 22 +++++++++++++++++++++- 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index b40ea104dd36..cc8a03cc9674 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -76,8 +76,11 @@ void kasan_free_shadow(const struct vm_struct *vm); int kasan_add_zero_shadow(void *start, unsigned long size); void kasan_remove_zero_shadow(void *start, unsigned long size); -size_t ksize(const void *); -static inline void kasan_unpoison_slab(const void *ptr) { ksize(ptr); } +size_t __ksize(const void *); +static inline void kasan_unpoison_slab(const void *ptr) +{ + kasan_unpoison_shadow(ptr, __ksize(ptr)); +} size_t kasan_metadata_size(struct kmem_cache *cache); bool kasan_save_enable_multi_shot(void); diff --git a/mm/slab_common.c b/mm/slab_common.c index b7c6a40e436a..a09bb10aa026 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1613,7 +1613,27 @@ EXPORT_SYMBOL(kzfree); */ size_t ksize(const void *objp) { - size_t size = __ksize(objp); + size_t size; + + if (WARN_ON_ONCE(!objp)) + return 0; + /* + * We need to check that the pointed to object is valid, and only then + * unpoison the shadow memory below. We use __kasan_check_read(), to + * generate a more useful report at the time ksize() is called (rather + * than later where behaviour is undefined due to potential + * use-after-free or double-free). + * + * If the pointed to memory is invalid we return 0, to avoid users of + * ksize() writing to and potentially corrupting the memory region. + * + * We want to perform the check before __ksize(), to avoid potentially + * crashing in __ksize() due to accessing invalid metadata. + */ + if (unlikely(objp == ZERO_SIZE_PTR) || !__kasan_check_read(objp, 1)) + return 0; + + size = __ksize(objp); /* * We assume that ksize callers could use whole allocated area, * so we need to unpoison this area. From 2236b99d6a33df72befa7205c2d8381aca7ae701 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 11 Jul 2019 20:54:21 -0700 Subject: [PATCH 042/147] include/linux/pfn_t.h: remove pfn_t_to_virt() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It has no callers and there is no virt_to_pfn_t(). Reported-by: Anshuman Khandual Cc: Dan Williams Cc: Jérôme Glisse Cc: Laurent Dufour Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pfn_t.h | 7 ------- 1 file changed, 7 deletions(-) diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h index 3c202a11a79e..01e8037023f7 100644 --- a/include/linux/pfn_t.h +++ b/include/linux/pfn_t.h @@ -66,13 +66,6 @@ static inline phys_addr_t pfn_t_to_phys(pfn_t pfn) return PFN_PHYS(pfn_t_to_pfn(pfn)); } -static inline void *pfn_t_to_virt(pfn_t pfn) -{ - if (pfn_t_has_page(pfn) && !is_device_private_page(pfn_t_to_page(pfn))) - return __va(pfn_t_to_phys(pfn)); - return NULL; -} - static inline pfn_t page_to_pfn_t(struct page *page) { return pfn_to_pfn_t(page_to_pfn(page)); From 03069bb0b19f8c3b6e079099dd8ac5e974756381 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 11 Jul 2019 20:54:24 -0700 Subject: [PATCH 043/147] arm: remove ARCH_SELECT_MEMORY_MODEL Patch series "remove ARCH_SELECT_MEMORY_MODEL where it has no effect". For several architectures the ARCH_SELECT_MEMORY_MODEL has no real effect because the dependencies for the memory model are always evaluated to a single value. Remove the ARCH_SELECT_MEMORY_MODEL from the Kconfigs for these architectures. This patch (of 3): The ARCH_SELECT_MEMORY_MODEL in arch/arm/Kconfig is enabled only when ARCH_SPARSEMEM_ENABLE=y. But in this case, ARCH_SPARSEMEM_DEFAULT is also enabled and this in turn enables SPARSEMEM_MANUAL. Since there is no definition of ARCH_FLATMEM_ENABLE in arch/arm/Kconfig, SPARSEMEM_MANUAL is the only enabled memory model, hence the final selection will evaluate to SPARSEMEM=y. Since ARCH_SPARSEMEM_ENABLE is set to 'y' only by several sub-arch configurations, the default for must sub-arches would be the falback to FLATMEM regardless of ARCH_SELECT_MEMORY_MODEL. Link: http://lkml.kernel.org/r/1556740577-4140-2-git-send-email-rppt@linux.ibm.com Signed-off-by: Mike Rapoport Cc: Arnd Bergmann Cc: Christoph Hellwig Cc: "David S. Miller" Cc: Heiko Carstens Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/Kconfig | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index d850feb5cc0a..0c55aa199c67 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -1622,9 +1622,6 @@ config ARCH_SPARSEMEM_ENABLE config ARCH_SPARSEMEM_DEFAULT def_bool ARCH_SPARSEMEM_ENABLE -config ARCH_SELECT_MEMORY_MODEL - def_bool ARCH_SPARSEMEM_ENABLE - config HAVE_ARCH_PFN_VALID def_bool ARCH_HAS_HOLES_MEMORYMODEL || !SPARSEMEM From a9d8777e397e3d1630377fa1d0f0dea0329057c3 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 11 Jul 2019 20:54:27 -0700 Subject: [PATCH 044/147] s390: remove ARCH_SELECT_MEMORY_MODEL The only reason s390 has ARCH_SELECT_MEMORY_MODEL option in arch/s390/Kconfig is an ancient compile error with allnoconfig which was fixed by commit 97195d6b411f ("[S390] fix sparsemem related compile error with allnoconfig on s390") by adding the ARCH_SELECT_MEMORY_MODEL option. Since then a lot have changed and now allnoconfig builds just fine without ARCH_SELECT_MEMORY_MODEL, so it can be removed. Link: http://lkml.kernel.org/r/1556740577-4140-3-git-send-email-rppt@linux.ibm.com Signed-off-by: Mike Rapoport Acked-by: Heiko Carstens Cc: Arnd Bergmann Cc: Christoph Hellwig Cc: "David S. Miller" Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/Kconfig | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index fdb4246265a5..f089ae375f6b 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -641,9 +641,6 @@ config ARCH_SPARSEMEM_ENABLE config ARCH_SPARSEMEM_DEFAULT def_bool y -config ARCH_SELECT_MEMORY_MODEL - def_bool y - config ARCH_ENABLE_MEMORY_HOTPLUG def_bool y if SPARSEMEM From 445676071776de492624a063923736637f9eb3e8 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 11 Jul 2019 20:54:31 -0700 Subject: [PATCH 045/147] sparc: remove ARCH_SELECT_MEMORY_MODEL The ARCH_SELECT_MEMORY_MODEL option is enabled only for 64-bit. However, 64-bit configuration also enables ARCH_SPARSEMEM_DEFAULT and there is no ARCH_FLATMEM_ENABLE in arch/sparc/Kconfig. With such settings, the dependencies in mm/Kconfig are always evaluated to SPARSEMEM=y for 64-bit and to FLATMEM=y for 32-bit. The ARCH_SELECT_MEMORY_MODEL option in arch/sparc/Kconfig does not affect anything and can be removed. Link: http://lkml.kernel.org/r/1556740577-4140-4-git-send-email-rppt@linux.ibm.com Signed-off-by: Mike Rapoport Cc: Arnd Bergmann Cc: Christoph Hellwig Cc: "David S. Miller" Cc: Heiko Carstens Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sparc/Kconfig | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 26ab6f5bbaaf..8fc95c7809ef 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -300,9 +300,6 @@ config NODES_SPAN_OTHER_NODES def_bool y depends on NEED_MULTIPLE_NODES -config ARCH_SELECT_MEMORY_MODEL - def_bool y if SPARC64 - config ARCH_SPARSEMEM_ENABLE def_bool y if SPARC64 select SPARSEMEM_VMEMMAP_ENABLE From a7030aea20d9191c271607aa1478f72a41948b0a Mon Sep 17 00:00:00 2001 From: Bharath Vedartham Date: Thu, 11 Jul 2019 20:54:34 -0700 Subject: [PATCH 046/147] mm/gup.c: make follow_page_mask() static follow_page_mask() is only used in gup.c, make it static. Link: http://lkml.kernel.org/r/20190510190831.GA4061@bharath12345-Inspiron-5559 Signed-off-by: Bharath Vedartham Reviewed-by: Ira Weiny Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/gup.c b/mm/gup.c index ddde097cf9e4..f1c1daebc425 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -515,7 +515,7 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma, * an error pointer if there is a mapping to something not represented * by a page descriptor (see also vm_normal_page()). */ -struct page *follow_page_mask(struct vm_area_struct *vma, +static struct page *follow_page_mask(struct vm_area_struct *vma, unsigned long address, unsigned int flags, struct follow_page_context *ctx) { From 465fc3a9b3129722b0df395529b3894b0b90d2de Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 11 Jul 2019 20:54:37 -0700 Subject: [PATCH 047/147] mm/memory.c: trivial clean up in insert_page() Make the success case use the same cleanup path as the failure case. Link: http://lkml.kernel.org/r/20190523134024.GC24093@localhost.localdomain Signed-off-by: Miklos Szeredi Reviewed-by: Andrew Morton Reviewed-by: Anshuman Khandual Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index ddf20bd0c317..ced4bedc660d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1475,8 +1475,6 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr, set_pte_at(mm, addr, pte, mk_pte(page, prot)); retval = 0; - pte_unmap_unlock(pte, ptl); - return retval; out_unlock: pte_unmap_unlock(pte, ptl); out: From 442a5a9a9295bfd9b0cffd0691ef8a6ce81db7c4 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 11 Jul 2019 20:54:40 -0700 Subject: [PATCH 048/147] mm: make !CONFIG_HUGE_PAGE wrappers into static inlines Instead of using defines, which loses type safety and provokes unused variable warnings from gcc, put the constants into static inlines. Link: http://lkml.kernel.org/r/20190522235102.GA15370@mellanox.com Signed-off-by: Jason Gunthorpe Suggested-by: Andrew Morton Reviewed-by: Mike Kravetz Cc: Jerome Glisse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 102 +++++++++++++++++++++++++++++++++------- 1 file changed, 86 insertions(+), 16 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index edf476c8cfb9..f895a79c6f5c 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -608,22 +608,92 @@ static inline void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, #else /* CONFIG_HUGETLB_PAGE */ struct hstate {}; -#define alloc_huge_page(v, a, r) NULL -#define alloc_huge_page_node(h, nid) NULL -#define alloc_huge_page_nodemask(h, preferred_nid, nmask) NULL -#define alloc_huge_page_vma(h, vma, address) NULL -#define alloc_bootmem_huge_page(h) NULL -#define hstate_file(f) NULL -#define hstate_sizelog(s) NULL -#define hstate_vma(v) NULL -#define hstate_inode(i) NULL -#define page_hstate(page) NULL -#define huge_page_size(h) PAGE_SIZE -#define huge_page_mask(h) PAGE_MASK -#define vma_kernel_pagesize(v) PAGE_SIZE -#define vma_mmu_pagesize(v) PAGE_SIZE -#define huge_page_order(h) 0 -#define huge_page_shift(h) PAGE_SHIFT + +static inline struct page *alloc_huge_page(struct vm_area_struct *vma, + unsigned long addr, + int avoid_reserve) +{ + return NULL; +} + +static inline struct page *alloc_huge_page_node(struct hstate *h, int nid) +{ + return NULL; +} + +static inline struct page * +alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask) +{ + return NULL; +} + +static inline struct page *alloc_huge_page_vma(struct hstate *h, + struct vm_area_struct *vma, + unsigned long address) +{ + return NULL; +} + +static inline int __alloc_bootmem_huge_page(struct hstate *h) +{ + return 0; +} + +static inline struct hstate *hstate_file(struct file *f) +{ + return NULL; +} + +static inline struct hstate *hstate_sizelog(int page_size_log) +{ + return NULL; +} + +static inline struct hstate *hstate_vma(struct vm_area_struct *vma) +{ + return NULL; +} + +static inline struct hstate *hstate_inode(struct inode *i) +{ + return NULL; +} + +static inline struct hstate *page_hstate(struct page *page) +{ + return NULL; +} + +static inline unsigned long huge_page_size(struct hstate *h) +{ + return PAGE_SIZE; +} + +static inline unsigned long huge_page_mask(struct hstate *h) +{ + return PAGE_MASK; +} + +static inline unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) +{ + return PAGE_SIZE; +} + +static inline unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) +{ + return PAGE_SIZE; +} + +static inline unsigned int huge_page_order(struct hstate *h) +{ + return 0; +} + +static inline unsigned int huge_page_shift(struct hstate *h) +{ + return PAGE_SHIFT; +} + static inline bool hstate_is_gigantic(struct hstate *h) { return false; From 219f8a2e25f0abbe222b170a0de2fd38c22d43ad Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 11 Jul 2019 20:54:43 -0700 Subject: [PATCH 049/147] include/linux/mm_types.h: ifdef struct vm_area_struct::swap_readahead_info The field is only used in swap code. Link: http://lkml.kernel.org/r/20190503190500.GA30589@avx2 Signed-off-by: Alexey Dobriyan Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 8ec38b11b361..1d1093474c1a 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -329,7 +329,9 @@ struct vm_area_struct { struct file * vm_file; /* File we map to (can be NULL). */ void * vm_private_data; /* was vm_pte (shared mem) */ +#ifdef CONFIG_SWAP atomic_long_t swap_readahead_info; +#endif #ifndef CONFIG_MMU struct vm_region *vm_region; /* NOMMU mapping region */ #endif From ac1c3e49a9a734150b33297eeca5b43d92fd5be8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 11 Jul 2019 20:54:46 -0700 Subject: [PATCH 050/147] mm: remove the account_page_dirtied export account_page_dirtied() is only used by our set_page_dirty() helpers and should not be used anywhere else. Link: http://lkml.kernel.org/r/20190605183702.30572-1-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index bdbe8b6b1225..1804f64ff43c 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2429,7 +2429,6 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) this_cpu_inc(bdp_ratelimits); } } -EXPORT_SYMBOL(account_page_dirtied); /* * Helper function for deaccounting dirty page without writeback. From 1fcf0a561cd09d7fb7f7afa2ddfe05f72f32050e Mon Sep 17 00:00:00 2001 From: Pingfan Liu Date: Thu, 11 Jul 2019 20:54:49 -0700 Subject: [PATCH 051/147] mm/page_isolation.c: change the prototype of undo_isolate_page_range() undo_isolate_page_range() never fails, so no need to return value. Link: http://lkml.kernel.org/r/1562075604-8979-1-git-send-email-kernelfans@gmail.com Signed-off-by: Pingfan Liu Acked-by: Michal Hocko Reviewed-by: Oscar Salvador Reviewed-by: Anshuman Khandual Cc: Qian Cai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-isolation.h | 2 +- mm/page_isolation.c | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h index 280ae96dc4c3..1099c2fee20f 100644 --- a/include/linux/page-isolation.h +++ b/include/linux/page-isolation.h @@ -50,7 +50,7 @@ start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, * Changes MIGRATE_ISOLATE to MIGRATE_MOVABLE. * target range is [start_pfn, end_pfn) */ -int +void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, unsigned migratetype); diff --git a/mm/page_isolation.c b/mm/page_isolation.c index e3638a5bafff..89c19c0feadb 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -230,7 +230,7 @@ undo: /* * Make isolated pages available again. */ -int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, +void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, unsigned migratetype) { unsigned long pfn; @@ -247,7 +247,6 @@ int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, continue; unset_migratetype_isolate(page, migratetype); } - return 0; } /* * Test all pages in the range is free(means isolated) or not. From 51b176290496518d6701bc40e63f70e4b6870198 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 11 Jul 2019 20:54:52 -0700 Subject: [PATCH 052/147] include/linux/vmpressure.h: use spinlock_t instead of struct spinlock For spinlocks the type spinlock_t should be used instead of "struct spinlock". Use spinlock_t for spinlock's definition. Link: http://lkml.kernel.org/r/20190704153803.12739-3-bigeasy@linutronix.de Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmpressure.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h index 61e6fddfb26f..6d28bc433c1c 100644 --- a/include/linux/vmpressure.h +++ b/include/linux/vmpressure.h @@ -17,7 +17,7 @@ struct vmpressure { unsigned long tree_scanned; unsigned long tree_reclaimed; /* The lock is used to keep the scanned/reclaimed above in sync. */ - struct spinlock sr_lock; + spinlock_t sr_lock; /* The list of vmpressure_event structs. */ struct list_head events; From 98ef2046f28b642ad35935972a173194ea58a21d Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Thu, 11 Jul 2019 20:54:56 -0700 Subject: [PATCH 053/147] mm: remove the exporting of totalram_pages Previously totalram_pages was the global variable. Currently, totalram_pages is the static inline function from the include/linux/mm.h However, the function is also marked as EXPORT_SYMBOL, which is at best an odd combination. Because there is no point for the static inline function from a public header to be exported, this commit removes the EXPORT_SYMBOL() marking. It will be still possible to use the function in modules because all the symbols it depends on are exported. Link: http://lkml.kernel.org/r/20190710141031.15642-1-efremov@linux.com Fixes: ca79b0c211af6 ("mm: convert totalram_pages and totalhigh_pages variables to atomic") Signed-off-by: Denis Efremov Acked-by: Michal Hocko Cc: Vlastimil Babka Cc: Oscar Salvador Cc: Pavel Tatashin Cc: Mel Gorman Cc: Mike Rapoport Cc: Alexander Duyck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8e3bc949ebcc..060303496094 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -224,8 +224,6 @@ int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] = { [ZONE_MOVABLE] = 0, }; -EXPORT_SYMBOL(totalram_pages); - static char * const zone_names[MAX_NR_ZONES] = { #ifdef CONFIG_ZONE_DMA "DMA", From f445884562dd8bc51eb4136bd21f014403d1813d Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 11 Jul 2019 20:54:59 -0700 Subject: [PATCH 054/147] include/linux/pagemap.h: document trylock_page() return value Cc: Henry Burns Cc: Jonathan Adams Cc: David Rientjes Cc: Mike Rapoport Cc: Vitaly Wool Cc: Xidong Wang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index fe0b29bf2df7..6fd0d3aa492c 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -452,6 +452,9 @@ extern int __lock_page_or_retry(struct page *page, struct mm_struct *mm, unsigned int flags); extern void unlock_page(struct page *page); +/* + * Return true if the page was successfully locked + */ static inline int trylock_page(struct page *page) { page = compound_head(page); From a9659476d4b391aec3f5357f176b63b9f8c46231 Mon Sep 17 00:00:00 2001 From: Nicolas Boichat Date: Thu, 11 Jul 2019 20:55:03 -0700 Subject: [PATCH 055/147] mm/failslab.c: by default, do not fail allocations with direct reclaim only When failslab was originally written, the intention of the "ignore-gfp-wait" flag default value ("N") was to fail GFP_ATOMIC allocations. Those were defined as (__GFP_HIGH), and the code would test for __GFP_WAIT (0x10u). However, since then, __GFP_WAIT was replaced by __GFP_RECLAIM (___GFP_DIRECT_RECLAIM|___GFP_KSWAPD_RECLAIM), and GFP_ATOMIC is now defined as (__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM). This means that when the flag is false, almost no allocation ever fails (as even GFP_ATOMIC allocations contain ___GFP_KSWAPD_RECLAIM). Restore the original intent of the code, by ignoring calls that directly reclaim only (__GFP_DIRECT_RECLAIM), and thus, failing GFP_ATOMIC calls again by default. Link: http://lkml.kernel.org/r/20190520214514.81360-1-drinkcat@chromium.org Fixes: 71baba4b92dc1fa1 ("mm, page_alloc: rename __GFP_WAIT to __GFP_RECLAIM") Signed-off-by: Nicolas Boichat Reviewed-by: Akinobu Mita Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/failslab.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/failslab.c b/mm/failslab.c index ec5aad211c5b..f92fed91ac23 100644 --- a/mm/failslab.c +++ b/mm/failslab.c @@ -23,7 +23,8 @@ bool __should_failslab(struct kmem_cache *s, gfp_t gfpflags) if (gfpflags & __GFP_NOFAIL) return false; - if (failslab.ignore_gfp_reclaim && (gfpflags & __GFP_RECLAIM)) + if (failslab.ignore_gfp_reclaim && + (gfpflags & __GFP_DIRECT_RECLAIM)) return false; if (failslab.cache_filter && !(s->flags & SLAB_FAILSLAB)) From 96a2b03f281d3a3b29c27028164f43090d6495b9 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 11 Jul 2019 20:55:06 -0700 Subject: [PATCH 056/147] mm, debug_pagelloc: use static keys to enable debugging Patch series "debug_pagealloc improvements". I have been recently debugging some pcplist corruptions, where it would be useful to perform struct page checks immediately as pages are allocated from and freed to pcplists, which is now only possible by rebuilding the kernel with CONFIG_DEBUG_VM (details in Patch 2 changelog). To make this kind of debugging simpler in future on a distro kernel, I have improved CONFIG_DEBUG_PAGEALLOC so that it has even smaller overhead when not enabled at boot time (Patch 1) and also when enabled (Patch 3), and extended it to perform the struct page checks more often when enabled (Patch 2). Now it can be configured in when building a distro kernel without extra overhead, and debugging page use after free or double free can be enabled simply by rebooting with debug_pagealloc=on. This patch (of 3): CONFIG_DEBUG_PAGEALLOC has been redesigned by 031bc5743f15 ("mm/debug-pagealloc: make debug-pagealloc boottime configurable") to allow being always enabled in a distro kernel, but only perform its expensive functionality when booted with debug_pagelloc=on. We can further reduce the overhead when not boot-enabled (including page allocator fast paths) using static keys. This patch introduces one for debug_pagealloc core functionality, and another for the optional guard page functionality (enabled by booting with debug_guardpage_minorder=X). Link: http://lkml.kernel.org/r/20190603143451.27353-2-vbabka@suse.cz Signed-off-by: Vlastimil Babka Reviewed-by: Andrew Morton Cc: Joonsoo Kim Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Vlastimil Babka Cc: Matthew Wilcox Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 15 +++++++++++---- mm/page_alloc.c | 23 +++++++++++++++++------ 2 files changed, 28 insertions(+), 10 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 0a6dae2f2b84..2c2e98cae2d1 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2701,11 +2701,18 @@ static inline void kernel_poison_pages(struct page *page, int numpages, int enable) { } #endif -extern bool _debug_pagealloc_enabled; +#ifdef CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT +DECLARE_STATIC_KEY_TRUE(_debug_pagealloc_enabled); +#else +DECLARE_STATIC_KEY_FALSE(_debug_pagealloc_enabled); +#endif static inline bool debug_pagealloc_enabled(void) { - return IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) && _debug_pagealloc_enabled; + if (!IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) + return false; + + return static_branch_unlikely(&_debug_pagealloc_enabled); } #if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_ARCH_HAS_SET_DIRECT_MAP) @@ -2859,7 +2866,7 @@ extern struct page_ext_operations debug_guardpage_ops; #ifdef CONFIG_DEBUG_PAGEALLOC extern unsigned int _debug_guardpage_minorder; -extern bool _debug_guardpage_enabled; +DECLARE_STATIC_KEY_FALSE(_debug_guardpage_enabled); static inline unsigned int debug_guardpage_minorder(void) { @@ -2868,7 +2875,7 @@ static inline unsigned int debug_guardpage_minorder(void) static inline bool debug_guardpage_enabled(void) { - return _debug_guardpage_enabled; + return static_branch_unlikely(&_debug_guardpage_enabled); } static inline bool page_is_guard(struct page *page) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 060303496094..3180d79be20c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -644,16 +644,27 @@ void prep_compound_page(struct page *page, unsigned int order) #ifdef CONFIG_DEBUG_PAGEALLOC unsigned int _debug_guardpage_minorder; -bool _debug_pagealloc_enabled __read_mostly - = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT); + +#ifdef CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT +DEFINE_STATIC_KEY_TRUE(_debug_pagealloc_enabled); +#else +DEFINE_STATIC_KEY_FALSE(_debug_pagealloc_enabled); +#endif EXPORT_SYMBOL(_debug_pagealloc_enabled); -bool _debug_guardpage_enabled __read_mostly; + +DEFINE_STATIC_KEY_FALSE(_debug_guardpage_enabled); static int __init early_debug_pagealloc(char *buf) { - if (!buf) + bool enable = false; + + if (kstrtobool(buf, &enable)) return -EINVAL; - return kstrtobool(buf, &_debug_pagealloc_enabled); + + if (enable) + static_branch_enable(&_debug_pagealloc_enabled); + + return 0; } early_param("debug_pagealloc", early_debug_pagealloc); @@ -677,7 +688,7 @@ static void init_debug_guardpage(void) if (!debug_guardpage_minorder()) return; - _debug_guardpage_enabled = true; + static_branch_enable(&_debug_guardpage_enabled); } struct page_ext_operations debug_guardpage_ops = { From 4462b32c9285b521ef378907aa66a5ca485aae41 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 11 Jul 2019 20:55:09 -0700 Subject: [PATCH 057/147] mm, page_alloc: more extensive free page checking with debug_pagealloc The page allocator checks struct pages for expected state (mapcount, flags etc) as pages are being allocated (check_new_page()) and freed (free_pages_check()) to provide some defense against errors in page allocator users. Prior commits 479f854a207c ("mm, page_alloc: defer debugging checks of pages allocated from the PCP") and 4db7548ccbd9 ("mm, page_alloc: defer debugging checks of freed pages until a PCP drain") this has happened for order-0 pages as they were allocated from or freed to the per-cpu caches (pcplists). Since those are fast paths, the checks are now performed only when pages are moved between pcplists and global free lists. This however lowers the chances of catching errors soon enough. In order to increase the chances of the checks to catch errors, the kernel has to be rebuilt with CONFIG_DEBUG_VM, which also enables multiple other internal debug checks (VM_BUG_ON() etc), which is suboptimal when the goal is to catch errors in mm users, not in mm code itself. To catch some wrong users of the page allocator we have CONFIG_DEBUG_PAGEALLOC, which is designed to have virtually no overhead unless enabled at boot time. Memory corruptions when writing to freed pages have often the same underlying errors (use-after-free, double free) as corrupting the corresponding struct pages, so this existing debugging functionality is a good fit to extend by also perform struct page checks at least as often as if CONFIG_DEBUG_VM was enabled. Specifically, after this patch, when debug_pagealloc is enabled on boot, and CONFIG_DEBUG_VM disabled, pages are checked when allocated from or freed to the pcplists *in addition* to being moved between pcplists and free lists. When both debug_pagealloc and CONFIG_DEBUG_VM are enabled, pages are checked when being moved between pcplists and free lists *in addition* to when allocated from or freed to the pcplists. When debug_pagealloc is not enabled on boot, the overhead in fast paths should be virtually none thanks to the use of static key. Link: http://lkml.kernel.org/r/20190603143451.27353-3-vbabka@suse.cz Signed-off-by: Vlastimil Babka Reviewed-by: Andrew Morton Cc: Mel Gorman Cc: Joonsoo Kim Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig.debug | 13 ++++++++---- mm/page_alloc.c | 53 +++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 52 insertions(+), 14 deletions(-) diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index fa6d79281368..a35ab6c55192 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -19,12 +19,17 @@ config DEBUG_PAGEALLOC Depending on runtime enablement, this results in a small or large slowdown, but helps to find certain types of memory corruption. + Also, the state of page tracking structures is checked more often as + pages are being allocated and freed, as unexpected state changes + often happen for same reasons as memory corruption (e.g. double free, + use-after-free). + For architectures which don't enable ARCH_SUPPORTS_DEBUG_PAGEALLOC, fill the pages with poison patterns after free_pages() and verify - the patterns before alloc_pages(). Additionally, - this option cannot be enabled in combination with hibernation as - that would result in incorrect warnings of memory corruption after - a resume because free pages are not saved to the suspend image. + the patterns before alloc_pages(). Additionally, this option cannot + be enabled in combination with hibernation as that would result in + incorrect warnings of memory corruption after a resume because free + pages are not saved to the suspend image. By default this option will have a small overhead, e.g. by not allowing the kernel mapping to be backed by large pages on some diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3180d79be20c..26b6ad8b065d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1160,19 +1160,36 @@ static __always_inline bool free_pages_prepare(struct page *page, } #ifdef CONFIG_DEBUG_VM -static inline bool free_pcp_prepare(struct page *page) +/* + * With DEBUG_VM enabled, order-0 pages are checked immediately when being freed + * to pcp lists. With debug_pagealloc also enabled, they are also rechecked when + * moved from pcp lists to free lists. + */ +static bool free_pcp_prepare(struct page *page) { return free_pages_prepare(page, 0, true); } -static inline bool bulkfree_pcp_prepare(struct page *page) +static bool bulkfree_pcp_prepare(struct page *page) { - return false; + if (debug_pagealloc_enabled()) + return free_pages_check(page); + else + return false; } #else +/* + * With DEBUG_VM disabled, order-0 pages being freed are checked only when + * moving from pcp lists to free list in order to reduce overhead. With + * debug_pagealloc enabled, they are checked also immediately when being freed + * to the pcp lists. + */ static bool free_pcp_prepare(struct page *page) { - return free_pages_prepare(page, 0, false); + if (debug_pagealloc_enabled()) + return free_pages_prepare(page, 0, true); + else + return free_pages_prepare(page, 0, false); } static bool bulkfree_pcp_prepare(struct page *page) @@ -2035,23 +2052,39 @@ static inline bool free_pages_prezeroed(void) } #ifdef CONFIG_DEBUG_VM -static bool check_pcp_refill(struct page *page) +/* + * With DEBUG_VM enabled, order-0 pages are checked for expected state when + * being allocated from pcp lists. With debug_pagealloc also enabled, they are + * also checked when pcp lists are refilled from the free lists. + */ +static inline bool check_pcp_refill(struct page *page) { - return false; + if (debug_pagealloc_enabled()) + return check_new_page(page); + else + return false; } -static bool check_new_pcp(struct page *page) +static inline bool check_new_pcp(struct page *page) { return check_new_page(page); } #else -static bool check_pcp_refill(struct page *page) +/* + * With DEBUG_VM disabled, free order-0 pages are checked for expected state + * when pcp lists are being refilled from the free lists. With debug_pagealloc + * enabled, they are also checked when being allocated from the pcp lists. + */ +static inline bool check_pcp_refill(struct page *page) { return check_new_page(page); } -static bool check_new_pcp(struct page *page) +static inline bool check_new_pcp(struct page *page) { - return false; + if (debug_pagealloc_enabled()) + return check_new_page(page); + else + return false; } #endif /* CONFIG_DEBUG_VM */ From 3972f6bb1c6ae1d32dcf2e4ff635d24b77f26dcb Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 11 Jul 2019 20:55:13 -0700 Subject: [PATCH 058/147] mm, debug_pagealloc: use a page type instead of page_ext flag When debug_pagealloc is enabled, we currently allocate the page_ext array to mark guard pages with the PAGE_EXT_DEBUG_GUARD flag. Now that we have the page_type field in struct page, we can use that instead, as guard pages are neither PageSlab nor mapped to userspace. This reduces memory overhead when debug_pagealloc is enabled and there are no other features requiring the page_ext array. Link: http://lkml.kernel.org/r/20190603143451.27353-4-vbabka@suse.cz Signed-off-by: Vlastimil Babka Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: "Kirill A. Shutemov" Cc: Mel Gorman Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- .../admin-guide/kernel-parameters.txt | 10 ++--- include/linux/mm.h | 10 +---- include/linux/page-flags.h | 6 +++ include/linux/page_ext.h | 1 - mm/Kconfig.debug | 1 - mm/page_alloc.c | 40 +++---------------- mm/page_ext.c | 3 -- 7 files changed, 17 insertions(+), 54 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index f1c433daef6b..aa4e7e7b87c2 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -805,12 +805,10 @@ tracking down these problems. debug_pagealloc= - [KNL] When CONFIG_DEBUG_PAGEALLOC is set, this - parameter enables the feature at boot time. In - default, it is disabled. We can avoid allocating huge - chunk of memory for debug pagealloc if we don't enable - it at boot time and the system will work mostly same - with the kernel built without CONFIG_DEBUG_PAGEALLOC. + [KNL] When CONFIG_DEBUG_PAGEALLOC is set, this parameter + enables the feature at boot time. By default, it is + disabled and the system will work mostly the same as a + kernel built without CONFIG_DEBUG_PAGEALLOC. on: enable the feature debugpat [X86] Enable PAT debugging diff --git a/include/linux/mm.h b/include/linux/mm.h index 2c2e98cae2d1..cb8d413d635e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2862,8 +2862,6 @@ extern long copy_huge_page_from_user(struct page *dst_page, bool allow_pagefault); #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ -extern struct page_ext_operations debug_guardpage_ops; - #ifdef CONFIG_DEBUG_PAGEALLOC extern unsigned int _debug_guardpage_minorder; DECLARE_STATIC_KEY_FALSE(_debug_guardpage_enabled); @@ -2880,16 +2878,10 @@ static inline bool debug_guardpage_enabled(void) static inline bool page_is_guard(struct page *page) { - struct page_ext *page_ext; - if (!debug_guardpage_enabled()) return false; - page_ext = lookup_page_ext(page); - if (unlikely(!page_ext)) - return false; - - return test_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags); + return PageGuard(page); } #else static inline unsigned int debug_guardpage_minorder(void) { return 0; } diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 9f8712a4b1a5..b848517da64c 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -703,6 +703,7 @@ PAGEFLAG_FALSE(DoubleMap) #define PG_offline 0x00000100 #define PG_kmemcg 0x00000200 #define PG_table 0x00000400 +#define PG_guard 0x00000800 #define PageType(page, flag) \ ((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE) @@ -754,6 +755,11 @@ PAGE_TYPE_OPS(Kmemcg, kmemcg) */ PAGE_TYPE_OPS(Table, table) +/* + * Marks guardpages used with debug_pagealloc. + */ +PAGE_TYPE_OPS(Guard, guard) + extern bool is_free_buddy_page(struct page *page); __PAGEFLAG(Isolated, isolated, PF_ANY); diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h index f84f167ec04c..09592951725c 100644 --- a/include/linux/page_ext.h +++ b/include/linux/page_ext.h @@ -17,7 +17,6 @@ struct page_ext_operations { #ifdef CONFIG_PAGE_EXTENSION enum page_ext_flags { - PAGE_EXT_DEBUG_GUARD, PAGE_EXT_OWNER, #if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT) PAGE_EXT_YOUNG, diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index a35ab6c55192..82b6a20898bd 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -12,7 +12,6 @@ config DEBUG_PAGEALLOC bool "Debug page memory allocations" depends on DEBUG_KERNEL depends on !HIBERNATION || ARCH_SUPPORTS_DEBUG_PAGEALLOC && !PPC && !SPARC - select PAGE_EXTENSION select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC ---help--- Unmap pages from the kernel linear mapping after free_pages(). diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 26b6ad8b065d..ae56e8feec0c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -50,7 +50,6 @@ #include #include #include -#include #include #include #include @@ -668,18 +667,6 @@ static int __init early_debug_pagealloc(char *buf) } early_param("debug_pagealloc", early_debug_pagealloc); -static bool need_debug_guardpage(void) -{ - /* If we don't use debug_pagealloc, we don't need guard page */ - if (!debug_pagealloc_enabled()) - return false; - - if (!debug_guardpage_minorder()) - return false; - - return true; -} - static void init_debug_guardpage(void) { if (!debug_pagealloc_enabled()) @@ -691,11 +678,6 @@ static void init_debug_guardpage(void) static_branch_enable(&_debug_guardpage_enabled); } -struct page_ext_operations debug_guardpage_ops = { - .need = need_debug_guardpage, - .init = init_debug_guardpage, -}; - static int __init debug_guardpage_minorder_setup(char *buf) { unsigned long res; @@ -713,20 +695,13 @@ early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup); static inline bool set_page_guard(struct zone *zone, struct page *page, unsigned int order, int migratetype) { - struct page_ext *page_ext; - if (!debug_guardpage_enabled()) return false; if (order >= debug_guardpage_minorder()) return false; - page_ext = lookup_page_ext(page); - if (unlikely(!page_ext)) - return false; - - __set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags); - + __SetPageGuard(page); INIT_LIST_HEAD(&page->lru); set_page_private(page, order); /* Guard pages are not available for any usage */ @@ -738,23 +713,16 @@ static inline bool set_page_guard(struct zone *zone, struct page *page, static inline void clear_page_guard(struct zone *zone, struct page *page, unsigned int order, int migratetype) { - struct page_ext *page_ext; - if (!debug_guardpage_enabled()) return; - page_ext = lookup_page_ext(page); - if (unlikely(!page_ext)) - return; - - __clear_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags); + __ClearPageGuard(page); set_page_private(page, 0); if (!is_migrate_isolate(migratetype)) __mod_zone_freepage_state(zone, (1 << order), migratetype); } #else -struct page_ext_operations debug_guardpage_ops; static inline bool set_page_guard(struct zone *zone, struct page *page, unsigned int order, int migratetype) { return false; } static inline void clear_page_guard(struct zone *zone, struct page *page, @@ -1930,6 +1898,10 @@ void __init page_alloc_init_late(void) for_each_populated_zone(zone) set_zone_contiguous(zone); + +#ifdef CONFIG_DEBUG_PAGEALLOC + init_debug_guardpage(); +#endif } #ifdef CONFIG_CMA diff --git a/mm/page_ext.c b/mm/page_ext.c index d8f1aca4ad43..5f5769c7db3b 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -59,9 +59,6 @@ */ static struct page_ext_operations *page_ext_ops[] = { -#ifdef CONFIG_DEBUG_PAGEALLOC - &debug_guardpage_ops, -#endif #ifdef CONFIG_PAGE_OWNER &page_owner_ops, #endif From d322a8e5e3e9742fa6b76a207e5df57e03f318f9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 11 Jul 2019 20:55:17 -0700 Subject: [PATCH 059/147] mm/filemap.c: fix an overly long line in read_cache_page Patch series "fix filler_t callback type mismatches", v2. Casting mapping->a_ops->readpage to filler_t causes an indirect call type mismatch with Control-Flow Integrity checking. This change fixes the mismatch in read_cache_page_gfp and read_mapping_page by adding using a NULL filler argument as an indication to call ->readpage directly, and by passing the right parameter callbacks in nfs and jffs2. This patch (of 4): Code cleanup. Link: http://lkml.kernel.org/r/20190520055731.24538-2-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Kees Cook Cc: Nick Desaulniers Cc: Sami Tolvanen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/filemap.c b/mm/filemap.c index f1aa20ab8434..d6f7596f148f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2915,7 +2915,8 @@ struct page *read_cache_page(struct address_space *mapping, int (*filler)(void *, struct page *), void *data) { - return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping)); + return do_read_cache_page(mapping, index, filler, data, + mapping_gfp_mask(mapping)); } EXPORT_SYMBOL(read_cache_page); From 6c45b454191b330c8bc21d1ed3cf39bb6da1a4eb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 11 Jul 2019 20:55:20 -0700 Subject: [PATCH 060/147] mm/filemap: don't cast ->readpage to filler_t for do_read_cache_page We can just pass a NULL filler and do the right thing inside of do_read_cache_page based on the NULL parameter. Link: http://lkml.kernel.org/r/20190520055731.24538-3-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Kees Cook Cc: Nick Desaulniers Cc: Sami Tolvanen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 3 +-- mm/filemap.c | 10 ++++++---- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 6fd0d3aa492c..c7552459a15f 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -383,8 +383,7 @@ extern int read_cache_pages(struct address_space *mapping, static inline struct page *read_mapping_page(struct address_space *mapping, pgoff_t index, void *data) { - filler_t *filler = (filler_t *)mapping->a_ops->readpage; - return read_cache_page(mapping, index, filler, data); + return read_cache_page(mapping, index, NULL, data); } /* diff --git a/mm/filemap.c b/mm/filemap.c index d6f7596f148f..1e5e006b8557 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2825,7 +2825,11 @@ repeat: } filler: - err = filler(data, page); + if (filler) + err = filler(data, page); + else + err = mapping->a_ops->readpage(data, page); + if (err < 0) { put_page(page); return ERR_PTR(err); @@ -2937,9 +2941,7 @@ struct page *read_cache_page_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp) { - filler_t *filler = (filler_t *)mapping->a_ops->readpage; - - return do_read_cache_page(mapping, index, filler, NULL, gfp); + return do_read_cache_page(mapping, index, NULL, NULL, gfp); } EXPORT_SYMBOL(read_cache_page_gfp); From 265de8ce3d5b3c70644a1a45457580bb07548b56 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 11 Jul 2019 20:55:23 -0700 Subject: [PATCH 061/147] jffs2: pass the correct prototype to read_cache_page Fix the callback jffs2 passes to read_cache_page to actually have the proper type expected. Casting around function pointers can easily hide typing bugs, and defeats control flow protection. Link: http://lkml.kernel.org/r/20190520055731.24538-4-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Kees Cook Cc: Nick Desaulniers Cc: Sami Tolvanen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/jffs2/file.c | 4 ++-- fs/jffs2/fs.c | 2 +- fs/jffs2/os-linux.h | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index 7d8654a1472e..f8fb89b10227 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -109,9 +109,9 @@ static int jffs2_do_readpage_nolock (struct inode *inode, struct page *pg) return ret; } -int jffs2_do_readpage_unlock(struct inode *inode, struct page *pg) +int jffs2_do_readpage_unlock(void *data, struct page *pg) { - int ret = jffs2_do_readpage_nolock(inode, pg); + int ret = jffs2_do_readpage_nolock(data, pg); unlock_page(pg); return ret; } diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index 112d85849db1..8a20ddd25f2d 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -687,7 +687,7 @@ unsigned char *jffs2_gc_fetch_page(struct jffs2_sb_info *c, struct page *pg; pg = read_cache_page(inode->i_mapping, offset >> PAGE_SHIFT, - (void *)jffs2_do_readpage_unlock, inode); + jffs2_do_readpage_unlock, inode); if (IS_ERR(pg)) return (void *)pg; diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h index a2dbbb3f4c74..bd3d5f0ddc34 100644 --- a/fs/jffs2/os-linux.h +++ b/fs/jffs2/os-linux.h @@ -155,7 +155,7 @@ extern const struct file_operations jffs2_file_operations; extern const struct inode_operations jffs2_file_inode_operations; extern const struct address_space_operations jffs2_file_address_operations; int jffs2_fsync(struct file *, loff_t, loff_t, int); -int jffs2_do_readpage_unlock (struct inode *inode, struct page *pg); +int jffs2_do_readpage_unlock(void *data, struct page *pg); /* ioctl.c */ long jffs2_ioctl(struct file *, unsigned int, unsigned long); From f053cbd4366051d7eb6ba1b8d529d20f719c2963 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 11 Jul 2019 20:55:26 -0700 Subject: [PATCH 062/147] 9p: pass the correct prototype to read_cache_page Fix the callback 9p passes to read_cache_page to actually have the proper type expected. Casting around function pointers can easily hide typing bugs, and defeats control flow protection. Link: http://lkml.kernel.org/r/20190520055731.24538-5-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Kees Cook Cc: Sami Tolvanen Cc: Nick Desaulniers Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/9p/vfs_addr.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index bc57ae9e2963..cce9ace651a2 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -35,8 +35,9 @@ * @page: structure to page * */ -static int v9fs_fid_readpage(struct p9_fid *fid, struct page *page) +static int v9fs_fid_readpage(void *data, struct page *page) { + struct p9_fid *fid = data; struct inode *inode = page->mapping->host; struct bio_vec bvec = {.bv_page = page, .bv_len = PAGE_SIZE}; struct iov_iter to; @@ -107,7 +108,8 @@ static int v9fs_vfs_readpages(struct file *filp, struct address_space *mapping, if (ret == 0) return ret; - ret = read_cache_pages(mapping, pages, (void *)v9fs_vfs_readpage, filp); + ret = read_cache_pages(mapping, pages, v9fs_fid_readpage, + filp->private_data); p9_debug(P9_DEBUG_VFS, " = %d\n", ret); return ret; } From a4985833885b8f568bab90d5dc1886ae68dc82cf Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 11 Jul 2019 20:55:29 -0700 Subject: [PATCH 063/147] mm/filemap.c: correct the comment about VM_FAULT_RETRY Commit 6b4c9f446981 ("filemap: drop the mmap_sem for all blocking operations") changed when mmap_sem is dropped during filemap page fault and when returning VM_FAULT_RETRY. Correct the comment to reflect the change. Link: http://lkml.kernel.org/r/1556234531-108228-1-git-send-email-yang.shi@linux.alibaba.com Signed-off-by: Yang Shi Reviewed-by: Josef Bacik Acked-by: Song Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 1e5e006b8557..d0cf700bf201 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2504,10 +2504,8 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf, * * vma->vm_mm->mmap_sem must be held on entry. * - * If our return value has VM_FAULT_RETRY set, it's because - * lock_page_or_retry() returned 0. - * The mmap_sem has usually been released in this case. - * See __lock_page_or_retry() for the exception. + * If our return value has VM_FAULT_RETRY set, it's because the mmap_sem + * may be dropped before doing I/O or by lock_page_maybe_drop_mmap(). * * If our return value does not have VM_FAULT_RETRY set, the mmap_sem * has not been released. From eb085574a7526c4375965c5fbf7e5b0c19cdd336 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Thu, 11 Jul 2019 20:55:33 -0700 Subject: [PATCH 064/147] mm, swap: fix race between swapoff and some swap operations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When swapin is performed, after getting the swap entry information from the page table, system will swap in the swap entry, without any lock held to prevent the swap device from being swapoff. This may cause the race like below, CPU 1 CPU 2 ----- ----- do_swap_page swapin_readahead __read_swap_cache_async swapoff swapcache_prepare p->swap_map = NULL __swap_duplicate p->swap_map[?] /* !!! NULL pointer access */ Because swapoff is usually done when system shutdown only, the race may not hit many people in practice. But it is still a race need to be fixed. To fix the race, get_swap_device() is added to check whether the specified swap entry is valid in its swap device. If so, it will keep the swap entry valid via preventing the swap device from being swapoff, until put_swap_device() is called. Because swapoff() is very rare code path, to make the normal path runs as fast as possible, rcu_read_lock/unlock() and synchronize_rcu() instead of reference count is used to implement get/put_swap_device(). >From get_swap_device() to put_swap_device(), RCU reader side is locked, so synchronize_rcu() in swapoff() will wait until put_swap_device() is called. In addition to swap_map, cluster_info, etc. data structure in the struct swap_info_struct, the swap cache radix tree will be freed after swapoff, so this patch fixes the race between swap cache looking up and swapoff too. Races between some other swap cache usages and swapoff are fixed too via calling synchronize_rcu() between clearing PageSwapCache() and freeing swap cache data structure. Another possible method to fix this is to use preempt_off() + stop_machine() to prevent the swap device from being swapoff when its data structure is being accessed. The overhead in hot-path of both methods is similar. The advantages of RCU based method are, 1. stop_machine() may disturb the normal execution code path on other CPUs. 2. File cache uses RCU to protect its radix tree. If the similar mechanism is used for swap cache too, it is easier to share code between them. 3. RCU is used to protect swap cache in total_swapcache_pages() and exit_swap_address_space() already. The two mechanisms can be merged to simplify the logic. Link: http://lkml.kernel.org/r/20190522015423.14418-1-ying.huang@intel.com Fixes: 235b62176712 ("mm/swap: add cluster lock") Signed-off-by: "Huang, Ying" Reviewed-by: Andrea Parri Not-nacked-by: Hugh Dickins Cc: Andrea Arcangeli Cc: Paul E. McKenney Cc: Daniel Jordan Cc: Michal Hocko Cc: Minchan Kim Cc: Johannes Weiner Cc: Tim Chen Cc: Mel Gorman Cc: Jérôme Glisse Cc: Yang Shi Cc: David Rientjes Cc: Rik van Riel Cc: Jan Kara Cc: Dave Jiang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 13 +++- mm/memory.c | 2 +- mm/swap_state.c | 16 ++++- mm/swapfile.c | 154 ++++++++++++++++++++++++++++++++++--------- 4 files changed, 146 insertions(+), 39 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 4bfb5c4ac108..6358a6185634 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -175,8 +175,9 @@ enum { SWP_PAGE_DISCARD = (1 << 10), /* freed swap page-cluster discards */ SWP_STABLE_WRITES = (1 << 11), /* no overwrite PG_writeback pages */ SWP_SYNCHRONOUS_IO = (1 << 12), /* synchronous IO is efficient */ + SWP_VALID = (1 << 13), /* swap is valid to be operated on? */ /* add others here before... */ - SWP_SCANNING = (1 << 13), /* refcount in scan_swap_map */ + SWP_SCANNING = (1 << 14), /* refcount in scan_swap_map */ }; #define SWAP_CLUSTER_MAX 32UL @@ -460,7 +461,7 @@ extern unsigned int count_swap_pages(int, int); extern sector_t map_swap_page(struct page *, struct block_device **); extern sector_t swapdev_block(int, pgoff_t); extern int page_swapcount(struct page *); -extern int __swap_count(struct swap_info_struct *si, swp_entry_t entry); +extern int __swap_count(swp_entry_t entry); extern int __swp_swapcount(swp_entry_t entry); extern int swp_swapcount(swp_entry_t entry); extern struct swap_info_struct *page_swap_info(struct page *); @@ -470,6 +471,12 @@ extern int try_to_free_swap(struct page *); struct backing_dev_info; extern int init_swap_address_space(unsigned int type, unsigned long nr_pages); extern void exit_swap_address_space(unsigned int type); +extern struct swap_info_struct *get_swap_device(swp_entry_t entry); + +static inline void put_swap_device(struct swap_info_struct *si) +{ + rcu_read_unlock(); +} #else /* CONFIG_SWAP */ @@ -576,7 +583,7 @@ static inline int page_swapcount(struct page *page) return 0; } -static inline int __swap_count(struct swap_info_struct *si, swp_entry_t entry) +static inline int __swap_count(swp_entry_t entry) { return 0; } diff --git a/mm/memory.c b/mm/memory.c index ced4bedc660d..b47e4e56448a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2805,7 +2805,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) struct swap_info_struct *si = swp_swap_info(entry); if (si->flags & SWP_SYNCHRONOUS_IO && - __swap_count(si, entry) == 1) { + __swap_count(entry) == 1) { /* skip swapcache */ page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address); diff --git a/mm/swap_state.c b/mm/swap_state.c index 85245fdec8d9..61453f1faf72 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -310,8 +310,13 @@ struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma, unsigned long addr) { struct page *page; + struct swap_info_struct *si; + si = get_swap_device(entry); + if (!si) + return NULL; page = find_get_page(swap_address_space(entry), swp_offset(entry)); + put_swap_device(si); INC_CACHE_INFO(find_total); if (page) { @@ -354,8 +359,8 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr, bool *new_page_allocated) { - struct page *found_page, *new_page = NULL; - struct address_space *swapper_space = swap_address_space(entry); + struct page *found_page = NULL, *new_page = NULL; + struct swap_info_struct *si; int err; *new_page_allocated = false; @@ -365,7 +370,12 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * called after lookup_swap_cache() failed, re-calling * that would confuse statistics. */ - found_page = find_get_page(swapper_space, swp_offset(entry)); + si = get_swap_device(entry); + if (!si) + break; + found_page = find_get_page(swap_address_space(entry), + swp_offset(entry)); + put_swap_device(si); if (found_page) break; diff --git a/mm/swapfile.c b/mm/swapfile.c index 596ac98051c5..dbab16ddefa6 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1079,12 +1079,11 @@ fail: static struct swap_info_struct *__swap_info_get(swp_entry_t entry) { struct swap_info_struct *p; - unsigned long offset, type; + unsigned long offset; if (!entry.val) goto out; - type = swp_type(entry); - p = swap_type_to_swap_info(type); + p = swp_swap_info(entry); if (!p) goto bad_nofile; if (!(p->flags & SWP_USED)) @@ -1187,6 +1186,69 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *p, return usage; } +/* + * Check whether swap entry is valid in the swap device. If so, + * return pointer to swap_info_struct, and keep the swap entry valid + * via preventing the swap device from being swapoff, until + * put_swap_device() is called. Otherwise return NULL. + * + * The entirety of the RCU read critical section must come before the + * return from or after the call to synchronize_rcu() in + * enable_swap_info() or swapoff(). So if "si->flags & SWP_VALID" is + * true, the si->map, si->cluster_info, etc. must be valid in the + * critical section. + * + * Notice that swapoff or swapoff+swapon can still happen before the + * rcu_read_lock() in get_swap_device() or after the rcu_read_unlock() + * in put_swap_device() if there isn't any other way to prevent + * swapoff, such as page lock, page table lock, etc. The caller must + * be prepared for that. For example, the following situation is + * possible. + * + * CPU1 CPU2 + * do_swap_page() + * ... swapoff+swapon + * __read_swap_cache_async() + * swapcache_prepare() + * __swap_duplicate() + * // check swap_map + * // verify PTE not changed + * + * In __swap_duplicate(), the swap_map need to be checked before + * changing partly because the specified swap entry may be for another + * swap device which has been swapoff. And in do_swap_page(), after + * the page is read from the swap device, the PTE is verified not + * changed with the page table locked to check whether the swap device + * has been swapoff or swapoff+swapon. + */ +struct swap_info_struct *get_swap_device(swp_entry_t entry) +{ + struct swap_info_struct *si; + unsigned long offset; + + if (!entry.val) + goto out; + si = swp_swap_info(entry); + if (!si) + goto bad_nofile; + + rcu_read_lock(); + if (!(si->flags & SWP_VALID)) + goto unlock_out; + offset = swp_offset(entry); + if (offset >= si->max) + goto unlock_out; + + return si; +bad_nofile: + pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val); +out: + return NULL; +unlock_out: + rcu_read_unlock(); + return NULL; +} + static unsigned char __swap_entry_free(struct swap_info_struct *p, swp_entry_t entry, unsigned char usage) { @@ -1358,11 +1420,18 @@ int page_swapcount(struct page *page) return count; } -int __swap_count(struct swap_info_struct *si, swp_entry_t entry) +int __swap_count(swp_entry_t entry) { + struct swap_info_struct *si; pgoff_t offset = swp_offset(entry); + int count = 0; - return swap_count(si->swap_map[offset]); + si = get_swap_device(entry); + if (si) { + count = swap_count(si->swap_map[offset]); + put_swap_device(si); + } + return count; } static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) @@ -1387,9 +1456,11 @@ int __swp_swapcount(swp_entry_t entry) int count = 0; struct swap_info_struct *si; - si = __swap_info_get(entry); - if (si) + si = get_swap_device(entry); + if (si) { count = swap_swapcount(si, entry); + put_swap_device(si); + } return count; } @@ -2335,9 +2406,9 @@ static int swap_node(struct swap_info_struct *p) return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE; } -static void _enable_swap_info(struct swap_info_struct *p, int prio, - unsigned char *swap_map, - struct swap_cluster_info *cluster_info) +static void setup_swap_info(struct swap_info_struct *p, int prio, + unsigned char *swap_map, + struct swap_cluster_info *cluster_info) { int i; @@ -2362,7 +2433,11 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio, } p->swap_map = swap_map; p->cluster_info = cluster_info; - p->flags |= SWP_WRITEOK; +} + +static void _enable_swap_info(struct swap_info_struct *p) +{ + p->flags |= SWP_WRITEOK | SWP_VALID; atomic_long_add(p->pages, &nr_swap_pages); total_swap_pages += p->pages; @@ -2389,7 +2464,17 @@ static void enable_swap_info(struct swap_info_struct *p, int prio, frontswap_init(p->type, frontswap_map); spin_lock(&swap_lock); spin_lock(&p->lock); - _enable_swap_info(p, prio, swap_map, cluster_info); + setup_swap_info(p, prio, swap_map, cluster_info); + spin_unlock(&p->lock); + spin_unlock(&swap_lock); + /* + * Guarantee swap_map, cluster_info, etc. fields are valid + * between get/put_swap_device() if SWP_VALID bit is set + */ + synchronize_rcu(); + spin_lock(&swap_lock); + spin_lock(&p->lock); + _enable_swap_info(p); spin_unlock(&p->lock); spin_unlock(&swap_lock); } @@ -2398,7 +2483,8 @@ static void reinsert_swap_info(struct swap_info_struct *p) { spin_lock(&swap_lock); spin_lock(&p->lock); - _enable_swap_info(p, p->prio, p->swap_map, p->cluster_info); + setup_swap_info(p, p->prio, p->swap_map, p->cluster_info); + _enable_swap_info(p); spin_unlock(&p->lock); spin_unlock(&swap_lock); } @@ -2501,6 +2587,17 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) reenable_swap_slots_cache_unlock(); + spin_lock(&swap_lock); + spin_lock(&p->lock); + p->flags &= ~SWP_VALID; /* mark swap device as invalid */ + spin_unlock(&p->lock); + spin_unlock(&swap_lock); + /* + * wait for swap operations protected by get/put_swap_device() + * to complete + */ + synchronize_rcu(); + flush_work(&p->discard_work); destroy_swap_extents(p); @@ -3265,17 +3362,11 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) unsigned char has_cache; int err = -EINVAL; - if (non_swap_entry(entry)) - goto out; - - p = swp_swap_info(entry); + p = get_swap_device(entry); if (!p) - goto bad_file; + goto out; offset = swp_offset(entry); - if (unlikely(offset >= p->max)) - goto out; - ci = lock_cluster_or_swap_info(p, offset); count = p->swap_map[offset]; @@ -3321,11 +3412,9 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) unlock_out: unlock_cluster_or_swap_info(p, ci); out: + if (p) + put_swap_device(p); return err; - -bad_file: - pr_err("swap_dup: %s%08lx\n", Bad_file, entry.val); - goto out; } /* @@ -3417,6 +3506,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) struct page *list_page; pgoff_t offset; unsigned char count; + int ret = 0; /* * When debugging, it's easier to use __GFP_ZERO here; but it's better @@ -3424,15 +3514,15 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) */ page = alloc_page(gfp_mask | __GFP_HIGHMEM); - si = swap_info_get(entry); + si = get_swap_device(entry); if (!si) { /* * An acceptable race has occurred since the failing - * __swap_duplicate(): the swap entry has been freed, - * perhaps even the whole swap_map cleared for swapoff. + * __swap_duplicate(): the swap device may be swapoff */ goto outer; } + spin_lock(&si->lock); offset = swp_offset(entry); @@ -3450,9 +3540,8 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) } if (!page) { - unlock_cluster(ci); - spin_unlock(&si->lock); - return -ENOMEM; + ret = -ENOMEM; + goto out; } /* @@ -3504,10 +3593,11 @@ out_unlock_cont: out: unlock_cluster(ci); spin_unlock(&si->lock); + put_swap_device(si); outer: if (page) __free_page(page); - return 0; + return ret; } /* From 054f1d1faaed6a7930b77286d607ae45c01d0443 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Thu, 11 Jul 2019 20:55:37 -0700 Subject: [PATCH 065/147] mm/swap_state.c: simplify total_swapcache_pages() with get_swap_device() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit total_swapcache_pages() may race with swapper_spaces[] allocation and freeing. Previously, this is protected with a swapper_spaces[] specific RCU mechanism. To simplify the logic/code complexity, it is replaced with get/put_swap_device(). The code line number is reduced too. Although not so important, the swapoff() performance improves too because one synchronize_rcu() call during swapoff() is deleted. [ying.huang@intel.com: fix bad swap file entry warning] Link: http://lkml.kernel.org/r/20190531024102.21723-1-ying.huang@intel.com Link: http://lkml.kernel.org/r/20190527082714.12151-1-ying.huang@intel.com Signed-off-by: "Huang, Ying" Reviewed-by: Andrew Morton Tested-by: Mike Kravetz Cc: Hugh Dickins Cc: Paul E. McKenney Cc: Minchan Kim Cc: Johannes Weiner Cc: Tim Chen Cc: Mel Gorman Cc: Jérôme Glisse Cc: Michal Hocko Cc: Andrea Arcangeli Cc: Yang Shi Cc: David Rientjes Cc: Rik van Riel Cc: Jan Kara Cc: Dave Jiang Cc: Daniel Jordan Cc: Andrea Parri Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap_state.c | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/mm/swap_state.c b/mm/swap_state.c index 61453f1faf72..8368621a0fc7 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -73,23 +73,24 @@ unsigned long total_swapcache_pages(void) unsigned int i, j, nr; unsigned long ret = 0; struct address_space *spaces; + struct swap_info_struct *si; - rcu_read_lock(); for (i = 0; i < MAX_SWAPFILES; i++) { - /* - * The corresponding entries in nr_swapper_spaces and - * swapper_spaces will be reused only after at least - * one grace period. So it is impossible for them - * belongs to different usage. - */ - nr = nr_swapper_spaces[i]; - spaces = rcu_dereference(swapper_spaces[i]); - if (!nr || !spaces) + swp_entry_t entry = swp_entry(i, 1); + + /* Avoid get_swap_device() to warn for bad swap entry */ + if (!swp_swap_info(entry)) continue; + /* Prevent swapoff to free swapper_spaces */ + si = get_swap_device(entry); + if (!si) + continue; + nr = nr_swapper_spaces[i]; + spaces = swapper_spaces[i]; for (j = 0; j < nr; j++) ret += spaces[j].nrpages; + put_swap_device(si); } - rcu_read_unlock(); return ret; } @@ -611,20 +612,16 @@ int init_swap_address_space(unsigned int type, unsigned long nr_pages) mapping_set_no_writeback_tags(space); } nr_swapper_spaces[type] = nr; - rcu_assign_pointer(swapper_spaces[type], spaces); + swapper_spaces[type] = spaces; return 0; } void exit_swap_address_space(unsigned int type) { - struct address_space *spaces; - - spaces = swapper_spaces[type]; + kvfree(swapper_spaces[type]); nr_swapper_spaces[type] = 0; - rcu_assign_pointer(swapper_spaces[type], NULL); - synchronize_rcu(); - kvfree(spaces); + swapper_spaces[type] = NULL; } static inline void swap_ra_clamp_pfn(struct vm_area_struct *vma, From 4efaceb1c5f8136d5fec3f26549d294b8e898bd7 Mon Sep 17 00:00:00 2001 From: Aaron Lu Date: Thu, 11 Jul 2019 20:55:41 -0700 Subject: [PATCH 066/147] mm, swap: use rbtree for swap_extent swap_extent is used to map swap page offset to backing device's block offset. For a continuous block range, one swap_extent is used and all these swap_extents are managed in a linked list. These swap_extents are used by map_swap_entry() during swap's read and write path. To find out the backing device's block offset for a page offset, the swap_extent list will be traversed linearly, with curr_swap_extent being used as a cache to speed up the search. This works well as long as swap_extents are not huge or when the number of processes that access swap device are few, but when the swap device has many extents and there are a number of processes accessing the swap device concurrently, it can be a problem. On one of our servers, the disk's remaining size is tight: $df -h Filesystem Size Used Avail Use% Mounted on ... ... /dev/nvme0n1p1 1.8T 1.3T 504G 72% /home/t4 When creating a 80G swapfile there, there are as many as 84656 swap extents. The end result is, kernel spends abou 30% time in map_swap_entry() and swap throughput is only 70MB/s. As a comparison, when I used smaller sized swapfile, like 4G whose swap_extent dropped to 2000, swap throughput is back to 400-500MB/s and map_swap_entry() is about 3%. One downside of using rbtree for swap_extent is, 'struct rbtree' takes 24 bytes while 'struct list_head' takes 16 bytes, that's 8 bytes more for each swap_extent. For a swapfile that has 80k swap_extents, that means 625KiB more memory consumed. Test: Since it's not possible to reboot that server, I can not test this patch diretly there. Instead, I tested it on another server with NVMe disk. I created a 20G swapfile on an NVMe backed XFS fs. By default, the filesystem is quite clean and the created swapfile has only 2 extents. Testing vanilla and this patch shows no obvious performance difference when swapfile is not fragmented. To see the patch's effects, I used some tweaks to manually fragment the swapfile by breaking the extent at 1M boundary. This made the swapfile have 20K extents. nr_task=4 kernel swapout(KB/s) map_swap_entry(perf) swapin(KB/s) map_swap_entry(perf) vanilla 165191 90.77% 171798 90.21% patched 858993 +420% 2.16% 715827 +317% 0.77% nr_task=8 kernel swapout(KB/s) map_swap_entry(perf) swapin(KB/s) map_swap_entry(perf) vanilla 306783 92.19% 318145 87.76% patched 954437 +211% 2.35% 1073741 +237% 1.57% swapout: the throughput of swap out, in KB/s, higher is better 1st map_swap_entry: cpu cycles percent sampled by perf swapin: the throughput of swap in, in KB/s, higher is better. 2nd map_swap_entry: cpu cycles percent sampled by perf nr_task=1 doesn't show any difference, this is due to the curr_swap_extent can be effectively used to cache the correct swap extent for single task workload. [akpm@linux-foundation.org: s/BUG_ON(1)/BUG()/] Link: http://lkml.kernel.org/r/20190523142404.GA181@aaronlu Signed-off-by: Aaron Lu Cc: Huang Ying Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 5 +- mm/page_io.c | 2 +- mm/swapfile.c | 131 ++++++++++++++++++++++++------------------- 3 files changed, 75 insertions(+), 63 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 6358a6185634..de2c67a33b7e 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -148,7 +148,7 @@ struct zone; * We always assume that blocks are of size PAGE_SIZE. */ struct swap_extent { - struct list_head list; + struct rb_node rb_node; pgoff_t start_page; pgoff_t nr_pages; sector_t start_block; @@ -248,8 +248,7 @@ struct swap_info_struct { unsigned int cluster_next; /* likely index for next allocation */ unsigned int cluster_nr; /* countdown to next cluster search */ struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */ - struct swap_extent *curr_swap_extent; - struct swap_extent first_swap_extent; + struct rb_root swap_extent_root;/* root of the swap extent rbtree */ struct block_device *bdev; /* swap device or bdev of swap file */ struct file *swap_file; /* seldom referenced */ unsigned int old_block_size; /* seldom referenced */ diff --git a/mm/page_io.c b/mm/page_io.c index a39aac2f8c8d..24ee600f9131 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -163,7 +163,7 @@ int generic_swapfile_activate(struct swap_info_struct *sis, blocks_per_page = PAGE_SIZE >> blkbits; /* - * Map all the blocks into the extent list. This code doesn't try + * Map all the blocks into the extent tree. This code doesn't try * to be very smart. */ probe_block = 0; diff --git a/mm/swapfile.c b/mm/swapfile.c index dbab16ddefa6..0789a762ce2f 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -152,6 +152,18 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, return ret; } +static inline struct swap_extent *first_se(struct swap_info_struct *sis) +{ + struct rb_node *rb = rb_first(&sis->swap_extent_root); + return rb_entry(rb, struct swap_extent, rb_node); +} + +static inline struct swap_extent *next_se(struct swap_extent *se) +{ + struct rb_node *rb = rb_next(&se->rb_node); + return rb ? rb_entry(rb, struct swap_extent, rb_node) : NULL; +} + /* * swapon tell device that all the old swap contents can be discarded, * to allow the swap device to optimize its wear-levelling. @@ -164,7 +176,7 @@ static int discard_swap(struct swap_info_struct *si) int err = 0; /* Do not discard the swap header page! */ - se = &si->first_swap_extent; + se = first_se(si); start_block = (se->start_block + 1) << (PAGE_SHIFT - 9); nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9); if (nr_blocks) { @@ -175,7 +187,7 @@ static int discard_swap(struct swap_info_struct *si) cond_resched(); } - list_for_each_entry(se, &si->first_swap_extent.list, list) { + for (se = next_se(se); se; se = next_se(se)) { start_block = se->start_block << (PAGE_SHIFT - 9); nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); @@ -189,6 +201,26 @@ static int discard_swap(struct swap_info_struct *si) return err; /* That will often be -EOPNOTSUPP */ } +static struct swap_extent * +offset_to_swap_extent(struct swap_info_struct *sis, unsigned long offset) +{ + struct swap_extent *se; + struct rb_node *rb; + + rb = sis->swap_extent_root.rb_node; + while (rb) { + se = rb_entry(rb, struct swap_extent, rb_node); + if (offset < se->start_page) + rb = rb->rb_left; + else if (offset >= se->start_page + se->nr_pages) + rb = rb->rb_right; + else + return se; + } + /* It *must* be present */ + BUG(); +} + /* * swap allocation tell device that a cluster of swap can now be discarded, * to allow the swap device to optimize its wear-levelling. @@ -196,32 +228,25 @@ static int discard_swap(struct swap_info_struct *si) static void discard_swap_cluster(struct swap_info_struct *si, pgoff_t start_page, pgoff_t nr_pages) { - struct swap_extent *se = si->curr_swap_extent; - int found_extent = 0; + struct swap_extent *se = offset_to_swap_extent(si, start_page); while (nr_pages) { - if (se->start_page <= start_page && - start_page < se->start_page + se->nr_pages) { - pgoff_t offset = start_page - se->start_page; - sector_t start_block = se->start_block + offset; - sector_t nr_blocks = se->nr_pages - offset; + pgoff_t offset = start_page - se->start_page; + sector_t start_block = se->start_block + offset; + sector_t nr_blocks = se->nr_pages - offset; - if (nr_blocks > nr_pages) - nr_blocks = nr_pages; - start_page += nr_blocks; - nr_pages -= nr_blocks; + if (nr_blocks > nr_pages) + nr_blocks = nr_pages; + start_page += nr_blocks; + nr_pages -= nr_blocks; - if (!found_extent++) - si->curr_swap_extent = se; + start_block <<= PAGE_SHIFT - 9; + nr_blocks <<= PAGE_SHIFT - 9; + if (blkdev_issue_discard(si->bdev, start_block, + nr_blocks, GFP_NOIO, 0)) + break; - start_block <<= PAGE_SHIFT - 9; - nr_blocks <<= PAGE_SHIFT - 9; - if (blkdev_issue_discard(si->bdev, start_block, - nr_blocks, GFP_NOIO, 0)) - break; - } - - se = list_next_entry(se, list); + se = next_se(se); } } @@ -1755,7 +1780,7 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) return type; } if (bdev == sis->bdev) { - struct swap_extent *se = &sis->first_swap_extent; + struct swap_extent *se = first_se(sis); if (se->start_block == offset) { if (bdev_p) @@ -2232,7 +2257,6 @@ static void drain_mmlist(void) static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev) { struct swap_info_struct *sis; - struct swap_extent *start_se; struct swap_extent *se; pgoff_t offset; @@ -2240,18 +2264,8 @@ static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev) *bdev = sis->bdev; offset = swp_offset(entry); - start_se = sis->curr_swap_extent; - se = start_se; - - for ( ; ; ) { - if (se->start_page <= offset && - offset < (se->start_page + se->nr_pages)) { - return se->start_block + (offset - se->start_page); - } - se = list_next_entry(se, list); - sis->curr_swap_extent = se; - BUG_ON(se == start_se); /* It *must* be present */ - } + se = offset_to_swap_extent(sis, offset); + return se->start_block + (offset - se->start_page); } /* @@ -2269,12 +2283,11 @@ sector_t map_swap_page(struct page *page, struct block_device **bdev) */ static void destroy_swap_extents(struct swap_info_struct *sis) { - while (!list_empty(&sis->first_swap_extent.list)) { - struct swap_extent *se; + while (!RB_EMPTY_ROOT(&sis->swap_extent_root)) { + struct rb_node *rb = sis->swap_extent_root.rb_node; + struct swap_extent *se = rb_entry(rb, struct swap_extent, rb_node); - se = list_first_entry(&sis->first_swap_extent.list, - struct swap_extent, list); - list_del(&se->list); + rb_erase(rb, &sis->swap_extent_root); kfree(se); } @@ -2290,7 +2303,7 @@ static void destroy_swap_extents(struct swap_info_struct *sis) /* * Add a block range (and the corresponding page range) into this swapdev's - * extent list. The extent list is kept sorted in page order. + * extent tree. * * This function rather assumes that it is called in ascending page order. */ @@ -2298,20 +2311,21 @@ int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, unsigned long nr_pages, sector_t start_block) { + struct rb_node **link = &sis->swap_extent_root.rb_node, *parent = NULL; struct swap_extent *se; struct swap_extent *new_se; - struct list_head *lh; - if (start_page == 0) { - se = &sis->first_swap_extent; - sis->curr_swap_extent = se; - se->start_page = 0; - se->nr_pages = nr_pages; - se->start_block = start_block; - return 1; - } else { - lh = sis->first_swap_extent.list.prev; /* Highest extent */ - se = list_entry(lh, struct swap_extent, list); + /* + * place the new node at the right most since the + * function is called in ascending page order. + */ + while (*link) { + parent = *link; + link = &parent->rb_right; + } + + if (parent) { + se = rb_entry(parent, struct swap_extent, rb_node); BUG_ON(se->start_page + se->nr_pages != start_page); if (se->start_block + se->nr_pages == start_block) { /* Merge it */ @@ -2320,9 +2334,7 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, } } - /* - * No merge. Insert a new extent, preserving ordering. - */ + /* No merge, insert a new extent. */ new_se = kmalloc(sizeof(*se), GFP_KERNEL); if (new_se == NULL) return -ENOMEM; @@ -2330,7 +2342,8 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, new_se->nr_pages = nr_pages; new_se->start_block = start_block; - list_add_tail(&new_se->list, &sis->first_swap_extent.list); + rb_link_node(&new_se->rb_node, parent, link); + rb_insert_color(&new_se->rb_node, &sis->swap_extent_root); return 1; } EXPORT_SYMBOL_GPL(add_swap_extent); @@ -2846,7 +2859,7 @@ static struct swap_info_struct *alloc_swap_info(void) * would be relying on p->type to remain valid. */ } - INIT_LIST_HEAD(&p->first_swap_extent.list); + p->swap_extent_root = RB_ROOT; plist_node_init(&p->list, 0); for_each_node(i) plist_node_init(&p->avail_lists[i], 0); From aeb309b81c6bada783c3695528a3e10748e97285 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Thu, 11 Jul 2019 20:55:44 -0700 Subject: [PATCH 067/147] mm/mincore.c: fix race between swapoff and mincore MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Via commit 4b3ef9daa4fc ("mm/swap: split swap cache into 64MB trunks"), after swapoff, the address_space associated with the swap device will be freed. So swap_address_space() users which touch the address_space need some kind of mechanism to prevent the address_space from being freed during accessing. When mincore processes an unmapped range for swapped shmem pages, it doesn't hold the lock to prevent swap device from being swapped off. So the following race is possible: CPU1 CPU2 do_mincore() swapoff() walk_page_range() mincore_unmapped_range() __mincore_unmapped_range mincore_page as = swap_address_space() ... exit_swap_address_space() ... kvfree(spaces) find_get_page(as) The address space may be accessed after being freed. To fix the race, get_swap_device()/put_swap_device() is used to enclose find_get_page() to check whether the swap entry is valid and prevent the swap device from being swapoff during accessing. Link: http://lkml.kernel.org/r/20190611020510.28251-1-ying.huang@intel.com Fixes: 4b3ef9daa4fc ("mm/swap: split swap cache into 64MB trunks") Signed-off-by: "Huang, Ying" Reviewed-by: Andrew Morton Acked-by: Michal Hocko Cc: Hugh Dickins Cc: Paul E. McKenney Cc: Minchan Kim Cc: Johannes Weiner Cc: Tim Chen Cc: Mel Gorman Cc: Jérôme Glisse Cc: Andrea Arcangeli Cc: Yang Shi Cc: David Rientjes Cc: Rik van Riel Cc: Jan Kara Cc: Dave Jiang Cc: Daniel Jordan Cc: Andrea Parri Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mincore.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/mm/mincore.c b/mm/mincore.c index c3f058bd0faf..4fe91d497436 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -68,8 +68,16 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) */ if (xa_is_value(page)) { swp_entry_t swp = radix_to_swp_entry(page); - page = find_get_page(swap_address_space(swp), - swp_offset(swp)); + struct swap_info_struct *si; + + /* Prevent swap device to being swapoff under us */ + si = get_swap_device(swp); + if (si) { + page = find_get_page(swap_address_space(swp), + swp_offset(swp)); + put_swap_device(si); + } else + page = NULL; } } else page = find_get_page(mapping, pgoff); From 38d384932ed1b58f0ed8dd17c66e0cd89acb354a Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 11 Jul 2019 20:55:48 -0700 Subject: [PATCH 068/147] memcg, oom: no oom-kill for __GFP_RETRY_MAYFAIL The documentation of __GFP_RETRY_MAYFAIL clearly mentioned that the OOM killer will not be triggered and indeed the page alloc does not invoke OOM killer for such allocations. However we do trigger memcg OOM killer for __GFP_RETRY_MAYFAIL. Fix that. This flag will used later to not trigger oom-killer in the charging path for fanotify and inotify event allocations. Link: http://lkml.kernel.org/r/20190514212259.156585-1-shakeelb@google.com Signed-off-by: Shakeel Butt Acked-by: Michal Hocko Cc: Amir Goldstein Cc: Jan Kara Cc: Johannes Weiner Cc: Roman Gushchin Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 591eafafbd8c..2ad94d0ce22f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2279,7 +2279,6 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, unsigned long nr_reclaimed; bool may_swap = true; bool drained = false; - bool oomed = false; enum oom_status oom_status; if (mem_cgroup_is_root(memcg)) @@ -2366,7 +2365,7 @@ retry: if (nr_retries--) goto retry; - if (gfp_mask & __GFP_RETRY_MAYFAIL && oomed) + if (gfp_mask & __GFP_RETRY_MAYFAIL) goto nomem; if (gfp_mask & __GFP_NOFAIL) @@ -2385,7 +2384,6 @@ retry: switch (oom_status) { case OOM_SUCCESS: nr_retries = MEM_CGROUP_RECLAIM_RETRIES; - oomed = true; goto retry; case OOM_FAILED: goto force; From ec165450968b26298bd1c373de37b0ab6d826b33 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 11 Jul 2019 20:55:52 -0700 Subject: [PATCH 069/147] memcg, fsnotify: no oom-kill for remote memcg charging Commit d46eb14b735b ("fs: fsnotify: account fsnotify metadata to kmemcg") added remote memcg charging for fanotify and inotify event objects. The aim was to charge the memory to the listener who is interested in the events but without triggering the OOM killer. Otherwise there would be security concerns for the listener. At the time, oom-kill trigger was not in the charging path. A parallel work added the oom-kill back to charging path i.e. commit 29ef680ae7c2 ("memcg, oom: move out_of_memory back to the charge path"). So to not trigger oom-killer in the remote memcg, explicitly add __GFP_RETRY_MAYFAIL to the fanotigy and inotify event allocations. Link: http://lkml.kernel.org/r/20190514212259.156585-2-shakeelb@google.com Signed-off-by: Shakeel Butt Reviewed-by: Roman Gushchin Acked-by: Jan Kara Cc: Johannes Weiner Cc: Vladimir Davydov Cc: Michal Hocko Cc: Amir Goldstein Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/notify/fanotify/fanotify.c | 5 ++++- fs/notify/inotify/inotify_fsnotify.c | 8 ++++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index b428c295d13f..5778d1347b35 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -288,10 +288,13 @@ struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group, /* * For queues with unlimited length lost events are not expected and * can possibly have security implications. Avoid losing events when - * memory is short. + * memory is short. For the limited size queues, avoid OOM killer in the + * target monitoring memcg as it may have security repercussion. */ if (group->max_events == UINT_MAX) gfp |= __GFP_NOFAIL; + else + gfp |= __GFP_RETRY_MAYFAIL; /* Whoever is interested in the event, pays for the allocation. */ memalloc_use_memcg(group->memcg); diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index 2fda08b2b885..d510223d302c 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -90,9 +90,13 @@ int inotify_handle_event(struct fsnotify_group *group, i_mark = container_of(inode_mark, struct inotify_inode_mark, fsn_mark); - /* Whoever is interested in the event, pays for the allocation. */ + /* + * Whoever is interested in the event, pays for the allocation. Do not + * trigger OOM killer in the target monitoring memcg as it may have + * security repercussion. + */ memalloc_use_memcg(group->memcg); - event = kmalloc(alloc_len, GFP_KERNEL_ACCOUNT); + event = kmalloc(alloc_len, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); memalloc_unuse_memcg(); if (unlikely(!event)) { From 1e577f970f66a53d429cbee37b36177c9712f488 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 11 Jul 2019 20:55:55 -0700 Subject: [PATCH 070/147] mm, memcg: introduce memory.events.local The memory controller in cgroup v2 exposes memory.events file for each memcg which shows the number of times events like low, high, max, oom and oom_kill have happened for the whole tree rooted at that memcg. Users can also poll or register notification to monitor the changes in that file. Any event at any level of the tree rooted at memcg will notify all the listeners along the path till root_mem_cgroup. There are existing users which depend on this behavior. However there are users which are only interested in the events happening at a specific level of the memcg tree and not in the events in the underlying tree rooted at that memcg. One such use-case is a centralized resource monitor which can dynamically adjust the limits of the jobs running on a system. The jobs can create their sub-hierarchy for their own sub-tasks. The centralized monitor is only interested in the events at the top level memcgs of the jobs as it can then act and adjust the limits of the jobs. Using the current memory.events for such centralized monitor is very inconvenient. The monitor will keep receiving events which it is not interested and to find if the received event is interesting, it has to read memory.event files of the next level and compare it with the top level one. So, let's introduce memory.events.local to the memcg which shows and notify for the events at the memcg level. Now, does memory.stat and memory.pressure need their local versions. IMHO no due to the no internal process contraint of the cgroup v2. The memory.stat file of the top level memcg of a job shows the stats and vmevents of the whole tree. The local stats or vmevents of the top level memcg will only change if there is a process running in that memcg but v2 does not allow that. Similarly for memory.pressure there will not be any process in the internal nodes and thus no chance of local pressure. Link: http://lkml.kernel.org/r/20190527174643.209172-1-shakeelb@google.com Signed-off-by: Shakeel Butt Reviewed-by: Roman Gushchin Acked-by: Johannes Weiner Acked-by: Michal Hocko Cc: Vladimir Davydov Cc: Chris Down Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/cgroup-v2.rst | 10 ++++++++ include/linux/memcontrol.h | 7 ++++- mm/memcontrol.c | 34 +++++++++++++++++-------- 3 files changed, 40 insertions(+), 11 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index a5c845338d6d..a9548de56ac9 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1146,6 +1146,11 @@ PAGE_SIZE multiple when read back. otherwise, a value change in this file generates a file modified event. + Note that all fields in this file are hierarchical and the + file modified event can be generated due to an event down the + hierarchy. For for the local events at the cgroup level see + memory.events.local. + low The number of times the cgroup is reclaimed due to high memory pressure even though its usage is under @@ -1185,6 +1190,11 @@ PAGE_SIZE multiple when read back. The number of processes belonging to this cgroup killed by any kind of OOM killer. + memory.events.local + Similar to memory.events but the fields in the file are local + to the cgroup i.e. not hierarchical. The file modified event + generated on this file reflects only the local events. + memory.stat A read-only flat-keyed file which exists on non-root cgroups. diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 1dcb763bb610..22141ebc5e15 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -233,8 +233,9 @@ struct mem_cgroup { /* OOM-Killer disable */ int oom_kill_disable; - /* memory.events */ + /* memory.events and memory.events.local */ struct cgroup_file events_file; + struct cgroup_file events_local_file; /* handle for "memory.swap.events" */ struct cgroup_file swap_events_file; @@ -281,6 +282,7 @@ struct mem_cgroup { /* memory.events */ atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS]; + atomic_long_t memory_events_local[MEMCG_NR_MEMORY_EVENTS]; unsigned long socket_pressure; @@ -747,6 +749,9 @@ static inline void count_memcg_event_mm(struct mm_struct *mm, static inline void memcg_memory_event(struct mem_cgroup *memcg, enum memcg_memory_event event) { + atomic_long_inc(&memcg->memory_events_local[event]); + cgroup_file_notify(&memcg->events_local_file); + do { atomic_long_inc(&memcg->memory_events[event]); cgroup_file_notify(&memcg->events_file); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2ad94d0ce22f..0a9bd604aa15 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5624,21 +5624,29 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, return nbytes; } +static void __memory_events_show(struct seq_file *m, atomic_long_t *events) +{ + seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW])); + seq_printf(m, "high %lu\n", atomic_long_read(&events[MEMCG_HIGH])); + seq_printf(m, "max %lu\n", atomic_long_read(&events[MEMCG_MAX])); + seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM])); + seq_printf(m, "oom_kill %lu\n", + atomic_long_read(&events[MEMCG_OOM_KILL])); +} + static int memory_events_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_seq(m); - seq_printf(m, "low %lu\n", - atomic_long_read(&memcg->memory_events[MEMCG_LOW])); - seq_printf(m, "high %lu\n", - atomic_long_read(&memcg->memory_events[MEMCG_HIGH])); - seq_printf(m, "max %lu\n", - atomic_long_read(&memcg->memory_events[MEMCG_MAX])); - seq_printf(m, "oom %lu\n", - atomic_long_read(&memcg->memory_events[MEMCG_OOM])); - seq_printf(m, "oom_kill %lu\n", - atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL])); + __memory_events_show(m, memcg->memory_events); + return 0; +} +static int memory_events_local_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); + + __memory_events_show(m, memcg->memory_events_local); return 0; } @@ -5800,6 +5808,12 @@ static struct cftype memory_files[] = { .file_offset = offsetof(struct mem_cgroup, events_file), .seq_show = memory_events_show, }, + { + .name = "events.local", + .flags = CFTYPE_NOT_ON_ROOT, + .file_offset = offsetof(struct mem_cgroup, events_local_file), + .seq_show = memory_events_local_show, + }, { .name = "stat", .flags = CFTYPE_NOT_ON_ROOT, From c8713d0b23123759c9d86b0421243c2c309505d7 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 11 Jul 2019 20:55:59 -0700 Subject: [PATCH 071/147] mm: memcontrol: dump memory.stat during cgroup OOM The current cgroup OOM memory info dump doesn't include all the memory we are tracking, nor does it give insight into what the VM tried to do leading up to the OOM. All that useful info is in memory.stat. Furthermore, the recursive printing for every child cgroup can generate absurd amounts of data on the console for larger cgroup trees, and it's not like we provide a per-cgroup breakdown during global OOM kills. When an OOM kill is triggered, print one set of recursive memory.stat items at the level whose limit triggered the OOM condition. Example output: stress invoked oom-killer: gfp_mask=0x100cca(GFP_HIGHUSER_MOVABLE), order=0, oom_score_adj=0 CPU: 2 PID: 210 Comm: stress Not tainted 5.2.0-rc2-mm1-00247-g47d49835983c #135 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-20181126_142135-anatol 04/01/2014 Call Trace: dump_stack+0x46/0x60 dump_header+0x4c/0x2d0 oom_kill_process.cold.10+0xb/0x10 out_of_memory+0x200/0x270 ? try_to_free_mem_cgroup_pages+0xdf/0x130 mem_cgroup_out_of_memory+0xb7/0xc0 try_charge+0x680/0x6f0 mem_cgroup_try_charge+0xb5/0x160 __add_to_page_cache_locked+0xc6/0x300 ? list_lru_destroy+0x80/0x80 add_to_page_cache_lru+0x45/0xc0 pagecache_get_page+0x11b/0x290 filemap_fault+0x458/0x6d0 ext4_filemap_fault+0x27/0x36 __do_fault+0x2f/0xb0 __handle_mm_fault+0x9c5/0x1140 ? apic_timer_interrupt+0xa/0x20 handle_mm_fault+0xc5/0x180 __do_page_fault+0x1ab/0x440 ? page_fault+0x8/0x30 page_fault+0x1e/0x30 RIP: 0033:0x55c32167fc10 Code: Bad RIP value. RSP: 002b:00007fff1d031c50 EFLAGS: 00010206 RAX: 000000000dc00000 RBX: 00007fd2db000010 RCX: 00007fd2db000010 RDX: 0000000000000000 RSI: 0000000010001000 RDI: 0000000000000000 RBP: 000055c321680a54 R08: 00000000ffffffff R09: 0000000000000000 R10: 0000000000000022 R11: 0000000000000246 R12: ffffffffffffffff R13: 0000000000000002 R14: 0000000000001000 R15: 0000000010000000 memory: usage 1024kB, limit 1024kB, failcnt 75131 swap: usage 0kB, limit 9007199254740988kB, failcnt 0 Memory cgroup stats for /foo: anon 0 file 0 kernel_stack 36864 slab 274432 sock 0 shmem 0 file_mapped 0 file_dirty 0 file_writeback 0 anon_thp 0 inactive_anon 126976 active_anon 0 inactive_file 0 active_file 0 unevictable 0 slab_reclaimable 0 slab_unreclaimable 274432 pgfault 59466 pgmajfault 1617 workingset_refault 2145 workingset_activate 0 workingset_nodereclaim 0 pgrefill 98952 pgscan 200060 pgsteal 59340 pgactivate 40095 pgdeactivate 96787 pglazyfree 0 pglazyfreed 0 thp_fault_alloc 0 thp_collapse_alloc 0 Tasks state (memory values in pages): [ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name [ 200] 0 200 1121 884 53248 29 0 bash [ 209] 0 209 905 246 45056 19 0 stress [ 210] 0 210 66442 56 499712 56349 0 stress oom-kill:constraint=CONSTRAINT_NONE,nodemask=(null),oom_memcg=/foo,task_memcg=/foo,task=stress,pid=210,uid=0 Memory cgroup out of memory: Killed process 210 (stress) total-vm:265768kB, anon-rss:0kB, file-rss:224kB, shmem-rss:0kB oom_reaper: reaped process 210 (stress), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB [hannes@cmpxchg.org: s/kvmalloc/kmalloc/ per Michal] Link: http://lkml.kernel.org/r/20190605161133.GA12453@cmpxchg.org Link: http://lkml.kernel.org/r/20190604210509.9744-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 289 ++++++++++++++++++++++++++---------------------- 1 file changed, 157 insertions(+), 132 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0a9bd604aa15..6de79ec3cd21 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -57,6 +57,7 @@ #include #include #include +#include #include "internal.h" #include #include @@ -1356,27 +1357,114 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) return false; } -static const unsigned int memcg1_stats[] = { - MEMCG_CACHE, - MEMCG_RSS, - MEMCG_RSS_HUGE, - NR_SHMEM, - NR_FILE_MAPPED, - NR_FILE_DIRTY, - NR_WRITEBACK, - MEMCG_SWAP, -}; +static char *memory_stat_format(struct mem_cgroup *memcg) +{ + struct seq_buf s; + int i; -static const char *const memcg1_stat_names[] = { - "cache", - "rss", - "rss_huge", - "shmem", - "mapped_file", - "dirty", - "writeback", - "swap", -}; + seq_buf_init(&s, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE); + if (!s.buffer) + return NULL; + + /* + * Provide statistics on the state of the memory subsystem as + * well as cumulative event counters that show past behavior. + * + * This list is ordered following a combination of these gradients: + * 1) generic big picture -> specifics and details + * 2) reflecting userspace activity -> reflecting kernel heuristics + * + * Current memory state: + */ + + seq_buf_printf(&s, "anon %llu\n", + (u64)memcg_page_state(memcg, MEMCG_RSS) * + PAGE_SIZE); + seq_buf_printf(&s, "file %llu\n", + (u64)memcg_page_state(memcg, MEMCG_CACHE) * + PAGE_SIZE); + seq_buf_printf(&s, "kernel_stack %llu\n", + (u64)memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) * + 1024); + seq_buf_printf(&s, "slab %llu\n", + (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) + + memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE)) * + PAGE_SIZE); + seq_buf_printf(&s, "sock %llu\n", + (u64)memcg_page_state(memcg, MEMCG_SOCK) * + PAGE_SIZE); + + seq_buf_printf(&s, "shmem %llu\n", + (u64)memcg_page_state(memcg, NR_SHMEM) * + PAGE_SIZE); + seq_buf_printf(&s, "file_mapped %llu\n", + (u64)memcg_page_state(memcg, NR_FILE_MAPPED) * + PAGE_SIZE); + seq_buf_printf(&s, "file_dirty %llu\n", + (u64)memcg_page_state(memcg, NR_FILE_DIRTY) * + PAGE_SIZE); + seq_buf_printf(&s, "file_writeback %llu\n", + (u64)memcg_page_state(memcg, NR_WRITEBACK) * + PAGE_SIZE); + + /* + * TODO: We should eventually replace our own MEMCG_RSS_HUGE counter + * with the NR_ANON_THP vm counter, but right now it's a pain in the + * arse because it requires migrating the work out of rmap to a place + * where the page->mem_cgroup is set up and stable. + */ + seq_buf_printf(&s, "anon_thp %llu\n", + (u64)memcg_page_state(memcg, MEMCG_RSS_HUGE) * + PAGE_SIZE); + + for (i = 0; i < NR_LRU_LISTS; i++) + seq_buf_printf(&s, "%s %llu\n", mem_cgroup_lru_names[i], + (u64)memcg_page_state(memcg, NR_LRU_BASE + i) * + PAGE_SIZE); + + seq_buf_printf(&s, "slab_reclaimable %llu\n", + (u64)memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) * + PAGE_SIZE); + seq_buf_printf(&s, "slab_unreclaimable %llu\n", + (u64)memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE) * + PAGE_SIZE); + + /* Accumulated memory events */ + + seq_buf_printf(&s, "pgfault %lu\n", memcg_events(memcg, PGFAULT)); + seq_buf_printf(&s, "pgmajfault %lu\n", memcg_events(memcg, PGMAJFAULT)); + + seq_buf_printf(&s, "workingset_refault %lu\n", + memcg_page_state(memcg, WORKINGSET_REFAULT)); + seq_buf_printf(&s, "workingset_activate %lu\n", + memcg_page_state(memcg, WORKINGSET_ACTIVATE)); + seq_buf_printf(&s, "workingset_nodereclaim %lu\n", + memcg_page_state(memcg, WORKINGSET_NODERECLAIM)); + + seq_buf_printf(&s, "pgrefill %lu\n", memcg_events(memcg, PGREFILL)); + seq_buf_printf(&s, "pgscan %lu\n", + memcg_events(memcg, PGSCAN_KSWAPD) + + memcg_events(memcg, PGSCAN_DIRECT)); + seq_buf_printf(&s, "pgsteal %lu\n", + memcg_events(memcg, PGSTEAL_KSWAPD) + + memcg_events(memcg, PGSTEAL_DIRECT)); + seq_buf_printf(&s, "pgactivate %lu\n", memcg_events(memcg, PGACTIVATE)); + seq_buf_printf(&s, "pgdeactivate %lu\n", memcg_events(memcg, PGDEACTIVATE)); + seq_buf_printf(&s, "pglazyfree %lu\n", memcg_events(memcg, PGLAZYFREE)); + seq_buf_printf(&s, "pglazyfreed %lu\n", memcg_events(memcg, PGLAZYFREED)); + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + seq_buf_printf(&s, "thp_fault_alloc %lu\n", + memcg_events(memcg, THP_FAULT_ALLOC)); + seq_buf_printf(&s, "thp_collapse_alloc %lu\n", + memcg_events(memcg, THP_COLLAPSE_ALLOC)); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + + /* The above should easily fit into one page */ + WARN_ON_ONCE(seq_buf_has_overflowed(&s)); + + return s.buffer; +} #define K(x) ((x) << (PAGE_SHIFT-10)) /** @@ -1411,39 +1499,32 @@ void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct * */ void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) { - struct mem_cgroup *iter; - unsigned int i; + char *buf; pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n", K((u64)page_counter_read(&memcg->memory)), K((u64)memcg->memory.max), memcg->memory.failcnt); - pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n", - K((u64)page_counter_read(&memcg->memsw)), - K((u64)memcg->memsw.max), memcg->memsw.failcnt); - pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n", - K((u64)page_counter_read(&memcg->kmem)), - K((u64)memcg->kmem.max), memcg->kmem.failcnt); - - for_each_mem_cgroup_tree(iter, memcg) { - pr_info("Memory cgroup stats for "); - pr_cont_cgroup_path(iter->css.cgroup); - pr_cont(":"); - - for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { - if (memcg1_stats[i] == MEMCG_SWAP && !do_swap_account) - continue; - pr_cont(" %s:%luKB", memcg1_stat_names[i], - K(memcg_page_state_local(iter, - memcg1_stats[i]))); - } - - for (i = 0; i < NR_LRU_LISTS; i++) - pr_cont(" %s:%luKB", mem_cgroup_lru_names[i], - K(memcg_page_state_local(iter, - NR_LRU_BASE + i))); - - pr_cont("\n"); + if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) + pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n", + K((u64)page_counter_read(&memcg->swap)), + K((u64)memcg->swap.max), memcg->swap.failcnt); + else { + pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n", + K((u64)page_counter_read(&memcg->memsw)), + K((u64)memcg->memsw.max), memcg->memsw.failcnt); + pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n", + K((u64)page_counter_read(&memcg->kmem)), + K((u64)memcg->kmem.max), memcg->kmem.failcnt); } + + pr_info("Memory cgroup stats for "); + pr_cont_cgroup_path(memcg->css.cgroup); + pr_cont(":"); + buf = memory_stat_format(memcg); + if (!buf) + return; + pr_info("%s", buf); + kfree(buf); } /* @@ -3470,6 +3551,28 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v) } #endif /* CONFIG_NUMA */ +static const unsigned int memcg1_stats[] = { + MEMCG_CACHE, + MEMCG_RSS, + MEMCG_RSS_HUGE, + NR_SHMEM, + NR_FILE_MAPPED, + NR_FILE_DIRTY, + NR_WRITEBACK, + MEMCG_SWAP, +}; + +static const char *const memcg1_stat_names[] = { + "cache", + "rss", + "rss_huge", + "shmem", + "mapped_file", + "dirty", + "writeback", + "swap", +}; + /* Universal VM events cgroup1 shows, original sort order */ static const unsigned int memcg1_events[] = { PGPGIN, @@ -5653,91 +5756,13 @@ static int memory_events_local_show(struct seq_file *m, void *v) static int memory_stat_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_seq(m); - int i; - - /* - * Provide statistics on the state of the memory subsystem as - * well as cumulative event counters that show past behavior. - * - * This list is ordered following a combination of these gradients: - * 1) generic big picture -> specifics and details - * 2) reflecting userspace activity -> reflecting kernel heuristics - * - * Current memory state: - */ - - seq_printf(m, "anon %llu\n", - (u64)memcg_page_state(memcg, MEMCG_RSS) * PAGE_SIZE); - seq_printf(m, "file %llu\n", - (u64)memcg_page_state(memcg, MEMCG_CACHE) * PAGE_SIZE); - seq_printf(m, "kernel_stack %llu\n", - (u64)memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) * 1024); - seq_printf(m, "slab %llu\n", - (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) + - memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE)) * - PAGE_SIZE); - seq_printf(m, "sock %llu\n", - (u64)memcg_page_state(memcg, MEMCG_SOCK) * PAGE_SIZE); - - seq_printf(m, "shmem %llu\n", - (u64)memcg_page_state(memcg, NR_SHMEM) * PAGE_SIZE); - seq_printf(m, "file_mapped %llu\n", - (u64)memcg_page_state(memcg, NR_FILE_MAPPED) * PAGE_SIZE); - seq_printf(m, "file_dirty %llu\n", - (u64)memcg_page_state(memcg, NR_FILE_DIRTY) * PAGE_SIZE); - seq_printf(m, "file_writeback %llu\n", - (u64)memcg_page_state(memcg, NR_WRITEBACK) * PAGE_SIZE); - - /* - * TODO: We should eventually replace our own MEMCG_RSS_HUGE counter - * with the NR_ANON_THP vm counter, but right now it's a pain in the - * arse because it requires migrating the work out of rmap to a place - * where the page->mem_cgroup is set up and stable. - */ - seq_printf(m, "anon_thp %llu\n", - (u64)memcg_page_state(memcg, MEMCG_RSS_HUGE) * PAGE_SIZE); - - for (i = 0; i < NR_LRU_LISTS; i++) - seq_printf(m, "%s %llu\n", mem_cgroup_lru_names[i], - (u64)memcg_page_state(memcg, NR_LRU_BASE + i) * - PAGE_SIZE); - - seq_printf(m, "slab_reclaimable %llu\n", - (u64)memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) * - PAGE_SIZE); - seq_printf(m, "slab_unreclaimable %llu\n", - (u64)memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE) * - PAGE_SIZE); - - /* Accumulated memory events */ - - seq_printf(m, "pgfault %lu\n", memcg_events(memcg, PGFAULT)); - seq_printf(m, "pgmajfault %lu\n", memcg_events(memcg, PGMAJFAULT)); - - seq_printf(m, "workingset_refault %lu\n", - memcg_page_state(memcg, WORKINGSET_REFAULT)); - seq_printf(m, "workingset_activate %lu\n", - memcg_page_state(memcg, WORKINGSET_ACTIVATE)); - seq_printf(m, "workingset_nodereclaim %lu\n", - memcg_page_state(memcg, WORKINGSET_NODERECLAIM)); - - seq_printf(m, "pgrefill %lu\n", memcg_events(memcg, PGREFILL)); - seq_printf(m, "pgscan %lu\n", memcg_events(memcg, PGSCAN_KSWAPD) + - memcg_events(memcg, PGSCAN_DIRECT)); - seq_printf(m, "pgsteal %lu\n", memcg_events(memcg, PGSTEAL_KSWAPD) + - memcg_events(memcg, PGSTEAL_DIRECT)); - seq_printf(m, "pgactivate %lu\n", memcg_events(memcg, PGACTIVATE)); - seq_printf(m, "pgdeactivate %lu\n", memcg_events(memcg, PGDEACTIVATE)); - seq_printf(m, "pglazyfree %lu\n", memcg_events(memcg, PGLAZYFREE)); - seq_printf(m, "pglazyfreed %lu\n", memcg_events(memcg, PGLAZYFREED)); - -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - seq_printf(m, "thp_fault_alloc %lu\n", - memcg_events(memcg, THP_FAULT_ALLOC)); - seq_printf(m, "thp_collapse_alloc %lu\n", - memcg_events(memcg, THP_COLLAPSE_ALLOC)); -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + char *buf; + buf = memory_stat_format(memcg); + if (!buf) + return -ENOMEM; + seq_puts(m, buf); + kfree(buf); return 0; } From c03914b7aa319fb2b6701a6427c13752c7418b9b Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 11 Jul 2019 20:56:02 -0700 Subject: [PATCH 072/147] mm: memcg/slab: postpone kmem_cache memcg pointer initialization to memcg_link_cache() Patch series "mm: reparent slab memory on cgroup removal", v7. # Why do we need this? We've noticed that the number of dying cgroups is steadily growing on most of our hosts in production. The following investigation revealed an issue in the userspace memory reclaim code [1], accounting of kernel stacks [2], and also the main reason: slab objects. The underlying problem is quite simple: any page charged to a cgroup holds a reference to it, so the cgroup can't be reclaimed unless all charged pages are gone. If a slab object is actively used by other cgroups, it won't be reclaimed, and will prevent the origin cgroup from being reclaimed. Slab objects, and first of all vfs cache, is shared between cgroups, which are using the same underlying fs, and what's even more important, it's shared between multiple generations of the same workload. So if something is running periodically every time in a new cgroup (like how systemd works), we do accumulate multiple dying cgroups. Strictly speaking pagecache isn't different here, but there is a key difference: we disable protection and apply some extra pressure on LRUs of dying cgroups, and these LRUs contain all charged pages. My experiments show that with the disabled kernel memory accounting the number of dying cgroups stabilizes at a relatively small number (~100, depends on memory pressure and cgroup creation rate), and with kernel memory accounting it grows pretty steadily up to several thousands. Memory cgroups are quite complex and big objects (mostly due to percpu stats), so it leads to noticeable memory losses. Memory occupied by dying cgroups is measured in hundreds of megabytes. I've even seen a host with more than 100Gb of memory wasted for dying cgroups. It leads to a degradation of performance with the uptime, and generally limits the usage of cgroups. My previous attempt [3] to fix the problem by applying extra pressure on slab shrinker lists caused a regressions with xfs and ext4, and has been reverted [4]. The following attempts to find the right balance [5, 6] were not successful. So instead of trying to find a maybe non-existing balance, let's do reparent accounted slab caches to the parent cgroup on cgroup removal. # Implementation approach There is however a significant problem with reparenting of slab memory: there is no list of charged pages. Some of them are in shrinker lists, but not all. Introducing of a new list is really not an option. But fortunately there is a way forward: every slab page has a stable pointer to the corresponding kmem_cache. So the idea is to reparent kmem_caches instead of slab pages. It's actually simpler and cheaper, but requires some underlying changes: 1) Make kmem_caches to hold a single reference to the memory cgroup, instead of a separate reference per every slab page. 2) Stop setting page->mem_cgroup pointer for memcg slab pages and use page->kmem_cache->memcg indirection instead. It's used only on slab page release, so performance overhead shouldn't be a big issue. 3) Introduce a refcounter for non-root slab caches. It's required to be able to destroy kmem_caches when they become empty and release the associated memory cgroup. There is a bonus: currently we release all memcg kmem_caches all together with the memory cgroup itself. This patchset allows individual kmem_caches to be released as soon as they become inactive and free. Some additional implementation details are provided in corresponding commit messages. # Results Below is the average number of dying cgroups on two groups of our production hosts. They do run some sort of web frontend workload, the memory pressure is moderate. As we can see, with the kernel memory reparenting the number stabilizes in 60s range; however with the original version it grows almost linearly and doesn't show any signs of plateauing. The difference in slab and percpu usage between patched and unpatched versions also grows linearly. In 7 days it exceeded 200Mb. day 0 1 2 3 4 5 6 7 original 56 362 628 752 1070 1250 1490 1560 patched 23 46 51 55 60 57 67 69 mem diff(Mb) 22 74 123 152 164 182 214 241 # Links [1]: commit 68600f623d69 ("mm: don't miss the last page because of round-off error") [2]: commit 9b6f7e163cd0 ("mm: rework memcg kernel stack accounting") [3]: commit 172b06c32b94 ("mm: slowly shrink slabs with a relatively small number of objects") [4]: commit a9a238e83fbb ("Revert "mm: slowly shrink slabs with a relatively small number of objects") [5]: https://lkml.org/lkml/2019/1/28/1865 [6]: https://marc.info/?l=linux-mm&m=155064763626437&w=2 This patch (of 10): Initialize kmem_cache->memcg_params.memcg pointer in memcg_link_cache() rather than in init_memcg_params(). Once kmem_cache will hold a reference to the memory cgroup, it will simplify the refcounting. For non-root kmem_caches memcg_link_cache() is always called before the kmem_cache becomes visible to a user, so it's safe. Link: http://lkml.kernel.org/r/20190611231813.3148843-2-guro@fb.com Signed-off-by: Roman Gushchin Reviewed-by: Shakeel Butt Acked-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Waiman Long Cc: Michal Hocko Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Andrei Vagin Cc: Qian Cai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 2 +- mm/slab.h | 5 +++-- mm/slab_common.c | 14 +++++++------- mm/slub.c | 2 +- 4 files changed, 12 insertions(+), 11 deletions(-) diff --git a/mm/slab.c b/mm/slab.c index 3521a351ceb5..badd98f7e2f1 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1239,7 +1239,7 @@ void __init kmem_cache_init(void) nr_node_ids * sizeof(struct kmem_cache_node *), SLAB_HWCACHE_ALIGN, 0, 0); list_add(&kmem_cache->list, &slab_caches); - memcg_link_cache(kmem_cache); + memcg_link_cache(kmem_cache, NULL); slab_state = PARTIAL; /* diff --git a/mm/slab.h b/mm/slab.h index 739099af6cbb..86f7ede21203 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -289,7 +289,7 @@ static __always_inline void memcg_uncharge_slab(struct page *page, int order, } extern void slab_init_memcg_params(struct kmem_cache *); -extern void memcg_link_cache(struct kmem_cache *s); +extern void memcg_link_cache(struct kmem_cache *s, struct mem_cgroup *memcg); extern void slab_deactivate_memcg_cache_rcu_sched(struct kmem_cache *s, void (*deact_fn)(struct kmem_cache *)); @@ -344,7 +344,8 @@ static inline void slab_init_memcg_params(struct kmem_cache *s) { } -static inline void memcg_link_cache(struct kmem_cache *s) +static inline void memcg_link_cache(struct kmem_cache *s, + struct mem_cgroup *memcg) { } diff --git a/mm/slab_common.c b/mm/slab_common.c index a09bb10aa026..07ee4189b40c 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -140,13 +140,12 @@ void slab_init_memcg_params(struct kmem_cache *s) } static int init_memcg_params(struct kmem_cache *s, - struct mem_cgroup *memcg, struct kmem_cache *root_cache) + struct kmem_cache *root_cache) { struct memcg_cache_array *arr; if (root_cache) { s->memcg_params.root_cache = root_cache; - s->memcg_params.memcg = memcg; INIT_LIST_HEAD(&s->memcg_params.children_node); INIT_LIST_HEAD(&s->memcg_params.kmem_caches_node); return 0; @@ -221,11 +220,12 @@ int memcg_update_all_caches(int num_memcgs) return ret; } -void memcg_link_cache(struct kmem_cache *s) +void memcg_link_cache(struct kmem_cache *s, struct mem_cgroup *memcg) { if (is_root_cache(s)) { list_add(&s->root_caches_node, &slab_root_caches); } else { + s->memcg_params.memcg = memcg; list_add(&s->memcg_params.children_node, &s->memcg_params.root_cache->memcg_params.children); list_add(&s->memcg_params.kmem_caches_node, @@ -244,7 +244,7 @@ static void memcg_unlink_cache(struct kmem_cache *s) } #else static inline int init_memcg_params(struct kmem_cache *s, - struct mem_cgroup *memcg, struct kmem_cache *root_cache) + struct kmem_cache *root_cache) { return 0; } @@ -384,7 +384,7 @@ static struct kmem_cache *create_cache(const char *name, s->useroffset = useroffset; s->usersize = usersize; - err = init_memcg_params(s, memcg, root_cache); + err = init_memcg_params(s, root_cache); if (err) goto out_free_cache; @@ -394,7 +394,7 @@ static struct kmem_cache *create_cache(const char *name, s->refcount = 1; list_add(&s->list, &slab_caches); - memcg_link_cache(s); + memcg_link_cache(s, memcg); out: if (err) return ERR_PTR(err); @@ -997,7 +997,7 @@ struct kmem_cache *__init create_kmalloc_cache(const char *name, create_boot_cache(s, name, size, flags, useroffset, usersize); list_add(&s->list, &slab_caches); - memcg_link_cache(s); + memcg_link_cache(s, NULL); s->refcount = 1; return s; } diff --git a/mm/slub.c b/mm/slub.c index 5e217653286c..e1402ed19e74 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4199,7 +4199,7 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache) } slab_init_memcg_params(s); list_add(&s->list, &slab_caches); - memcg_link_cache(s); + memcg_link_cache(s, NULL); return s; } From 0b14e8aa68223c2c124d408aa4b110b364d13c53 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 11 Jul 2019 20:56:06 -0700 Subject: [PATCH 073/147] mm: memcg/slab: rename slab delayed deactivation functions and fields The delayed work/rcu deactivation infrastructure of non-root kmem_caches can be also used for asynchronous release of these objects. Let's get rid of the word "deactivation" in corresponding names to make the code look better after generalization. It's easier to make the renaming first, so that the generalized code will look consistent from scratch. Let's rename struct memcg_cache_params fields: deact_fn -> work_fn deact_rcu_head -> rcu_head deact_work -> work And RCU/delayed work callbacks in slab common code: kmemcg_deactivate_rcufn -> kmemcg_rcufn kmemcg_deactivate_workfn -> kmemcg_workfn This patch contains no functional changes, only renamings. Link: http://lkml.kernel.org/r/20190611231813.3148843-3-guro@fb.com Signed-off-by: Roman Gushchin Acked-by: Vladimir Davydov Reviewed-by: Shakeel Butt Cc: Christoph Lameter Cc: Johannes Weiner Cc: Michal Hocko Cc: Waiman Long Cc: David Rientjes Cc: Joonsoo Kim Cc: Pekka Enberg Cc: Andrei Vagin Cc: Qian Cai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 6 +++--- mm/slab.h | 2 +- mm/slab_common.c | 30 +++++++++++++++--------------- 3 files changed, 19 insertions(+), 19 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index 98c3d12b7275..6008d884e621 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -643,10 +643,10 @@ struct memcg_cache_params { struct list_head children_node; struct list_head kmem_caches_node; - void (*deact_fn)(struct kmem_cache *); + void (*work_fn)(struct kmem_cache *); union { - struct rcu_head deact_rcu_head; - struct work_struct deact_work; + struct rcu_head rcu_head; + struct work_struct work; }; }; }; diff --git a/mm/slab.h b/mm/slab.h index 86f7ede21203..7ef695b91919 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -291,7 +291,7 @@ static __always_inline void memcg_uncharge_slab(struct page *page, int order, extern void slab_init_memcg_params(struct kmem_cache *); extern void memcg_link_cache(struct kmem_cache *s, struct mem_cgroup *memcg); extern void slab_deactivate_memcg_cache_rcu_sched(struct kmem_cache *s, - void (*deact_fn)(struct kmem_cache *)); + void (*work_fn)(struct kmem_cache *)); #else /* CONFIG_MEMCG_KMEM */ diff --git a/mm/slab_common.c b/mm/slab_common.c index 07ee4189b40c..f4dd9f75751c 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -691,17 +691,17 @@ out_unlock: put_online_cpus(); } -static void kmemcg_deactivate_workfn(struct work_struct *work) +static void kmemcg_workfn(struct work_struct *work) { struct kmem_cache *s = container_of(work, struct kmem_cache, - memcg_params.deact_work); + memcg_params.work); get_online_cpus(); get_online_mems(); mutex_lock(&slab_mutex); - s->memcg_params.deact_fn(s); + s->memcg_params.work_fn(s); mutex_unlock(&slab_mutex); @@ -712,36 +712,36 @@ static void kmemcg_deactivate_workfn(struct work_struct *work) css_put(&s->memcg_params.memcg->css); } -static void kmemcg_deactivate_rcufn(struct rcu_head *head) +static void kmemcg_rcufn(struct rcu_head *head) { struct kmem_cache *s = container_of(head, struct kmem_cache, - memcg_params.deact_rcu_head); + memcg_params.rcu_head); /* - * We need to grab blocking locks. Bounce to ->deact_work. The + * We need to grab blocking locks. Bounce to ->work. The * work item shares the space with the RCU head and can't be * initialized eariler. */ - INIT_WORK(&s->memcg_params.deact_work, kmemcg_deactivate_workfn); - queue_work(memcg_kmem_cache_wq, &s->memcg_params.deact_work); + INIT_WORK(&s->memcg_params.work, kmemcg_workfn); + queue_work(memcg_kmem_cache_wq, &s->memcg_params.work); } /** * slab_deactivate_memcg_cache_rcu_sched - schedule deactivation after a * sched RCU grace period * @s: target kmem_cache - * @deact_fn: deactivation function to call + * @work_fn: deactivation function to call * - * Schedule @deact_fn to be invoked with online cpus, mems and slab_mutex + * Schedule @work_fn to be invoked with online cpus, mems and slab_mutex * held after a sched RCU grace period. The slab is guaranteed to stay - * alive until @deact_fn is finished. This is to be used from + * alive until @work_fn is finished. This is to be used from * __kmemcg_cache_deactivate(). */ void slab_deactivate_memcg_cache_rcu_sched(struct kmem_cache *s, - void (*deact_fn)(struct kmem_cache *)) + void (*work_fn)(struct kmem_cache *)) { if (WARN_ON_ONCE(is_root_cache(s)) || - WARN_ON_ONCE(s->memcg_params.deact_fn)) + WARN_ON_ONCE(s->memcg_params.work_fn)) return; if (s->memcg_params.root_cache->memcg_params.dying) @@ -750,8 +750,8 @@ void slab_deactivate_memcg_cache_rcu_sched(struct kmem_cache *s, /* pin memcg so that @s doesn't get destroyed in the middle */ css_get(&s->memcg_params.memcg->css); - s->memcg_params.deact_fn = deact_fn; - call_rcu(&s->memcg_params.deact_rcu_head, kmemcg_deactivate_rcufn); + s->memcg_params.work_fn = work_fn; + call_rcu(&s->memcg_params.rcu_head, kmemcg_rcufn); } void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg) From 434866947564b954409c2fe561605e22f7b49f64 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 11 Jul 2019 20:56:09 -0700 Subject: [PATCH 074/147] mm: memcg/slab: generalize postponed non-root kmem_cache deactivation Currently SLUB uses a work scheduled after an RCU grace period to deactivate a non-root kmem_cache. This mechanism can be reused for kmem_caches release, but requires generalization for SLAB case. Introduce kmemcg_cache_deactivate() function, which calls allocator-specific __kmem_cache_deactivate() and schedules execution of __kmem_cache_deactivate_after_rcu() with all necessary locks in a worker context after an rcu grace period. Here is the new calling scheme: kmemcg_cache_deactivate() __kmemcg_cache_deactivate() SLAB/SLUB-specific kmemcg_rcufn() rcu kmemcg_workfn() work __kmemcg_cache_deactivate_after_rcu() SLAB/SLUB-specific instead of: __kmemcg_cache_deactivate() SLAB/SLUB-specific slab_deactivate_memcg_cache_rcu_sched() SLUB-only kmemcg_rcufn() rcu kmemcg_workfn() work kmemcg_cache_deact_after_rcu() SLUB-only For consistency, all allocator-specific functions start with "__". Link: http://lkml.kernel.org/r/20190611231813.3148843-4-guro@fb.com Signed-off-by: Roman Gushchin Acked-by: Vladimir Davydov Reviewed-by: Shakeel Butt Cc: Christoph Lameter Cc: Johannes Weiner Cc: Michal Hocko Cc: Waiman Long Cc: David Rientjes Cc: Joonsoo Kim Cc: Pekka Enberg Cc: Andrei Vagin Cc: Qian Cai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 4 ++++ mm/slab.h | 3 +-- mm/slab_common.c | 27 ++++++++------------------- mm/slub.c | 8 +------- 4 files changed, 14 insertions(+), 28 deletions(-) diff --git a/mm/slab.c b/mm/slab.c index badd98f7e2f1..30347bd3f19c 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2252,6 +2252,10 @@ void __kmemcg_cache_deactivate(struct kmem_cache *cachep) { __kmem_cache_shrink(cachep); } + +void __kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s) +{ +} #endif int __kmem_cache_shutdown(struct kmem_cache *cachep) diff --git a/mm/slab.h b/mm/slab.h index 7ef695b91919..dc83583ee9dd 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -172,6 +172,7 @@ int __kmem_cache_shutdown(struct kmem_cache *); void __kmem_cache_release(struct kmem_cache *); int __kmem_cache_shrink(struct kmem_cache *); void __kmemcg_cache_deactivate(struct kmem_cache *s); +void __kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s); void slab_kmem_cache_release(struct kmem_cache *); struct seq_file; @@ -290,8 +291,6 @@ static __always_inline void memcg_uncharge_slab(struct page *page, int order, extern void slab_init_memcg_params(struct kmem_cache *); extern void memcg_link_cache(struct kmem_cache *s, struct mem_cgroup *memcg); -extern void slab_deactivate_memcg_cache_rcu_sched(struct kmem_cache *s, - void (*work_fn)(struct kmem_cache *)); #else /* CONFIG_MEMCG_KMEM */ diff --git a/mm/slab_common.c b/mm/slab_common.c index f4dd9f75751c..62733bbcc971 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -708,7 +708,7 @@ static void kmemcg_workfn(struct work_struct *work) put_online_mems(); put_online_cpus(); - /* done, put the ref from slab_deactivate_memcg_cache_rcu_sched() */ + /* done, put the ref from kmemcg_cache_deactivate() */ css_put(&s->memcg_params.memcg->css); } @@ -726,31 +726,21 @@ static void kmemcg_rcufn(struct rcu_head *head) queue_work(memcg_kmem_cache_wq, &s->memcg_params.work); } -/** - * slab_deactivate_memcg_cache_rcu_sched - schedule deactivation after a - * sched RCU grace period - * @s: target kmem_cache - * @work_fn: deactivation function to call - * - * Schedule @work_fn to be invoked with online cpus, mems and slab_mutex - * held after a sched RCU grace period. The slab is guaranteed to stay - * alive until @work_fn is finished. This is to be used from - * __kmemcg_cache_deactivate(). - */ -void slab_deactivate_memcg_cache_rcu_sched(struct kmem_cache *s, - void (*work_fn)(struct kmem_cache *)) +static void kmemcg_cache_deactivate(struct kmem_cache *s) { if (WARN_ON_ONCE(is_root_cache(s)) || WARN_ON_ONCE(s->memcg_params.work_fn)) return; + __kmemcg_cache_deactivate(s); + if (s->memcg_params.root_cache->memcg_params.dying) return; /* pin memcg so that @s doesn't get destroyed in the middle */ css_get(&s->memcg_params.memcg->css); - s->memcg_params.work_fn = work_fn; + s->memcg_params.work_fn = __kmemcg_cache_deactivate_after_rcu; call_rcu(&s->memcg_params.rcu_head, kmemcg_rcufn); } @@ -773,7 +763,7 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg) if (!c) continue; - __kmemcg_cache_deactivate(c); + kmemcg_cache_deactivate(c); arr->entries[idx] = NULL; } mutex_unlock(&slab_mutex); @@ -866,11 +856,10 @@ static void flush_memcg_workqueue(struct kmem_cache *s) mutex_unlock(&slab_mutex); /* - * SLUB deactivates the kmem_caches through call_rcu. Make + * SLAB and SLUB deactivate the kmem_caches through call_rcu. Make * sure all registered rcu callbacks have been invoked. */ - if (IS_ENABLED(CONFIG_SLUB)) - rcu_barrier(); + rcu_barrier(); /* * SLAB and SLUB create memcg kmem_caches through workqueue and SLUB diff --git a/mm/slub.c b/mm/slub.c index e1402ed19e74..845aeaa6c2d4 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4008,7 +4008,7 @@ int __kmem_cache_shrink(struct kmem_cache *s) } #ifdef CONFIG_MEMCG -static void kmemcg_cache_deact_after_rcu(struct kmem_cache *s) +void __kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s) { /* * Called with all the locks held after a sched RCU grace period. @@ -4034,12 +4034,6 @@ void __kmemcg_cache_deactivate(struct kmem_cache *s) */ slub_set_cpu_partial(s, 0); s->min_partial = 0; - - /* - * s->cpu_partial is checked locklessly (see put_cpu_partial), so - * we have to make sure the change is visible before shrinking. - */ - slab_deactivate_memcg_cache_rcu_sched(s, kmemcg_cache_deact_after_rcu); } #endif /* CONFIG_MEMCG */ From 49a18eae2e98a794477b5af5d85938e430c0be72 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 11 Jul 2019 20:56:13 -0700 Subject: [PATCH 075/147] mm: memcg/slab: introduce __memcg_kmem_uncharge_memcg() Let's separate the page counter modification code out of __memcg_kmem_uncharge() in a way similar to what __memcg_kmem_charge() and __memcg_kmem_charge_memcg() work. This will allow to reuse this code later using a new memcg_kmem_uncharge_memcg() wrapper, which calls __memcg_kmem_uncharge_memcg() if memcg_kmem_enabled() check is passed. Link: http://lkml.kernel.org/r/20190611231813.3148843-5-guro@fb.com Signed-off-by: Roman Gushchin Reviewed-by: Shakeel Butt Acked-by: Vladimir Davydov Cc: Christoph Lameter Cc: Johannes Weiner Cc: Michal Hocko Cc: Waiman Long Cc: David Rientjes Cc: Joonsoo Kim Cc: Pekka Enberg Cc: Andrei Vagin Cc: Qian Cai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 10 ++++++++++ mm/memcontrol.c | 25 +++++++++++++++++-------- 2 files changed, 27 insertions(+), 8 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 22141ebc5e15..68402842c337 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1278,6 +1278,8 @@ int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order); void __memcg_kmem_uncharge(struct page *page, int order); int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, struct mem_cgroup *memcg); +void __memcg_kmem_uncharge_memcg(struct mem_cgroup *memcg, + unsigned int nr_pages); extern struct static_key_false memcg_kmem_enabled_key; extern struct workqueue_struct *memcg_kmem_cache_wq; @@ -1319,6 +1321,14 @@ static inline int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, return __memcg_kmem_charge_memcg(page, gfp, order, memcg); return 0; } + +static inline void memcg_kmem_uncharge_memcg(struct page *page, int order, + struct mem_cgroup *memcg) +{ + if (memcg_kmem_enabled()) + __memcg_kmem_uncharge_memcg(memcg, 1 << order); +} + /* * helper for accessing a memcg's index. It will be used as an index in the * child cache array in kmem_cache, and also to derive its name. This function diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6de79ec3cd21..25e35a8b8ba2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2807,6 +2807,22 @@ int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order) css_put(&memcg->css); return ret; } + +/** + * __memcg_kmem_uncharge_memcg: uncharge a kmem page + * @memcg: memcg to uncharge + * @nr_pages: number of pages to uncharge + */ +void __memcg_kmem_uncharge_memcg(struct mem_cgroup *memcg, + unsigned int nr_pages) +{ + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) + page_counter_uncharge(&memcg->kmem, nr_pages); + + page_counter_uncharge(&memcg->memory, nr_pages); + if (do_memsw_account()) + page_counter_uncharge(&memcg->memsw, nr_pages); +} /** * __memcg_kmem_uncharge: uncharge a kmem page * @page: page to uncharge @@ -2821,14 +2837,7 @@ void __memcg_kmem_uncharge(struct page *page, int order) return; VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); - - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) - page_counter_uncharge(&memcg->kmem, nr_pages); - - page_counter_uncharge(&memcg->memory, nr_pages); - if (do_memsw_account()) - page_counter_uncharge(&memcg->memsw, nr_pages); - + __memcg_kmem_uncharge_memcg(memcg, nr_pages); page->mem_cgroup = NULL; /* slab pages do not have PageKmemcg flag set */ From 6cea1d569d24af6f9e95f70cb301807440ae2981 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 11 Jul 2019 20:56:16 -0700 Subject: [PATCH 076/147] mm: memcg/slab: unify SLAB and SLUB page accounting Currently the page accounting code is duplicated in SLAB and SLUB internals. Let's move it into new (un)charge_slab_page helpers in the slab_common.c file. These helpers will be responsible for statistics (global and memcg-aware) and memcg charging. So they are replacing direct memcg_(un)charge_slab() calls. Link: http://lkml.kernel.org/r/20190611231813.3148843-6-guro@fb.com Signed-off-by: Roman Gushchin Reviewed-by: Shakeel Butt Acked-by: Christoph Lameter Acked-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Waiman Long Cc: David Rientjes Cc: Joonsoo Kim Cc: Pekka Enberg Cc: Andrei Vagin Cc: Qian Cai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 19 +++---------------- mm/slab.h | 25 +++++++++++++++++++++++++ mm/slub.c | 14 ++------------ 3 files changed, 30 insertions(+), 28 deletions(-) diff --git a/mm/slab.c b/mm/slab.c index 30347bd3f19c..e9d90b0da47b 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1360,7 +1360,6 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) { struct page *page; - int nr_pages; flags |= cachep->allocflags; @@ -1370,17 +1369,11 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, return NULL; } - if (memcg_charge_slab(page, flags, cachep->gfporder, cachep)) { + if (charge_slab_page(page, flags, cachep->gfporder, cachep)) { __free_pages(page, cachep->gfporder); return NULL; } - nr_pages = (1 << cachep->gfporder); - if (cachep->flags & SLAB_RECLAIM_ACCOUNT) - mod_lruvec_page_state(page, NR_SLAB_RECLAIMABLE, nr_pages); - else - mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE, nr_pages); - __SetPageSlab(page); /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */ if (sk_memalloc_socks() && page_is_pfmemalloc(page)) @@ -1395,12 +1388,6 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, static void kmem_freepages(struct kmem_cache *cachep, struct page *page) { int order = cachep->gfporder; - unsigned long nr_freed = (1 << order); - - if (cachep->flags & SLAB_RECLAIM_ACCOUNT) - mod_lruvec_page_state(page, NR_SLAB_RECLAIMABLE, -nr_freed); - else - mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE, -nr_freed); BUG_ON(!PageSlab(page)); __ClearPageSlabPfmemalloc(page); @@ -1409,8 +1396,8 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page) page->mapping = NULL; if (current->reclaim_state) - current->reclaim_state->reclaimed_slab += nr_freed; - memcg_uncharge_slab(page, order, cachep); + current->reclaim_state->reclaimed_slab += 1 << order; + uncharge_slab_page(page, order, cachep); __free_pages(page, order); } diff --git a/mm/slab.h b/mm/slab.h index dc83583ee9dd..46623a576a3c 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -205,6 +205,12 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer, void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **); int __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **); +static inline int cache_vmstat_idx(struct kmem_cache *s) +{ + return (s->flags & SLAB_RECLAIM_ACCOUNT) ? + NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE; +} + #ifdef CONFIG_MEMCG_KMEM /* List of all root caches. */ @@ -361,6 +367,25 @@ static inline struct kmem_cache *virt_to_cache(const void *obj) return page->slab_cache; } +static __always_inline int charge_slab_page(struct page *page, + gfp_t gfp, int order, + struct kmem_cache *s) +{ + int ret = memcg_charge_slab(page, gfp, order, s); + + if (!ret) + mod_lruvec_page_state(page, cache_vmstat_idx(s), 1 << order); + + return ret; +} + +static __always_inline void uncharge_slab_page(struct page *page, int order, + struct kmem_cache *s) +{ + mod_lruvec_page_state(page, cache_vmstat_idx(s), -(1 << order)); + memcg_uncharge_slab(page, order, s); +} + static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) { struct kmem_cache *cachep; diff --git a/mm/slub.c b/mm/slub.c index 845aeaa6c2d4..c9541a480627 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1488,7 +1488,7 @@ static inline struct page *alloc_slab_page(struct kmem_cache *s, else page = __alloc_pages_node(node, flags, order); - if (page && memcg_charge_slab(page, flags, order, s)) { + if (page && charge_slab_page(page, flags, order, s)) { __free_pages(page, order); page = NULL; } @@ -1681,11 +1681,6 @@ out: if (!page) return NULL; - mod_lruvec_page_state(page, - (s->flags & SLAB_RECLAIM_ACCOUNT) ? - NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, - 1 << oo_order(oo)); - inc_slabs_node(s, page_to_nid(page), page->objects); return page; @@ -1719,18 +1714,13 @@ static void __free_slab(struct kmem_cache *s, struct page *page) check_object(s, page, p, SLUB_RED_INACTIVE); } - mod_lruvec_page_state(page, - (s->flags & SLAB_RECLAIM_ACCOUNT) ? - NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, - -pages); - __ClearPageSlabPfmemalloc(page); __ClearPageSlab(page); page->mapping = NULL; if (current->reclaim_state) current->reclaim_state->reclaimed_slab += pages; - memcg_uncharge_slab(page, order, s); + uncharge_slab_page(page, order, s); __free_pages(page, order); } From 570332978ea7fdbec86a07086a584d796a87da2c Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 11 Jul 2019 20:56:20 -0700 Subject: [PATCH 077/147] mm: memcg/slab: don't check the dying flag on kmem_cache creation There is no point in checking the root_cache->memcg_params.dying flag on kmem_cache creation path. New allocations shouldn't be performed using a dead root kmem_cache, so no new memcg kmem_cache creation can be scheduled after the flag is set. And if it was scheduled before, flush_memcg_workqueue() will wait for it anyway. So let's drop this check to simplify the code. Link: http://lkml.kernel.org/r/20190611231813.3148843-7-guro@fb.com Signed-off-by: Roman Gushchin Acked-by: Vladimir Davydov Reviewed-by: Shakeel Butt Cc: Christoph Lameter Cc: Johannes Weiner Cc: Michal Hocko Cc: Waiman Long Cc: David Rientjes Cc: Joonsoo Kim Cc: Pekka Enberg Cc: Andrei Vagin Cc: Qian Cai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab_common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/slab_common.c b/mm/slab_common.c index 62733bbcc971..afdd73553b88 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -640,7 +640,7 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg, * The memory cgroup could have been offlined while the cache * creation work was pending. */ - if (memcg->kmem_state != KMEM_ONLINE || root_cache->memcg_params.dying) + if (memcg->kmem_state != KMEM_ONLINE) goto out_unlock; idx = memcg_cache_id(memcg); From 63b02ef7dc4ec239df45c018ac0adbd02ba30a0c Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 11 Jul 2019 20:56:24 -0700 Subject: [PATCH 078/147] mm: memcg/slab: synchronize access to kmem_cache dying flag using a spinlock Currently the memcg_params.dying flag and the corresponding workqueue used for the asynchronous deactivation of kmem_caches is synchronized using the slab_mutex. It makes impossible to check this flag from the irq context, which will be required in order to implement asynchronous release of kmem_caches. So let's switch over to the irq-save flavor of the spinlock-based synchronization. Link: http://lkml.kernel.org/r/20190611231813.3148843-8-guro@fb.com Signed-off-by: Roman Gushchin Acked-by: Vladimir Davydov Reviewed-by: Shakeel Butt Cc: Christoph Lameter Cc: Johannes Weiner Cc: Michal Hocko Cc: Waiman Long Cc: David Rientjes Cc: Joonsoo Kim Cc: Pekka Enberg Cc: Andrei Vagin Cc: Qian Cai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab_common.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/mm/slab_common.c b/mm/slab_common.c index afdd73553b88..a15557776d7d 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -130,6 +130,7 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr, #ifdef CONFIG_MEMCG_KMEM LIST_HEAD(slab_root_caches); +static DEFINE_SPINLOCK(memcg_kmem_wq_lock); void slab_init_memcg_params(struct kmem_cache *s) { @@ -734,14 +735,22 @@ static void kmemcg_cache_deactivate(struct kmem_cache *s) __kmemcg_cache_deactivate(s); + /* + * memcg_kmem_wq_lock is used to synchronize memcg_params.dying + * flag and make sure that no new kmem_cache deactivation tasks + * are queued (see flush_memcg_workqueue() ). + */ + spin_lock_irq(&memcg_kmem_wq_lock); if (s->memcg_params.root_cache->memcg_params.dying) - return; + goto unlock; /* pin memcg so that @s doesn't get destroyed in the middle */ css_get(&s->memcg_params.memcg->css); s->memcg_params.work_fn = __kmemcg_cache_deactivate_after_rcu; call_rcu(&s->memcg_params.rcu_head, kmemcg_rcufn); +unlock: + spin_unlock_irq(&memcg_kmem_wq_lock); } void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg) @@ -851,9 +860,9 @@ static int shutdown_memcg_caches(struct kmem_cache *s) static void flush_memcg_workqueue(struct kmem_cache *s) { - mutex_lock(&slab_mutex); + spin_lock_irq(&memcg_kmem_wq_lock); s->memcg_params.dying = true; - mutex_unlock(&slab_mutex); + spin_unlock_irq(&memcg_kmem_wq_lock); /* * SLAB and SLUB deactivate the kmem_caches through call_rcu. Make From f0a3a24b532d9a7e56a33c5112b2a212ed6ec580 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 11 Jul 2019 20:56:27 -0700 Subject: [PATCH 079/147] mm: memcg/slab: rework non-root kmem_cache lifecycle management Currently each charged slab page holds a reference to the cgroup to which it's charged. Kmem_caches are held by the memcg and are released all together with the memory cgroup. It means that none of kmem_caches are released unless at least one reference to the memcg exists, which is very far from optimal. Let's rework it in a way that allows releasing individual kmem_caches as soon as the cgroup is offline, the kmem_cache is empty and there are no pending allocations. To make it possible, let's introduce a new percpu refcounter for non-root kmem caches. The counter is initialized to the percpu mode, and is switched to the atomic mode during kmem_cache deactivation. The counter is bumped for every charged page and also for every running allocation. So the kmem_cache can't be released unless all allocations complete. To shutdown non-active empty kmem_caches, let's reuse the work queue, previously used for the kmem_cache deactivation. Once the reference counter reaches 0, let's schedule an asynchronous kmem_cache release. * I used the following simple approach to test the performance (stolen from another patchset by T. Harding): time find / -name fname-no-exist echo 2 > /proc/sys/vm/drop_caches repeat 10 times Results: orig patched real 0m1.455s real 0m1.355s user 0m0.206s user 0m0.219s sys 0m0.855s sys 0m0.807s real 0m1.487s real 0m1.699s user 0m0.221s user 0m0.256s sys 0m0.806s sys 0m0.948s real 0m1.515s real 0m1.505s user 0m0.183s user 0m0.215s sys 0m0.876s sys 0m0.858s real 0m1.291s real 0m1.380s user 0m0.193s user 0m0.198s sys 0m0.843s sys 0m0.786s real 0m1.364s real 0m1.374s user 0m0.180s user 0m0.182s sys 0m0.868s sys 0m0.806s real 0m1.352s real 0m1.312s user 0m0.201s user 0m0.212s sys 0m0.820s sys 0m0.761s real 0m1.302s real 0m1.349s user 0m0.205s user 0m0.203s sys 0m0.803s sys 0m0.792s real 0m1.334s real 0m1.301s user 0m0.194s user 0m0.201s sys 0m0.806s sys 0m0.779s real 0m1.426s real 0m1.434s user 0m0.216s user 0m0.181s sys 0m0.824s sys 0m0.864s real 0m1.350s real 0m1.295s user 0m0.200s user 0m0.190s sys 0m0.842s sys 0m0.811s So it looks like the difference is not noticeable in this test. [cai@lca.pw: fix an use-after-free in kmemcg_workfn()] Link: http://lkml.kernel.org/r/1560977573-10715-1-git-send-email-cai@lca.pw Link: http://lkml.kernel.org/r/20190611231813.3148843-9-guro@fb.com Signed-off-by: Roman Gushchin Signed-off-by: Qian Cai Acked-by: Vladimir Davydov Cc: Christoph Lameter Cc: Johannes Weiner Cc: Michal Hocko Cc: Shakeel Butt Cc: Waiman Long Cc: David Rientjes Cc: Joonsoo Kim Cc: Pekka Enberg Cc: Andrei Vagin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 3 +- mm/memcontrol.c | 50 +++++++++++++++++++++------- mm/slab.h | 44 +++++++------------------ mm/slab_common.c | 78 +++++++++++++++++++++++++------------------- 4 files changed, 96 insertions(+), 79 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index 6008d884e621..bc189a43e680 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -16,6 +16,7 @@ #include #include #include +#include /* @@ -152,7 +153,6 @@ int kmem_cache_shrink(struct kmem_cache *); void memcg_create_kmem_cache(struct mem_cgroup *, struct kmem_cache *); void memcg_deactivate_kmem_caches(struct mem_cgroup *); -void memcg_destroy_kmem_caches(struct mem_cgroup *); /* * Please use this macro to create slab caches. Simply specify the @@ -642,6 +642,7 @@ struct memcg_cache_params { struct mem_cgroup *memcg; struct list_head children_node; struct list_head kmem_caches_node; + struct percpu_ref refcnt; void (*work_fn)(struct kmem_cache *); union { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 25e35a8b8ba2..ce4ce5e7937b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2667,12 +2667,13 @@ static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, { struct memcg_kmem_cache_create_work *cw; + if (!css_tryget_online(&memcg->css)) + return; + cw = kmalloc(sizeof(*cw), GFP_NOWAIT | __GFP_NOWARN); if (!cw) return; - css_get(&memcg->css); - cw->memcg = memcg; cw->cachep = cachep; INIT_WORK(&cw->work, memcg_kmem_cache_create_func); @@ -2707,6 +2708,7 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep) { struct mem_cgroup *memcg; struct kmem_cache *memcg_cachep; + struct memcg_cache_array *arr; int kmemcg_id; VM_BUG_ON(!is_root_cache(cachep)); @@ -2714,14 +2716,28 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep) if (memcg_kmem_bypass()) return cachep; - memcg = get_mem_cgroup_from_current(); + rcu_read_lock(); + + if (unlikely(current->active_memcg)) + memcg = current->active_memcg; + else + memcg = mem_cgroup_from_task(current); + + if (!memcg || memcg == root_mem_cgroup) + goto out_unlock; + kmemcg_id = READ_ONCE(memcg->kmemcg_id); if (kmemcg_id < 0) - goto out; + goto out_unlock; - memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id); - if (likely(memcg_cachep)) - return memcg_cachep; + arr = rcu_dereference(cachep->memcg_params.memcg_caches); + + /* + * Make sure we will access the up-to-date value. The code updating + * memcg_caches issues a write barrier to match the data dependency + * barrier inside READ_ONCE() (see memcg_create_kmem_cache()). + */ + memcg_cachep = READ_ONCE(arr->entries[kmemcg_id]); /* * If we are in a safe context (can wait, and not in interrupt @@ -2734,10 +2750,20 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep) * memcg_create_kmem_cache, this means no further allocation * could happen with the slab_mutex held. So it's better to * defer everything. + * + * If the memcg is dying or memcg_cache is about to be released, + * don't bother creating new kmem_caches. Because memcg_cachep + * is ZEROed as the fist step of kmem offlining, we don't need + * percpu_ref_tryget_live() here. css_tryget_online() check in + * memcg_schedule_kmem_cache_create() will prevent us from + * creation of a new kmem_cache. */ - memcg_schedule_kmem_cache_create(memcg, cachep); -out: - css_put(&memcg->css); + if (unlikely(!memcg_cachep)) + memcg_schedule_kmem_cache_create(memcg, cachep); + else if (percpu_ref_tryget(&memcg_cachep->memcg_params.refcnt)) + cachep = memcg_cachep; +out_unlock: + rcu_read_unlock(); return cachep; } @@ -2748,7 +2774,7 @@ out: void memcg_kmem_put_cache(struct kmem_cache *cachep) { if (!is_root_cache(cachep)) - css_put(&cachep->memcg_params.memcg->css); + percpu_ref_put(&cachep->memcg_params.refcnt); } /** @@ -3295,7 +3321,7 @@ static void memcg_free_kmem(struct mem_cgroup *memcg) memcg_offline_kmem(memcg); if (memcg->kmem_state == KMEM_ALLOCATED) { - memcg_destroy_kmem_caches(memcg); + WARN_ON(!list_empty(&memcg->kmem_caches)); static_branch_dec(&memcg_kmem_enabled_key); WARN_ON(page_counter_read(&memcg->kmem)); } diff --git a/mm/slab.h b/mm/slab.h index 46623a576a3c..5d2b8511e6fb 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -248,31 +248,6 @@ static inline const char *cache_name(struct kmem_cache *s) return s->name; } -/* - * Note, we protect with RCU only the memcg_caches array, not per-memcg caches. - * That said the caller must assure the memcg's cache won't go away by either - * taking a css reference to the owner cgroup, or holding the slab_mutex. - */ -static inline struct kmem_cache * -cache_from_memcg_idx(struct kmem_cache *s, int idx) -{ - struct kmem_cache *cachep; - struct memcg_cache_array *arr; - - rcu_read_lock(); - arr = rcu_dereference(s->memcg_params.memcg_caches); - - /* - * Make sure we will access the up-to-date value. The code updating - * memcg_caches issues a write barrier to match this (see - * memcg_create_kmem_cache()). - */ - cachep = READ_ONCE(arr->entries[idx]); - rcu_read_unlock(); - - return cachep; -} - static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) { if (is_root_cache(s)) @@ -284,14 +259,25 @@ static __always_inline int memcg_charge_slab(struct page *page, gfp_t gfp, int order, struct kmem_cache *s) { + int ret; + if (is_root_cache(s)) return 0; - return memcg_kmem_charge_memcg(page, gfp, order, s->memcg_params.memcg); + + ret = memcg_kmem_charge_memcg(page, gfp, order, s->memcg_params.memcg); + if (ret) + return ret; + + percpu_ref_get_many(&s->memcg_params.refcnt, 1 << order); + + return 0; } static __always_inline void memcg_uncharge_slab(struct page *page, int order, struct kmem_cache *s) { + if (!is_root_cache(s)) + percpu_ref_put_many(&s->memcg_params.refcnt, 1 << order); memcg_kmem_uncharge(page, order); } @@ -323,12 +309,6 @@ static inline const char *cache_name(struct kmem_cache *s) return s->name; } -static inline struct kmem_cache * -cache_from_memcg_idx(struct kmem_cache *s, int idx) -{ - return NULL; -} - static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) { return s; diff --git a/mm/slab_common.c b/mm/slab_common.c index a15557776d7d..ee3971f7fabc 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -132,6 +132,8 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr, LIST_HEAD(slab_root_caches); static DEFINE_SPINLOCK(memcg_kmem_wq_lock); +static void kmemcg_cache_shutdown(struct percpu_ref *percpu_ref); + void slab_init_memcg_params(struct kmem_cache *s) { s->memcg_params.root_cache = NULL; @@ -146,6 +148,12 @@ static int init_memcg_params(struct kmem_cache *s, struct memcg_cache_array *arr; if (root_cache) { + int ret = percpu_ref_init(&s->memcg_params.refcnt, + kmemcg_cache_shutdown, + 0, GFP_KERNEL); + if (ret) + return ret; + s->memcg_params.root_cache = root_cache; INIT_LIST_HEAD(&s->memcg_params.children_node); INIT_LIST_HEAD(&s->memcg_params.kmem_caches_node); @@ -171,6 +179,8 @@ static void destroy_memcg_params(struct kmem_cache *s) { if (is_root_cache(s)) kvfree(rcu_access_pointer(s->memcg_params.memcg_caches)); + else + percpu_ref_exit(&s->memcg_params.refcnt); } static void free_memcg_params(struct rcu_head *rcu) @@ -226,6 +236,7 @@ void memcg_link_cache(struct kmem_cache *s, struct mem_cgroup *memcg) if (is_root_cache(s)) { list_add(&s->root_caches_node, &slab_root_caches); } else { + css_get(&memcg->css); s->memcg_params.memcg = memcg; list_add(&s->memcg_params.children_node, &s->memcg_params.root_cache->memcg_params.children); @@ -241,6 +252,7 @@ static void memcg_unlink_cache(struct kmem_cache *s) } else { list_del(&s->memcg_params.children_node); list_del(&s->memcg_params.kmem_caches_node); + css_put(&s->memcg_params.memcg->css); } } #else @@ -678,7 +690,7 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg, } /* - * Since readers won't lock (see cache_from_memcg_idx()), we need a + * Since readers won't lock (see memcg_kmem_get_cache()), we need a * barrier here to ensure nobody will see the kmem_cache partially * initialized. */ @@ -701,16 +713,11 @@ static void kmemcg_workfn(struct work_struct *work) get_online_mems(); mutex_lock(&slab_mutex); - s->memcg_params.work_fn(s); - mutex_unlock(&slab_mutex); put_online_mems(); put_online_cpus(); - - /* done, put the ref from kmemcg_cache_deactivate() */ - css_put(&s->memcg_params.memcg->css); } static void kmemcg_rcufn(struct rcu_head *head) @@ -727,10 +734,38 @@ static void kmemcg_rcufn(struct rcu_head *head) queue_work(memcg_kmem_cache_wq, &s->memcg_params.work); } +static void kmemcg_cache_shutdown_fn(struct kmem_cache *s) +{ + WARN_ON(shutdown_cache(s)); +} + +static void kmemcg_cache_shutdown(struct percpu_ref *percpu_ref) +{ + struct kmem_cache *s = container_of(percpu_ref, struct kmem_cache, + memcg_params.refcnt); + unsigned long flags; + + spin_lock_irqsave(&memcg_kmem_wq_lock, flags); + if (s->memcg_params.root_cache->memcg_params.dying) + goto unlock; + + s->memcg_params.work_fn = kmemcg_cache_shutdown_fn; + INIT_WORK(&s->memcg_params.work, kmemcg_workfn); + queue_work(memcg_kmem_cache_wq, &s->memcg_params.work); + +unlock: + spin_unlock_irqrestore(&memcg_kmem_wq_lock, flags); +} + +static void kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s) +{ + __kmemcg_cache_deactivate_after_rcu(s); + percpu_ref_kill(&s->memcg_params.refcnt); +} + static void kmemcg_cache_deactivate(struct kmem_cache *s) { - if (WARN_ON_ONCE(is_root_cache(s)) || - WARN_ON_ONCE(s->memcg_params.work_fn)) + if (WARN_ON_ONCE(is_root_cache(s))) return; __kmemcg_cache_deactivate(s); @@ -744,10 +779,7 @@ static void kmemcg_cache_deactivate(struct kmem_cache *s) if (s->memcg_params.root_cache->memcg_params.dying) goto unlock; - /* pin memcg so that @s doesn't get destroyed in the middle */ - css_get(&s->memcg_params.memcg->css); - - s->memcg_params.work_fn = __kmemcg_cache_deactivate_after_rcu; + s->memcg_params.work_fn = kmemcg_cache_deactivate_after_rcu; call_rcu(&s->memcg_params.rcu_head, kmemcg_rcufn); unlock: spin_unlock_irq(&memcg_kmem_wq_lock); @@ -781,28 +813,6 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg) put_online_cpus(); } -void memcg_destroy_kmem_caches(struct mem_cgroup *memcg) -{ - struct kmem_cache *s, *s2; - - get_online_cpus(); - get_online_mems(); - - mutex_lock(&slab_mutex); - list_for_each_entry_safe(s, s2, &memcg->kmem_caches, - memcg_params.kmem_caches_node) { - /* - * The cgroup is about to be freed and therefore has no charges - * left. Hence, all its caches must be empty by now. - */ - BUG_ON(shutdown_cache(s)); - } - mutex_unlock(&slab_mutex); - - put_online_mems(); - put_online_cpus(); -} - static int shutdown_memcg_caches(struct kmem_cache *s) { struct memcg_cache_array *arr; From 4d96ba3530750fae3f3f01150adfecde96157815 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 11 Jul 2019 20:56:31 -0700 Subject: [PATCH 080/147] mm: memcg/slab: stop setting page->mem_cgroup pointer for slab pages Every slab page charged to a non-root memory cgroup has a pointer to the memory cgroup and holds a reference to it, which protects a non-empty memory cgroup from being released. At the same time the page has a pointer to the corresponding kmem_cache, and also hold a reference to the kmem_cache. And kmem_cache by itself holds a reference to the cgroup. So there is clearly some redundancy, which allows to stop setting the page->mem_cgroup pointer and rely on getting memcg pointer indirectly via kmem_cache. Further it will allow to change this pointer easier, without a need to go over all charged pages. So let's stop setting page->mem_cgroup pointer for slab pages, and stop using the css refcounter directly for protecting the memory cgroup from going away. Instead rely on kmem_cache as an intermediate object. Make sure that vmstats and shrinker lists are working as previously, as well as /proc/kpagecgroup interface. Link: http://lkml.kernel.org/r/20190611231813.3148843-10-guro@fb.com Signed-off-by: Roman Gushchin Acked-by: Vladimir Davydov Reviewed-by: Shakeel Butt Cc: Christoph Lameter Cc: Johannes Weiner Cc: Michal Hocko Cc: Waiman Long Cc: David Rientjes Cc: Joonsoo Kim Cc: Pekka Enberg Cc: Andrei Vagin Cc: Qian Cai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/list_lru.c | 3 +- mm/memcontrol.c | 12 ++++---- mm/slab.h | 74 ++++++++++++++++++++++++++++++++++++++++--------- 3 files changed, 70 insertions(+), 19 deletions(-) diff --git a/mm/list_lru.c b/mm/list_lru.c index 927d85be32f6..0f1f6b06b7f3 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -12,6 +12,7 @@ #include #include #include +#include "slab.h" #ifdef CONFIG_MEMCG_KMEM static LIST_HEAD(list_lrus); @@ -63,7 +64,7 @@ static __always_inline struct mem_cgroup *mem_cgroup_from_kmem(void *ptr) if (!memcg_kmem_enabled()) return NULL; page = virt_to_head_page(ptr); - return page->mem_cgroup; + return memcg_from_slab_page(page); } static inline struct list_lru_one * diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ce4ce5e7937b..fa39e51b3d94 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -486,7 +486,10 @@ ino_t page_cgroup_ino(struct page *page) unsigned long ino = 0; rcu_read_lock(); - memcg = READ_ONCE(page->mem_cgroup); + if (PageHead(page) && PageSlab(page)) + memcg = memcg_from_slab_page(page); + else + memcg = READ_ONCE(page->mem_cgroup); while (memcg && !(memcg->css.flags & CSS_ONLINE)) memcg = parent_mem_cgroup(memcg); if (memcg) @@ -2802,9 +2805,6 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, cancel_charge(memcg, nr_pages); return -ENOMEM; } - - page->mem_cgroup = memcg; - return 0; } @@ -2827,8 +2827,10 @@ int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order) memcg = get_mem_cgroup_from_current(); if (!mem_cgroup_is_root(memcg)) { ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg); - if (!ret) + if (!ret) { + page->mem_cgroup = memcg; __SetPageKmemcg(page); + } } css_put(&memcg->css); return ret; diff --git a/mm/slab.h b/mm/slab.h index 5d2b8511e6fb..7ead47cb9338 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -255,30 +255,67 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) return s->memcg_params.root_cache; } +/* + * Expects a pointer to a slab page. Please note, that PageSlab() check + * isn't sufficient, as it returns true also for tail compound slab pages, + * which do not have slab_cache pointer set. + * So this function assumes that the page can pass PageHead() and PageSlab() + * checks. + */ +static inline struct mem_cgroup *memcg_from_slab_page(struct page *page) +{ + struct kmem_cache *s; + + s = READ_ONCE(page->slab_cache); + if (s && !is_root_cache(s)) + return s->memcg_params.memcg; + + return NULL; +} + +/* + * Charge the slab page belonging to the non-root kmem_cache. + * Can be called for non-root kmem_caches only. + */ static __always_inline int memcg_charge_slab(struct page *page, gfp_t gfp, int order, struct kmem_cache *s) { + struct mem_cgroup *memcg; + struct lruvec *lruvec; int ret; - if (is_root_cache(s)) - return 0; - - ret = memcg_kmem_charge_memcg(page, gfp, order, s->memcg_params.memcg); + memcg = s->memcg_params.memcg; + ret = memcg_kmem_charge_memcg(page, gfp, order, memcg); if (ret) return ret; + lruvec = mem_cgroup_lruvec(page_pgdat(page), memcg); + mod_lruvec_state(lruvec, cache_vmstat_idx(s), 1 << order); + + /* transer try_charge() page references to kmem_cache */ percpu_ref_get_many(&s->memcg_params.refcnt, 1 << order); + css_put_many(&memcg->css, 1 << order); return 0; } +/* + * Uncharge a slab page belonging to a non-root kmem_cache. + * Can be called for non-root kmem_caches only. + */ static __always_inline void memcg_uncharge_slab(struct page *page, int order, struct kmem_cache *s) { - if (!is_root_cache(s)) - percpu_ref_put_many(&s->memcg_params.refcnt, 1 << order); - memcg_kmem_uncharge(page, order); + struct mem_cgroup *memcg; + struct lruvec *lruvec; + + memcg = s->memcg_params.memcg; + lruvec = mem_cgroup_lruvec(page_pgdat(page), memcg); + mod_lruvec_state(lruvec, cache_vmstat_idx(s), -(1 << order)); + memcg_kmem_uncharge_memcg(page, order, memcg); + + percpu_ref_put_many(&s->memcg_params.refcnt, 1 << order); } extern void slab_init_memcg_params(struct kmem_cache *); @@ -314,6 +351,11 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) return s; } +static inline struct mem_cgroup *memcg_from_slab_page(struct page *page) +{ + return NULL; +} + static inline int memcg_charge_slab(struct page *page, gfp_t gfp, int order, struct kmem_cache *s) { @@ -351,18 +393,24 @@ static __always_inline int charge_slab_page(struct page *page, gfp_t gfp, int order, struct kmem_cache *s) { - int ret = memcg_charge_slab(page, gfp, order, s); + if (is_root_cache(s)) { + mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s), + 1 << order); + return 0; + } - if (!ret) - mod_lruvec_page_state(page, cache_vmstat_idx(s), 1 << order); - - return ret; + return memcg_charge_slab(page, gfp, order, s); } static __always_inline void uncharge_slab_page(struct page *page, int order, struct kmem_cache *s) { - mod_lruvec_page_state(page, cache_vmstat_idx(s), -(1 << order)); + if (is_root_cache(s)) { + mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s), + -(1 << order)); + return; + } + memcg_uncharge_slab(page, order, s); } From fb2f2b0adb98bbbbbb51c5a5327f3f90f5dc417e Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 11 Jul 2019 20:56:34 -0700 Subject: [PATCH 081/147] mm: memcg/slab: reparent memcg kmem_caches on cgroup removal Let's reparent non-root kmem_caches on memcg offlining. This allows us to release the memory cgroup without waiting for the last outstanding kernel object (e.g. dentry used by another application). Since the parent cgroup is already charged, everything we need to do is to splice the list of kmem_caches to the parent's kmem_caches list, swap the memcg pointer, drop the css refcounter for each kmem_cache and adjust the parent's css refcounter. Please, note that kmem_cache->memcg_params.memcg isn't a stable pointer anymore. It's safe to read it under rcu_read_lock(), cgroup_mutex held, or any other way that protects the memory cgroup from being released. We can race with the slab allocation and deallocation paths. It's not a big problem: parent's charge and slab global stats are always correct, and we don't care anymore about the child usage and global stats. The child cgroup is already offline, so we don't use or show it anywhere. Local slab stats (NR_SLAB_RECLAIMABLE and NR_SLAB_UNRECLAIMABLE) aren't used anywhere except count_shadow_nodes(). But even there it won't break anything: after reparenting "nodes" will be 0 on child level (because we're already reparenting shrinker lists), and on parent level page stats always were 0, and this patch won't change anything. [guro@fb.com: properly handle kmem_caches reparented to root_mem_cgroup] Link: http://lkml.kernel.org/r/20190620213427.1691847-1-guro@fb.com Link: http://lkml.kernel.org/r/20190611231813.3148843-11-guro@fb.com Signed-off-by: Roman Gushchin Acked-by: Vladimir Davydov Reviewed-by: Shakeel Butt Acked-by: David Rientjes Cc: Christoph Lameter Cc: Johannes Weiner Cc: Michal Hocko Cc: Waiman Long Cc: David Rientjes Cc: Joonsoo Kim Cc: Pekka Enberg Cc: Andrei Vagin Cc: Qian Cai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 2 +- mm/memcontrol.c | 14 ++++++++------ mm/slab.h | 41 ++++++++++++++++++++++++++++++++--------- mm/slab_common.c | 19 +++++++++++++++++-- 4 files changed, 58 insertions(+), 18 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index bc189a43e680..fd0ef2e16178 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -152,7 +152,7 @@ void kmem_cache_destroy(struct kmem_cache *); int kmem_cache_shrink(struct kmem_cache *); void memcg_create_kmem_cache(struct mem_cgroup *, struct kmem_cache *); -void memcg_deactivate_kmem_caches(struct mem_cgroup *); +void memcg_deactivate_kmem_caches(struct mem_cgroup *, struct mem_cgroup *); /* * Please use this macro to create slab caches. Simply specify the diff --git a/mm/memcontrol.c b/mm/memcontrol.c index fa39e51b3d94..2cb7e4e5c51a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3284,15 +3284,15 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg) */ memcg->kmem_state = KMEM_ALLOCATED; - memcg_deactivate_kmem_caches(memcg); - - kmemcg_id = memcg->kmemcg_id; - BUG_ON(kmemcg_id < 0); - parent = parent_mem_cgroup(memcg); if (!parent) parent = root_mem_cgroup; + memcg_deactivate_kmem_caches(memcg, parent); + + kmemcg_id = memcg->kmemcg_id; + BUG_ON(kmemcg_id < 0); + /* * Change kmemcg_id of this cgroup and all its descendants to the * parent's id, and then move all entries from this cgroup's list_lrus @@ -3325,7 +3325,6 @@ static void memcg_free_kmem(struct mem_cgroup *memcg) if (memcg->kmem_state == KMEM_ALLOCATED) { WARN_ON(!list_empty(&memcg->kmem_caches)); static_branch_dec(&memcg_kmem_enabled_key); - WARN_ON(page_counter_read(&memcg->kmem)); } } #else @@ -4773,6 +4772,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) /* The following stuff does not apply to the root */ if (!parent) { +#ifdef CONFIG_MEMCG_KMEM + INIT_LIST_HEAD(&memcg->kmem_caches); +#endif root_mem_cgroup = memcg; return &memcg->css; } diff --git a/mm/slab.h b/mm/slab.h index 7ead47cb9338..a62372d0f271 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -261,6 +261,9 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) * which do not have slab_cache pointer set. * So this function assumes that the page can pass PageHead() and PageSlab() * checks. + * + * The kmem_cache can be reparented asynchronously. The caller must ensure + * the memcg lifetime, e.g. by taking rcu_read_lock() or cgroup_mutex. */ static inline struct mem_cgroup *memcg_from_slab_page(struct page *page) { @@ -268,7 +271,7 @@ static inline struct mem_cgroup *memcg_from_slab_page(struct page *page) s = READ_ONCE(page->slab_cache); if (s && !is_root_cache(s)) - return s->memcg_params.memcg; + return READ_ONCE(s->memcg_params.memcg); return NULL; } @@ -285,10 +288,22 @@ static __always_inline int memcg_charge_slab(struct page *page, struct lruvec *lruvec; int ret; - memcg = s->memcg_params.memcg; + rcu_read_lock(); + memcg = READ_ONCE(s->memcg_params.memcg); + while (memcg && !css_tryget_online(&memcg->css)) + memcg = parent_mem_cgroup(memcg); + rcu_read_unlock(); + + if (unlikely(!memcg || mem_cgroup_is_root(memcg))) { + mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s), + (1 << order)); + percpu_ref_get_many(&s->memcg_params.refcnt, 1 << order); + return 0; + } + ret = memcg_kmem_charge_memcg(page, gfp, order, memcg); if (ret) - return ret; + goto out; lruvec = mem_cgroup_lruvec(page_pgdat(page), memcg); mod_lruvec_state(lruvec, cache_vmstat_idx(s), 1 << order); @@ -296,8 +311,9 @@ static __always_inline int memcg_charge_slab(struct page *page, /* transer try_charge() page references to kmem_cache */ percpu_ref_get_many(&s->memcg_params.refcnt, 1 << order); css_put_many(&memcg->css, 1 << order); - - return 0; +out: + css_put(&memcg->css); + return ret; } /* @@ -310,10 +326,17 @@ static __always_inline void memcg_uncharge_slab(struct page *page, int order, struct mem_cgroup *memcg; struct lruvec *lruvec; - memcg = s->memcg_params.memcg; - lruvec = mem_cgroup_lruvec(page_pgdat(page), memcg); - mod_lruvec_state(lruvec, cache_vmstat_idx(s), -(1 << order)); - memcg_kmem_uncharge_memcg(page, order, memcg); + rcu_read_lock(); + memcg = READ_ONCE(s->memcg_params.memcg); + if (likely(!mem_cgroup_is_root(memcg))) { + lruvec = mem_cgroup_lruvec(page_pgdat(page), memcg); + mod_lruvec_state(lruvec, cache_vmstat_idx(s), -(1 << order)); + memcg_kmem_uncharge_memcg(page, order, memcg); + } else { + mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s), + -(1 << order)); + } + rcu_read_unlock(); percpu_ref_put_many(&s->memcg_params.refcnt, 1 << order); } diff --git a/mm/slab_common.c b/mm/slab_common.c index ee3971f7fabc..b893eefb6229 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -252,7 +252,8 @@ static void memcg_unlink_cache(struct kmem_cache *s) } else { list_del(&s->memcg_params.children_node); list_del(&s->memcg_params.kmem_caches_node); - css_put(&s->memcg_params.memcg->css); + mem_cgroup_put(s->memcg_params.memcg); + WRITE_ONCE(s->memcg_params.memcg, NULL); } } #else @@ -785,11 +786,13 @@ unlock: spin_unlock_irq(&memcg_kmem_wq_lock); } -void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg) +void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg, + struct mem_cgroup *parent) { int idx; struct memcg_cache_array *arr; struct kmem_cache *s, *c; + unsigned int nr_reparented; idx = memcg_cache_id(memcg); @@ -807,6 +810,18 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg) kmemcg_cache_deactivate(c); arr->entries[idx] = NULL; } + nr_reparented = 0; + list_for_each_entry(s, &memcg->kmem_caches, + memcg_params.kmem_caches_node) { + WRITE_ONCE(s->memcg_params.memcg, parent); + css_put(&memcg->css); + nr_reparented++; + } + if (nr_reparented) { + list_splice_init(&memcg->kmem_caches, + &parent->kmem_caches); + css_get_many(&parent->css, nr_reparented); + } mutex_unlock(&slab_mutex); put_online_mems(); From fcf8a1e483490cd249df4e02d5425636c3f43c86 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 11 Jul 2019 20:56:38 -0700 Subject: [PATCH 082/147] mm, memcg: add a memcg_slabinfo debugfs file There are concerns about memory leaks from extensive use of memory cgroups as each memory cgroup creates its own set of kmem caches. There is a possiblity that the memcg kmem caches may remain even after the memory cgroups have been offlined. Therefore, it will be useful to show the status of each of memcg kmem caches. This patch introduces a new /memcg_slabinfo file which is somewhat similar to /proc/slabinfo in format, but lists only information about kmem caches that have child memcg kmem caches. Information available in /proc/slabinfo are not repeated in memcg_slabinfo. A portion of a sample output of the file was: # rpc_inode_cache root 13 51 1 1 rpc_inode_cache 48 0 0 0 0 fat_inode_cache root 1 45 1 1 fat_inode_cache 41 2 45 1 1 xfs_inode root 770 816 24 24 xfs_inode 92 22 34 1 1 xfs_inode 88:dead 1 34 1 1 xfs_inode 89:dead 23 34 1 1 xfs_inode 85 4 34 1 1 xfs_inode 84 9 34 1 1 The css id of the memcg is also listed. If a memcg is not online, the tag ":dead" will be attached as shown above. [longman@redhat.com: memcg: add ":deact" tag for reparented kmem caches in memcg_slabinfo] Link: http://lkml.kernel.org/r/20190621173005.31514-1-longman@redhat.com [longman@redhat.com: set the flag in the common code as suggested by Roman] Link: http://lkml.kernel.org/r/20190627184324.5875-1-longman@redhat.com Link: http://lkml.kernel.org/r/20190619171621.26209-1-longman@redhat.com Signed-off-by: Waiman Long Suggested-by: Shakeel Butt Reviewed-by: Shakeel Butt Acked-by: Roman Gushchin Acked-by: David Rientjes Cc: Christoph Lameter Cc: Pekka Enberg Cc: Joonsoo Kim Cc: Michal Hocko Cc: Johannes Weiner Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 4 +++ mm/slab_common.c | 60 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+) diff --git a/include/linux/slab.h b/include/linux/slab.h index fd0ef2e16178..56c9c7eed34e 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -116,6 +116,10 @@ /* Objects are reclaimable */ #define SLAB_RECLAIM_ACCOUNT ((slab_flags_t __force)0x00020000U) #define SLAB_TEMPORARY SLAB_RECLAIM_ACCOUNT /* Objects are short-lived */ + +/* Slab deactivation flag */ +#define SLAB_DEACTIVATED ((slab_flags_t __force)0x10000000U) + /* * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests. * diff --git a/mm/slab_common.c b/mm/slab_common.c index b893eefb6229..6c49dbb3769e 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -770,6 +771,7 @@ static void kmemcg_cache_deactivate(struct kmem_cache *s) return; __kmemcg_cache_deactivate(s); + s->flags |= SLAB_DEACTIVATED; /* * memcg_kmem_wq_lock is used to synchronize memcg_params.dying @@ -1521,6 +1523,64 @@ static int __init slab_proc_init(void) return 0; } module_init(slab_proc_init); + +#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_MEMCG_KMEM) +/* + * Display information about kmem caches that have child memcg caches. + */ +static int memcg_slabinfo_show(struct seq_file *m, void *unused) +{ + struct kmem_cache *s, *c; + struct slabinfo sinfo; + + mutex_lock(&slab_mutex); + seq_puts(m, "# "); + seq_puts(m, " \n"); + list_for_each_entry(s, &slab_root_caches, root_caches_node) { + /* + * Skip kmem caches that don't have any memcg children. + */ + if (list_empty(&s->memcg_params.children)) + continue; + + memset(&sinfo, 0, sizeof(sinfo)); + get_slabinfo(s, &sinfo); + seq_printf(m, "%-17s root %6lu %6lu %6lu %6lu\n", + cache_name(s), sinfo.active_objs, sinfo.num_objs, + sinfo.active_slabs, sinfo.num_slabs); + + for_each_memcg_cache(c, s) { + struct cgroup_subsys_state *css; + char *status = ""; + + css = &c->memcg_params.memcg->css; + if (!(css->flags & CSS_ONLINE)) + status = ":dead"; + else if (c->flags & SLAB_DEACTIVATED) + status = ":deact"; + + memset(&sinfo, 0, sizeof(sinfo)); + get_slabinfo(c, &sinfo); + seq_printf(m, "%-17s %4d%-6s %6lu %6lu %6lu %6lu\n", + cache_name(c), css->id, status, + sinfo.active_objs, sinfo.num_objs, + sinfo.active_slabs, sinfo.num_slabs); + } + } + mutex_unlock(&slab_mutex); + return 0; +} +DEFINE_SHOW_ATTRIBUTE(memcg_slabinfo); + +static int __init memcg_slabinfo_init(void) +{ + debugfs_create_file("memcg_slabinfo", S_IFREG | S_IRUGO, + NULL, NULL, &memcg_slabinfo_fops); + return 0; +} + +late_initcall(memcg_slabinfo_init); +#endif /* CONFIG_DEBUG_FS && CONFIG_MEMCG_KMEM */ #endif /* CONFIG_SLAB || CONFIG_SLUB_DEBUG */ static __always_inline void *__do_krealloc(const void *p, size_t new_size, From f455c854877dcce5d714b00203ea804bf601fb02 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 11 Jul 2019 20:56:41 -0700 Subject: [PATCH 083/147] mm: use untagged_addr() for get_user_pages_fast addresses Patch series "switch the remaining architectures to use generic GUP", v4. A series to switch mips, sh and sparc64 to use the generic GUP code so that we only have one codebase to touch for further improvements to this code. This patch (of 16): This will allow sparc64, or any future architecture with memory tagging to override its tags for get_user_pages and get_user_pages_fast. Link: http://lkml.kernel.org/r/20190625143715.1689-2-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Khalid Aziz Reviewed-by: Jason Gunthorpe Cc: Paul Burton Cc: James Hogan Cc: Yoshinori Sato Cc: Rich Felker Cc: David Miller Cc: Nicholas Piggin Cc: Khalid Aziz Cc: Andrey Konovalov Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Ralf Baechle Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index f1c1daebc425..fc704dc37914 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2146,7 +2146,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, unsigned long flags; int nr = 0; - start &= PAGE_MASK; + start = untagged_addr(start) & PAGE_MASK; len = (unsigned long) nr_pages << PAGE_SHIFT; end = start + len; @@ -2219,7 +2219,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, unsigned long addr, len, end; int nr = 0, ret = 0; - start &= PAGE_MASK; + start = untagged_addr(start) & PAGE_MASK; addr = start; len = (unsigned long) nr_pages << PAGE_SHIFT; end = start + len; From 26f4c328079d78d6cd0462c53c14ec0b69f4748e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 11 Jul 2019 20:56:45 -0700 Subject: [PATCH 084/147] mm: simplify gup_fast_permitted Pass in the already calculated end value instead of recomputing it, and leave the end > start check in the callers instead of duplicating them in the arch code. Link: http://lkml.kernel.org/r/20190625143715.1689-3-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Cc: Andrey Konovalov Cc: Benjamin Herrenschmidt Cc: David Miller Cc: James Hogan Cc: Khalid Aziz Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Paul Burton Cc: Paul Mackerras Cc: Ralf Baechle Cc: Rich Felker Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/include/asm/pgtable.h | 8 +------- arch/x86/include/asm/pgtable_64.h | 8 +------- mm/gup.c | 17 +++++++---------- 3 files changed, 9 insertions(+), 24 deletions(-) diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 9f0195d5fa16..9b274fcaacb6 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -1270,14 +1270,8 @@ static inline pte_t *pte_offset(pmd_t *pmd, unsigned long address) #define pte_offset_map(pmd, address) pte_offset_kernel(pmd, address) #define pte_unmap(pte) do { } while (0) -static inline bool gup_fast_permitted(unsigned long start, int nr_pages) +static inline bool gup_fast_permitted(unsigned long start, unsigned long end) { - unsigned long len, end; - - len = (unsigned long) nr_pages << PAGE_SHIFT; - end = start + len; - if (end < start) - return false; return end <= current->mm->context.asce_limit; } #define gup_fast_permitted gup_fast_permitted diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 0bb566315621..4990d26dfc73 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -259,14 +259,8 @@ extern void init_extra_mapping_uc(unsigned long phys, unsigned long size); extern void init_extra_mapping_wb(unsigned long phys, unsigned long size); #define gup_fast_permitted gup_fast_permitted -static inline bool gup_fast_permitted(unsigned long start, int nr_pages) +static inline bool gup_fast_permitted(unsigned long start, unsigned long end) { - unsigned long len, end; - - len = (unsigned long)nr_pages << PAGE_SHIFT; - end = start + len; - if (end < start) - return false; if (end >> __VIRTUAL_MASK_SHIFT) return false; return true; diff --git a/mm/gup.c b/mm/gup.c index fc704dc37914..84891c06d485 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2123,13 +2123,9 @@ static void gup_pgd_range(unsigned long addr, unsigned long end, * Check if it's allowed to use __get_user_pages_fast() for the range, or * we need to fall back to the slow version: */ -bool gup_fast_permitted(unsigned long start, int nr_pages) +static bool gup_fast_permitted(unsigned long start, unsigned long end) { - unsigned long len, end; - - len = (unsigned long) nr_pages << PAGE_SHIFT; - end = start + len; - return end >= start; + return true; } #endif @@ -2150,6 +2146,8 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, len = (unsigned long) nr_pages << PAGE_SHIFT; end = start + len; + if (end <= start) + return 0; if (unlikely(!access_ok((void __user *)start, len))) return 0; @@ -2165,7 +2163,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, * block IPIs that come from THPs splitting. */ - if (gup_fast_permitted(start, nr_pages)) { + if (gup_fast_permitted(start, end)) { local_irq_save(flags); gup_pgd_range(start, end, write ? FOLL_WRITE : 0, pages, &nr); local_irq_restore(flags); @@ -2224,13 +2222,12 @@ int get_user_pages_fast(unsigned long start, int nr_pages, len = (unsigned long) nr_pages << PAGE_SHIFT; end = start + len; - if (nr_pages <= 0) + if (end <= start) return 0; - if (unlikely(!access_ok((void __user *)start, len))) return -EFAULT; - if (gup_fast_permitted(start, nr_pages)) { + if (gup_fast_permitted(start, end)) { local_irq_disable(); gup_pgd_range(addr, end, gup_flags, pages, &nr); local_irq_enable(); From 39656e83dab918861931ef96e5c41731b0899e56 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 11 Jul 2019 20:56:49 -0700 Subject: [PATCH 085/147] mm: lift the x86_32 PAE version of gup_get_pte to common code The split low/high access is the only non-READ_ONCE version of gup_get_pte that did show up in the various arch implemenations. Lift it to common code and drop the ifdef based arch override. Link: http://lkml.kernel.org/r/20190625143715.1689-4-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Cc: Andrey Konovalov Cc: Benjamin Herrenschmidt Cc: David Miller Cc: James Hogan Cc: Khalid Aziz Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Paul Burton Cc: Paul Mackerras Cc: Ralf Baechle Cc: Rich Felker Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 1 + arch/x86/include/asm/pgtable-3level.h | 47 ------------------------ arch/x86/kvm/mmu.c | 2 +- mm/Kconfig | 3 ++ mm/gup.c | 53 ++++++++++++++++++++++++--- 5 files changed, 53 insertions(+), 53 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index dce10b18f4bc..71c1f7864434 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -123,6 +123,7 @@ config X86 select GENERIC_STRNLEN_USER select GENERIC_TIME_VSYSCALL select GENERIC_GETTIMEOFDAY + select GUP_GET_PTE_LOW_HIGH if X86_PAE select HARDLOCKUP_CHECK_TIMESTAMP if X86_64 select HAVE_ACPI_APEI if ACPI select HAVE_ACPI_APEI_NMI if ACPI diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index f8b1ad2c3828..e3633795fb22 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -285,53 +285,6 @@ static inline pud_t native_pudp_get_and_clear(pud_t *pudp) #define __pte_to_swp_entry(pte) (__swp_entry(__pteval_swp_type(pte), \ __pteval_swp_offset(pte))) -#define gup_get_pte gup_get_pte -/* - * WARNING: only to be used in the get_user_pages_fast() implementation. - * - * With get_user_pages_fast(), we walk down the pagetables without taking - * any locks. For this we would like to load the pointers atomically, - * but that is not possible (without expensive cmpxchg8b) on PAE. What - * we do have is the guarantee that a PTE will only either go from not - * present to present, or present to not present or both -- it will not - * switch to a completely different present page without a TLB flush in - * between; something that we are blocking by holding interrupts off. - * - * Setting ptes from not present to present goes: - * - * ptep->pte_high = h; - * smp_wmb(); - * ptep->pte_low = l; - * - * And present to not present goes: - * - * ptep->pte_low = 0; - * smp_wmb(); - * ptep->pte_high = 0; - * - * We must ensure here that the load of pte_low sees 'l' iff pte_high - * sees 'h'. We load pte_high *after* loading pte_low, which ensures we - * don't see an older value of pte_high. *Then* we recheck pte_low, - * which ensures that we haven't picked up a changed pte high. We might - * have gotten rubbish values from pte_low and pte_high, but we are - * guaranteed that pte_low will not have the present bit set *unless* - * it is 'l'. Because get_user_pages_fast() only operates on present ptes - * we're safe. - */ -static inline pte_t gup_get_pte(pte_t *ptep) -{ - pte_t pte; - - do { - pte.pte_low = ptep->pte_low; - smp_rmb(); - pte.pte_high = ptep->pte_high; - smp_rmb(); - } while (unlikely(pte.pte_low != ptep->pte_low)); - - return pte; -} - #include #endif /* _ASM_X86_PGTABLE_3LEVEL_H */ diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 98f6e4f88b04..4a9c63d1c20a 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -650,7 +650,7 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte) /* * The idea using the light way get the spte on x86_32 guest is from - * gup_get_pte(arch/x86/mm/gup.c). + * gup_get_pte (mm/gup.c). * * An spte tlb flush may be pending, because kvm_set_pte_rmapp * coalesces them and we are running out of the MMU lock. Therefore diff --git a/mm/Kconfig b/mm/Kconfig index ef6efedc5921..53e2ca54b385 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -762,6 +762,9 @@ config GUP_BENCHMARK See tools/testing/selftests/vm/gup_benchmark.c +config GUP_GET_PTE_LOW_HIGH + bool + config ARCH_HAS_PTE_SPECIAL bool diff --git a/mm/gup.c b/mm/gup.c index 84891c06d485..2093283e4933 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1684,17 +1684,60 @@ struct page *get_dump_page(unsigned long addr) * This code is based heavily on the PowerPC implementation by Nick Piggin. */ #ifdef CONFIG_HAVE_GENERIC_GUP - -#ifndef gup_get_pte +#ifdef CONFIG_GUP_GET_PTE_LOW_HIGH /* - * We assume that the PTE can be read atomically. If this is not the case for - * your architecture, please provide the helper. + * WARNING: only to be used in the get_user_pages_fast() implementation. + * + * With get_user_pages_fast(), we walk down the pagetables without taking any + * locks. For this we would like to load the pointers atomically, but sometimes + * that is not possible (e.g. without expensive cmpxchg8b on x86_32 PAE). What + * we do have is the guarantee that a PTE will only either go from not present + * to present, or present to not present or both -- it will not switch to a + * completely different present page without a TLB flush in between; something + * that we are blocking by holding interrupts off. + * + * Setting ptes from not present to present goes: + * + * ptep->pte_high = h; + * smp_wmb(); + * ptep->pte_low = l; + * + * And present to not present goes: + * + * ptep->pte_low = 0; + * smp_wmb(); + * ptep->pte_high = 0; + * + * We must ensure here that the load of pte_low sees 'l' IFF pte_high sees 'h'. + * We load pte_high *after* loading pte_low, which ensures we don't see an older + * value of pte_high. *Then* we recheck pte_low, which ensures that we haven't + * picked up a changed pte high. We might have gotten rubbish values from + * pte_low and pte_high, but we are guaranteed that pte_low will not have the + * present bit set *unless* it is 'l'. Because get_user_pages_fast() only + * operates on present ptes we're safe. + */ +static inline pte_t gup_get_pte(pte_t *ptep) +{ + pte_t pte; + + do { + pte.pte_low = ptep->pte_low; + smp_rmb(); + pte.pte_high = ptep->pte_high; + smp_rmb(); + } while (unlikely(pte.pte_low != ptep->pte_low)); + + return pte; +} +#else /* CONFIG_GUP_GET_PTE_LOW_HIGH */ +/* + * We require that the PTE can be read atomically. */ static inline pte_t gup_get_pte(pte_t *ptep) { return READ_ONCE(*ptep); } -#endif +#endif /* CONFIG_GUP_GET_PTE_LOW_HIGH */ static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages) { From 446f062bf06c81de85edd279ee179715c83a4270 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 11 Jul 2019 20:56:52 -0700 Subject: [PATCH 086/147] MIPS: use the generic get_user_pages_fast code The mips code is mostly equivalent to the generic one, minus various bugfixes and an arch override for gup_fast_permitted. Note that this defines ARCH_HAS_PTE_SPECIAL for mips as mips has pte_special and pte_mkspecial implemented and used in the existing gup code. They are no-op stubs, though which makes me a little unsure if this is really right thing to do. Note that this also adds back a missing cpu_has_dc_aliases check for __get_user_pages_fast, which the old code was only doing for get_user_pages_fast. This clearly looks like an oversight, as any condition that makes get_user_pages_fast unsafe also applies to __get_user_pages_fast. [hch@lst.de: MIPS: don't select ARCH_HAS_PTE_SPECIAL] Link: http://lkml.kernel.org/r/20190701151818.32227-3-hch@lst.de Link: http://lkml.kernel.org/r/20190625143715.1689-5-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Tested-by: Guenter Roeck Cc: Ralf Baechle Cc: Andrey Konovalov Cc: Benjamin Herrenschmidt Cc: David Miller Cc: James Hogan Cc: Khalid Aziz Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Paul Burton Cc: Paul Mackerras Cc: Rich Felker Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/mips/Kconfig | 2 + arch/mips/include/asm/pgtable.h | 3 + arch/mips/mm/Makefile | 1 - arch/mips/mm/gup.c | 303 -------------------------------- 4 files changed, 5 insertions(+), 304 deletions(-) delete mode 100644 arch/mips/mm/gup.c diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 70d3200476bf..51c05f669d57 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -34,6 +34,7 @@ config MIPS select GENERIC_SCHED_CLOCK if !CAVIUM_OCTEON_SOC select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL + select GUP_GET_PTE_LOW_HIGH if CPU_MIPS32 && PHYS_ADDR_T_64BIT select HANDLE_DOMAIN_IRQ select HAVE_ARCH_COMPILER_H select HAVE_ARCH_JUMP_LABEL @@ -55,6 +56,7 @@ config MIPS select HAVE_FTRACE_MCOUNT_RECORD select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACER + select HAVE_GENERIC_GUP select HAVE_IDE select HAVE_IOREMAP_PROT select HAVE_IRQ_EXIT_ON_IRQ_STACK diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h index 4ccb465ef3f2..7d27194e3b45 100644 --- a/arch/mips/include/asm/pgtable.h +++ b/arch/mips/include/asm/pgtable.h @@ -20,6 +20,7 @@ #include #include #include +#include struct mm_struct; struct vm_area_struct; @@ -626,6 +627,8 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +#define gup_fast_permitted(start, end) (!cpu_has_dc_aliases) + #include /* diff --git a/arch/mips/mm/Makefile b/arch/mips/mm/Makefile index f34d7ff5eb60..1e8d335025d7 100644 --- a/arch/mips/mm/Makefile +++ b/arch/mips/mm/Makefile @@ -7,7 +7,6 @@ obj-y += cache.o obj-y += context.o obj-y += extable.o obj-y += fault.o -obj-y += gup.o obj-y += init.o obj-y += mmap.o obj-y += page.o diff --git a/arch/mips/mm/gup.c b/arch/mips/mm/gup.c deleted file mode 100644 index 4c2b4483683c..000000000000 --- a/arch/mips/mm/gup.c +++ /dev/null @@ -1,303 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Lockless get_user_pages_fast for MIPS - * - * Copyright (C) 2008 Nick Piggin - * Copyright (C) 2008 Novell Inc. - * Copyright (C) 2011 Ralf Baechle - */ -#include -#include -#include -#include -#include -#include - -#include -#include - -static inline pte_t gup_get_pte(pte_t *ptep) -{ -#if defined(CONFIG_PHYS_ADDR_T_64BIT) && defined(CONFIG_CPU_MIPS32) - pte_t pte; - -retry: - pte.pte_low = ptep->pte_low; - smp_rmb(); - pte.pte_high = ptep->pte_high; - smp_rmb(); - if (unlikely(pte.pte_low != ptep->pte_low)) - goto retry; - - return pte; -#else - return READ_ONCE(*ptep); -#endif -} - -static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, - int write, struct page **pages, int *nr) -{ - pte_t *ptep = pte_offset_map(&pmd, addr); - do { - pte_t pte = gup_get_pte(ptep); - struct page *page; - - if (!pte_present(pte) || - pte_special(pte) || (write && !pte_write(pte))) { - pte_unmap(ptep); - return 0; - } - VM_BUG_ON(!pfn_valid(pte_pfn(pte))); - page = pte_page(pte); - get_page(page); - SetPageReferenced(page); - pages[*nr] = page; - (*nr)++; - - } while (ptep++, addr += PAGE_SIZE, addr != end); - - pte_unmap(ptep - 1); - return 1; -} - -static inline void get_head_page_multiple(struct page *page, int nr) -{ - VM_BUG_ON(page != compound_head(page)); - VM_BUG_ON(page_count(page) == 0); - page_ref_add(page, nr); - SetPageReferenced(page); -} - -static int gup_huge_pmd(pmd_t pmd, unsigned long addr, unsigned long end, - int write, struct page **pages, int *nr) -{ - pte_t pte = *(pte_t *)&pmd; - struct page *head, *page; - int refs; - - if (write && !pte_write(pte)) - return 0; - /* hugepages are never "special" */ - VM_BUG_ON(pte_special(pte)); - VM_BUG_ON(!pfn_valid(pte_pfn(pte))); - - refs = 0; - head = pte_page(pte); - page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); - do { - VM_BUG_ON(compound_head(page) != head); - pages[*nr] = page; - (*nr)++; - page++; - refs++; - } while (addr += PAGE_SIZE, addr != end); - - get_head_page_multiple(head, refs); - return 1; -} - -static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, - int write, struct page **pages, int *nr) -{ - unsigned long next; - pmd_t *pmdp; - - pmdp = pmd_offset(&pud, addr); - do { - pmd_t pmd = *pmdp; - - next = pmd_addr_end(addr, end); - if (pmd_none(pmd)) - return 0; - if (unlikely(pmd_huge(pmd))) { - if (!gup_huge_pmd(pmd, addr, next, write, pages,nr)) - return 0; - } else { - if (!gup_pte_range(pmd, addr, next, write, pages,nr)) - return 0; - } - } while (pmdp++, addr = next, addr != end); - - return 1; -} - -static int gup_huge_pud(pud_t pud, unsigned long addr, unsigned long end, - int write, struct page **pages, int *nr) -{ - pte_t pte = *(pte_t *)&pud; - struct page *head, *page; - int refs; - - if (write && !pte_write(pte)) - return 0; - /* hugepages are never "special" */ - VM_BUG_ON(pte_special(pte)); - VM_BUG_ON(!pfn_valid(pte_pfn(pte))); - - refs = 0; - head = pte_page(pte); - page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); - do { - VM_BUG_ON(compound_head(page) != head); - pages[*nr] = page; - (*nr)++; - page++; - refs++; - } while (addr += PAGE_SIZE, addr != end); - - get_head_page_multiple(head, refs); - return 1; -} - -static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, - int write, struct page **pages, int *nr) -{ - unsigned long next; - pud_t *pudp; - - pudp = pud_offset(&pgd, addr); - do { - pud_t pud = *pudp; - - next = pud_addr_end(addr, end); - if (pud_none(pud)) - return 0; - if (unlikely(pud_huge(pud))) { - if (!gup_huge_pud(pud, addr, next, write, pages,nr)) - return 0; - } else { - if (!gup_pmd_range(pud, addr, next, write, pages,nr)) - return 0; - } - } while (pudp++, addr = next, addr != end); - - return 1; -} - -/* - * Like get_user_pages_fast() except its IRQ-safe in that it won't fall - * back to the regular GUP. - * Note a difference with get_user_pages_fast: this always returns the - * number of pages pinned, 0 if no pages were pinned. - */ -int __get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages) -{ - struct mm_struct *mm = current->mm; - unsigned long addr, len, end; - unsigned long next; - unsigned long flags; - pgd_t *pgdp; - int nr = 0; - - start &= PAGE_MASK; - addr = start; - len = (unsigned long) nr_pages << PAGE_SHIFT; - end = start + len; - if (unlikely(!access_ok((void __user *)start, len))) - return 0; - - /* - * XXX: batch / limit 'nr', to avoid large irq off latency - * needs some instrumenting to determine the common sizes used by - * important workloads (eg. DB2), and whether limiting the batch - * size will decrease performance. - * - * It seems like we're in the clear for the moment. Direct-IO is - * the main guy that batches up lots of get_user_pages, and even - * they are limited to 64-at-a-time which is not so many. - */ - /* - * This doesn't prevent pagetable teardown, but does prevent - * the pagetables and pages from being freed. - * - * So long as we atomically load page table pointers versus teardown, - * we can follow the address down to the page and take a ref on it. - */ - local_irq_save(flags); - pgdp = pgd_offset(mm, addr); - do { - pgd_t pgd = *pgdp; - - next = pgd_addr_end(addr, end); - if (pgd_none(pgd)) - break; - if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) - break; - } while (pgdp++, addr = next, addr != end); - local_irq_restore(flags); - - return nr; -} - -/** - * get_user_pages_fast() - pin user pages in memory - * @start: starting user address - * @nr_pages: number of pages from start to pin - * @gup_flags: flags modifying pin behaviour - * @pages: array that receives pointers to the pages pinned. - * Should be at least nr_pages long. - * - * Attempt to pin user pages in memory without taking mm->mmap_sem. - * If not successful, it will fall back to taking the lock and - * calling get_user_pages(). - * - * Returns number of pages pinned. This may be fewer than the number - * requested. If nr_pages is 0 or negative, returns 0. If no pages - * were pinned, returns -errno. - */ -int get_user_pages_fast(unsigned long start, int nr_pages, - unsigned int gup_flags, struct page **pages) -{ - struct mm_struct *mm = current->mm; - unsigned long addr, len, end; - unsigned long next; - pgd_t *pgdp; - int ret, nr = 0; - - start &= PAGE_MASK; - addr = start; - len = (unsigned long) nr_pages << PAGE_SHIFT; - - end = start + len; - if (end < start || cpu_has_dc_aliases) - goto slow_irqon; - - /* XXX: batch / limit 'nr' */ - local_irq_disable(); - pgdp = pgd_offset(mm, addr); - do { - pgd_t pgd = *pgdp; - - next = pgd_addr_end(addr, end); - if (pgd_none(pgd)) - goto slow; - if (!gup_pud_range(pgd, addr, next, gup_flags & FOLL_WRITE, - pages, &nr)) - goto slow; - } while (pgdp++, addr = next, addr != end); - local_irq_enable(); - - VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT); - return nr; -slow: - local_irq_enable(); - -slow_irqon: - /* Try to get the remaining pages with get_user_pages */ - start += nr << PAGE_SHIFT; - pages += nr; - - ret = get_user_pages_unlocked(start, (end - start) >> PAGE_SHIFT, - pages, gup_flags); - - /* Have to be a bit careful with return values */ - if (nr > 0) { - if (ret < 0) - ret = nr; - else - ret += nr; - } - return ret; -} From 2f85e7f948a2c9f7e1524397d1818191ff5abb03 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 11 Jul 2019 20:56:56 -0700 Subject: [PATCH 087/147] sh: add the missing pud_page definition sh only had pud_page_vaddr, but not pud_page. [hch@lst.de: sh: stub out pud_page] Link: http://lkml.kernel.org/r/20190701151818.32227-2-hch@lst.de Link: http://lkml.kernel.org/r/20190625143715.1689-6-hch@lst.de Signed-off-by: Christoph Hellwig Tested-by: Guenter Roeck Cc: Andrey Konovalov Cc: Benjamin Herrenschmidt Cc: David Miller Cc: James Hogan Cc: Jason Gunthorpe Cc: Khalid Aziz Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Paul Burton Cc: Paul Mackerras Cc: Ralf Baechle Cc: Rich Felker Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sh/include/asm/pgtable-3level.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/sh/include/asm/pgtable-3level.h b/arch/sh/include/asm/pgtable-3level.h index 7d8587eb65ff..779260b721ca 100644 --- a/arch/sh/include/asm/pgtable-3level.h +++ b/arch/sh/include/asm/pgtable-3level.h @@ -38,6 +38,9 @@ static inline unsigned long pud_page_vaddr(pud_t pud) return pud_val(pud); } +/* only used by the stubbed out hugetlb gup code, should never be called */ +#define pud_page(pud) NULL + #define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1)) static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) { From 3c9b9accad9f25a52438c790f76b08d79051d383 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 11 Jul 2019 20:57:00 -0700 Subject: [PATCH 088/147] sh: use the generic get_user_pages_fast code The sh code is mostly equivalent to the generic one, minus various bugfixes and two arch overrides that this patch adds to pgtable.h. Link: http://lkml.kernel.org/r/20190625143715.1689-7-hch@lst.de Signed-off-by: Christoph Hellwig Cc: Andrey Konovalov Cc: Benjamin Herrenschmidt Cc: David Miller Cc: James Hogan Cc: Jason Gunthorpe Cc: Khalid Aziz Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Paul Burton Cc: Paul Mackerras Cc: Ralf Baechle Cc: Rich Felker Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sh/Kconfig | 2 + arch/sh/include/asm/pgtable.h | 37 +++++ arch/sh/mm/Makefile | 2 +- arch/sh/mm/gup.c | 277 ---------------------------------- 4 files changed, 40 insertions(+), 278 deletions(-) delete mode 100644 arch/sh/mm/gup.c diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index c7c99e18d5ff..02ae8132ebfe 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -15,6 +15,7 @@ config SUPERH select HAVE_ARCH_TRACEHOOK select HAVE_PERF_EVENTS select HAVE_DEBUG_BUGVERBOSE + select HAVE_GENERIC_GUP select ARCH_HAVE_CUSTOM_GPIO_H select ARCH_HAVE_NMI_SAFE_CMPXCHG if (GUSA_RB || CPU_SH4A) select ARCH_HAS_GCOV_PROFILE_ALL @@ -64,6 +65,7 @@ config SUPERH config SUPERH32 def_bool "$(ARCH)" = "sh" select ARCH_32BIT_OFF_T + select GUP_GET_PTE_LOW_HIGH if X2TLB select HAVE_KPROBES select HAVE_KRETPROBES select HAVE_IOREMAP_PROT if MMU && !X2TLB diff --git a/arch/sh/include/asm/pgtable.h b/arch/sh/include/asm/pgtable.h index 3587103afe59..9085d1142fa3 100644 --- a/arch/sh/include/asm/pgtable.h +++ b/arch/sh/include/asm/pgtable.h @@ -149,6 +149,43 @@ extern void paging_init(void); extern void page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd); +static inline bool __pte_access_permitted(pte_t pte, u64 prot) +{ + return (pte_val(pte) & (prot | _PAGE_SPECIAL)) == prot; +} + +#ifdef CONFIG_X2TLB +static inline bool pte_access_permitted(pte_t pte, bool write) +{ + u64 prot = _PAGE_PRESENT; + + prot |= _PAGE_EXT(_PAGE_EXT_KERN_READ | _PAGE_EXT_USER_READ); + if (write) + prot |= _PAGE_EXT(_PAGE_EXT_KERN_WRITE | _PAGE_EXT_USER_WRITE); + return __pte_access_permitted(pte, prot); +} +#elif defined(CONFIG_SUPERH64) +static inline bool pte_access_permitted(pte_t pte, bool write) +{ + u64 prot = _PAGE_PRESENT | _PAGE_USER | _PAGE_READ; + + if (write) + prot |= _PAGE_WRITE; + return __pte_access_permitted(pte, prot); +} +#else +static inline bool pte_access_permitted(pte_t pte, bool write) +{ + u64 prot = _PAGE_PRESENT | _PAGE_USER; + + if (write) + prot |= _PAGE_RW; + return __pte_access_permitted(pte, prot); +} +#endif + +#define pte_access_permitted pte_access_permitted + /* arch/sh/mm/mmap.c */ #define HAVE_ARCH_UNMAPPED_AREA #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN diff --git a/arch/sh/mm/Makefile b/arch/sh/mm/Makefile index fbe5e79751b3..5051b38fd5b6 100644 --- a/arch/sh/mm/Makefile +++ b/arch/sh/mm/Makefile @@ -17,7 +17,7 @@ cacheops-$(CONFIG_CPU_SHX3) += cache-shx3.o obj-y += $(cacheops-y) mmu-y := nommu.o extable_32.o -mmu-$(CONFIG_MMU) := extable_$(BITS).o fault.o gup.o ioremap.o kmap.o \ +mmu-$(CONFIG_MMU) := extable_$(BITS).o fault.o ioremap.o kmap.o \ pgtable.o tlbex_$(BITS).o tlbflush_$(BITS).o obj-y += $(mmu-y) diff --git a/arch/sh/mm/gup.c b/arch/sh/mm/gup.c deleted file mode 100644 index 277c882f7489..000000000000 --- a/arch/sh/mm/gup.c +++ /dev/null @@ -1,277 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Lockless get_user_pages_fast for SuperH - * - * Copyright (C) 2009 - 2010 Paul Mundt - * - * Cloned from the x86 and PowerPC versions, by: - * - * Copyright (C) 2008 Nick Piggin - * Copyright (C) 2008 Novell Inc. - */ -#include -#include -#include -#include -#include - -static inline pte_t gup_get_pte(pte_t *ptep) -{ -#ifndef CONFIG_X2TLB - return READ_ONCE(*ptep); -#else - /* - * With get_user_pages_fast, we walk down the pagetables without - * taking any locks. For this we would like to load the pointers - * atomically, but that is not possible with 64-bit PTEs. What - * we do have is the guarantee that a pte will only either go - * from not present to present, or present to not present or both - * -- it will not switch to a completely different present page - * without a TLB flush in between; something that we are blocking - * by holding interrupts off. - * - * Setting ptes from not present to present goes: - * ptep->pte_high = h; - * smp_wmb(); - * ptep->pte_low = l; - * - * And present to not present goes: - * ptep->pte_low = 0; - * smp_wmb(); - * ptep->pte_high = 0; - * - * We must ensure here that the load of pte_low sees l iff pte_high - * sees h. We load pte_high *after* loading pte_low, which ensures we - * don't see an older value of pte_high. *Then* we recheck pte_low, - * which ensures that we haven't picked up a changed pte high. We might - * have got rubbish values from pte_low and pte_high, but we are - * guaranteed that pte_low will not have the present bit set *unless* - * it is 'l'. And get_user_pages_fast only operates on present ptes, so - * we're safe. - * - * gup_get_pte should not be used or copied outside gup.c without being - * very careful -- it does not atomically load the pte or anything that - * is likely to be useful for you. - */ - pte_t pte; - -retry: - pte.pte_low = ptep->pte_low; - smp_rmb(); - pte.pte_high = ptep->pte_high; - smp_rmb(); - if (unlikely(pte.pte_low != ptep->pte_low)) - goto retry; - - return pte; -#endif -} - -/* - * The performance critical leaf functions are made noinline otherwise gcc - * inlines everything into a single function which results in too much - * register pressure. - */ -static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, - unsigned long end, int write, struct page **pages, int *nr) -{ - u64 mask, result; - pte_t *ptep; - -#ifdef CONFIG_X2TLB - result = _PAGE_PRESENT | _PAGE_EXT(_PAGE_EXT_KERN_READ | _PAGE_EXT_USER_READ); - if (write) - result |= _PAGE_EXT(_PAGE_EXT_KERN_WRITE | _PAGE_EXT_USER_WRITE); -#elif defined(CONFIG_SUPERH64) - result = _PAGE_PRESENT | _PAGE_USER | _PAGE_READ; - if (write) - result |= _PAGE_WRITE; -#else - result = _PAGE_PRESENT | _PAGE_USER; - if (write) - result |= _PAGE_RW; -#endif - - mask = result | _PAGE_SPECIAL; - - ptep = pte_offset_map(&pmd, addr); - do { - pte_t pte = gup_get_pte(ptep); - struct page *page; - - if ((pte_val(pte) & mask) != result) { - pte_unmap(ptep); - return 0; - } - VM_BUG_ON(!pfn_valid(pte_pfn(pte))); - page = pte_page(pte); - get_page(page); - __flush_anon_page(page, addr); - flush_dcache_page(page); - pages[*nr] = page; - (*nr)++; - - } while (ptep++, addr += PAGE_SIZE, addr != end); - pte_unmap(ptep - 1); - - return 1; -} - -static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, - int write, struct page **pages, int *nr) -{ - unsigned long next; - pmd_t *pmdp; - - pmdp = pmd_offset(&pud, addr); - do { - pmd_t pmd = *pmdp; - - next = pmd_addr_end(addr, end); - if (pmd_none(pmd)) - return 0; - if (!gup_pte_range(pmd, addr, next, write, pages, nr)) - return 0; - } while (pmdp++, addr = next, addr != end); - - return 1; -} - -static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, - int write, struct page **pages, int *nr) -{ - unsigned long next; - pud_t *pudp; - - pudp = pud_offset(&pgd, addr); - do { - pud_t pud = *pudp; - - next = pud_addr_end(addr, end); - if (pud_none(pud)) - return 0; - if (!gup_pmd_range(pud, addr, next, write, pages, nr)) - return 0; - } while (pudp++, addr = next, addr != end); - - return 1; -} - -/* - * Like get_user_pages_fast() except its IRQ-safe in that it won't fall - * back to the regular GUP. - * Note a difference with get_user_pages_fast: this always returns the - * number of pages pinned, 0 if no pages were pinned. - */ -int __get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages) -{ - struct mm_struct *mm = current->mm; - unsigned long addr, len, end; - unsigned long next; - unsigned long flags; - pgd_t *pgdp; - int nr = 0; - - start &= PAGE_MASK; - addr = start; - len = (unsigned long) nr_pages << PAGE_SHIFT; - end = start + len; - if (unlikely(!access_ok((void __user *)start, len))) - return 0; - - /* - * This doesn't prevent pagetable teardown, but does prevent - * the pagetables and pages from being freed. - */ - local_irq_save(flags); - pgdp = pgd_offset(mm, addr); - do { - pgd_t pgd = *pgdp; - - next = pgd_addr_end(addr, end); - if (pgd_none(pgd)) - break; - if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) - break; - } while (pgdp++, addr = next, addr != end); - local_irq_restore(flags); - - return nr; -} - -/** - * get_user_pages_fast() - pin user pages in memory - * @start: starting user address - * @nr_pages: number of pages from start to pin - * @gup_flags: flags modifying pin behaviour - * @pages: array that receives pointers to the pages pinned. - * Should be at least nr_pages long. - * - * Attempt to pin user pages in memory without taking mm->mmap_sem. - * If not successful, it will fall back to taking the lock and - * calling get_user_pages(). - * - * Returns number of pages pinned. This may be fewer than the number - * requested. If nr_pages is 0 or negative, returns 0. If no pages - * were pinned, returns -errno. - */ -int get_user_pages_fast(unsigned long start, int nr_pages, - unsigned int gup_flags, struct page **pages) -{ - struct mm_struct *mm = current->mm; - unsigned long addr, len, end; - unsigned long next; - pgd_t *pgdp; - int nr = 0; - - start &= PAGE_MASK; - addr = start; - len = (unsigned long) nr_pages << PAGE_SHIFT; - - end = start + len; - if (end < start) - goto slow_irqon; - - local_irq_disable(); - pgdp = pgd_offset(mm, addr); - do { - pgd_t pgd = *pgdp; - - next = pgd_addr_end(addr, end); - if (pgd_none(pgd)) - goto slow; - if (!gup_pud_range(pgd, addr, next, gup_flags & FOLL_WRITE, - pages, &nr)) - goto slow; - } while (pgdp++, addr = next, addr != end); - local_irq_enable(); - - VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT); - return nr; - - { - int ret; - -slow: - local_irq_enable(); -slow_irqon: - /* Try to get the remaining pages with get_user_pages */ - start += nr << PAGE_SHIFT; - pages += nr; - - ret = get_user_pages_unlocked(start, - (end - start) >> PAGE_SHIFT, pages, - gup_flags); - - /* Have to be a bit careful with return values */ - if (nr > 0) { - if (ret < 0) - ret = nr; - else - ret += nr; - } - - return ret; - } -} From d85507901f6a0a4fbd04cbdd567f4798a695b6d3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 11 Jul 2019 20:57:03 -0700 Subject: [PATCH 089/147] sparc64: add the missing pgd_page definition sparc64 only had pgd_page_vaddr, but not pgd_page. [hch@lst.de: fix sparc64 build] Link: http://lkml.kernel.org/r/20190626131318.GA5101@lst.de Link: http://lkml.kernel.org/r/20190625143715.1689-8-hch@lst.de Signed-off-by: Christoph Hellwig Cc: David Miller Cc: Andrey Konovalov Cc: Benjamin Herrenschmidt Cc: James Hogan Cc: Jason Gunthorpe Cc: Khalid Aziz Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Paul Burton Cc: Paul Mackerras Cc: Ralf Baechle Cc: Rich Felker Cc: Yoshinori Sato Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sparc/include/asm/pgtable_64.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 22500c3be7a9..439129af9266 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -864,6 +864,9 @@ static inline unsigned long pud_page_vaddr(pud_t pud) #define pgd_present(pgd) (pgd_val(pgd) != 0U) #define pgd_clear(pgdp) (pgd_val(*(pgdp)) = 0UL) +/* only used by the stubbed out hugetlb gup code, should never be called */ +#define pgd_page(pgd) NULL + static inline unsigned long pud_large(pud_t pud) { pte_t pte = __pte(pud_val(pud)); From 5875509d2f30eb8963163f255ded98095989142f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 11 Jul 2019 20:57:07 -0700 Subject: [PATCH 090/147] sparc64: define untagged_addr() Add a helper to untag a user pointer. This is needed for ADI support in get_user_pages_fast. Link: http://lkml.kernel.org/r/20190625143715.1689-9-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Khalid Aziz Cc: David Miller Cc: Andrey Konovalov Cc: Benjamin Herrenschmidt Cc: James Hogan Cc: Jason Gunthorpe Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Paul Burton Cc: Paul Mackerras Cc: Ralf Baechle Cc: Rich Felker Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sparc/include/asm/pgtable_64.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 439129af9266..8358ba00aa5d 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -1078,6 +1078,28 @@ static inline int io_remap_pfn_range(struct vm_area_struct *vma, } #define io_remap_pfn_range io_remap_pfn_range +static inline unsigned long untagged_addr(unsigned long start) +{ + if (adi_capable()) { + long addr = start; + + /* If userspace has passed a versioned address, kernel + * will not find it in the VMAs since it does not store + * the version tags in the list of VMAs. Storing version + * tags in list of VMAs is impractical since they can be + * changed any time from userspace without dropping into + * kernel. Any address search in VMAs will be done with + * non-versioned addresses. Ensure the ADI version bits + * are dropped here by sign extending the last bit before + * ADI bits. IOMMU does not implement version tags. + */ + return (addr << (long)adi_nbits()) >> (long)adi_nbits(); + } + + return start; +} +#define untagged_addr untagged_addr + #include #include From 7b9afb86b6328f10dc2cad9223d7def12d60e505 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 11 Jul 2019 20:57:11 -0700 Subject: [PATCH 091/147] sparc64: use the generic get_user_pages_fast code The sparc64 code is mostly equivalent to the generic one, minus various bugfixes and two arch overrides that this patch adds to pgtable.h. Link: http://lkml.kernel.org/r/20190625143715.1689-10-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Khalid Aziz Cc: David Miller Cc: Andrey Konovalov Cc: Benjamin Herrenschmidt Cc: James Hogan Cc: Jason Gunthorpe Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Paul Burton Cc: Paul Mackerras Cc: Ralf Baechle Cc: Rich Felker Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sparc/Kconfig | 1 + arch/sparc/include/asm/pgtable_64.h | 18 ++ arch/sparc/mm/Makefile | 2 +- arch/sparc/mm/gup.c | 340 ---------------------------- 4 files changed, 20 insertions(+), 341 deletions(-) delete mode 100644 arch/sparc/mm/gup.c diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 8fc95c7809ef..c59464755764 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -28,6 +28,7 @@ config SPARC select RTC_DRV_M48T59 select RTC_SYSTOHC select HAVE_ARCH_JUMP_LABEL if SPARC64 + select HAVE_GENERIC_GUP if SPARC64 select GENERIC_IRQ_SHOW select ARCH_WANT_IPC_PARSE_VERSION select GENERIC_PCI_IOMAP diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 8358ba00aa5d..1599de730532 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -1100,6 +1100,24 @@ static inline unsigned long untagged_addr(unsigned long start) } #define untagged_addr untagged_addr +static inline bool pte_access_permitted(pte_t pte, bool write) +{ + u64 prot; + + if (tlb_type == hypervisor) { + prot = _PAGE_PRESENT_4V | _PAGE_P_4V; + if (write) + prot |= _PAGE_WRITE_4V; + } else { + prot = _PAGE_PRESENT_4U | _PAGE_P_4U; + if (write) + prot |= _PAGE_WRITE_4U; + } + + return (pte_val(pte) & (prot | _PAGE_SPECIAL)) == prot; +} +#define pte_access_permitted pte_access_permitted + #include #include diff --git a/arch/sparc/mm/Makefile b/arch/sparc/mm/Makefile index d39075b1e3b7..b078205b70e0 100644 --- a/arch/sparc/mm/Makefile +++ b/arch/sparc/mm/Makefile @@ -5,7 +5,7 @@ asflags-y := -ansi ccflags-y := -Werror -obj-$(CONFIG_SPARC64) += ultra.o tlb.o tsb.o gup.o +obj-$(CONFIG_SPARC64) += ultra.o tlb.o tsb.o obj-y += fault_$(BITS).o obj-y += init_$(BITS).o obj-$(CONFIG_SPARC32) += extable.o srmmu.o iommu.o io-unit.o diff --git a/arch/sparc/mm/gup.c b/arch/sparc/mm/gup.c deleted file mode 100644 index 1e770a517d4a..000000000000 --- a/arch/sparc/mm/gup.c +++ /dev/null @@ -1,340 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Lockless get_user_pages_fast for sparc, cribbed from powerpc - * - * Copyright (C) 2008 Nick Piggin - * Copyright (C) 2008 Novell Inc. - */ - -#include -#include -#include -#include -#include -#include -#include - -/* - * The performance critical leaf functions are made noinline otherwise gcc - * inlines everything into a single function which results in too much - * register pressure. - */ -static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, - unsigned long end, int write, struct page **pages, int *nr) -{ - unsigned long mask, result; - pte_t *ptep; - - if (tlb_type == hypervisor) { - result = _PAGE_PRESENT_4V|_PAGE_P_4V; - if (write) - result |= _PAGE_WRITE_4V; - } else { - result = _PAGE_PRESENT_4U|_PAGE_P_4U; - if (write) - result |= _PAGE_WRITE_4U; - } - mask = result | _PAGE_SPECIAL; - - ptep = pte_offset_kernel(&pmd, addr); - do { - struct page *page, *head; - pte_t pte = *ptep; - - if ((pte_val(pte) & mask) != result) - return 0; - VM_BUG_ON(!pfn_valid(pte_pfn(pte))); - - /* The hugepage case is simplified on sparc64 because - * we encode the sub-page pfn offsets into the - * hugepage PTEs. We could optimize this in the future - * use page_cache_add_speculative() for the hugepage case. - */ - page = pte_page(pte); - head = compound_head(page); - if (!page_cache_get_speculative(head)) - return 0; - if (unlikely(pte_val(pte) != pte_val(*ptep))) { - put_page(head); - return 0; - } - - pages[*nr] = page; - (*nr)++; - } while (ptep++, addr += PAGE_SIZE, addr != end); - - return 1; -} - -static int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr, - unsigned long end, int write, struct page **pages, - int *nr) -{ - struct page *head, *page; - int refs; - - if (!(pmd_val(pmd) & _PAGE_VALID)) - return 0; - - if (write && !pmd_write(pmd)) - return 0; - - refs = 0; - page = pmd_page(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); - head = compound_head(page); - do { - VM_BUG_ON(compound_head(page) != head); - pages[*nr] = page; - (*nr)++; - page++; - refs++; - } while (addr += PAGE_SIZE, addr != end); - - if (!page_cache_add_speculative(head, refs)) { - *nr -= refs; - return 0; - } - - if (unlikely(pmd_val(pmd) != pmd_val(*pmdp))) { - *nr -= refs; - while (refs--) - put_page(head); - return 0; - } - - return 1; -} - -static int gup_huge_pud(pud_t *pudp, pud_t pud, unsigned long addr, - unsigned long end, int write, struct page **pages, - int *nr) -{ - struct page *head, *page; - int refs; - - if (!(pud_val(pud) & _PAGE_VALID)) - return 0; - - if (write && !pud_write(pud)) - return 0; - - refs = 0; - page = pud_page(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); - head = compound_head(page); - do { - VM_BUG_ON(compound_head(page) != head); - pages[*nr] = page; - (*nr)++; - page++; - refs++; - } while (addr += PAGE_SIZE, addr != end); - - if (!page_cache_add_speculative(head, refs)) { - *nr -= refs; - return 0; - } - - if (unlikely(pud_val(pud) != pud_val(*pudp))) { - *nr -= refs; - while (refs--) - put_page(head); - return 0; - } - - return 1; -} - -static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, - int write, struct page **pages, int *nr) -{ - unsigned long next; - pmd_t *pmdp; - - pmdp = pmd_offset(&pud, addr); - do { - pmd_t pmd = *pmdp; - - next = pmd_addr_end(addr, end); - if (pmd_none(pmd)) - return 0; - if (unlikely(pmd_large(pmd))) { - if (!gup_huge_pmd(pmdp, pmd, addr, next, - write, pages, nr)) - return 0; - } else if (!gup_pte_range(pmd, addr, next, write, - pages, nr)) - return 0; - } while (pmdp++, addr = next, addr != end); - - return 1; -} - -static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, - int write, struct page **pages, int *nr) -{ - unsigned long next; - pud_t *pudp; - - pudp = pud_offset(&pgd, addr); - do { - pud_t pud = *pudp; - - next = pud_addr_end(addr, end); - if (pud_none(pud)) - return 0; - if (unlikely(pud_large(pud))) { - if (!gup_huge_pud(pudp, pud, addr, next, - write, pages, nr)) - return 0; - } else if (!gup_pmd_range(pud, addr, next, write, pages, nr)) - return 0; - } while (pudp++, addr = next, addr != end); - - return 1; -} - -/* - * Note a difference with get_user_pages_fast: this always returns the - * number of pages pinned, 0 if no pages were pinned. - */ -int __get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages) -{ - struct mm_struct *mm = current->mm; - unsigned long addr, len, end; - unsigned long next, flags; - pgd_t *pgdp; - int nr = 0; - -#ifdef CONFIG_SPARC64 - if (adi_capable()) { - long addr = start; - - /* If userspace has passed a versioned address, kernel - * will not find it in the VMAs since it does not store - * the version tags in the list of VMAs. Storing version - * tags in list of VMAs is impractical since they can be - * changed any time from userspace without dropping into - * kernel. Any address search in VMAs will be done with - * non-versioned addresses. Ensure the ADI version bits - * are dropped here by sign extending the last bit before - * ADI bits. IOMMU does not implement version tags. - */ - addr = (addr << (long)adi_nbits()) >> (long)adi_nbits(); - start = addr; - } -#endif - start &= PAGE_MASK; - addr = start; - len = (unsigned long) nr_pages << PAGE_SHIFT; - end = start + len; - - local_irq_save(flags); - pgdp = pgd_offset(mm, addr); - do { - pgd_t pgd = *pgdp; - - next = pgd_addr_end(addr, end); - if (pgd_none(pgd)) - break; - if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) - break; - } while (pgdp++, addr = next, addr != end); - local_irq_restore(flags); - - return nr; -} - -int get_user_pages_fast(unsigned long start, int nr_pages, - unsigned int gup_flags, struct page **pages) -{ - struct mm_struct *mm = current->mm; - unsigned long addr, len, end; - unsigned long next; - pgd_t *pgdp; - int nr = 0; - -#ifdef CONFIG_SPARC64 - if (adi_capable()) { - long addr = start; - - /* If userspace has passed a versioned address, kernel - * will not find it in the VMAs since it does not store - * the version tags in the list of VMAs. Storing version - * tags in list of VMAs is impractical since they can be - * changed any time from userspace without dropping into - * kernel. Any address search in VMAs will be done with - * non-versioned addresses. Ensure the ADI version bits - * are dropped here by sign extending the last bit before - * ADI bits. IOMMU does not implements version tags, - */ - addr = (addr << (long)adi_nbits()) >> (long)adi_nbits(); - start = addr; - } -#endif - start &= PAGE_MASK; - addr = start; - len = (unsigned long) nr_pages << PAGE_SHIFT; - end = start + len; - - /* - * XXX: batch / limit 'nr', to avoid large irq off latency - * needs some instrumenting to determine the common sizes used by - * important workloads (eg. DB2), and whether limiting the batch size - * will decrease performance. - * - * It seems like we're in the clear for the moment. Direct-IO is - * the main guy that batches up lots of get_user_pages, and even - * they are limited to 64-at-a-time which is not so many. - */ - /* - * This doesn't prevent pagetable teardown, but does prevent - * the pagetables from being freed on sparc. - * - * So long as we atomically load page table pointers versus teardown, - * we can follow the address down to the the page and take a ref on it. - */ - local_irq_disable(); - - pgdp = pgd_offset(mm, addr); - do { - pgd_t pgd = *pgdp; - - next = pgd_addr_end(addr, end); - if (pgd_none(pgd)) - goto slow; - if (!gup_pud_range(pgd, addr, next, gup_flags & FOLL_WRITE, - pages, &nr)) - goto slow; - } while (pgdp++, addr = next, addr != end); - - local_irq_enable(); - - VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT); - return nr; - - { - int ret; - -slow: - local_irq_enable(); - - /* Try to get the remaining pages with get_user_pages */ - start += nr << PAGE_SHIFT; - pages += nr; - - ret = get_user_pages_unlocked(start, - (end - start) >> PAGE_SHIFT, pages, - gup_flags); - - /* Have to be a bit careful with return values */ - if (nr > 0) { - if (ret < 0) - ret = nr; - else - ret += nr; - } - - return ret; - } -} From 67a929e097b774c69253c8b61ef9eb8a42b463a3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 11 Jul 2019 20:57:14 -0700 Subject: [PATCH 092/147] mm: rename CONFIG_HAVE_GENERIC_GUP to CONFIG_HAVE_FAST_GUP We only support the generic GUP now, so rename the config option to be more clear, and always use the mm/Kconfig definition of the symbol and select it from the arch Kconfigs. Link: http://lkml.kernel.org/r/20190625143715.1689-11-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Khalid Aziz Reviewed-by: Jason Gunthorpe Cc: Andrey Konovalov Cc: Benjamin Herrenschmidt Cc: David Miller Cc: James Hogan Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Paul Burton Cc: Paul Mackerras Cc: Ralf Baechle Cc: Rich Felker Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/Kconfig | 5 +---- arch/arm64/Kconfig | 4 +--- arch/mips/Kconfig | 2 +- arch/powerpc/Kconfig | 2 +- arch/s390/Kconfig | 2 +- arch/sh/Kconfig | 2 +- arch/sparc/Kconfig | 2 +- arch/x86/Kconfig | 4 +--- mm/Kconfig | 2 +- mm/gup.c | 4 ++-- 10 files changed, 11 insertions(+), 18 deletions(-) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 0c55aa199c67..2bf1ce39a96d 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -75,6 +75,7 @@ config ARM select HAVE_DYNAMIC_FTRACE_WITH_REGS if HAVE_DYNAMIC_FTRACE select HAVE_EFFICIENT_UNALIGNED_ACCESS if (CPU_V6 || CPU_V6K || CPU_V7) && MMU select HAVE_EXIT_THREAD + select HAVE_FAST_GUP if ARM_LPAE select HAVE_FTRACE_MCOUNT_RECORD if !XIP_KERNEL select HAVE_FUNCTION_GRAPH_TRACER if !THUMB2_KERNEL && !CC_IS_CLANG select HAVE_FUNCTION_TRACER if !XIP_KERNEL @@ -1625,10 +1626,6 @@ config ARCH_SPARSEMEM_DEFAULT config HAVE_ARCH_PFN_VALID def_bool ARCH_HAS_HOLES_MEMORYMODEL || !SPARSEMEM -config HAVE_GENERIC_GUP - def_bool y - depends on ARM_LPAE - config HIGHMEM bool "High Memory Support" depends on MMU diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index c085aec9459b..a36ff61321ce 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -143,6 +143,7 @@ config ARM64 select HAVE_DMA_CONTIGUOUS select HAVE_DYNAMIC_FTRACE select HAVE_EFFICIENT_UNALIGNED_ACCESS + select HAVE_FAST_GUP select HAVE_FTRACE_MCOUNT_RECORD select HAVE_FUNCTION_TRACER select HAVE_FUNCTION_GRAPH_TRACER @@ -267,9 +268,6 @@ config ZONE_DMA32 bool "Support DMA32 zone" if EXPERT default y -config HAVE_GENERIC_GUP - def_bool y - config ARCH_ENABLE_MEMORY_HOTPLUG def_bool y diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 51c05f669d57..7957d3457156 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -53,10 +53,10 @@ config MIPS select HAVE_DMA_CONTIGUOUS select HAVE_DYNAMIC_FTRACE select HAVE_EXIT_THREAD + select HAVE_FAST_GUP select HAVE_FTRACE_MCOUNT_RECORD select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACER - select HAVE_GENERIC_GUP select HAVE_IDE select HAVE_IOREMAP_PROT select HAVE_IRQ_EXIT_ON_IRQ_STACK diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 3b795a0cab62..959866c156de 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -185,12 +185,12 @@ config PPC select HAVE_DYNAMIC_FTRACE_WITH_REGS if MPROFILE_KERNEL select HAVE_EBPF_JIT if PPC64 select HAVE_EFFICIENT_UNALIGNED_ACCESS if !(CPU_LITTLE_ENDIAN && POWER7_CPU) + select HAVE_FAST_GUP select HAVE_FTRACE_MCOUNT_RECORD select HAVE_FUNCTION_ERROR_INJECTION select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACER select HAVE_GCC_PLUGINS if GCC_VERSION >= 50200 # plugin support on gcc <= 5.1 is buggy on PPC - select HAVE_GENERIC_GUP select HAVE_HW_BREAKPOINT if PERF_EVENTS && (PPC_BOOK3S || PPC_8xx) select HAVE_IDE select HAVE_IOREMAP_PROT diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index f089ae375f6b..5d8570ed6cab 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -139,6 +139,7 @@ config S390 select HAVE_DMA_CONTIGUOUS select HAVE_DYNAMIC_FTRACE select HAVE_DYNAMIC_FTRACE_WITH_REGS + select HAVE_FAST_GUP select HAVE_EFFICIENT_UNALIGNED_ACCESS select HAVE_FENTRY select HAVE_FTRACE_MCOUNT_RECORD @@ -146,7 +147,6 @@ config S390 select HAVE_FUNCTION_TRACER select HAVE_FUTEX_CMPXCHG if FUTEX select HAVE_GCC_PLUGINS - select HAVE_GENERIC_GUP select HAVE_KERNEL_BZIP2 select HAVE_KERNEL_GZIP select HAVE_KERNEL_LZ4 diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 02ae8132ebfe..31a7d12db705 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -15,7 +15,7 @@ config SUPERH select HAVE_ARCH_TRACEHOOK select HAVE_PERF_EVENTS select HAVE_DEBUG_BUGVERBOSE - select HAVE_GENERIC_GUP + select HAVE_FAST_GUP if MMU select ARCH_HAVE_CUSTOM_GPIO_H select ARCH_HAVE_NMI_SAFE_CMPXCHG if (GUSA_RB || CPU_SH4A) select ARCH_HAS_GCOV_PROFILE_ALL diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index c59464755764..e9f5d62e9817 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -28,7 +28,7 @@ config SPARC select RTC_DRV_M48T59 select RTC_SYSTOHC select HAVE_ARCH_JUMP_LABEL if SPARC64 - select HAVE_GENERIC_GUP if SPARC64 + select HAVE_FAST_GUP if SPARC64 select GENERIC_IRQ_SHOW select ARCH_WANT_IPC_PARSE_VERSION select GENERIC_PCI_IOMAP diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 71c1f7864434..9df2d1cb7a9e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -159,6 +159,7 @@ config X86 select HAVE_EFFICIENT_UNALIGNED_ACCESS select HAVE_EISA select HAVE_EXIT_THREAD + select HAVE_FAST_GUP select HAVE_FENTRY if X86_64 || DYNAMIC_FTRACE select HAVE_FTRACE_MCOUNT_RECORD select HAVE_FUNCTION_GRAPH_TRACER @@ -2907,9 +2908,6 @@ config HAVE_ATOMIC_IOMAP config X86_DEV_DMA_OPS bool -config HAVE_GENERIC_GUP - def_bool y - source "drivers/firmware/Kconfig" source "arch/x86/kvm/Kconfig" diff --git a/mm/Kconfig b/mm/Kconfig index 53e2ca54b385..b5a258d62465 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -132,7 +132,7 @@ config HAVE_MEMBLOCK_NODE_MAP config HAVE_MEMBLOCK_PHYS_MAP bool -config HAVE_GENERIC_GUP +config HAVE_FAST_GUP bool config ARCH_KEEP_MEMBLOCK diff --git a/mm/gup.c b/mm/gup.c index 2093283e4933..2f69b30a11d3 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1651,7 +1651,7 @@ struct page *get_dump_page(unsigned long addr) #endif /* CONFIG_ELF_CORE */ /* - * Generic Fast GUP + * Fast GUP * * get_user_pages_fast attempts to pin user pages by walking the page * tables directly and avoids taking locks. Thus the walker needs to be @@ -1683,7 +1683,7 @@ struct page *get_dump_page(unsigned long addr) * * This code is based heavily on the PowerPC implementation by Nick Piggin. */ -#ifdef CONFIG_HAVE_GENERIC_GUP +#ifdef CONFIG_HAVE_FAST_GUP #ifdef CONFIG_GUP_GET_PTE_LOW_HIGH /* * WARNING: only to be used in the get_user_pages_fast() implementation. From d3649f68b4336e7ef7aa264cf05ba1265feb0968 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 11 Jul 2019 20:57:18 -0700 Subject: [PATCH 093/147] mm: reorder code blocks in gup.c This moves the actually exported functions towards the end of the file, and reorders some functions to be in more logical blocks as a preparation for moving various stubs inline into the main functionality using IS_ENABLED(). Link: http://lkml.kernel.org/r/20190625143715.1689-12-hch@lst.de Signed-off-by: Christoph Hellwig Cc: Andrey Konovalov Cc: Benjamin Herrenschmidt Cc: David Miller Cc: James Hogan Cc: Jason Gunthorpe Cc: Khalid Aziz Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Paul Burton Cc: Paul Mackerras Cc: Ralf Baechle Cc: Rich Felker Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 410 +++++++++++++++++++++++++++---------------------------- 1 file changed, 205 insertions(+), 205 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 2f69b30a11d3..d36b82c05e79 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1100,86 +1100,6 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk, return pages_done; } -/* - * We can leverage the VM_FAULT_RETRY functionality in the page fault - * paths better by using either get_user_pages_locked() or - * get_user_pages_unlocked(). - * - * get_user_pages_locked() is suitable to replace the form: - * - * down_read(&mm->mmap_sem); - * do_something() - * get_user_pages(tsk, mm, ..., pages, NULL); - * up_read(&mm->mmap_sem); - * - * to: - * - * int locked = 1; - * down_read(&mm->mmap_sem); - * do_something() - * get_user_pages_locked(tsk, mm, ..., pages, &locked); - * if (locked) - * up_read(&mm->mmap_sem); - */ -long get_user_pages_locked(unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - int *locked) -{ - /* - * FIXME: Current FOLL_LONGTERM behavior is incompatible with - * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on - * vmas. As there are no users of this flag in this call we simply - * disallow this option for now. - */ - if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) - return -EINVAL; - - return __get_user_pages_locked(current, current->mm, start, nr_pages, - pages, NULL, locked, - gup_flags | FOLL_TOUCH); -} -EXPORT_SYMBOL(get_user_pages_locked); - -/* - * get_user_pages_unlocked() is suitable to replace the form: - * - * down_read(&mm->mmap_sem); - * get_user_pages(tsk, mm, ..., pages, NULL); - * up_read(&mm->mmap_sem); - * - * with: - * - * get_user_pages_unlocked(tsk, mm, ..., pages); - * - * It is functionally equivalent to get_user_pages_fast so - * get_user_pages_fast should be used instead if specific gup_flags - * (e.g. FOLL_FORCE) are not required. - */ -long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, - struct page **pages, unsigned int gup_flags) -{ - struct mm_struct *mm = current->mm; - int locked = 1; - long ret; - - /* - * FIXME: Current FOLL_LONGTERM behavior is incompatible with - * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on - * vmas. As there are no users of this flag in this call we simply - * disallow this option for now. - */ - if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) - return -EINVAL; - - down_read(&mm->mmap_sem); - ret = __get_user_pages_locked(current, mm, start, nr_pages, pages, NULL, - &locked, gup_flags | FOLL_TOUCH); - if (locked) - up_read(&mm->mmap_sem); - return ret; -} -EXPORT_SYMBOL(get_user_pages_unlocked); - /* * get_user_pages_remote() - pin user pages in memory * @tsk: the task_struct to use for page fault accounting, or @@ -1256,6 +1176,153 @@ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, } EXPORT_SYMBOL(get_user_pages_remote); +/** + * populate_vma_page_range() - populate a range of pages in the vma. + * @vma: target vma + * @start: start address + * @end: end address + * @nonblocking: + * + * This takes care of mlocking the pages too if VM_LOCKED is set. + * + * return 0 on success, negative error code on error. + * + * vma->vm_mm->mmap_sem must be held. + * + * If @nonblocking is NULL, it may be held for read or write and will + * be unperturbed. + * + * If @nonblocking is non-NULL, it must held for read only and may be + * released. If it's released, *@nonblocking will be set to 0. + */ +long populate_vma_page_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, int *nonblocking) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long nr_pages = (end - start) / PAGE_SIZE; + int gup_flags; + + VM_BUG_ON(start & ~PAGE_MASK); + VM_BUG_ON(end & ~PAGE_MASK); + VM_BUG_ON_VMA(start < vma->vm_start, vma); + VM_BUG_ON_VMA(end > vma->vm_end, vma); + VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm); + + gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK; + if (vma->vm_flags & VM_LOCKONFAULT) + gup_flags &= ~FOLL_POPULATE; + /* + * We want to touch writable mappings with a write fault in order + * to break COW, except for shared mappings because these don't COW + * and we would not want to dirty them for nothing. + */ + if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE) + gup_flags |= FOLL_WRITE; + + /* + * We want mlock to succeed for regions that have any permissions + * other than PROT_NONE. + */ + if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) + gup_flags |= FOLL_FORCE; + + /* + * We made sure addr is within a VMA, so the following will + * not result in a stack expansion that recurses back here. + */ + return __get_user_pages(current, mm, start, nr_pages, gup_flags, + NULL, NULL, nonblocking); +} + +/* + * __mm_populate - populate and/or mlock pages within a range of address space. + * + * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap + * flags. VMAs must be already marked with the desired vm_flags, and + * mmap_sem must not be held. + */ +int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) +{ + struct mm_struct *mm = current->mm; + unsigned long end, nstart, nend; + struct vm_area_struct *vma = NULL; + int locked = 0; + long ret = 0; + + end = start + len; + + for (nstart = start; nstart < end; nstart = nend) { + /* + * We want to fault in pages for [nstart; end) address range. + * Find first corresponding VMA. + */ + if (!locked) { + locked = 1; + down_read(&mm->mmap_sem); + vma = find_vma(mm, nstart); + } else if (nstart >= vma->vm_end) + vma = vma->vm_next; + if (!vma || vma->vm_start >= end) + break; + /* + * Set [nstart; nend) to intersection of desired address + * range with the first VMA. Also, skip undesirable VMA types. + */ + nend = min(end, vma->vm_end); + if (vma->vm_flags & (VM_IO | VM_PFNMAP)) + continue; + if (nstart < vma->vm_start) + nstart = vma->vm_start; + /* + * Now fault in a range of pages. populate_vma_page_range() + * double checks the vma flags, so that it won't mlock pages + * if the vma was already munlocked. + */ + ret = populate_vma_page_range(vma, nstart, nend, &locked); + if (ret < 0) { + if (ignore_errors) { + ret = 0; + continue; /* continue at next VMA */ + } + break; + } + nend = nstart + ret * PAGE_SIZE; + ret = 0; + } + if (locked) + up_read(&mm->mmap_sem); + return ret; /* 0 or negative error code */ +} + +/** + * get_dump_page() - pin user page in memory while writing it to core dump + * @addr: user address + * + * Returns struct page pointer of user page pinned for dump, + * to be freed afterwards by put_page(). + * + * Returns NULL on any kind of failure - a hole must then be inserted into + * the corefile, to preserve alignment with its headers; and also returns + * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found - + * allowing a hole to be left in the corefile to save diskspace. + * + * Called without mmap_sem, but after all other threads have been killed. + */ +#ifdef CONFIG_ELF_CORE +struct page *get_dump_page(unsigned long addr) +{ + struct vm_area_struct *vma; + struct page *page; + + if (__get_user_pages(current, current->mm, addr, 1, + FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma, + NULL) < 1) + return NULL; + flush_cache_page(vma, addr, page_to_pfn(page)); + return page; +} +#endif /* CONFIG_ELF_CORE */ + #if defined(CONFIG_FS_DAX) || defined (CONFIG_CMA) static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages) { @@ -1503,152 +1570,85 @@ long get_user_pages(unsigned long start, unsigned long nr_pages, } EXPORT_SYMBOL(get_user_pages); -/** - * populate_vma_page_range() - populate a range of pages in the vma. - * @vma: target vma - * @start: start address - * @end: end address - * @nonblocking: +/* + * We can leverage the VM_FAULT_RETRY functionality in the page fault + * paths better by using either get_user_pages_locked() or + * get_user_pages_unlocked(). * - * This takes care of mlocking the pages too if VM_LOCKED is set. + * get_user_pages_locked() is suitable to replace the form: * - * return 0 on success, negative error code on error. + * down_read(&mm->mmap_sem); + * do_something() + * get_user_pages(tsk, mm, ..., pages, NULL); + * up_read(&mm->mmap_sem); * - * vma->vm_mm->mmap_sem must be held. + * to: * - * If @nonblocking is NULL, it may be held for read or write and will - * be unperturbed. - * - * If @nonblocking is non-NULL, it must held for read only and may be - * released. If it's released, *@nonblocking will be set to 0. + * int locked = 1; + * down_read(&mm->mmap_sem); + * do_something() + * get_user_pages_locked(tsk, mm, ..., pages, &locked); + * if (locked) + * up_read(&mm->mmap_sem); */ -long populate_vma_page_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end, int *nonblocking) +long get_user_pages_locked(unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, + int *locked) { - struct mm_struct *mm = vma->vm_mm; - unsigned long nr_pages = (end - start) / PAGE_SIZE; - int gup_flags; - - VM_BUG_ON(start & ~PAGE_MASK); - VM_BUG_ON(end & ~PAGE_MASK); - VM_BUG_ON_VMA(start < vma->vm_start, vma); - VM_BUG_ON_VMA(end > vma->vm_end, vma); - VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm); - - gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK; - if (vma->vm_flags & VM_LOCKONFAULT) - gup_flags &= ~FOLL_POPULATE; /* - * We want to touch writable mappings with a write fault in order - * to break COW, except for shared mappings because these don't COW - * and we would not want to dirty them for nothing. + * FIXME: Current FOLL_LONGTERM behavior is incompatible with + * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on + * vmas. As there are no users of this flag in this call we simply + * disallow this option for now. */ - if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE) - gup_flags |= FOLL_WRITE; + if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) + return -EINVAL; - /* - * We want mlock to succeed for regions that have any permissions - * other than PROT_NONE. - */ - if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) - gup_flags |= FOLL_FORCE; - - /* - * We made sure addr is within a VMA, so the following will - * not result in a stack expansion that recurses back here. - */ - return __get_user_pages(current, mm, start, nr_pages, gup_flags, - NULL, NULL, nonblocking); + return __get_user_pages_locked(current, current->mm, start, nr_pages, + pages, NULL, locked, + gup_flags | FOLL_TOUCH); } +EXPORT_SYMBOL(get_user_pages_locked); /* - * __mm_populate - populate and/or mlock pages within a range of address space. + * get_user_pages_unlocked() is suitable to replace the form: * - * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap - * flags. VMAs must be already marked with the desired vm_flags, and - * mmap_sem must not be held. + * down_read(&mm->mmap_sem); + * get_user_pages(tsk, mm, ..., pages, NULL); + * up_read(&mm->mmap_sem); + * + * with: + * + * get_user_pages_unlocked(tsk, mm, ..., pages); + * + * It is functionally equivalent to get_user_pages_fast so + * get_user_pages_fast should be used instead if specific gup_flags + * (e.g. FOLL_FORCE) are not required. */ -int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) +long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, + struct page **pages, unsigned int gup_flags) { struct mm_struct *mm = current->mm; - unsigned long end, nstart, nend; - struct vm_area_struct *vma = NULL; - int locked = 0; - long ret = 0; + int locked = 1; + long ret; - end = start + len; + /* + * FIXME: Current FOLL_LONGTERM behavior is incompatible with + * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on + * vmas. As there are no users of this flag in this call we simply + * disallow this option for now. + */ + if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) + return -EINVAL; - for (nstart = start; nstart < end; nstart = nend) { - /* - * We want to fault in pages for [nstart; end) address range. - * Find first corresponding VMA. - */ - if (!locked) { - locked = 1; - down_read(&mm->mmap_sem); - vma = find_vma(mm, nstart); - } else if (nstart >= vma->vm_end) - vma = vma->vm_next; - if (!vma || vma->vm_start >= end) - break; - /* - * Set [nstart; nend) to intersection of desired address - * range with the first VMA. Also, skip undesirable VMA types. - */ - nend = min(end, vma->vm_end); - if (vma->vm_flags & (VM_IO | VM_PFNMAP)) - continue; - if (nstart < vma->vm_start) - nstart = vma->vm_start; - /* - * Now fault in a range of pages. populate_vma_page_range() - * double checks the vma flags, so that it won't mlock pages - * if the vma was already munlocked. - */ - ret = populate_vma_page_range(vma, nstart, nend, &locked); - if (ret < 0) { - if (ignore_errors) { - ret = 0; - continue; /* continue at next VMA */ - } - break; - } - nend = nstart + ret * PAGE_SIZE; - ret = 0; - } + down_read(&mm->mmap_sem); + ret = __get_user_pages_locked(current, mm, start, nr_pages, pages, NULL, + &locked, gup_flags | FOLL_TOUCH); if (locked) up_read(&mm->mmap_sem); - return ret; /* 0 or negative error code */ + return ret; } - -/** - * get_dump_page() - pin user page in memory while writing it to core dump - * @addr: user address - * - * Returns struct page pointer of user page pinned for dump, - * to be freed afterwards by put_page(). - * - * Returns NULL on any kind of failure - a hole must then be inserted into - * the corefile, to preserve alignment with its headers; and also returns - * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found - - * allowing a hole to be left in the corefile to save diskspace. - * - * Called without mmap_sem, but after all other threads have been killed. - */ -#ifdef CONFIG_ELF_CORE -struct page *get_dump_page(unsigned long addr) -{ - struct vm_area_struct *vma; - struct page *page; - - if (__get_user_pages(current, current->mm, addr, 1, - FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma, - NULL) < 1) - return NULL; - flush_cache_page(vma, addr, page_to_pfn(page)); - return page; -} -#endif /* CONFIG_ELF_CORE */ +EXPORT_SYMBOL(get_user_pages_unlocked); /* * Fast GUP From 050a9adc64383aed3429a31432b4f5a7b0cdc8ac Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 11 Jul 2019 20:57:21 -0700 Subject: [PATCH 094/147] mm: consolidate the get_user_pages* implementations Always build mm/gup.c so that we don't have to provide separate nommu stubs. Also merge the get_user_pages_fast and __get_user_pages_fast stubs when HAVE_FAST_GUP into the main implementations, which will never call the fast path if HAVE_FAST_GUP is not set. This also ensures the new put_user_pages* helpers are available for nommu, as those are currently missing, which would create a problem as soon as we actually grew users for it. Link: http://lkml.kernel.org/r/20190625143715.1689-13-hch@lst.de Signed-off-by: Christoph Hellwig Cc: Andrey Konovalov Cc: Benjamin Herrenschmidt Cc: David Miller Cc: James Hogan Cc: Jason Gunthorpe Cc: Khalid Aziz Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Paul Burton Cc: Paul Mackerras Cc: Ralf Baechle Cc: Rich Felker Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 1 + mm/Makefile | 4 +-- mm/gup.c | 67 +++++++++++++++++++++++++++++++++++++--- mm/nommu.c | 88 ----------------------------------------------------- mm/util.c | 47 ---------------------------- 5 files changed, 65 insertions(+), 142 deletions(-) diff --git a/mm/Kconfig b/mm/Kconfig index b5a258d62465..48840b28482b 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -133,6 +133,7 @@ config HAVE_MEMBLOCK_PHYS_MAP bool config HAVE_FAST_GUP + depends on MMU bool config ARCH_KEEP_MEMBLOCK diff --git a/mm/Makefile b/mm/Makefile index ac5e5ba78874..dc0746ca1109 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -22,7 +22,7 @@ KCOV_INSTRUMENT_mmzone.o := n KCOV_INSTRUMENT_vmstat.o := n mmu-y := nommu.o -mmu-$(CONFIG_MMU) := gup.o highmem.o memory.o mincore.o \ +mmu-$(CONFIG_MMU) := highmem.o memory.o mincore.o \ mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \ msync.o page_vma_mapped.o pagewalk.o \ pgtable-generic.o rmap.o vmalloc.o @@ -39,7 +39,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ mm_init.o mmu_context.o percpu.o slab_common.o \ compaction.o vmacache.o \ interval_tree.o list_lru.o workingset.o \ - debug.o $(mmu-y) + debug.o gup.o $(mmu-y) # Give 'page_alloc' its own module-parameter namespace page-alloc-y := page_alloc.o diff --git a/mm/gup.c b/mm/gup.c index d36b82c05e79..cc5ddd8869b7 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -134,6 +134,7 @@ void put_user_pages(struct page **pages, unsigned long npages) } EXPORT_SYMBOL(put_user_pages); +#ifdef CONFIG_MMU static struct page *no_page_table(struct vm_area_struct *vma, unsigned int flags) { @@ -1322,6 +1323,51 @@ struct page *get_dump_page(unsigned long addr) return page; } #endif /* CONFIG_ELF_CORE */ +#else /* CONFIG_MMU */ +static long __get_user_pages_locked(struct task_struct *tsk, + struct mm_struct *mm, unsigned long start, + unsigned long nr_pages, struct page **pages, + struct vm_area_struct **vmas, int *locked, + unsigned int foll_flags) +{ + struct vm_area_struct *vma; + unsigned long vm_flags; + int i; + + /* calculate required read or write permissions. + * If FOLL_FORCE is set, we only require the "MAY" flags. + */ + vm_flags = (foll_flags & FOLL_WRITE) ? + (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); + vm_flags &= (foll_flags & FOLL_FORCE) ? + (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); + + for (i = 0; i < nr_pages; i++) { + vma = find_vma(mm, start); + if (!vma) + goto finish_or_fault; + + /* protect what we can, including chardevs */ + if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) || + !(vm_flags & vma->vm_flags)) + goto finish_or_fault; + + if (pages) { + pages[i] = virt_to_page(start); + if (pages[i]) + get_page(pages[i]); + } + if (vmas) + vmas[i] = vma; + start = (start + PAGE_SIZE) & PAGE_MASK; + } + + return i; + +finish_or_fault: + return i ? : -EFAULT; +} +#endif /* !CONFIG_MMU */ #if defined(CONFIG_FS_DAX) || defined (CONFIG_CMA) static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages) @@ -1484,7 +1530,7 @@ static long check_and_migrate_cma_pages(struct task_struct *tsk, { return nr_pages; } -#endif +#endif /* CONFIG_CMA */ /* * __gup_longterm_locked() is a wrapper for __get_user_pages_locked which @@ -2160,6 +2206,12 @@ static void gup_pgd_range(unsigned long addr, unsigned long end, return; } while (pgdp++, addr = next, addr != end); } +#else +static inline void gup_pgd_range(unsigned long addr, unsigned long end, + unsigned int flags, struct page **pages, int *nr) +{ +} +#endif /* CONFIG_HAVE_FAST_GUP */ #ifndef gup_fast_permitted /* @@ -2177,6 +2229,9 @@ static bool gup_fast_permitted(unsigned long start, unsigned long end) * the regular GUP. * Note a difference with get_user_pages_fast: this always returns the * number of pages pinned, 0 if no pages were pinned. + * + * If the architecture does not support this function, simply return with no + * pages pinned. */ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages) @@ -2206,7 +2261,8 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, * block IPIs that come from THPs splitting. */ - if (gup_fast_permitted(start, end)) { + if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) && + gup_fast_permitted(start, end)) { local_irq_save(flags); gup_pgd_range(start, end, write ? FOLL_WRITE : 0, pages, &nr); local_irq_restore(flags); @@ -2214,6 +2270,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, return nr; } +EXPORT_SYMBOL_GPL(__get_user_pages_fast); static int __gup_longterm_unlocked(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages) @@ -2270,7 +2327,8 @@ int get_user_pages_fast(unsigned long start, int nr_pages, if (unlikely(!access_ok((void __user *)start, len))) return -EFAULT; - if (gup_fast_permitted(start, end)) { + if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) && + gup_fast_permitted(start, end)) { local_irq_disable(); gup_pgd_range(addr, end, gup_flags, pages, &nr); local_irq_enable(); @@ -2296,5 +2354,4 @@ int get_user_pages_fast(unsigned long start, int nr_pages, return ret; } - -#endif /* CONFIG_HAVE_GENERIC_GUP */ +EXPORT_SYMBOL_GPL(get_user_pages_fast); diff --git a/mm/nommu.c b/mm/nommu.c index d8c02fbe03b5..07165ad2e548 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -111,94 +111,6 @@ unsigned int kobjsize(const void *objp) return PAGE_SIZE << compound_order(page); } -static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, - unsigned int foll_flags, struct page **pages, - struct vm_area_struct **vmas, int *nonblocking) -{ - struct vm_area_struct *vma; - unsigned long vm_flags; - int i; - - /* calculate required read or write permissions. - * If FOLL_FORCE is set, we only require the "MAY" flags. - */ - vm_flags = (foll_flags & FOLL_WRITE) ? - (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); - vm_flags &= (foll_flags & FOLL_FORCE) ? - (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); - - for (i = 0; i < nr_pages; i++) { - vma = find_vma(mm, start); - if (!vma) - goto finish_or_fault; - - /* protect what we can, including chardevs */ - if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) || - !(vm_flags & vma->vm_flags)) - goto finish_or_fault; - - if (pages) { - pages[i] = virt_to_page(start); - if (pages[i]) - get_page(pages[i]); - } - if (vmas) - vmas[i] = vma; - start = (start + PAGE_SIZE) & PAGE_MASK; - } - - return i; - -finish_or_fault: - return i ? : -EFAULT; -} - -/* - * get a list of pages in an address range belonging to the specified process - * and indicate the VMA that covers each page - * - this is potentially dodgy as we may end incrementing the page count of a - * slab page or a secondary page from a compound page - * - don't permit access to VMAs that don't support it, such as I/O mappings - */ -long get_user_pages(unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas) -{ - return __get_user_pages(current, current->mm, start, nr_pages, - gup_flags, pages, vmas, NULL); -} -EXPORT_SYMBOL(get_user_pages); - -long get_user_pages_locked(unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - int *locked) -{ - return get_user_pages(start, nr_pages, gup_flags, pages, NULL); -} -EXPORT_SYMBOL(get_user_pages_locked); - -static long __get_user_pages_unlocked(struct task_struct *tsk, - struct mm_struct *mm, unsigned long start, - unsigned long nr_pages, struct page **pages, - unsigned int gup_flags) -{ - long ret; - down_read(&mm->mmap_sem); - ret = __get_user_pages(tsk, mm, start, nr_pages, gup_flags, pages, - NULL, NULL); - up_read(&mm->mmap_sem); - return ret; -} - -long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, - struct page **pages, unsigned int gup_flags) -{ - return __get_user_pages_unlocked(current, current->mm, start, nr_pages, - pages, gup_flags); -} -EXPORT_SYMBOL(get_user_pages_unlocked); - /** * follow_pfn - look up PFN at a user virtual address * @vma: memory mapping diff --git a/mm/util.c b/mm/util.c index 9834c4ab7d8e..68575a315dc5 100644 --- a/mm/util.c +++ b/mm/util.c @@ -300,53 +300,6 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) } #endif -/* - * Like get_user_pages_fast() except its IRQ-safe in that it won't fall - * back to the regular GUP. - * Note a difference with get_user_pages_fast: this always returns the - * number of pages pinned, 0 if no pages were pinned. - * If the architecture does not support this function, simply return with no - * pages pinned. - */ -int __weak __get_user_pages_fast(unsigned long start, - int nr_pages, int write, struct page **pages) -{ - return 0; -} -EXPORT_SYMBOL_GPL(__get_user_pages_fast); - -/** - * get_user_pages_fast() - pin user pages in memory - * @start: starting user address - * @nr_pages: number of pages from start to pin - * @gup_flags: flags modifying pin behaviour - * @pages: array that receives pointers to the pages pinned. - * Should be at least nr_pages long. - * - * get_user_pages_fast provides equivalent functionality to get_user_pages, - * operating on current and current->mm, with force=0 and vma=NULL. However - * unlike get_user_pages, it must be called without mmap_sem held. - * - * get_user_pages_fast may take mmap_sem and page table locks, so no - * assumptions can be made about lack of locking. get_user_pages_fast is to be - * implemented in a way that is advantageous (vs get_user_pages()) when the - * user memory area is already faulted in and present in ptes. However if the - * pages have to be faulted in, it may turn out to be slightly slower so - * callers need to carefully consider what to use. On many architectures, - * get_user_pages_fast simply falls back to get_user_pages. - * - * Return: number of pages pinned. This may be fewer than the number - * requested. If nr_pages is 0 or negative, returns 0. If no pages - * were pinned, returns -errno. - */ -int __weak get_user_pages_fast(unsigned long start, - int nr_pages, unsigned int gup_flags, - struct page **pages) -{ - return get_user_pages_unlocked(start, nr_pages, pages, gup_flags); -} -EXPORT_SYMBOL_GPL(get_user_pages_fast); - unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flag, unsigned long pgoff) From 817be129e6f254e5bd8c17b1da834c8f612dca28 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 11 Jul 2019 20:57:25 -0700 Subject: [PATCH 095/147] mm: validate get_user_pages_fast flags We can only deal with FOLL_WRITE and/or FOLL_LONGTERM in get_user_pages_fast, so reject all other flags. Link: http://lkml.kernel.org/r/20190625143715.1689-14-hch@lst.de Signed-off-by: Christoph Hellwig Cc: Andrey Konovalov Cc: Benjamin Herrenschmidt Cc: David Miller Cc: James Hogan Cc: Jason Gunthorpe Cc: Khalid Aziz Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Paul Burton Cc: Paul Mackerras Cc: Ralf Baechle Cc: Rich Felker Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/gup.c b/mm/gup.c index cc5ddd8869b7..9d68cef2fa90 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2317,6 +2317,9 @@ int get_user_pages_fast(unsigned long start, int nr_pages, unsigned long addr, len, end; int nr = 0, ret = 0; + if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM))) + return -EINVAL; + start = untagged_addr(start) & PAGE_MASK; addr = start; len = (unsigned long) nr_pages << PAGE_SHIFT; From cbd34da7dc9afd521e0bea5e7d12701f4a9da7c7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 11 Jul 2019 20:57:28 -0700 Subject: [PATCH 096/147] mm: move the powerpc hugepd code to mm/gup.c While only powerpc supports the hugepd case, the code is pretty generic and I'd like to keep all GUP internals in one place. Link: http://lkml.kernel.org/r/20190625143715.1689-15-hch@lst.de Signed-off-by: Christoph Hellwig Cc: Andrey Konovalov Cc: Benjamin Herrenschmidt Cc: David Miller Cc: James Hogan Cc: Jason Gunthorpe Cc: Khalid Aziz Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Paul Burton Cc: Paul Mackerras Cc: Ralf Baechle Cc: Rich Felker Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/Kconfig | 1 + arch/powerpc/mm/hugetlbpage.c | 72 ------------------------------ include/linux/hugetlb.h | 18 -------- mm/Kconfig | 10 +++++ mm/gup.c | 82 +++++++++++++++++++++++++++++++++++ 5 files changed, 93 insertions(+), 90 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 959866c156de..24a41f919309 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -125,6 +125,7 @@ config PPC select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_KCOV + select ARCH_HAS_HUGEPD if HUGETLB_PAGE select ARCH_HAS_MMIOWB if PPC64 select ARCH_HAS_PHYS_TO_DMA select ARCH_HAS_PMEM_API if PPC64 diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index b5d92dc32844..51716c11d0fb 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -511,13 +511,6 @@ retry: return page; } -static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end, - unsigned long sz) -{ - unsigned long __boundary = (addr + sz) & ~(sz-1); - return (__boundary - 1 < end - 1) ? __boundary : end; -} - #ifdef CONFIG_PPC_MM_SLICES unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, @@ -665,68 +658,3 @@ void flush_dcache_icache_hugepage(struct page *page) } } } - -static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, - unsigned long end, int write, struct page **pages, int *nr) -{ - unsigned long pte_end; - struct page *head, *page; - pte_t pte; - int refs; - - pte_end = (addr + sz) & ~(sz-1); - if (pte_end < end) - end = pte_end; - - pte = READ_ONCE(*ptep); - - if (!pte_access_permitted(pte, write)) - return 0; - - /* hugepages are never "special" */ - VM_BUG_ON(!pfn_valid(pte_pfn(pte))); - - refs = 0; - head = pte_page(pte); - - page = head + ((addr & (sz-1)) >> PAGE_SHIFT); - do { - VM_BUG_ON(compound_head(page) != head); - pages[*nr] = page; - (*nr)++; - page++; - refs++; - } while (addr += PAGE_SIZE, addr != end); - - if (!page_cache_add_speculative(head, refs)) { - *nr -= refs; - return 0; - } - - if (unlikely(pte_val(pte) != pte_val(*ptep))) { - /* Could be optimized better */ - *nr -= refs; - while (refs--) - put_page(head); - return 0; - } - - return 1; -} - -int gup_huge_pd(hugepd_t hugepd, unsigned long addr, unsigned int pdshift, - unsigned long end, int write, struct page **pages, int *nr) -{ - pte_t *ptep; - unsigned long sz = 1UL << hugepd_shift(hugepd); - unsigned long next; - - ptep = hugepte_offset(hugepd, addr, pdshift); - do { - next = hugepte_addr_end(addr, end, sz); - if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr)) - return 0; - } while (ptep++, addr = next, addr != end); - - return 1; -} diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index f895a79c6f5c..edfca4278319 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -16,29 +16,11 @@ struct user_struct; struct mmu_gather; #ifndef is_hugepd -/* - * Some architectures requires a hugepage directory format that is - * required to support multiple hugepage sizes. For example - * a4fe3ce76 "powerpc/mm: Allow more flexible layouts for hugepage pagetables" - * introduced the same on powerpc. This allows for a more flexible hugepage - * pagetable layout. - */ typedef struct { unsigned long pd; } hugepd_t; #define is_hugepd(hugepd) (0) #define __hugepd(x) ((hugepd_t) { (x) }) -static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr, - unsigned pdshift, unsigned long end, - int write, struct page **pages, int *nr) -{ - return 0; -} -#else -extern int gup_huge_pd(hugepd_t hugepd, unsigned long addr, - unsigned pdshift, unsigned long end, - int write, struct page **pages, int *nr); #endif - #ifdef CONFIG_HUGETLB_PAGE #include diff --git a/mm/Kconfig b/mm/Kconfig index 48840b28482b..0b4352557dd5 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -769,4 +769,14 @@ config GUP_GET_PTE_LOW_HIGH config ARCH_HAS_PTE_SPECIAL bool +# +# Some architectures require a special hugepage directory format that is +# required to support multiple hugepage sizes. For example a4fe3ce76 +# "powerpc/mm: Allow more flexible layouts for hugepage pagetables" +# introduced it on powerpc. This allows for a more flexible hugepage +# pagetable layouts. +# +config ARCH_HAS_HUGEPD + bool + endmenu diff --git a/mm/gup.c b/mm/gup.c index 9d68cef2fa90..2f8bf7a71c74 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1966,6 +1966,88 @@ static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr, } #endif +#ifdef CONFIG_ARCH_HAS_HUGEPD +static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end, + unsigned long sz) +{ + unsigned long __boundary = (addr + sz) & ~(sz-1); + return (__boundary - 1 < end - 1) ? __boundary : end; +} + +static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, + unsigned long end, int write, struct page **pages, int *nr) +{ + unsigned long pte_end; + struct page *head, *page; + pte_t pte; + int refs; + + pte_end = (addr + sz) & ~(sz-1); + if (pte_end < end) + end = pte_end; + + pte = READ_ONCE(*ptep); + + if (!pte_access_permitted(pte, write)) + return 0; + + /* hugepages are never "special" */ + VM_BUG_ON(!pfn_valid(pte_pfn(pte))); + + refs = 0; + head = pte_page(pte); + + page = head + ((addr & (sz-1)) >> PAGE_SHIFT); + do { + VM_BUG_ON(compound_head(page) != head); + pages[*nr] = page; + (*nr)++; + page++; + refs++; + } while (addr += PAGE_SIZE, addr != end); + + if (!page_cache_add_speculative(head, refs)) { + *nr -= refs; + return 0; + } + + if (unlikely(pte_val(pte) != pte_val(*ptep))) { + /* Could be optimized better */ + *nr -= refs; + while (refs--) + put_page(head); + return 0; + } + + return 1; +} + +static int gup_huge_pd(hugepd_t hugepd, unsigned long addr, + unsigned int pdshift, unsigned long end, int write, + struct page **pages, int *nr) +{ + pte_t *ptep; + unsigned long sz = 1UL << hugepd_shift(hugepd); + unsigned long next; + + ptep = hugepte_offset(hugepd, addr, pdshift); + do { + next = hugepte_addr_end(addr, end, sz); + if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr)) + return 0; + } while (ptep++, addr = next, addr != end); + + return 1; +} +#else +static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr, + unsigned pdshift, unsigned long end, int write, + struct page **pages, int *nr) +{ + return 0; +} +#endif /* CONFIG_ARCH_HAS_HUGEPD */ + static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { From 01a369160bbea43727aa2b99877f86ebddba9acc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 11 Jul 2019 20:57:32 -0700 Subject: [PATCH 097/147] mm: switch gup_hugepte to use try_get_compound_head This applies the overflow fixes from 8fde12ca79aff ("mm: prevent get_user_pages() from overflowing page refcount") to the powerpc hugepd code and brings it back in sync with the other GUP cases. Link: http://lkml.kernel.org/r/20190625143715.1689-16-hch@lst.de Signed-off-by: Christoph Hellwig Cc: Andrey Konovalov Cc: Benjamin Herrenschmidt Cc: David Miller Cc: James Hogan Cc: Jason Gunthorpe Cc: Khalid Aziz Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Paul Burton Cc: Paul Mackerras Cc: Ralf Baechle Cc: Rich Felker Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/gup.c b/mm/gup.c index 2f8bf7a71c74..7763abd16405 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2006,7 +2006,8 @@ static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, refs++; } while (addr += PAGE_SIZE, addr != end); - if (!page_cache_add_speculative(head, refs)) { + head = try_get_compound_head(head, refs); + if (!head) { *nr -= refs; return 0; } From 520b4a4496f12b117b94f3ac7c493651881c5fe3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 11 Jul 2019 20:57:36 -0700 Subject: [PATCH 098/147] mm: mark the page referenced in gup_hugepte All other get_user_page_fast cases mark the page referenced, so do this here as well. Link: http://lkml.kernel.org/r/20190625143715.1689-17-hch@lst.de Signed-off-by: Christoph Hellwig Cc: Andrey Konovalov Cc: Benjamin Herrenschmidt Cc: David Miller Cc: James Hogan Cc: Jason Gunthorpe Cc: Khalid Aziz Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Paul Burton Cc: Paul Mackerras Cc: Ralf Baechle Cc: Rich Felker Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/gup.c b/mm/gup.c index 7763abd16405..83d480e9b05f 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2020,6 +2020,7 @@ static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, return 0; } + SetPageReferenced(head); return 1; } From aa712399c1e8245c375a5c44760de684ec2ebefb Mon Sep 17 00:00:00 2001 From: Pingfan Liu Date: Thu, 11 Jul 2019 20:57:39 -0700 Subject: [PATCH 099/147] mm/gup: speed up check_and_migrate_cma_pages() on huge page Both hugetlb and thp locate on the same migration type of pageblock, since they are allocated from a free_list[]. Based on this fact, it is enough to check on a single subpage to decide the migration type of the whole huge page. By this way, it saves (2M/4K - 1) times loop for pmd_huge on x86, similar on other archs. Furthermore, when executing isolate_huge_page(), it avoid taking global hugetlb_lock many times, and meanless remove/add to the local link list cma_page_list. [akpm@linux-foundation.org: make `i' and `step' unsigned] Link: http://lkml.kernel.org/r/1561612545-28997-1-git-send-email-kernelfans@gmail.com Signed-off-by: Pingfan Liu Reviewed-by: Andrew Morton Reviewed-by: Ira Weiny Cc: Mike Rapoport Cc: "Kirill A. Shutemov" Cc: Thomas Gleixner Cc: John Hubbard Cc: "Aneesh Kumar K.V" Cc: Christoph Hellwig Cc: Keith Busch Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 83d480e9b05f..f411bab037f5 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1449,25 +1449,31 @@ static long check_and_migrate_cma_pages(struct task_struct *tsk, struct vm_area_struct **vmas, unsigned int gup_flags) { - long i; + unsigned long i; + unsigned long step; bool drain_allow = true; bool migrate_allow = true; LIST_HEAD(cma_page_list); check_again: - for (i = 0; i < nr_pages; i++) { + for (i = 0; i < nr_pages;) { + + struct page *head = compound_head(pages[i]); + + /* + * gup may start from a tail page. Advance step by the left + * part. + */ + step = (1 << compound_order(head)) - (pages[i] - head); /* * If we get a page from the CMA zone, since we are going to * be pinning these entries, we might as well move them out * of the CMA zone if possible. */ - if (is_migrate_cma_page(pages[i])) { - - struct page *head = compound_head(pages[i]); - - if (PageHuge(head)) { + if (is_migrate_cma_page(head)) { + if (PageHuge(head)) isolate_huge_page(head, &cma_page_list); - } else { + else { if (!PageLRU(head) && drain_allow) { lru_add_drain_all(); drain_allow = false; @@ -1482,6 +1488,8 @@ check_again: } } } + + i += step; } if (!list_empty(&cma_page_list)) { From b5d1c39f34d1c9bca0c4b9ae2e339fbbe264a9c7 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 11 Jul 2019 20:57:43 -0700 Subject: [PATCH 100/147] mm/gup.c: remove some BUG_ONs from get_gate_page() If we end up without a PGD or PUD entry backing the gate area, don't BUG -- just fail gracefully. It's not entirely implausible that this could happen some day on x86. It doesn't right now even with an execute-only emulated vsyscall page because the fixmap shares the PUD, but the core mm code shouldn't rely on that particular detail to avoid OOPSing. Link: http://lkml.kernel.org/r/a1d9f4efb75b9d464e59fd6af00104b21c58f6f7.1561610798.git.luto@kernel.org Signed-off-by: Andy Lutomirski Reviewed-by: Kees Cook Reviewed-by: Andrew Morton Cc: Florian Weimer Cc: Jann Horn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index f411bab037f5..bb4ad57d20e3 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -586,11 +586,14 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address, pgd = pgd_offset_k(address); else pgd = pgd_offset_gate(mm, address); - BUG_ON(pgd_none(*pgd)); + if (pgd_none(*pgd)) + return -EFAULT; p4d = p4d_offset(pgd, address); - BUG_ON(p4d_none(*p4d)); + if (p4d_none(*p4d)) + return -EFAULT; pud = pud_offset(p4d, address); - BUG_ON(pud_none(*pud)); + if (pud_none(*pud)) + return -EFAULT; pmd = pmd_offset(pud, address); if (!pmd_present(*pmd)) return -EFAULT; From 790c73690c2bbecb3f6f8becbdb11ddc9bcff8cc Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Thu, 11 Jul 2019 20:57:46 -0700 Subject: [PATCH 101/147] mm/gup.c: mark undo_dev_pagemap as __maybe_unused Several mips builds generate the following build warning. mm/gup.c:1788:13: warning: 'undo_dev_pagemap' defined but not used The function is declared unconditionally but only called from behind various ifdefs. Mark it __maybe_unused. Link: http://lkml.kernel.org/r/1562072523-22311-1-git-send-email-linux@roeck-us.net Signed-off-by: Guenter Roeck Reviewed-by: Andrew Morton Cc: Stephen Rothwell Cc: Robin Murphy Cc: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/gup.c b/mm/gup.c index bb4ad57d20e3..43b7d875de37 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1796,7 +1796,8 @@ static inline pte_t gup_get_pte(pte_t *ptep) } #endif /* CONFIG_GUP_GET_PTE_LOW_HIGH */ -static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages) +static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start, + struct page **pages) { while ((*nr) - nr_start) { struct page *page = pages[--(*nr)]; From 5fba4af4456b5d3f982d4ac1c879d16b36aaa0fb Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 11 Jul 2019 20:57:49 -0700 Subject: [PATCH 102/147] asm-generic, x86: introduce generic pte_{alloc,free}_one[_kernel] Most architectures have identical or very similar implementation of pte_alloc_one_kernel(), pte_alloc_one(), pte_free_kernel() and pte_free(). Add a generic implementation that can be reused across architectures and enable its use on x86. The generic implementation uses GFP_KERNEL | __GFP_ZERO for the kernel page tables and GFP_KERNEL | __GFP_ZERO | __GFP_ACCOUNT for the user page tables. The "base" functions for PTE allocation, namely __pte_alloc_one_kernel() and __pte_alloc_one() are intended for the architectures that require additional actions after actual memory allocation or must use non-default GFP flags. x86 is switched to use generic pte_alloc_one_kernel(), pte_free_kernel() and pte_free(). x86 still implements pte_alloc_one() to allow run-time control of GFP flags required for "userpte" command line option. Link: http://lkml.kernel.org/r/1557296232-15361-2-git-send-email-rppt@linux.ibm.com Signed-off-by: Mike Rapoport Cc: Albert Ou Cc: Anshuman Khandual Cc: Anton Ivanov Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Geert Uytterhoeven Cc: Greentime Hu Cc: Guan Xuetao Cc: Guo Ren Cc: Guo Ren Cc: Helge Deller Cc: Ley Foon Tan Cc: Matthew Wilcox Cc: Matt Turner Cc: Michael Ellerman Cc: Michal Hocko Cc: Palmer Dabbelt Cc: Paul Burton Cc: Ralf Baechle Cc: Richard Kuo Cc: Richard Weinberger Cc: Russell King Cc: Sam Creasey Cc: Vincent Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/pgalloc.h | 19 +----- arch/x86/mm/pgtable.c | 33 +++------- include/asm-generic/pgalloc.h | 107 +++++++++++++++++++++++++++++++-- 3 files changed, 115 insertions(+), 44 deletions(-) diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h index a281e61ec60c..29aa7859bdee 100644 --- a/arch/x86/include/asm/pgalloc.h +++ b/arch/x86/include/asm/pgalloc.h @@ -6,6 +6,9 @@ #include /* for struct page */ #include +#define __HAVE_ARCH_PTE_ALLOC_ONE +#include /* for pte_{alloc,free}_one */ + static inline int __paravirt_pgd_alloc(struct mm_struct *mm) { return 0; } #ifdef CONFIG_PARAVIRT_XXL @@ -47,24 +50,8 @@ extern gfp_t __userpte_alloc_gfp; extern pgd_t *pgd_alloc(struct mm_struct *); extern void pgd_free(struct mm_struct *mm, pgd_t *pgd); -extern pte_t *pte_alloc_one_kernel(struct mm_struct *); extern pgtable_t pte_alloc_one(struct mm_struct *); -/* Should really implement gc for free page table pages. This could be - done with a reference count in struct page. */ - -static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) -{ - BUG_ON((unsigned long)pte & (PAGE_SIZE-1)); - free_page((unsigned long)pte); -} - -static inline void pte_free(struct mm_struct *mm, struct page *pte) -{ - pgtable_page_dtor(pte); - __free_page(pte); -} - extern void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte); static inline void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte, diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 1f67b1e15bf6..44816ff6411f 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -13,33 +13,17 @@ phys_addr_t physical_mask __ro_after_init = (1ULL << __PHYSICAL_MASK_SHIFT) - 1; EXPORT_SYMBOL(physical_mask); #endif -#define PGALLOC_GFP (GFP_KERNEL_ACCOUNT | __GFP_ZERO) - #ifdef CONFIG_HIGHPTE -#define PGALLOC_USER_GFP __GFP_HIGHMEM +#define PGTABLE_HIGHMEM __GFP_HIGHMEM #else -#define PGALLOC_USER_GFP 0 +#define PGTABLE_HIGHMEM 0 #endif -gfp_t __userpte_alloc_gfp = PGALLOC_GFP | PGALLOC_USER_GFP; - -pte_t *pte_alloc_one_kernel(struct mm_struct *mm) -{ - return (pte_t *)__get_free_page(PGALLOC_GFP & ~__GFP_ACCOUNT); -} +gfp_t __userpte_alloc_gfp = GFP_PGTABLE_USER | PGTABLE_HIGHMEM; pgtable_t pte_alloc_one(struct mm_struct *mm) { - struct page *pte; - - pte = alloc_pages(__userpte_alloc_gfp, 0); - if (!pte) - return NULL; - if (!pgtable_page_ctor(pte)) { - __free_page(pte); - return NULL; - } - return pte; + return __pte_alloc_one(mm, __userpte_alloc_gfp); } static int __init setup_userpte(char *arg) @@ -235,7 +219,7 @@ static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[], int count) { int i; bool failed = false; - gfp_t gfp = PGALLOC_GFP; + gfp_t gfp = GFP_PGTABLE_USER; if (mm == &init_mm) gfp &= ~__GFP_ACCOUNT; @@ -399,14 +383,14 @@ static inline pgd_t *_pgd_alloc(void) * We allocate one page for pgd. */ if (!SHARED_KERNEL_PMD) - return (pgd_t *)__get_free_pages(PGALLOC_GFP, + return (pgd_t *)__get_free_pages(GFP_PGTABLE_USER, PGD_ALLOCATION_ORDER); /* * Now PAE kernel is not running as a Xen domain. We can allocate * a 32-byte slab for pgd to save memory space. */ - return kmem_cache_alloc(pgd_cache, PGALLOC_GFP); + return kmem_cache_alloc(pgd_cache, GFP_PGTABLE_USER); } static inline void _pgd_free(pgd_t *pgd) @@ -424,7 +408,8 @@ void __init pgd_cache_init(void) static inline pgd_t *_pgd_alloc(void) { - return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER); + return (pgd_t *)__get_free_pages(GFP_PGTABLE_USER, + PGD_ALLOCATION_ORDER); } static inline void _pgd_free(pgd_t *pgd) diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h index 948714c1535a..8476175c07e7 100644 --- a/include/asm-generic/pgalloc.h +++ b/include/asm-generic/pgalloc.h @@ -1,13 +1,112 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __ASM_GENERIC_PGALLOC_H #define __ASM_GENERIC_PGALLOC_H -/* - * an empty file is enough for a nommu architecture - */ + #ifdef CONFIG_MMU -#error need to implement an architecture specific asm/pgalloc.h + +#define GFP_PGTABLE_KERNEL (GFP_KERNEL | __GFP_ZERO) +#define GFP_PGTABLE_USER (GFP_PGTABLE_KERNEL | __GFP_ACCOUNT) + +/** + * __pte_alloc_one_kernel - allocate a page for PTE-level kernel page table + * @mm: the mm_struct of the current context + * + * This function is intended for architectures that need + * anything beyond simple page allocation. + * + * Return: pointer to the allocated memory or %NULL on error + */ +static inline pte_t *__pte_alloc_one_kernel(struct mm_struct *mm) +{ + return (pte_t *)__get_free_page(GFP_PGTABLE_KERNEL); +} + +#ifndef __HAVE_ARCH_PTE_ALLOC_ONE_KERNEL +/** + * pte_alloc_one_kernel - allocate a page for PTE-level kernel page table + * @mm: the mm_struct of the current context + * + * Return: pointer to the allocated memory or %NULL on error + */ +static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) +{ + return __pte_alloc_one_kernel(mm); +} #endif +/** + * pte_free_kernel - free PTE-level kernel page table page + * @mm: the mm_struct of the current context + * @pte: pointer to the memory containing the page table + */ +static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) +{ + free_page((unsigned long)pte); +} + +/** + * __pte_alloc_one - allocate a page for PTE-level user page table + * @mm: the mm_struct of the current context + * @gfp: GFP flags to use for the allocation + * + * Allocates a page and runs the pgtable_page_ctor(). + * + * This function is intended for architectures that need + * anything beyond simple page allocation or must have custom GFP flags. + * + * Return: `struct page` initialized as page table or %NULL on error + */ +static inline pgtable_t __pte_alloc_one(struct mm_struct *mm, gfp_t gfp) +{ + struct page *pte; + + pte = alloc_page(gfp); + if (!pte) + return NULL; + if (!pgtable_page_ctor(pte)) { + __free_page(pte); + return NULL; + } + + return pte; +} + +#ifndef __HAVE_ARCH_PTE_ALLOC_ONE +/** + * pte_alloc_one - allocate a page for PTE-level user page table + * @mm: the mm_struct of the current context + * + * Allocates a page and runs the pgtable_page_ctor(). + * + * Return: `struct page` initialized as page table or %NULL on error + */ +static inline pgtable_t pte_alloc_one(struct mm_struct *mm) +{ + return __pte_alloc_one(mm, GFP_PGTABLE_USER); +} +#endif + +/* + * Should really implement gc for free page table pages. This could be + * done with a reference count in struct page. + */ + +/** + * pte_free - free PTE-level user page table page + * @mm: the mm_struct of the current context + * @pte_page: the `struct page` representing the page table + */ +static inline void pte_free(struct mm_struct *mm, struct page *pte_page) +{ + pgtable_page_dtor(pte_page); + __free_page(pte_page); +} + +#else /* CONFIG_MMU */ + +/* This is enough for a nommu architecture */ #define check_pgt_cache() do { } while (0) +#endif /* CONFIG_MMU */ + #endif /* __ASM_GENERIC_PGALLOC_H */ From bc3ace9b520f97d5650d096a5f95cac3fa64e204 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 11 Jul 2019 20:57:53 -0700 Subject: [PATCH 103/147] alpha: switch to generic version of pte allocation alpha allocates PTE pages with __get_free_page() and uses GFP_KERNEL | __GFP_ZERO for the allocations. Switch it to the generic version that does exactly the same thing for the kernel page tables and adds __GFP_ACCOUNT for the user PTEs. The alpha pte_free() and pte_free_kernel() versions are identical to the generic ones and can be simply dropped. Link: http://lkml.kernel.org/r/1557296232-15361-3-git-send-email-rppt@linux.ibm.com Signed-off-by: Mike Rapoport Cc: Albert Ou Cc: Anshuman Khandual Cc: Anton Ivanov Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Geert Uytterhoeven Cc: Greentime Hu Cc: Guan Xuetao Cc: Guo Ren Cc: Guo Ren Cc: Helge Deller Cc: Ley Foon Tan Cc: Matthew Wilcox Cc: Matt Turner Cc: Michael Ellerman Cc: Michal Hocko Cc: Palmer Dabbelt Cc: Paul Burton Cc: Ralf Baechle Cc: Richard Kuo Cc: Richard Weinberger Cc: Russell King Cc: Sam Creasey Cc: Vincent Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/include/asm/pgalloc.h | 40 +++----------------------------- 1 file changed, 3 insertions(+), 37 deletions(-) diff --git a/arch/alpha/include/asm/pgalloc.h b/arch/alpha/include/asm/pgalloc.h index 02f9f91bb4f0..71ded3b7d82d 100644 --- a/arch/alpha/include/asm/pgalloc.h +++ b/arch/alpha/include/asm/pgalloc.h @@ -5,6 +5,8 @@ #include #include +#include /* for pte_{alloc,free}_one */ + /* * Allocate and free page tables. The xxx_kernel() versions are * used to allocate a kernel page table - this turns on ASN bits @@ -41,7 +43,7 @@ pgd_free(struct mm_struct *mm, pgd_t *pgd) static inline pmd_t * pmd_alloc_one(struct mm_struct *mm, unsigned long address) { - pmd_t *ret = (pmd_t *)__get_free_page(GFP_KERNEL|__GFP_ZERO); + pmd_t *ret = (pmd_t *)__get_free_page(GFP_PGTABLE_USER); return ret; } @@ -51,42 +53,6 @@ pmd_free(struct mm_struct *mm, pmd_t *pmd) free_page((unsigned long)pmd); } -static inline pte_t * -pte_alloc_one_kernel(struct mm_struct *mm) -{ - pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_ZERO); - return pte; -} - -static inline void -pte_free_kernel(struct mm_struct *mm, pte_t *pte) -{ - free_page((unsigned long)pte); -} - -static inline pgtable_t -pte_alloc_one(struct mm_struct *mm) -{ - pte_t *pte = pte_alloc_one_kernel(mm); - struct page *page; - - if (!pte) - return NULL; - page = virt_to_page(pte); - if (!pgtable_page_ctor(page)) { - __free_page(page); - return NULL; - } - return page; -} - -static inline void -pte_free(struct mm_struct *mm, pgtable_t page) -{ - pgtable_page_dtor(page); - __free_page(page); -} - #define check_pgt_cache() do { } while (0) #endif /* _ALPHA_PGALLOC_H */ From 28bcf5937536062d96ee0b581a76a0b1b652eec6 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 11 Jul 2019 20:57:57 -0700 Subject: [PATCH 104/147] arm: switch to generic version of pte allocation Replace __get_free_page() and alloc_pages() calls with the generic __pte_alloc_one_kernel() and __pte_alloc_one(). There is no functional change for the kernel PTE allocation. The difference for the user PTEs, is that the clear_pte_table() is now called after pgtable_page_ctor() and the addition of __GFP_ACCOUNT to the GFP flags. The conversion to the generic version of pte_free_kernel() removes the NULL check for pte. The pte_free() version on arm is identical to the generic one and can be simply dropped. Link: http://lkml.kernel.org/r/1557296232-15361-4-git-send-email-rppt@linux.ibm.com Signed-off-by: Mike Rapoport Cc: Albert Ou Cc: Anshuman Khandual Cc: Anton Ivanov Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Geert Uytterhoeven Cc: Greentime Hu Cc: Guan Xuetao Cc: Guo Ren Cc: Guo Ren Cc: Helge Deller Cc: Ley Foon Tan Cc: Matthew Wilcox Cc: Matt Turner Cc: Michael Ellerman Cc: Michal Hocko Cc: Palmer Dabbelt Cc: Paul Burton Cc: Ralf Baechle Cc: Richard Kuo Cc: Richard Weinberger Cc: Russell King Cc: Sam Creasey Cc: Vincent Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/include/asm/pgalloc.h | 41 +++++++++++----------------------- arch/arm/mm/mmu.c | 2 +- 2 files changed, 14 insertions(+), 29 deletions(-) diff --git a/arch/arm/include/asm/pgalloc.h b/arch/arm/include/asm/pgalloc.h index c038cff6fdd3..a2a68b751971 100644 --- a/arch/arm/include/asm/pgalloc.h +++ b/arch/arm/include/asm/pgalloc.h @@ -54,8 +54,6 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) extern pgd_t *pgd_alloc(struct mm_struct *mm); extern void pgd_free(struct mm_struct *mm, pgd_t *pgd); -#define PGALLOC_GFP (GFP_KERNEL | __GFP_ZERO) - static inline void clean_pte_table(pte_t *pte) { clean_dcache_area(pte + PTE_HWTABLE_PTRS, PTE_HWTABLE_SIZE); @@ -77,54 +75,41 @@ static inline void clean_pte_table(pte_t *pte) * | h/w pt 1 | * +------------+ */ + +#define __HAVE_ARCH_PTE_ALLOC_ONE_KERNEL +#define __HAVE_ARCH_PTE_ALLOC_ONE +#include + static inline pte_t * pte_alloc_one_kernel(struct mm_struct *mm) { - pte_t *pte; + pte_t *pte = __pte_alloc_one_kernel(mm); - pte = (pte_t *)__get_free_page(PGALLOC_GFP); if (pte) clean_pte_table(pte); return pte; } +#ifdef CONFIG_HIGHPTE +#define PGTABLE_HIGHMEM __GFP_HIGHMEM +#else +#define PGTABLE_HIGHMEM 0 +#endif + static inline pgtable_t pte_alloc_one(struct mm_struct *mm) { struct page *pte; -#ifdef CONFIG_HIGHPTE - pte = alloc_pages(PGALLOC_GFP | __GFP_HIGHMEM, 0); -#else - pte = alloc_pages(PGALLOC_GFP, 0); -#endif + pte = __pte_alloc_one(mm, GFP_PGTABLE_USER | PGTABLE_HIGHMEM); if (!pte) return NULL; if (!PageHighMem(pte)) clean_pte_table(page_address(pte)); - if (!pgtable_page_ctor(pte)) { - __free_page(pte); - return NULL; - } return pte; } -/* - * Free one PTE table. - */ -static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) -{ - if (pte) - free_page((unsigned long)pte); -} - -static inline void pte_free(struct mm_struct *mm, pgtable_t pte) -{ - pgtable_page_dtor(pte); - __free_page(pte); -} - static inline void __pmd_populate(pmd_t *pmdp, phys_addr_t pte, pmdval_t prot) { diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index 1aa2586fa597..d9a0038774a6 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -729,7 +729,7 @@ static void __init *early_alloc(unsigned long sz) static void *__init late_alloc(unsigned long sz) { - void *ptr = (void *)__get_free_pages(PGALLOC_GFP, get_order(sz)); + void *ptr = (void *)__get_free_pages(GFP_PGTABLE_KERNEL, get_order(sz)); if (!ptr || !pgtable_page_ctor(virt_to_page(ptr))) BUG(); From 50f11a8a4620eee6b6831e69ab5d42456546d7d8 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 11 Jul 2019 20:58:02 -0700 Subject: [PATCH 105/147] arm64: switch to generic version of pte allocation The PTE allocations in arm64 are identical to the generic ones modulo the GFP flags. Using the generic pte_alloc_one() functions ensures that the user page tables are allocated with __GFP_ACCOUNT set. The arm64 definition of PGALLOC_GFP is removed and replaced with GFP_PGTABLE_USER for p[gum]d_alloc_one() for the user page tables and GFP_PGTABLE_KERNEL for the kernel page tables. The KVM memory cache is now using GFP_PGTABLE_USER. The mappings created with create_pgd_mapping() are now using GFP_PGTABLE_KERNEL. The conversion to the generic version of pte_free_kernel() removes the NULL check for pte. The pte_free() version on arm64 is identical to the generic one and can be simply dropped. [cai@lca.pw: fix a bogus GFP flag in pgd_alloc()] Link: https://lore.kernel.org/r/1559656836-24940-1-git-send-email-cai@lca.pw/ [and fix it more] Link: https://lore.kernel.org/linux-mm/20190617151252.GF16810@rapoport-lnx/ Link: http://lkml.kernel.org/r/1557296232-15361-5-git-send-email-rppt@linux.ibm.com Signed-off-by: Mike Rapoport Cc: Albert Ou Cc: Anshuman Khandual Cc: Anton Ivanov Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Geert Uytterhoeven Cc: Greentime Hu Cc: Guan Xuetao Cc: Guo Ren Cc: Guo Ren Cc: Helge Deller Cc: Ley Foon Tan Cc: Matthew Wilcox Cc: Matt Turner Cc: Michael Ellerman Cc: Michal Hocko Cc: Palmer Dabbelt Cc: Paul Burton Cc: Ralf Baechle Cc: Richard Kuo Cc: Richard Weinberger Cc: Russell King Cc: Sam Creasey Cc: Vincent Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/include/asm/pgalloc.h | 47 ++++++-------------------------- arch/arm64/mm/mmu.c | 2 +- arch/arm64/mm/pgd.c | 6 ++-- virt/kvm/arm/mmu.c | 2 +- 4 files changed, 14 insertions(+), 43 deletions(-) diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h index cdced518378d..14d0bc44d451 100644 --- a/arch/arm64/include/asm/pgalloc.h +++ b/arch/arm64/include/asm/pgalloc.h @@ -13,18 +13,23 @@ #include #include +#include /* for pte_{alloc,free}_one */ + #define check_pgt_cache() do { } while (0) -#define PGALLOC_GFP (GFP_KERNEL | __GFP_ZERO) #define PGD_SIZE (PTRS_PER_PGD * sizeof(pgd_t)) #if CONFIG_PGTABLE_LEVELS > 2 static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) { + gfp_t gfp = GFP_PGTABLE_USER; struct page *page; - page = alloc_page(PGALLOC_GFP); + if (mm == &init_mm) + gfp = GFP_PGTABLE_KERNEL; + + page = alloc_page(gfp); if (!page) return NULL; if (!pgtable_pmd_page_ctor(page)) { @@ -61,7 +66,7 @@ static inline void __pud_populate(pud_t *pudp, phys_addr_t pmdp, pudval_t prot) static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) { - return (pud_t *)__get_free_page(PGALLOC_GFP); + return (pud_t *)__get_free_page(GFP_PGTABLE_USER); } static inline void pud_free(struct mm_struct *mm, pud_t *pudp) @@ -89,42 +94,6 @@ static inline void __pgd_populate(pgd_t *pgdp, phys_addr_t pudp, pgdval_t prot) extern pgd_t *pgd_alloc(struct mm_struct *mm); extern void pgd_free(struct mm_struct *mm, pgd_t *pgdp); -static inline pte_t * -pte_alloc_one_kernel(struct mm_struct *mm) -{ - return (pte_t *)__get_free_page(PGALLOC_GFP); -} - -static inline pgtable_t -pte_alloc_one(struct mm_struct *mm) -{ - struct page *pte; - - pte = alloc_pages(PGALLOC_GFP, 0); - if (!pte) - return NULL; - if (!pgtable_page_ctor(pte)) { - __free_page(pte); - return NULL; - } - return pte; -} - -/* - * Free a PTE table. - */ -static inline void pte_free_kernel(struct mm_struct *mm, pte_t *ptep) -{ - if (ptep) - free_page((unsigned long)ptep); -} - -static inline void pte_free(struct mm_struct *mm, pgtable_t pte) -{ - pgtable_page_dtor(pte); - __free_page(pte); -} - static inline void __pmd_populate(pmd_t *pmdp, phys_addr_t ptep, pmdval_t prot) { diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 3645f29bd814..1b49c08dfa2b 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -362,7 +362,7 @@ static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys, static phys_addr_t __pgd_pgtable_alloc(int shift) { - void *ptr = (void *)__get_free_page(PGALLOC_GFP); + void *ptr = (void *)__get_free_page(GFP_PGTABLE_KERNEL); BUG_ON(!ptr); /* Ensure the zeroed page is visible to the page table walker */ diff --git a/arch/arm64/mm/pgd.c b/arch/arm64/mm/pgd.c index 9a0c7d5090d6..7548f9ca1f11 100644 --- a/arch/arm64/mm/pgd.c +++ b/arch/arm64/mm/pgd.c @@ -19,10 +19,12 @@ static struct kmem_cache *pgd_cache __ro_after_init; pgd_t *pgd_alloc(struct mm_struct *mm) { + gfp_t gfp = GFP_PGTABLE_USER; + if (PGD_SIZE == PAGE_SIZE) - return (pgd_t *)__get_free_page(PGALLOC_GFP); + return (pgd_t *)__get_free_page(gfp); else - return kmem_cache_alloc(pgd_cache, PGALLOC_GFP); + return kmem_cache_alloc(pgd_cache, gfp); } void pgd_free(struct mm_struct *mm, pgd_t *pgd) diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index 198e5171e1f7..38b4c910b6c3 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -129,7 +129,7 @@ static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, if (cache->nobjs >= min) return 0; while (cache->nobjs < max) { - page = (void *)__get_free_page(PGALLOC_GFP); + page = (void *)__get_free_page(GFP_PGTABLE_USER); if (!page) return -ENOMEM; cache->objects[cache->nobjs++] = page; From bd5ff066514c2dcffd443cfaa55580db0f19caf8 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 11 Jul 2019 20:58:06 -0700 Subject: [PATCH 106/147] csky: switch to generic version of pte allocation The csky implementation pte_alloc_one(), pte_free_kernel() and pte_free() is identical to the generic except of lack of __GFP_ACCOUNT for the user PTEs allocation. Switch csky to use generic version of these functions. The csky implementation of pte_alloc_one_kernel() is not replaced because it does not clear the allocated page but rather sets each PTE in it to a non-zero value. The pte_free_kernel() and pte_free() versions on csky are identical to the generic ones and can be simply dropped. Link: http://lkml.kernel.org/r/1557296232-15361-6-git-send-email-rppt@linux.ibm.com Signed-off-by: Mike Rapoport Acked-by: Guo Ren Cc: Albert Ou Cc: Anshuman Khandual Cc: Anton Ivanov Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Geert Uytterhoeven Cc: Greentime Hu Cc: Guan Xuetao Cc: Guo Ren Cc: Helge Deller Cc: Ley Foon Tan Cc: Matthew Wilcox Cc: Matt Turner Cc: Michael Ellerman Cc: Michal Hocko Cc: Palmer Dabbelt Cc: Paul Burton Cc: Ralf Baechle Cc: Richard Kuo Cc: Richard Weinberger Cc: Russell King Cc: Sam Creasey Cc: Vincent Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/csky/include/asm/pgalloc.h | 30 +++--------------------------- 1 file changed, 3 insertions(+), 27 deletions(-) diff --git a/arch/csky/include/asm/pgalloc.h b/arch/csky/include/asm/pgalloc.h index d213bb47b717..98c5716708d6 100644 --- a/arch/csky/include/asm/pgalloc.h +++ b/arch/csky/include/asm/pgalloc.h @@ -8,6 +8,9 @@ #include #include +#define __HAVE_ARCH_PTE_ALLOC_ONE_KERNEL +#include /* for pte_{alloc,free}_one */ + static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte) { @@ -39,33 +42,6 @@ static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) return pte; } -static inline struct page *pte_alloc_one(struct mm_struct *mm) -{ - struct page *pte; - - pte = alloc_pages(GFP_KERNEL | __GFP_ZERO, 0); - if (!pte) - return NULL; - - if (!pgtable_page_ctor(pte)) { - __free_page(pte); - return NULL; - } - - return pte; -} - -static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) -{ - free_pages((unsigned long)pte, PTE_ORDER); -} - -static inline void pte_free(struct mm_struct *mm, pgtable_t pte) -{ - pgtable_page_dtor(pte); - __free_pages(pte, PTE_ORDER); -} - static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) { free_pages((unsigned long)pgd, PGD_ORDER); From 14c0a39c9af9a25e0f94f9be89431c2debb34f2c Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 11 Jul 2019 20:58:10 -0700 Subject: [PATCH 107/147] m68k: sun3: switch to generic version of pte allocation The sun3 MMU variant of m68k uses GFP_KERNEL to allocate a PTE page and then memset(0) or clear_highpage() to clear it. This is equivalent to allocating the page with GFP_KERNEL | __GFP_ZERO, which allows replacing sun3 implementation of pte_alloc_one() and pte_alloc_one_kernel() with the generic ones. The pte_free() and pte_free_kernel() versions are identical to the generic ones and can be simply dropped. Link: http://lkml.kernel.org/r/1557296232-15361-8-git-send-email-rppt@linux.ibm.com Signed-off-by: Mike Rapoport Cc: Albert Ou Cc: Anshuman Khandual Cc: Anton Ivanov Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Geert Uytterhoeven Cc: Greentime Hu Cc: Guan Xuetao Cc: Guo Ren Cc: Guo Ren Cc: Helge Deller Cc: Ley Foon Tan Cc: Matthew Wilcox Cc: Matt Turner Cc: Michael Ellerman Cc: Michal Hocko Cc: Palmer Dabbelt Cc: Paul Burton Cc: Ralf Baechle Cc: Richard Kuo Cc: Richard Weinberger Cc: Russell King Cc: Sam Creasey Cc: Vincent Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/m68k/include/asm/sun3_pgalloc.h | 41 ++-------------------------- 1 file changed, 2 insertions(+), 39 deletions(-) diff --git a/arch/m68k/include/asm/sun3_pgalloc.h b/arch/m68k/include/asm/sun3_pgalloc.h index 1456c5eecbd9..1a8ddbd0d23c 100644 --- a/arch/m68k/include/asm/sun3_pgalloc.h +++ b/arch/m68k/include/asm/sun3_pgalloc.h @@ -13,55 +13,18 @@ #include +#include /* for pte_{alloc,free}_one */ + extern const char bad_pmd_string[]; #define pmd_alloc_one(mm,address) ({ BUG(); ((pmd_t *)2); }) - -static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) -{ - free_page((unsigned long) pte); -} - -static inline void pte_free(struct mm_struct *mm, pgtable_t page) -{ - pgtable_page_dtor(page); - __free_page(page); -} - #define __pte_free_tlb(tlb,pte,addr) \ do { \ pgtable_page_dtor(pte); \ tlb_remove_page((tlb), pte); \ } while (0) -static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) -{ - unsigned long page = __get_free_page(GFP_KERNEL); - - if (!page) - return NULL; - - memset((void *)page, 0, PAGE_SIZE); - return (pte_t *) (page); -} - -static inline pgtable_t pte_alloc_one(struct mm_struct *mm) -{ - struct page *page = alloc_pages(GFP_KERNEL, 0); - - if (page == NULL) - return NULL; - - clear_highpage(page); - if (!pgtable_page_ctor(page)) { - __free_page(page); - return NULL; - } - return page; - -} - static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte) { pmd_val(*pmd) = __pa((unsigned long)pte); From b7902ce175476b767e6c614fded293faf906deee Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 11 Jul 2019 20:58:14 -0700 Subject: [PATCH 108/147] mips: switch to generic version of pte allocation MIPS allocates kernel PTE pages with __get_free_pages(GFP_KERNEL | __GFP_ZERO, PTE_ORDER) and user PTE pages with pte = alloc_pages(GFP_KERNEL, PTE_ORDER) and then uses clear_highpage(pte) to zero out the allocated page for the user page tables. The PTE_ORDER is hardwired to zero, which makes MIPS implementation almost identical to the generic one. Switch MIPS to the generic version that does exactly the same thing for the kernel page tables and adds __GFP_ACCOUNT for the user PTEs. The pte_free_kernel() and pte_free() versions on mips are identical to the generic ones and can be simply dropped. Link: http://lkml.kernel.org/r/1557296232-15361-9-git-send-email-rppt@linux.ibm.com Signed-off-by: Mike Rapoport Acked-by: Paul Burton Cc: Albert Ou Cc: Anshuman Khandual Cc: Anton Ivanov Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Geert Uytterhoeven Cc: Greentime Hu Cc: Guan Xuetao Cc: Guo Ren Cc: Guo Ren Cc: Helge Deller Cc: Ley Foon Tan Cc: Matthew Wilcox Cc: Matt Turner Cc: Michael Ellerman Cc: Michal Hocko Cc: Palmer Dabbelt Cc: Ralf Baechle Cc: Richard Kuo Cc: Richard Weinberger Cc: Russell King Cc: Sam Creasey Cc: Vincent Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/mips/include/asm/pgalloc.h | 33 ++------------------------------- 1 file changed, 2 insertions(+), 31 deletions(-) diff --git a/arch/mips/include/asm/pgalloc.h b/arch/mips/include/asm/pgalloc.h index 27808d9461f4..aa16b85ddffc 100644 --- a/arch/mips/include/asm/pgalloc.h +++ b/arch/mips/include/asm/pgalloc.h @@ -13,6 +13,8 @@ #include #include +#include /* for pte_{alloc,free}_one */ + static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte) { @@ -50,37 +52,6 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) free_pages((unsigned long)pgd, PGD_ORDER); } -static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) -{ - return (pte_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, PTE_ORDER); -} - -static inline struct page *pte_alloc_one(struct mm_struct *mm) -{ - struct page *pte; - - pte = alloc_pages(GFP_KERNEL, PTE_ORDER); - if (!pte) - return NULL; - clear_highpage(pte); - if (!pgtable_page_ctor(pte)) { - __free_page(pte); - return NULL; - } - return pte; -} - -static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) -{ - free_pages((unsigned long)pte, PTE_ORDER); -} - -static inline void pte_free(struct mm_struct *mm, pgtable_t pte) -{ - pgtable_page_dtor(pte); - __free_pages(pte, PTE_ORDER); -} - #define __pte_free_tlb(tlb,pte,address) \ do { \ pgtable_page_dtor(pte); \ From f52a8e1a67cde67c33d5c2eabd6494dcab956677 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 11 Jul 2019 20:58:18 -0700 Subject: [PATCH 109/147] nds32: switch to generic version of pte allocation The nds32 implementation of pte_alloc_one_kernel() differs from the generic in the use of __GFP_RETRY_MAYFAIL flag, which is removed after the conversion. The nds32 version of pte_alloc_one() missed the call to pgtable_page_ctor() and also used __GFP_RETRY_MAYFAIL. Switching it to use generic __pte_alloc_one() for the PTE page allocation ensures that page table constructor is run and the user page tables are allocated with __GFP_ACCOUNT. The conversion to the generic version of pte_free_kernel() removes the NULL check for pte. The pte_free() version on nds32 is identical to the generic one and can be simply dropped. Link: http://lkml.kernel.org/r/1557296232-15361-10-git-send-email-rppt@linux.ibm.com Signed-off-by: Mike Rapoport Cc: Albert Ou Cc: Anshuman Khandual Cc: Anton Ivanov Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Geert Uytterhoeven Cc: Greentime Hu Cc: Guan Xuetao Cc: Guo Ren Cc: Guo Ren Cc: Helge Deller Cc: Ley Foon Tan Cc: Matthew Wilcox Cc: Matt Turner Cc: Michael Ellerman Cc: Michal Hocko Cc: Palmer Dabbelt Cc: Paul Burton Cc: Ralf Baechle Cc: Richard Kuo Cc: Richard Weinberger Cc: Russell King Cc: Sam Creasey Cc: Vincent Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/nds32/include/asm/pgalloc.h | 31 ++++--------------------------- 1 file changed, 4 insertions(+), 27 deletions(-) diff --git a/arch/nds32/include/asm/pgalloc.h b/arch/nds32/include/asm/pgalloc.h index 3cbc749c79aa..e78b43d8389f 100644 --- a/arch/nds32/include/asm/pgalloc.h +++ b/arch/nds32/include/asm/pgalloc.h @@ -9,6 +9,9 @@ #include #include +#define __HAVE_ARCH_PTE_ALLOC_ONE +#include /* for pte_{alloc,free}_one */ + /* * Since we have only two-level page tables, these are trivial */ @@ -22,43 +25,17 @@ extern void pgd_free(struct mm_struct *mm, pgd_t * pgd); #define check_pgt_cache() do { } while (0) -static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) -{ - pte_t *pte; - - pte = - (pte_t *) __get_free_page(GFP_KERNEL | __GFP_RETRY_MAYFAIL | - __GFP_ZERO); - - return pte; -} - static inline pgtable_t pte_alloc_one(struct mm_struct *mm) { pgtable_t pte; - pte = alloc_pages(GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_ZERO, 0); + pte = __pte_alloc_one(mm, GFP_PGTABLE_USER); if (pte) cpu_dcache_wb_page((unsigned long)page_address(pte)); return pte; } -/* - * Free one PTE table. - */ -static inline void pte_free_kernel(struct mm_struct *mm, pte_t * pte) -{ - if (pte) { - free_page((unsigned long)pte); - } -} - -static inline void pte_free(struct mm_struct *mm, pgtable_t pte) -{ - __free_page(pte); -} - /* * Populate the pmdp entry with a pointer to the pte. This pmd is part * of the mm address space. From fc7835c2f8ea800ded22f68bd782cd17a6dd83cd Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 11 Jul 2019 20:58:22 -0700 Subject: [PATCH 110/147] nios2: switch to generic version of pte allocation nios2 allocates kernel PTE pages with __get_free_pages(GFP_KERNEL | __GFP_ZERO, PTE_ORDER); and user page tables with pte = alloc_pages(GFP_KERNEL, PTE_ORDER); if (pte) clear_highpage(); The PTE_ORDER is hardwired to zero, which makes nios2 implementation almost identical to the generic one. Switch nios2 to the generic version that does exactly the same thing for the kernel page tables and adds __GFP_ACCOUNT for the user PTEs. The pte_free_kernel() and pte_free() versions on nios2 are identical to the generic ones and can be simply dropped. Link: http://lkml.kernel.org/r/1557296232-15361-11-git-send-email-rppt@linux.ibm.com Signed-off-by: Mike Rapoport Cc: Albert Ou Cc: Anshuman Khandual Cc: Anton Ivanov Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Geert Uytterhoeven Cc: Greentime Hu Cc: Guan Xuetao Cc: Guo Ren Cc: Guo Ren Cc: Helge Deller Cc: Ley Foon Tan Cc: Matthew Wilcox Cc: Matt Turner Cc: Michael Ellerman Cc: Michal Hocko Cc: Palmer Dabbelt Cc: Paul Burton Cc: Ralf Baechle Cc: Richard Kuo Cc: Richard Weinberger Cc: Russell King Cc: Sam Creasey Cc: Vincent Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/nios2/include/asm/pgalloc.h | 37 ++------------------------------ 1 file changed, 2 insertions(+), 35 deletions(-) diff --git a/arch/nios2/include/asm/pgalloc.h b/arch/nios2/include/asm/pgalloc.h index 3a149ead1207..4bc8cf72067e 100644 --- a/arch/nios2/include/asm/pgalloc.h +++ b/arch/nios2/include/asm/pgalloc.h @@ -12,6 +12,8 @@ #include +#include /* for pte_{alloc,free}_one */ + static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte) { @@ -37,41 +39,6 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) free_pages((unsigned long)pgd, PGD_ORDER); } -static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) -{ - pte_t *pte; - - pte = (pte_t *) __get_free_pages(GFP_KERNEL|__GFP_ZERO, PTE_ORDER); - - return pte; -} - -static inline pgtable_t pte_alloc_one(struct mm_struct *mm) -{ - struct page *pte; - - pte = alloc_pages(GFP_KERNEL, PTE_ORDER); - if (pte) { - if (!pgtable_page_ctor(pte)) { - __free_page(pte); - return NULL; - } - clear_highpage(pte); - } - return pte; -} - -static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) -{ - free_pages((unsigned long)pte, PTE_ORDER); -} - -static inline void pte_free(struct mm_struct *mm, struct page *pte) -{ - pgtable_page_dtor(pte); - __free_pages(pte, PTE_ORDER); -} - #define __pte_free_tlb(tlb, pte, addr) \ do { \ pgtable_page_dtor(pte); \ From 3f4a13085dd88cb806a2c64fb1286e9cf3a98cd0 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 11 Jul 2019 20:58:27 -0700 Subject: [PATCH 111/147] parisc: switch to generic version of pte allocation parisc allocates PTE pages with __get_free_page() and uses GFP_KERNEL | __GFP_ZERO for the allocations. Switch it to the generic version that does exactly the same thing for the kernel page tables and adds __GFP_ACCOUNT for the user PTEs. The pte_free_kernel() and pte_free() versions on are identical to the generic ones and can be simply dropped. Link: http://lkml.kernel.org/r/1557296232-15361-12-git-send-email-rppt@linux.ibm.com Signed-off-by: Mike Rapoport Cc: Albert Ou Cc: Anshuman Khandual Cc: Anton Ivanov Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Geert Uytterhoeven Cc: Greentime Hu Cc: Guan Xuetao Cc: Guo Ren Cc: Guo Ren Cc: Helge Deller Cc: Ley Foon Tan Cc: Matthew Wilcox Cc: Matt Turner Cc: Michael Ellerman Cc: Michal Hocko Cc: Palmer Dabbelt Cc: Paul Burton Cc: Ralf Baechle Cc: Richard Kuo Cc: Richard Weinberger Cc: Russell King Cc: Sam Creasey Cc: Vincent Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/parisc/include/asm/pgalloc.h | 33 ++----------------------------- 1 file changed, 2 insertions(+), 31 deletions(-) diff --git a/arch/parisc/include/asm/pgalloc.h b/arch/parisc/include/asm/pgalloc.h index ea75cc966dae..4f2059a50fae 100644 --- a/arch/parisc/include/asm/pgalloc.h +++ b/arch/parisc/include/asm/pgalloc.h @@ -10,6 +10,8 @@ #include +#include /* for pte_{alloc,free}_one */ + /* Allocate the top level pgd (page directory) * * Here (for 64 bit kernels) we implement a Hybrid L2/L3 scheme: we @@ -122,37 +124,6 @@ pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte) pmd_populate_kernel(mm, pmd, page_address(pte_page)) #define pmd_pgtable(pmd) pmd_page(pmd) -static inline pgtable_t -pte_alloc_one(struct mm_struct *mm) -{ - struct page *page = alloc_page(GFP_KERNEL|__GFP_ZERO); - if (!page) - return NULL; - if (!pgtable_page_ctor(page)) { - __free_page(page); - return NULL; - } - return page; -} - -static inline pte_t * -pte_alloc_one_kernel(struct mm_struct *mm) -{ - pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_ZERO); - return pte; -} - -static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) -{ - free_page((unsigned long)pte); -} - -static inline void pte_free(struct mm_struct *mm, struct page *pte) -{ - pgtable_page_dtor(pte); - pte_free_kernel(mm, page_address(pte)); -} - #define check_pgt_cache() do { } while (0) #endif From d1b46fe50c8b0e0b4035c48ccd5f655aa7ceea16 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 11 Jul 2019 20:58:31 -0700 Subject: [PATCH 112/147] riscv: switch to generic version of pte allocation The only difference between the generic and RISC-V implementation of PTE allocation is the usage of __GFP_RETRY_MAYFAIL for both kernel and user PTEs and the absence of __GFP_ACCOUNT for the user PTEs. The conversion to the generic version removes the __GFP_RETRY_MAYFAIL and ensures that GFP_ACCOUNT is used for the user PTE allocations. The pte_free() and pte_free_kernel() versions are identical to the generic ones and can be simply dropped. Link: http://lkml.kernel.org/r/1557296232-15361-13-git-send-email-rppt@linux.ibm.com Signed-off-by: Mike Rapoport Reviewed-by: Palmer Dabbelt Cc: Albert Ou Cc: Anshuman Khandual Cc: Anton Ivanov Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Geert Uytterhoeven Cc: Greentime Hu Cc: Guan Xuetao Cc: Guo Ren Cc: Guo Ren Cc: Helge Deller Cc: Ley Foon Tan Cc: Matthew Wilcox Cc: Matt Turner Cc: Michael Ellerman Cc: Michal Hocko Cc: Paul Burton Cc: Ralf Baechle Cc: Richard Kuo Cc: Richard Weinberger Cc: Russell King Cc: Sam Creasey Cc: Vincent Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/riscv/include/asm/pgalloc.h | 29 ++--------------------------- 1 file changed, 2 insertions(+), 27 deletions(-) diff --git a/arch/riscv/include/asm/pgalloc.h b/arch/riscv/include/asm/pgalloc.h index eb8b0195f27f..56a67d66f72f 100644 --- a/arch/riscv/include/asm/pgalloc.h +++ b/arch/riscv/include/asm/pgalloc.h @@ -10,6 +10,8 @@ #include #include +#include /* for pte_{alloc,free}_one */ + static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte) { @@ -74,33 +76,6 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) #endif /* __PAGETABLE_PMD_FOLDED */ -static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) -{ - return (pte_t *)__get_free_page( - GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_ZERO); -} - -static inline struct page *pte_alloc_one(struct mm_struct *mm) -{ - struct page *pte; - - pte = alloc_page(GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_ZERO); - if (likely(pte != NULL)) - pgtable_page_ctor(pte); - return pte; -} - -static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) -{ - free_page((unsigned long)pte); -} - -static inline void pte_free(struct mm_struct *mm, pgtable_t pte) -{ - pgtable_page_dtor(pte); - __free_page(pte); -} - #define __pte_free_tlb(tlb, pte, buf) \ do { \ pgtable_page_dtor(pte); \ From f32848e16939e1407ad3413f7faa3e0a8ad802eb Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 11 Jul 2019 20:58:35 -0700 Subject: [PATCH 113/147] um: switch to generic version of pte allocation um allocates PTE pages with __get_free_page() and uses GFP_KERNEL | __GFP_ZERO for the allocations. Switch it to the generic version that does exactly the same thing for the kernel page tables and adds __GFP_ACCOUNT for the user PTEs. The pte_free() and pte_free_kernel() versions are identical to the generic ones and can be simply dropped. Link: http://lkml.kernel.org/r/1557296232-15361-14-git-send-email-rppt@linux.ibm.com Signed-off-by: Mike Rapoport Reviewed-by: Anton Ivanov Acked-by: Anton Ivanov Cc: Albert Ou Cc: Anshuman Khandual Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Geert Uytterhoeven Cc: Greentime Hu Cc: Guan Xuetao Cc: Guo Ren Cc: Guo Ren Cc: Helge Deller Cc: Ley Foon Tan Cc: Matthew Wilcox Cc: Matt Turner Cc: Michael Ellerman Cc: Michal Hocko Cc: Palmer Dabbelt Cc: Paul Burton Cc: Ralf Baechle Cc: Richard Kuo Cc: Richard Weinberger Cc: Russell King Cc: Sam Creasey Cc: Vincent Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/um/include/asm/pgalloc.h | 16 ++-------------- arch/um/kernel/mem.c | 22 ---------------------- 2 files changed, 2 insertions(+), 36 deletions(-) diff --git a/arch/um/include/asm/pgalloc.h b/arch/um/include/asm/pgalloc.h index 99eb5682792a..d7b282e9c4d5 100644 --- a/arch/um/include/asm/pgalloc.h +++ b/arch/um/include/asm/pgalloc.h @@ -10,6 +10,8 @@ #include +#include /* for pte_{alloc,free}_one */ + #define pmd_populate_kernel(mm, pmd, pte) \ set_pmd(pmd, __pmd(_PAGE_TABLE + (unsigned long) __pa(pte))) @@ -25,20 +27,6 @@ extern pgd_t *pgd_alloc(struct mm_struct *); extern void pgd_free(struct mm_struct *mm, pgd_t *pgd); -extern pte_t *pte_alloc_one_kernel(struct mm_struct *); -extern pgtable_t pte_alloc_one(struct mm_struct *); - -static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) -{ - free_page((unsigned long) pte); -} - -static inline void pte_free(struct mm_struct *mm, pgtable_t pte) -{ - pgtable_page_dtor(pte); - __free_page(pte); -} - #define __pte_free_tlb(tlb,pte, address) \ do { \ pgtable_page_dtor(pte); \ diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c index a9c9a94c096f..de58e976b9bc 100644 --- a/arch/um/kernel/mem.c +++ b/arch/um/kernel/mem.c @@ -208,28 +208,6 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd) free_page((unsigned long) pgd); } -pte_t *pte_alloc_one_kernel(struct mm_struct *mm) -{ - pte_t *pte; - - pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_ZERO); - return pte; -} - -pgtable_t pte_alloc_one(struct mm_struct *mm) -{ - struct page *pte; - - pte = alloc_page(GFP_KERNEL|__GFP_ZERO); - if (!pte) - return NULL; - if (!pgtable_page_ctor(pte)) { - __free_page(pte); - return NULL; - } - return pte; -} - #ifdef CONFIG_3_LEVEL_PGTABLES pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address) { From c2471e79a7ea0f48e3ae9253e1f3688a44cc944d Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 11 Jul 2019 20:58:39 -0700 Subject: [PATCH 114/147] unicore32: switch to generic version of pte allocation Replace __get_free_page() and alloc_pages() calls with the generic __pte_alloc_one_kernel() and __pte_alloc_one(). There is no functional change for the kernel PTE allocation. The difference for the user PTEs, is that the clear_pte_table() is now called after pgtable_page_ctor() and the addition of __GFP_ACCOUNT to the GFP flags. The pte_free() and pte_free_kernel() versions are identical to the generic ones and can be simply dropped. Link: http://lkml.kernel.org/r/1557296232-15361-15-git-send-email-rppt@linux.ibm.com Signed-off-by: Mike Rapoport Cc: Arnd Bergmann Cc: Anshuman Khandual Cc: Catalin Marinas Cc: Geert Uytterhoeven Cc: Greentime Hu Cc: Guan Xuetao Cc: Guo Ren Cc: Helge Deller Cc: Ley Foon Tan Cc: Matthew Wilcox Cc: Matt Turner Cc: Michael Ellerman Cc: Michal Hocko Cc: Palmer Dabbelt Cc: Paul Burton Cc: Richard Kuo Cc: Richard Weinberger Cc: Russell King Cc: Sam Creasey Cc: Ralf Baechle Cc: Vincent Chen Cc: Albert Ou Cc: Anton Ivanov Cc: Guo Ren Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/unicore32/include/asm/pgalloc.h | 36 +++++++--------------------- 1 file changed, 8 insertions(+), 28 deletions(-) diff --git a/arch/unicore32/include/asm/pgalloc.h b/arch/unicore32/include/asm/pgalloc.h index ec64834b1c6a..3f0903bd98e9 100644 --- a/arch/unicore32/include/asm/pgalloc.h +++ b/arch/unicore32/include/asm/pgalloc.h @@ -14,6 +14,10 @@ #include #include +#define __HAVE_ARCH_PTE_ALLOC_ONE_KERNEL +#define __HAVE_ARCH_PTE_ALLOC_ONE +#include + #define check_pgt_cache() do { } while (0) #define _PAGE_USER_TABLE (PMD_TYPE_TABLE | PMD_PRESENT) @@ -25,17 +29,14 @@ extern void free_pgd_slow(struct mm_struct *mm, pgd_t *pgd); #define pgd_alloc(mm) get_pgd_slow(mm) #define pgd_free(mm, pgd) free_pgd_slow(mm, pgd) -#define PGALLOC_GFP (GFP_KERNEL | __GFP_ZERO) - /* * Allocate one PTE table. */ static inline pte_t * pte_alloc_one_kernel(struct mm_struct *mm) { - pte_t *pte; + pte_t *pte = __pte_alloc_one_kernel(mm); - pte = (pte_t *)__get_free_page(PGALLOC_GFP); if (pte) clean_dcache_area(pte, PTRS_PER_PTE * sizeof(pte_t)); @@ -47,35 +48,14 @@ pte_alloc_one(struct mm_struct *mm) { struct page *pte; - pte = alloc_pages(PGALLOC_GFP, 0); + pte = __pte_alloc_one(mm, GFP_PGTABLE_USER); if (!pte) return NULL; - if (!PageHighMem(pte)) { - void *page = page_address(pte); - clean_dcache_area(page, PTRS_PER_PTE * sizeof(pte_t)); - } - if (!pgtable_page_ctor(pte)) { - __free_page(pte); - } - + if (!PageHighMem(pte)) + clean_pte_table(page_address(pte)); return pte; } -/* - * Free one PTE table. - */ -static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) -{ - if (pte) - free_page((unsigned long)pte); -} - -static inline void pte_free(struct mm_struct *mm, pgtable_t pte) -{ - pgtable_page_dtor(pte); - __free_page(pte); -} - static inline void __pmd_populate(pmd_t *pmdp, unsigned long pmdval) { set_pmd(pmdp, __pmd(pmdval)); From 8b1e0f81fb6fcf3109465a168b2e2da3f711fa86 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Thu, 11 Jul 2019 20:58:43 -0700 Subject: [PATCH 115/147] mm/pgtable: drop pgtable_t variable from pte_fn_t functions Drop the pgtable_t variable from all implementation for pte_fn_t as none of them use it. apply_to_pte_range() should stop computing it as well. Should help us save some cycles. Link: http://lkml.kernel.org/r/1556803126-26596-1-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Acked-by: Matthew Wilcox Cc: Ard Biesheuvel Cc: Russell King Cc: Catalin Marinas Cc: Will Deacon Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Michal Hocko Cc: Logan Gunthorpe Cc: "Kirill A. Shutemov" Cc: Dan Williams Cc: Cc: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/kernel/efi.c | 3 +-- arch/arm/mm/dma-mapping.c | 3 +-- arch/arm/mm/pageattr.c | 3 +-- arch/arm64/kernel/efi.c | 3 +-- arch/arm64/mm/pageattr.c | 3 +-- arch/x86/xen/mmu_pv.c | 3 +-- drivers/gpu/drm/i915/i915_mm.c | 3 +-- drivers/xen/gntdev.c | 6 ++---- drivers/xen/privcmd.c | 6 ++---- drivers/xen/xlate_mmu.c | 3 +-- include/linux/mm.h | 3 +-- mm/memory.c | 5 +---- mm/vmalloc.c | 2 +- 13 files changed, 15 insertions(+), 31 deletions(-) diff --git a/arch/arm/kernel/efi.c b/arch/arm/kernel/efi.c index ed005870671a..e57dbcc89123 100644 --- a/arch/arm/kernel/efi.c +++ b/arch/arm/kernel/efi.c @@ -8,8 +8,7 @@ #include #include -static int __init set_permissions(pte_t *ptep, pgtable_t token, - unsigned long addr, void *data) +static int __init set_permissions(pte_t *ptep, unsigned long addr, void *data) { efi_memory_desc_t *md = data; pte_t pte = *ptep; diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c index 439bb6a59a04..1fb5c0ca1ed8 100644 --- a/arch/arm/mm/dma-mapping.c +++ b/arch/arm/mm/dma-mapping.c @@ -493,8 +493,7 @@ void __init dma_contiguous_remap(void) } } -static int __dma_update_pte(pte_t *pte, pgtable_t token, unsigned long addr, - void *data) +static int __dma_update_pte(pte_t *pte, unsigned long addr, void *data) { struct page *page = virt_to_page(addr); pgprot_t prot = *(pgprot_t *)data; diff --git a/arch/arm/mm/pageattr.c b/arch/arm/mm/pageattr.c index 0f5faf30d9bf..d546efad7e97 100644 --- a/arch/arm/mm/pageattr.c +++ b/arch/arm/mm/pageattr.c @@ -14,8 +14,7 @@ struct page_change_data { pgprot_t clear_mask; }; -static int change_page_range(pte_t *ptep, pgtable_t token, unsigned long addr, - void *data) +static int change_page_range(pte_t *ptep, unsigned long addr, void *data) { struct page_change_data *cdata = data; pte_t pte = *ptep; diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c index 3c33d0dd8e0e..d0cf596db82c 100644 --- a/arch/arm64/kernel/efi.c +++ b/arch/arm64/kernel/efi.c @@ -82,8 +82,7 @@ int __init efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md) return 0; } -static int __init set_permissions(pte_t *ptep, pgtable_t token, - unsigned long addr, void *data) +static int __init set_permissions(pte_t *ptep, unsigned long addr, void *data) { efi_memory_desc_t *md = data; pte_t pte = READ_ONCE(*ptep); diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index fcdcf6cd7677..03c53f16ee77 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -19,8 +19,7 @@ struct page_change_data { bool rodata_full __ro_after_init = IS_ENABLED(CONFIG_RODATA_FULL_DEFAULT_ENABLED); -static int change_page_range(pte_t *ptep, pgtable_t token, unsigned long addr, - void *data) +static int change_page_range(pte_t *ptep, unsigned long addr, void *data) { struct page_change_data *cdata = data; pte_t pte = READ_ONCE(*ptep); diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index beb44e22afdf..f6e5eeecfc69 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -2700,8 +2700,7 @@ struct remap_data { struct mmu_update *mmu_update; }; -static int remap_area_pfn_pte_fn(pte_t *ptep, pgtable_t token, - unsigned long addr, void *data) +static int remap_area_pfn_pte_fn(pte_t *ptep, unsigned long addr, void *data) { struct remap_data *rmd = data; pte_t pte = pte_mkspecial(mfn_pte(*rmd->pfn, rmd->prot)); diff --git a/drivers/gpu/drm/i915/i915_mm.c b/drivers/gpu/drm/i915/i915_mm.c index e4935dd1fd37..c23bb29e6d3e 100644 --- a/drivers/gpu/drm/i915/i915_mm.c +++ b/drivers/gpu/drm/i915/i915_mm.c @@ -35,8 +35,7 @@ struct remap_pfn { pgprot_t prot; }; -static int remap_pfn(pte_t *pte, pgtable_t token, - unsigned long addr, void *data) +static int remap_pfn(pte_t *pte, unsigned long addr, void *data) { struct remap_pfn *r = data; diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index 469dfbd6cf90..4c339c7e66e5 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -264,8 +264,7 @@ void gntdev_put_map(struct gntdev_priv *priv, struct gntdev_grant_map *map) /* ------------------------------------------------------------------ */ -static int find_grant_ptes(pte_t *pte, pgtable_t token, - unsigned long addr, void *data) +static int find_grant_ptes(pte_t *pte, unsigned long addr, void *data) { struct gntdev_grant_map *map = data; unsigned int pgnr = (addr - map->vma->vm_start) >> PAGE_SHIFT; @@ -292,8 +291,7 @@ static int find_grant_ptes(pte_t *pte, pgtable_t token, } #ifdef CONFIG_X86 -static int set_grant_ptes_as_special(pte_t *pte, pgtable_t token, - unsigned long addr, void *data) +static int set_grant_ptes_as_special(pte_t *pte, unsigned long addr, void *data) { set_pte_at(current->mm, addr, pte, pte_mkspecial(*pte)); return 0; diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index 1ff38d8036e9..2f5ce7230a43 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -731,8 +731,7 @@ struct remap_pfn { unsigned long i; }; -static int remap_pfn_fn(pte_t *ptep, pgtable_t token, unsigned long addr, - void *data) +static int remap_pfn_fn(pte_t *ptep, unsigned long addr, void *data) { struct remap_pfn *r = data; struct page *page = r->pages[r->i]; @@ -966,8 +965,7 @@ static int privcmd_mmap(struct file *file, struct vm_area_struct *vma) * on a per pfn/pte basis. Mapping calls that fail with ENOENT * can be then retried until success. */ -static int is_mapped_fn(pte_t *pte, struct page *pmd_page, - unsigned long addr, void *data) +static int is_mapped_fn(pte_t *pte, unsigned long addr, void *data) { return pte_none(*pte) ? 0 : -EBUSY; } diff --git a/drivers/xen/xlate_mmu.c b/drivers/xen/xlate_mmu.c index e7df65d32c91..ba883a80b3c0 100644 --- a/drivers/xen/xlate_mmu.c +++ b/drivers/xen/xlate_mmu.c @@ -93,8 +93,7 @@ static void setup_hparams(unsigned long gfn, void *data) info->fgfn++; } -static int remap_pte_fn(pte_t *ptep, pgtable_t token, unsigned long addr, - void *data) +static int remap_pte_fn(pte_t *ptep, unsigned long addr, void *data) { struct remap_data *info = data; struct page *page = info->pages[info->index++]; diff --git a/include/linux/mm.h b/include/linux/mm.h index cb8d413d635e..bb242ad810eb 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2686,8 +2686,7 @@ static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags) return 0; } -typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, - void *data); +typedef int (*pte_fn_t)(pte_t *pte, unsigned long addr, void *data); extern int apply_to_page_range(struct mm_struct *mm, unsigned long address, unsigned long size, pte_fn_t fn, void *data); diff --git a/mm/memory.c b/mm/memory.c index b47e4e56448a..0428ff5ee339 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2036,7 +2036,6 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, { pte_t *pte; int err; - pgtable_t token; spinlock_t *uninitialized_var(ptl); pte = (mm == &init_mm) ? @@ -2049,10 +2048,8 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, arch_enter_lazy_mmu_mode(); - token = pmd_pgtable(*pmd); - do { - err = fn(pte++, token, addr, data); + err = fn(pte++, addr, data); if (err) break; } while (addr += PAGE_SIZE, addr != end); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 030a544e6602..a5413a6e51fa 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2996,7 +2996,7 @@ void __weak vmalloc_sync_all(void) } -static int f(pte_t *pte, pgtable_t table, unsigned long addr, void *data) +static int f(pte_t *pte, unsigned long addr, void *data) { pte_t ***p = data; From 96756fcb831ddec3ad15f3a107b6e2749084aafb Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Thu, 11 Jul 2019 20:58:47 -0700 Subject: [PATCH 116/147] mm/memory.c: fail when offset == num in first check of __vm_map_pages() If the caller asks us for offset == num, we should already fail in the first check, i.e. the one testing for offsets beyond the object. At the moment, we are failing on the second test anyway, since count cannot be 0. Still, to agree with the comment of the first test, we should first test it there. Link: http://lkml.kernel.org/r/20190528193004.GA7744@gmail.com Signed-off-by: Miguel Ojeda Reviewed-by: Andrew Morton Cc: Souptick Joarder Cc: Matthew Wilcox Cc: Peter Zijlstra Cc: "Aneesh Kumar K.V" Cc: Huang Ying Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index 0428ff5ee339..ad4bf1a1a0ef 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1545,7 +1545,7 @@ static int __vm_map_pages(struct vm_area_struct *vma, struct page **pages, int ret, i; /* Fail if the user requested offset is beyond the end of the object */ - if (offset > num) + if (offset >= num) return -ENXIO; /* Fail if the user requested size exceeds available object size */ From 543bdb2d825fe2400d6e951f1786d92139a16931 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Thu, 11 Jul 2019 20:58:50 -0700 Subject: [PATCH 117/147] mm/mmu_notifier: use hlist_add_head_rcu() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make mmu_notifier_register() safer by issuing a memory barrier before registering a new notifier. This fixes a theoretical bug on weakly ordered CPUs. For example, take this simplified use of notifiers by a driver: my_struct->mn.ops = &my_ops; /* (1) */ mmu_notifier_register(&my_struct->mn, mm) ... hlist_add_head(&mn->hlist, &mm->mmu_notifiers); /* (2) */ ... Once mmu_notifier_register() releases the mm locks, another thread can invalidate a range: mmu_notifier_invalidate_range() ... hlist_for_each_entry_rcu(mn, &mm->mmu_notifiers, hlist) { if (mn->ops->invalidate_range) The read side relies on the data dependency between mn and ops to ensure that the pointer is properly initialized. But the write side doesn't have any dependency between (1) and (2), so they could be reordered and the readers could dereference an invalid mn->ops. mmu_notifier_register() does take all the mm locks before adding to the hlist, but those have acquire semantics which isn't sufficient. By calling hlist_add_head_rcu() instead of hlist_add_head() we update the hlist using a store-release, ensuring that readers see prior initialization of my_struct. This situation is better illustated by litmus test MP+onceassign+derefonce. Link: http://lkml.kernel.org/r/20190502133532.24981-1-jean-philippe.brucker@arm.com Fixes: cddb8a5c14aa ("mmu-notifiers: core") Signed-off-by: Jean-Philippe Brucker Cc: Jérôme Glisse Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmu_notifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 513b9607409d..b5670620aea0 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -274,7 +274,7 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn, * thanks to mm_take_all_locks(). */ spin_lock(&mm->mmu_notifier_mm->lock); - hlist_add_head(&mn->hlist, &mm->mmu_notifier_mm->list); + hlist_add_head_rcu(&mn->hlist, &mm->mmu_notifier_mm->list); spin_unlock(&mm->mmu_notifier_mm->lock); mm_drop_all_locks(mm); From cacca6baf0b0a2dfe8eb3430b5f81916f35284cc Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Thu, 11 Jul 2019 20:58:53 -0700 Subject: [PATCH 118/147] mm/vmalloc.c: remove "node" argument Patch series "Some cleanups for the KVA/vmalloc", v5. This patch (of 4): Remove unused argument from the __alloc_vmap_area() function. Link: http://lkml.kernel.org/r/20190606120411.8298-2-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Andrew Morton Reviewed-by: Roman Gushchin Cc: Hillf Danton Cc: Matthew Wilcox Cc: Michal Hocko Cc: Oleksiy Avramchenko Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index a5413a6e51fa..b645686ef9b6 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -986,7 +986,7 @@ adjust_va_to_fit_type(struct vmap_area *va, */ static __always_inline unsigned long __alloc_vmap_area(unsigned long size, unsigned long align, - unsigned long vstart, unsigned long vend, int node) + unsigned long vstart, unsigned long vend) { unsigned long nva_start_addr; struct vmap_area *va; @@ -1063,7 +1063,7 @@ retry: * If an allocation fails, the "vend" address is * returned. Therefore trigger the overflow path. */ - addr = __alloc_vmap_area(size, align, vstart, vend, node); + addr = __alloc_vmap_area(size, align, vstart, vend); if (unlikely(addr == vend)) goto overflow; From 82dd23e84be3ead53b6d584d836f51852d1096e6 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Thu, 11 Jul 2019 20:58:57 -0700 Subject: [PATCH 119/147] mm/vmalloc.c: preload a CPU with one object for split purpose Refactor the NE_FIT_TYPE split case when it comes to an allocation of one extra object. We need it in order to build a remaining space. The preload is done per CPU in non-atomic context with GFP_KERNEL flags. More permissive parameters can be beneficial for systems which are suffer from high memory pressure or low memory condition. For example on my KVM system(4xCPUs, no swap, 256MB RAM) i can simulate the failure of page allocation with GFP_NOWAIT flags. Using "stress-ng" tool and starting N workers spinning on fork() and exit(), i can trigger below trace: [ 179.815161] stress-ng-fork: page allocation failure: order:0, mode:0x40800(GFP_NOWAIT|__GFP_COMP), nodemask=(null),cpuset=/,mems_allowed=0 [ 179.815168] CPU: 0 PID: 12612 Comm: stress-ng-fork Not tainted 5.2.0-rc3+ #1003 [ 179.815170] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014 [ 179.815171] Call Trace: [ 179.815178] dump_stack+0x5c/0x7b [ 179.815182] warn_alloc+0x108/0x190 [ 179.815187] __alloc_pages_slowpath+0xdc7/0xdf0 [ 179.815191] __alloc_pages_nodemask+0x2de/0x330 [ 179.815194] cache_grow_begin+0x77/0x420 [ 179.815197] fallback_alloc+0x161/0x200 [ 179.815200] kmem_cache_alloc+0x1c9/0x570 [ 179.815202] alloc_vmap_area+0x32c/0x990 [ 179.815206] __get_vm_area_node+0xb0/0x170 [ 179.815208] __vmalloc_node_range+0x6d/0x230 [ 179.815211] ? _do_fork+0xce/0x3d0 [ 179.815213] copy_process.part.46+0x850/0x1b90 [ 179.815215] ? _do_fork+0xce/0x3d0 [ 179.815219] _do_fork+0xce/0x3d0 [ 179.815226] ? __do_page_fault+0x2bf/0x4e0 [ 179.815229] do_syscall_64+0x55/0x130 [ 179.815231] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 179.815234] RIP: 0033:0x7fedec4c738b ... [ 179.815237] RSP: 002b:00007ffda469d730 EFLAGS: 00000246 ORIG_RAX: 0000000000000038 [ 179.815239] RAX: ffffffffffffffda RBX: 00007ffda469d730 RCX: 00007fedec4c738b [ 179.815240] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000001200011 [ 179.815241] RBP: 00007ffda469d780 R08: 00007fededd6e300 R09: 00007ffda47f50a0 [ 179.815242] R10: 00007fededd6e5d0 R11: 0000000000000246 R12: 0000000000000000 [ 179.815243] R13: 0000000000000020 R14: 0000000000000000 R15: 0000000000000000 [ 179.815245] Mem-Info: [ 179.815249] active_anon:12686 inactive_anon:14760 isolated_anon:0 active_file:502 inactive_file:61 isolated_file:70 unevictable:2 dirty:0 writeback:0 unstable:0 slab_reclaimable:2380 slab_unreclaimable:7520 mapped:15069 shmem:14813 pagetables:10833 bounce:0 free:1922 free_pcp:229 free_cma:0 Link: http://lkml.kernel.org/r/20190606120411.8298-3-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Cc: Hillf Danton Cc: Matthew Wilcox Cc: Michal Hocko Cc: Oleksiy Avramchenko Cc: Roman Gushchin Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 51 insertions(+), 4 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index b645686ef9b6..45e0dc0e09f8 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -365,6 +365,13 @@ static LIST_HEAD(free_vmap_area_list); */ static struct rb_root free_vmap_area_root = RB_ROOT; +/* + * Preload a CPU with one object for "no edge" split case. The + * aim is to get rid of allocations from the atomic context, thus + * to use more permissive allocation masks. + */ +static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node); + static __always_inline unsigned long va_size(struct vmap_area *va) { @@ -951,9 +958,24 @@ adjust_va_to_fit_type(struct vmap_area *va, * L V NVA V R * |---|-------|---| */ - lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT); - if (unlikely(!lva)) - return -1; + lva = __this_cpu_xchg(ne_fit_preload_node, NULL); + if (unlikely(!lva)) { + /* + * For percpu allocator we do not do any pre-allocation + * and leave it as it is. The reason is it most likely + * never ends up with NE_FIT_TYPE splitting. In case of + * percpu allocations offsets and sizes are aligned to + * fixed align request, i.e. RE_FIT_TYPE and FL_FIT_TYPE + * are its main fitting cases. + * + * There are a few exceptions though, as an example it is + * a first allocation (early boot up) when we have "one" + * big free space that has to be split. + */ + lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT); + if (!lva) + return -1; + } /* * Build the remainder. @@ -1032,7 +1054,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, unsigned long vstart, unsigned long vend, int node, gfp_t gfp_mask) { - struct vmap_area *va; + struct vmap_area *va, *pva; unsigned long addr; int purged = 0; @@ -1057,7 +1079,32 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask & GFP_RECLAIM_MASK); retry: + /* + * Preload this CPU with one extra vmap_area object to ensure + * that we have it available when fit type of free area is + * NE_FIT_TYPE. + * + * The preload is done in non-atomic context, thus it allows us + * to use more permissive allocation masks to be more stable under + * low memory condition and high memory pressure. + * + * Even if it fails we do not really care about that. Just proceed + * as it is. "overflow" path will refill the cache we allocate from. + */ + preempt_disable(); + if (!__this_cpu_read(ne_fit_preload_node)) { + preempt_enable(); + pva = kmem_cache_alloc_node(vmap_area_cachep, GFP_KERNEL, node); + preempt_disable(); + + if (__this_cpu_cmpxchg(ne_fit_preload_node, NULL, pva)) { + if (pva) + kmem_cache_free(vmap_area_cachep, pva); + } + } + spin_lock(&vmap_area_lock); + preempt_enable(); /* * If an allocation fails, the "vend" address is From 54f63d9d8a39118486eb8a7168cda5845240c3d2 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Thu, 11 Jul 2019 20:59:00 -0700 Subject: [PATCH 120/147] mm/vmalloc.c: get rid of one single unlink_va() when merge It does not make sense to try to "unlink" the node that is definitely not linked with a list nor tree. On the first merge step VA just points to the previously disconnected busy area. On the second step, check if the node has been merged and do "unlink" if so, because now it points to an object that must be linked. Link: http://lkml.kernel.org/r/20190606120411.8298-4-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Acked-by: Hillf Danton Reviewed-by: Roman Gushchin Cc: Matthew Wilcox Cc: Michal Hocko Cc: Oleksiy Avramchenko Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 45e0dc0e09f8..857dd8415a2e 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -719,9 +719,6 @@ merge_or_add_vmap_area(struct vmap_area *va, /* Check and update the tree if needed. */ augment_tree_propagate_from(sibling); - /* Remove this VA, it has been merged. */ - unlink_va(va, root); - /* Free vmap_area object. */ kmem_cache_free(vmap_area_cachep, va); @@ -746,12 +743,11 @@ merge_or_add_vmap_area(struct vmap_area *va, /* Check and update the tree if needed. */ augment_tree_propagate_from(sibling); - /* Remove this VA, it has been merged. */ - unlink_va(va, root); + if (merged) + unlink_va(va, root); /* Free vmap_area object. */ kmem_cache_free(vmap_area_cachep, va); - return; } } From 460e42d19a13d49455c5b269e8e0a1b1d522a895 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Thu, 11 Jul 2019 20:59:03 -0700 Subject: [PATCH 121/147] mm/vmalloc.c: switch to WARN_ON() and move it under unlink_va() Trigger a warning if an object that is about to be freed is detached. We used to have a BUG_ON(), but even though it is considered as faulty behaviour that is not a good reason to break a system. Link: http://lkml.kernel.org/r/20190606120411.8298-5-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Cc: Roman Gushchin Cc: Hillf Danton Cc: Michal Hocko Cc: Matthew Wilcox Cc: Oleksiy Avramchenko Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 857dd8415a2e..84f50d7e40bc 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -534,20 +534,17 @@ link_va(struct vmap_area *va, struct rb_root *root, static __always_inline void unlink_va(struct vmap_area *va, struct rb_root *root) { - /* - * During merging a VA node can be empty, therefore - * not linked with the tree nor list. Just check it. - */ - if (!RB_EMPTY_NODE(&va->rb_node)) { - if (root == &free_vmap_area_root) - rb_erase_augmented(&va->rb_node, - root, &free_vmap_area_rb_augment_cb); - else - rb_erase(&va->rb_node, root); + if (WARN_ON(RB_EMPTY_NODE(&va->rb_node))) + return; - list_del(&va->list); - RB_CLEAR_NODE(&va->rb_node); - } + if (root == &free_vmap_area_root) + rb_erase_augmented(&va->rb_node, + root, &free_vmap_area_rb_augment_cb); + else + rb_erase(&va->rb_node, root); + + list_del(&va->list); + RB_CLEAR_NODE(&va->rb_node); } #if DEBUG_AUGMENT_PROPAGATE_CHECK @@ -1162,8 +1159,6 @@ EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier); static void __free_vmap_area(struct vmap_area *va) { - BUG_ON(RB_EMPTY_NODE(&va->rb_node)); - /* * Remove from the busy tree/list. */ From d9009d67f42e59760aae5471ba2f62b3d5d531d1 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 11 Jul 2019 20:59:06 -0700 Subject: [PATCH 122/147] mm/vmalloc.c: spelling> s/informaion/information/ Link: http://lkml.kernel.org/r/20190607113509.15032-1-geert+renesas@glider.be Signed-off-by: Geert Uytterhoeven Reviewed-by: Andrew Morton Acked-by: Souptick Joarder Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 84f50d7e40bc..edb212298c8a 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2812,7 +2812,7 @@ static int aligned_vwrite(char *buf, char *addr, unsigned long count) * Note: In usual ops, vread() is never necessary because the caller * should know vmalloc() area is valid and can use memcpy(). * This is for routines which have to access vmalloc area without - * any informaion, as /dev/kmem. + * any information, as /dev/kmem. * * Return: number of bytes for which addr and buf should be increased * (same number as @count) or %0 if [addr...addr+count) doesn't @@ -2891,7 +2891,7 @@ finished: * Note: In usual ops, vwrite() is never necessary because the caller * should know vmalloc() area is valid and can use memcpy(). * This is for routines which have to access vmalloc area without - * any informaion, as /dev/kmem. + * any information, as /dev/kmem. * * Return: number of bytes for which addr and buf should be * increased (same number as @count) or %0 if [addr...addr+count) From ec11408a1630eed2cb03db55b8b372267f5f1032 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 11 Jul 2019 20:59:09 -0700 Subject: [PATCH 123/147] mm/large system hash: use vmalloc for size > MAX_ORDER when !hashdist The kernel currently clamps large system hashes to MAX_ORDER when hashdist is not set, which is rather arbitrary. vmalloc space is limited on 32-bit machines, but this shouldn't result in much more used because of small physical memory limiting system hash sizes. Include "vmalloc" or "linear" in the kernel log message. Link: http://lkml.kernel.org/r/20190605144814.29319-1-npiggin@gmail.com Signed-off-by: Nicholas Piggin Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ae56e8feec0c..05143e0f821f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7981,6 +7981,7 @@ void *__init alloc_large_system_hash(const char *tablename, unsigned long log2qty, size; void *table = NULL; gfp_t gfp_flags; + bool virt; /* allow the kernel cmdline to have a say */ if (!numentries) { @@ -8037,6 +8038,7 @@ void *__init alloc_large_system_hash(const char *tablename, gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC; do { + virt = false; size = bucketsize << log2qty; if (flags & HASH_EARLY) { if (flags & HASH_ZERO) @@ -8044,26 +8046,26 @@ void *__init alloc_large_system_hash(const char *tablename, else table = memblock_alloc_raw(size, SMP_CACHE_BYTES); - } else if (hashdist) { + } else if (get_order(size) >= MAX_ORDER || hashdist) { table = __vmalloc(size, gfp_flags, PAGE_KERNEL); + virt = true; } else { /* * If bucketsize is not a power-of-two, we may free * some pages at the end of hash table which * alloc_pages_exact() automatically does */ - if (get_order(size) < MAX_ORDER) { - table = alloc_pages_exact(size, gfp_flags); - kmemleak_alloc(table, size, 1, gfp_flags); - } + table = alloc_pages_exact(size, gfp_flags); + kmemleak_alloc(table, size, 1, gfp_flags); } } while (!table && size > PAGE_SIZE && --log2qty); if (!table) panic("Failed to allocate %s hash table\n", tablename); - pr_info("%s hash table entries: %ld (order: %d, %lu bytes)\n", - tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size); + pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n", + tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size, + virt ? "vmalloc" : "linear"); if (_hash_shift) *_hash_shift = log2qty; From e03a5125ec7bd1d4e8b062816a8813436876dc7c Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 11 Jul 2019 20:59:12 -0700 Subject: [PATCH 124/147] mm/large system hash: clear hashdist when only one node with memory is booted CONFIG_NUMA on 64-bit CPUs currently enables hashdist unconditionally even when booting on single node machines. This causes the large system hashes to be allocated with vmalloc, and mapped with small pages. This change clears hashdist if only one node has come up with memory. This results in the important large inode and dentry hashes using memblock allocations. All others are within 4MB size up to about 128GB of RAM, which allows them to be allocated from the linear map on most non-NUMA images. Other big hashes like futex and TCP should eventually be moved over to the same style of allocation as those vfs caches that use HASH_EARLY if !hashdist, so they don't exceed MAX_ORDER on very large non-NUMA images. This brings dTLB misses for linux kernel tree `git diff` from ~45,000 to ~8,000 on a Kaby Lake KVM guest with 8MB dentry hash and mitigations=off (performance is in the noise, under 1% difference, page tables are likely to be well cached for this workload). Link: http://lkml.kernel.org/r/20190605144814.29319-2-npiggin@gmail.com Signed-off-by: Nicholas Piggin Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 05143e0f821f..3a555ce69006 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7534,10 +7534,28 @@ static int page_alloc_cpu_dead(unsigned int cpu) return 0; } +#ifdef CONFIG_NUMA +int hashdist = HASHDIST_DEFAULT; + +static int __init set_hashdist(char *str) +{ + if (!str) + return 0; + hashdist = simple_strtoul(str, &str, 0); + return 1; +} +__setup("hashdist=", set_hashdist); +#endif + void __init page_alloc_init(void) { int ret; +#ifdef CONFIG_NUMA + if (num_node_state(N_MEMORY) == 1) + hashdist = 0; +#endif + ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC_DEAD, "mm/page_alloc:dead", NULL, page_alloc_cpu_dead); @@ -7922,19 +7940,6 @@ out: return ret; } -#ifdef CONFIG_NUMA -int hashdist = HASHDIST_DEFAULT; - -static int __init set_hashdist(char *str) -{ - if (!str) - return 0; - hashdist = simple_strtoul(str, &str, 0); - return 1; -} -__setup("hashdist=", set_hashdist); -#endif - #ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES /* * Returns the number of pages that arch has reserved but From ba5c5e4a5da443e80a3722e67515de5e37375b18 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 11 Jul 2019 20:59:15 -0700 Subject: [PATCH 125/147] arm64: move jump_label_init() before parse_early_param() While jump_label_init() was moved earlier in the boot process in efd9e03facd0 ("arm64: Use static keys for CPU features"), it wasn't early enough for early params to use it. The old state of things was as described here... init/main.c calls out to arch-specific things before general jump label and early param handling: asmlinkage __visible void __init start_kernel(void) { ... setup_arch(&command_line); ... smp_prepare_boot_cpu(); ... /* parameters may set static keys */ jump_label_init(); parse_early_param(); ... } x86 setup_arch() wants those earlier, so it handles jump label and early param: void __init setup_arch(char **cmdline_p) { ... jump_label_init(); ... parse_early_param(); ... } arm64 setup_arch() only had early param: void __init setup_arch(char **cmdline_p) { ... parse_early_param(); ... } with jump label later in smp_prepare_boot_cpu(): void __init smp_prepare_boot_cpu(void) { ... jump_label_init(); ... } This moves arm64 jump_label_init() from smp_prepare_boot_cpu() to setup_arch(), as done already on x86, in preparation from early param usage in the init_on_alloc/free() series: https://lkml.kernel.org/r/1561572949.5154.81.camel@lca.pw Link: http://lkml.kernel.org/r/201906271003.005303B52@keescook Signed-off-by: Kees Cook Acked-by: Ard Biesheuvel Acked-by: Catalin Marinas Cc: Alexander Potapenko Cc: Qian Cai Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/kernel/setup.c | 5 +++++ arch/arm64/kernel/smp.c | 5 ----- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 7e541f947b4c..9c4bad7d7131 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -283,6 +283,11 @@ void __init setup_arch(char **cmdline_p) setup_machine_fdt(__fdt_pointer); + /* + * Initialise the static keys early as they may be enabled by the + * cpufeature code and early parameters. + */ + jump_label_init(); parse_early_param(); /* diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 9286ee6749e8..ea90d3bd9253 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -420,11 +420,6 @@ void __init smp_cpus_done(unsigned int max_cpus) void __init smp_prepare_boot_cpu(void) { set_my_cpu_offset(per_cpu_offset(smp_processor_id())); - /* - * Initialise the static keys early as they may be enabled by the - * cpufeature code. - */ - jump_label_init(); cpuinfo_store_boot_cpu(); /* From 6471384af2a6530696fc0203bafe4de41a23c9ef Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 11 Jul 2019 20:59:19 -0700 Subject: [PATCH 126/147] mm: security: introduce init_on_alloc=1 and init_on_free=1 boot options Patch series "add init_on_alloc/init_on_free boot options", v10. Provide init_on_alloc and init_on_free boot options. These are aimed at preventing possible information leaks and making the control-flow bugs that depend on uninitialized values more deterministic. Enabling either of the options guarantees that the memory returned by the page allocator and SL[AU]B is initialized with zeroes. SLOB allocator isn't supported at the moment, as its emulation of kmem caches complicates handling of SLAB_TYPESAFE_BY_RCU caches correctly. Enabling init_on_free also guarantees that pages and heap objects are initialized right after they're freed, so it won't be possible to access stale data by using a dangling pointer. As suggested by Michal Hocko, right now we don't let the heap users to disable initialization for certain allocations. There's not enough evidence that doing so can speed up real-life cases, and introducing ways to opt-out may result in things going out of control. This patch (of 2): The new options are needed to prevent possible information leaks and make control-flow bugs that depend on uninitialized values more deterministic. This is expected to be on-by-default on Android and Chrome OS. And it gives the opportunity for anyone else to use it under distros too via the boot args. (The init_on_free feature is regularly requested by folks where memory forensics is included in their threat models.) init_on_alloc=1 makes the kernel initialize newly allocated pages and heap objects with zeroes. Initialization is done at allocation time at the places where checks for __GFP_ZERO are performed. init_on_free=1 makes the kernel initialize freed pages and heap objects with zeroes upon their deletion. This helps to ensure sensitive data doesn't leak via use-after-free accesses. Both init_on_alloc=1 and init_on_free=1 guarantee that the allocator returns zeroed memory. The two exceptions are slab caches with constructors and SLAB_TYPESAFE_BY_RCU flag. Those are never zero-initialized to preserve their semantics. Both init_on_alloc and init_on_free default to zero, but those defaults can be overridden with CONFIG_INIT_ON_ALLOC_DEFAULT_ON and CONFIG_INIT_ON_FREE_DEFAULT_ON. If either SLUB poisoning or page poisoning is enabled, those options take precedence over init_on_alloc and init_on_free: initialization is only applied to unpoisoned allocations. Slowdown for the new features compared to init_on_free=0, init_on_alloc=0: hackbench, init_on_free=1: +7.62% sys time (st.err 0.74%) hackbench, init_on_alloc=1: +7.75% sys time (st.err 2.14%) Linux build with -j12, init_on_free=1: +8.38% wall time (st.err 0.39%) Linux build with -j12, init_on_free=1: +24.42% sys time (st.err 0.52%) Linux build with -j12, init_on_alloc=1: -0.13% wall time (st.err 0.42%) Linux build with -j12, init_on_alloc=1: +0.57% sys time (st.err 0.40%) The slowdown for init_on_free=0, init_on_alloc=0 compared to the baseline is within the standard error. The new features are also going to pave the way for hardware memory tagging (e.g. arm64's MTE), which will require both on_alloc and on_free hooks to set the tags for heap objects. With MTE, tagging will have the same cost as memory initialization. Although init_on_free is rather costly, there are paranoid use-cases where in-memory data lifetime is desired to be minimized. There are various arguments for/against the realism of the associated threat models, but given that we'll need the infrastructure for MTE anyway, and there are people who want wipe-on-free behavior no matter what the performance cost, it seems reasonable to include it in this series. [glider@google.com: v8] Link: http://lkml.kernel.org/r/20190626121943.131390-2-glider@google.com [glider@google.com: v9] Link: http://lkml.kernel.org/r/20190627130316.254309-2-glider@google.com [glider@google.com: v10] Link: http://lkml.kernel.org/r/20190628093131.199499-2-glider@google.com Link: http://lkml.kernel.org/r/20190617151050.92663-2-glider@google.com Signed-off-by: Alexander Potapenko Acked-by: Kees Cook Acked-by: Michal Hocko [page and dmapool parts Acked-by: James Morris ] Cc: Christoph Lameter Cc: Masahiro Yamada Cc: "Serge E. Hallyn" Cc: Nick Desaulniers Cc: Kostya Serebryany Cc: Dmitry Vyukov Cc: Sandeep Patil Cc: Laura Abbott Cc: Randy Dunlap Cc: Jann Horn Cc: Mark Rutland Cc: Marco Elver Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- .../admin-guide/kernel-parameters.txt | 9 +++ drivers/infiniband/core/uverbs_ioctl.c | 2 +- include/linux/mm.h | 24 +++++++ mm/dmapool.c | 4 +- mm/page_alloc.c | 71 +++++++++++++++++-- mm/slab.c | 16 ++++- mm/slab.h | 20 ++++++ mm/slub.c | 40 +++++++++-- net/core/sock.c | 2 +- security/Kconfig.hardening | 29 ++++++++ 10 files changed, 199 insertions(+), 18 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index aa4e7e7b87c2..099c5a4be95b 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1668,6 +1668,15 @@ initrd= [BOOT] Specify the location of the initial ramdisk + init_on_alloc= [MM] Fill newly allocated pages and heap objects with + zeroes. + Format: 0 | 1 + Default set by CONFIG_INIT_ON_ALLOC_DEFAULT_ON. + + init_on_free= [MM] Fill freed pages and heap objects with zeroes. + Format: 0 | 1 + Default set by CONFIG_INIT_ON_FREE_DEFAULT_ON. + init_pkru= [x86] Specify the default memory protection keys rights register contents for all processes. 0x55555554 by default (disallow access to all but pkey 0). Can diff --git a/drivers/infiniband/core/uverbs_ioctl.c b/drivers/infiniband/core/uverbs_ioctl.c index 829b0c6944d8..61758201d9b2 100644 --- a/drivers/infiniband/core/uverbs_ioctl.c +++ b/drivers/infiniband/core/uverbs_ioctl.c @@ -127,7 +127,7 @@ __malloc void *_uverbs_alloc(struct uverbs_attr_bundle *bundle, size_t size, res = (void *)pbundle->internal_buffer + pbundle->internal_used; pbundle->internal_used = ALIGN(new_used, sizeof(*pbundle->internal_buffer)); - if (flags & __GFP_ZERO) + if (want_init_on_alloc(flags)) memset(res, 0, size); return res; } diff --git a/include/linux/mm.h b/include/linux/mm.h index bb242ad810eb..f88f0eabcc5e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2700,6 +2700,30 @@ static inline void kernel_poison_pages(struct page *page, int numpages, int enable) { } #endif +#ifdef CONFIG_INIT_ON_ALLOC_DEFAULT_ON +DECLARE_STATIC_KEY_TRUE(init_on_alloc); +#else +DECLARE_STATIC_KEY_FALSE(init_on_alloc); +#endif +static inline bool want_init_on_alloc(gfp_t flags) +{ + if (static_branch_unlikely(&init_on_alloc) && + !page_poisoning_enabled()) + return true; + return flags & __GFP_ZERO; +} + +#ifdef CONFIG_INIT_ON_FREE_DEFAULT_ON +DECLARE_STATIC_KEY_TRUE(init_on_free); +#else +DECLARE_STATIC_KEY_FALSE(init_on_free); +#endif +static inline bool want_init_on_free(void) +{ + return static_branch_unlikely(&init_on_free) && + !page_poisoning_enabled(); +} + #ifdef CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT DECLARE_STATIC_KEY_TRUE(_debug_pagealloc_enabled); #else diff --git a/mm/dmapool.c b/mm/dmapool.c index 8c94c89a6f7e..fe5d33060415 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -378,7 +378,7 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, #endif spin_unlock_irqrestore(&pool->lock, flags); - if (mem_flags & __GFP_ZERO) + if (want_init_on_alloc(mem_flags)) memset(retval, 0, pool->size); return retval; @@ -428,6 +428,8 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) } offset = vaddr - page->vaddr; + if (want_init_on_free()) + memset(vaddr, 0, pool->size); #ifdef DMAPOOL_DEBUG if ((dma - page->dma) != offset) { spin_unlock_irqrestore(&pool->lock, flags); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3a555ce69006..dbd0d5cbbcbb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -135,6 +135,55 @@ unsigned long totalcma_pages __read_mostly; int percpu_pagelist_fraction; gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; +#ifdef CONFIG_INIT_ON_ALLOC_DEFAULT_ON +DEFINE_STATIC_KEY_TRUE(init_on_alloc); +#else +DEFINE_STATIC_KEY_FALSE(init_on_alloc); +#endif +EXPORT_SYMBOL(init_on_alloc); + +#ifdef CONFIG_INIT_ON_FREE_DEFAULT_ON +DEFINE_STATIC_KEY_TRUE(init_on_free); +#else +DEFINE_STATIC_KEY_FALSE(init_on_free); +#endif +EXPORT_SYMBOL(init_on_free); + +static int __init early_init_on_alloc(char *buf) +{ + int ret; + bool bool_result; + + if (!buf) + return -EINVAL; + ret = kstrtobool(buf, &bool_result); + if (bool_result && page_poisoning_enabled()) + pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, will take precedence over init_on_alloc\n"); + if (bool_result) + static_branch_enable(&init_on_alloc); + else + static_branch_disable(&init_on_alloc); + return ret; +} +early_param("init_on_alloc", early_init_on_alloc); + +static int __init early_init_on_free(char *buf) +{ + int ret; + bool bool_result; + + if (!buf) + return -EINVAL; + ret = kstrtobool(buf, &bool_result); + if (bool_result && page_poisoning_enabled()) + pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, will take precedence over init_on_free\n"); + if (bool_result) + static_branch_enable(&init_on_free); + else + static_branch_disable(&init_on_free); + return ret; +} +early_param("init_on_free", early_init_on_free); /* * A cached value of the page's pageblock's migratetype, used when the page is @@ -1067,6 +1116,14 @@ out: return ret; } +static void kernel_init_free_pages(struct page *page, int numpages) +{ + int i; + + for (i = 0; i < numpages; i++) + clear_highpage(page + i); +} + static __always_inline bool free_pages_prepare(struct page *page, unsigned int order, bool check_free) { @@ -1118,6 +1175,9 @@ static __always_inline bool free_pages_prepare(struct page *page, PAGE_SIZE << order); } arch_free_page(page, order); + if (want_init_on_free()) + kernel_init_free_pages(page, 1 << order); + kernel_poison_pages(page, 1 << order, 0); if (debug_pagealloc_enabled()) kernel_map_pages(page, 1 << order, 0); @@ -2019,8 +2079,8 @@ static inline int check_new_page(struct page *page) static inline bool free_pages_prezeroed(void) { - return IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) && - page_poisoning_enabled(); + return (IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) && + page_poisoning_enabled()) || want_init_on_free(); } #ifdef CONFIG_DEBUG_VM @@ -2090,13 +2150,10 @@ inline void post_alloc_hook(struct page *page, unsigned int order, static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, unsigned int alloc_flags) { - int i; - post_alloc_hook(page, order, gfp_flags); - if (!free_pages_prezeroed() && (gfp_flags & __GFP_ZERO)) - for (i = 0; i < (1 << order); i++) - clear_highpage(page + i); + if (!free_pages_prezeroed() && want_init_on_alloc(gfp_flags)) + kernel_init_free_pages(page, 1 << order); if (order && (gfp_flags & __GFP_COMP)) prep_compound_page(page, order); diff --git a/mm/slab.c b/mm/slab.c index e9d90b0da47b..9df370558e5d 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1811,6 +1811,14 @@ static bool set_objfreelist_slab_cache(struct kmem_cache *cachep, cachep->num = 0; + /* + * If slab auto-initialization on free is enabled, store the freelist + * off-slab, so that its contents don't end up in one of the allocated + * objects. + */ + if (unlikely(slab_want_init_on_free(cachep))) + return false; + if (cachep->ctor || flags & SLAB_TYPESAFE_BY_RCU) return false; @@ -3248,7 +3256,7 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, local_irq_restore(save_flags); ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); - if (unlikely(flags & __GFP_ZERO) && ptr) + if (unlikely(slab_want_init_on_alloc(flags, cachep)) && ptr) memset(ptr, 0, cachep->object_size); slab_post_alloc_hook(cachep, flags, 1, &ptr); @@ -3305,7 +3313,7 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller) objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); prefetchw(objp); - if (unlikely(flags & __GFP_ZERO) && objp) + if (unlikely(slab_want_init_on_alloc(flags, cachep)) && objp) memset(objp, 0, cachep->object_size); slab_post_alloc_hook(cachep, flags, 1, &objp); @@ -3426,6 +3434,8 @@ void ___cache_free(struct kmem_cache *cachep, void *objp, struct array_cache *ac = cpu_cache_get(cachep); check_irq_off(); + if (unlikely(slab_want_init_on_free(cachep))) + memset(objp, 0, cachep->object_size); kmemleak_free_recursive(objp, cachep->flags); objp = cache_free_debugcheck(cachep, objp, caller); @@ -3513,7 +3523,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, cache_alloc_debugcheck_after_bulk(s, flags, size, p, _RET_IP_); /* Clear memory outside IRQ disabled section */ - if (unlikely(flags & __GFP_ZERO)) + if (unlikely(slab_want_init_on_alloc(flags, s))) for (i = 0; i < size; i++) memset(p[i], 0, s->object_size); diff --git a/mm/slab.h b/mm/slab.h index a62372d0f271..9057b8056b07 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -607,4 +607,24 @@ static inline int cache_random_seq_create(struct kmem_cache *cachep, static inline void cache_random_seq_destroy(struct kmem_cache *cachep) { } #endif /* CONFIG_SLAB_FREELIST_RANDOM */ +static inline bool slab_want_init_on_alloc(gfp_t flags, struct kmem_cache *c) +{ + if (static_branch_unlikely(&init_on_alloc)) { + if (c->ctor) + return false; + if (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) + return flags & __GFP_ZERO; + return true; + } + return flags & __GFP_ZERO; +} + +static inline bool slab_want_init_on_free(struct kmem_cache *c) +{ + if (static_branch_unlikely(&init_on_free)) + return !(c->ctor || + (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON))); + return false; +} + #endif /* MM_SLAB_H */ diff --git a/mm/slub.c b/mm/slub.c index c9541a480627..e6c030e47364 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1279,6 +1279,10 @@ check_slabs: if (*str == ',') slub_debug_slabs = str + 1; out: + if ((static_branch_unlikely(&init_on_alloc) || + static_branch_unlikely(&init_on_free)) && + (slub_debug & SLAB_POISON)) + pr_info("mem auto-init: SLAB_POISON will take precedence over init_on_alloc/init_on_free\n"); return 1; } @@ -1422,6 +1426,28 @@ static __always_inline bool slab_free_hook(struct kmem_cache *s, void *x) static inline bool slab_free_freelist_hook(struct kmem_cache *s, void **head, void **tail) { + + void *object; + void *next = *head; + void *old_tail = *tail ? *tail : *head; + int rsize; + + if (slab_want_init_on_free(s)) + do { + object = next; + next = get_freepointer(s, object); + /* + * Clear the object and the metadata, but don't touch + * the redzone. + */ + memset(object, 0, s->object_size); + rsize = (s->flags & SLAB_RED_ZONE) ? s->red_left_pad + : 0; + memset((char *)object + s->inuse, 0, + s->size - s->inuse - rsize); + set_freepointer(s, object, next); + } while (object != old_tail); + /* * Compiler cannot detect this function can be removed if slab_free_hook() * evaluates to nothing. Thus, catch all relevant config debug options here. @@ -1431,9 +1457,7 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s, defined(CONFIG_DEBUG_OBJECTS_FREE) || \ defined(CONFIG_KASAN) - void *object; - void *next = *head; - void *old_tail = *tail ? *tail : *head; + next = *head; /* Head and tail of the reconstructed freelist */ *head = NULL; @@ -2729,8 +2753,14 @@ redo: prefetch_freepointer(s, next_object); stat(s, ALLOC_FASTPATH); } + /* + * If the object has been wiped upon free, make sure it's fully + * initialized by zeroing out freelist pointer. + */ + if (unlikely(slab_want_init_on_free(s)) && object) + memset(object + s->offset, 0, sizeof(void *)); - if (unlikely(gfpflags & __GFP_ZERO) && object) + if (unlikely(slab_want_init_on_alloc(gfpflags, s)) && object) memset(object, 0, s->object_size); slab_post_alloc_hook(s, gfpflags, 1, &object); @@ -3151,7 +3181,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, local_irq_enable(); /* Clear memory outside IRQ disabled fastpath loop */ - if (unlikely(flags & __GFP_ZERO)) { + if (unlikely(slab_want_init_on_alloc(flags, s))) { int j; for (j = 0; j < i; j++) diff --git a/net/core/sock.c b/net/core/sock.c index 3e073ca6138f..d57b0cc995a0 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1597,7 +1597,7 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO); if (!sk) return sk; - if (priority & __GFP_ZERO) + if (want_init_on_alloc(priority)) sk_prot_clear_nulls(sk, prot->obj_size); } else sk = kmalloc(prot->obj_size, priority); diff --git a/security/Kconfig.hardening b/security/Kconfig.hardening index c6cb2d9b2905..a1ffe2eb4d5f 100644 --- a/security/Kconfig.hardening +++ b/security/Kconfig.hardening @@ -160,6 +160,35 @@ config STACKLEAK_RUNTIME_DISABLE runtime to control kernel stack erasing for kernels built with CONFIG_GCC_PLUGIN_STACKLEAK. +config INIT_ON_ALLOC_DEFAULT_ON + bool "Enable heap memory zeroing on allocation by default" + help + This has the effect of setting "init_on_alloc=1" on the kernel + command line. This can be disabled with "init_on_alloc=0". + When "init_on_alloc" is enabled, all page allocator and slab + allocator memory will be zeroed when allocated, eliminating + many kinds of "uninitialized heap memory" flaws, especially + heap content exposures. The performance impact varies by + workload, but most cases see <1% impact. Some synthetic + workloads have measured as high as 7%. + +config INIT_ON_FREE_DEFAULT_ON + bool "Enable heap memory zeroing on free by default" + help + This has the effect of setting "init_on_free=1" on the kernel + command line. This can be disabled with "init_on_free=0". + Similar to "init_on_alloc", when "init_on_free" is enabled, + all page allocator and slab allocator memory will be zeroed + when freed, eliminating many kinds of "uninitialized heap memory" + flaws, especially heap content exposures. The primary difference + with "init_on_free" is that data lifetime in memory is reduced, + as anything freed is wiped immediately, making live forensics or + cold boot memory attacks unable to recover freed memory contents. + The performance impact varies by workload, but is more expensive + than "init_on_alloc" due to the negative cache effects of + touching "cold" memory areas. Most cases see 3-5% impact. Some + synthetic workloads have measured as high as 8%. + endmenu endmenu From 23a5c8cb7a91939cb08bc2dc880a7aa882bc6241 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 11 Jul 2019 20:59:23 -0700 Subject: [PATCH 127/147] mm: init: report memory auto-initialization features at boot time Print the currently enabled stack and heap initialization modes. Stack initialization is enabled by a config flag, while heap initialization is configured at boot time with defaults being set in the config. It's more convenient for the user to have all information about these hardening measures in one place at boot, so the user can reason about the expected behavior of the running system. The possible options for stack are: - "all" for CONFIG_INIT_STACK_ALL; - "byref_all" for CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF_ALL; - "byref" for CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF; - "__user" for CONFIG_GCC_PLUGIN_STRUCTLEAK_USER; - "off" otherwise. Depending on the values of init_on_alloc and init_on_free boottime options we also report "heap alloc" and "heap free" as "on"/"off". In the init_on_free mode initializing pages at boot time may take a while, so print a notice about that as well. This depends on how much memory is installed, the memory bandwidth, etc. On a relatively modern x86 system, it takes about 0.75s/GB to wipe all memory: [ 0.418722] mem auto-init: stack:byref_all, heap alloc:off, heap free:on [ 0.419765] mem auto-init: clearing system memory may take some time... [ 12.376605] Memory: 16408564K/16776672K available (14339K kernel code, 1397K rwdata, 3756K rodata, 1636K init, 11460K bss, 368108K reserved, 0K cma-reserved) Link: http://lkml.kernel.org/r/20190617151050.92663-3-glider@google.com Signed-off-by: Alexander Potapenko Suggested-by: Kees Cook Acked-by: Kees Cook Cc: Christoph Lameter Cc: Dmitry Vyukov Cc: James Morris Cc: Jann Horn Cc: Kostya Serebryany Cc: Laura Abbott Cc: Mark Rutland Cc: Masahiro Yamada Cc: Matthew Wilcox Cc: Nick Desaulniers Cc: Randy Dunlap Cc: Sandeep Patil Cc: "Serge E. Hallyn" Cc: Souptick Joarder Cc: Marco Elver Cc: Kaiwan N Billimoria Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- init/main.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/init/main.c b/init/main.c index 66a196c5e4c3..ff5803b0841c 100644 --- a/init/main.c +++ b/init/main.c @@ -520,6 +520,29 @@ static inline void initcall_debug_enable(void) } #endif +/* Report memory auto-initialization states for this boot. */ +static void __init report_meminit(void) +{ + const char *stack; + + if (IS_ENABLED(CONFIG_INIT_STACK_ALL)) + stack = "all"; + else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF_ALL)) + stack = "byref_all"; + else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF)) + stack = "byref"; + else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_USER)) + stack = "__user"; + else + stack = "off"; + + pr_info("mem auto-init: stack:%s, heap alloc:%s, heap free:%s\n", + stack, want_init_on_alloc(GFP_KERNEL) ? "on" : "off", + want_init_on_free() ? "on" : "off"); + if (want_init_on_free()) + pr_info("mem auto-init: clearing system memory may take some time...\n"); +} + /* * Set up kernel memory allocators */ @@ -530,6 +553,7 @@ static void __init mm_init(void) * bigger than MAX_ORDER unless SPARSEMEM. */ page_ext_init_flatmem(); + report_meminit(); mem_init(); kmem_cache_init(); pgtable_init(); From af5d440365894b5ca51f29866c1a01496dce52c4 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 11 Jul 2019 20:59:27 -0700 Subject: [PATCH 128/147] mm: vmscan: remove double slab pressure by inc'ing sc->nr_scanned Commit 9092c71bb724 ("mm: use sc->priority for slab shrink targets") has broken up the relationship between sc->nr_scanned and slab pressure. The sc->nr_scanned can't double slab pressure anymore. So, it sounds no sense to still keep sc->nr_scanned inc'ed. Actually, it would prevent from adding pressure on slab shrink since excessive sc->nr_scanned would prevent from scan->priority raise. The bonnie test doesn't show this would change the behavior of slab shrinkers. w/ w/o /sec %CP /sec %CP Sequential delete: 3960.6 94.6 3997.6 96.2 Random delete: 2518 63.8 2561.6 64.6 The slight increase of "/sec" without the patch would be caused by the slight increase of CPU usage. Link: http://lkml.kernel.org/r/1559025859-72759-1-git-send-email-yang.shi@linux.alibaba.com Signed-off-by: Yang Shi Acked-by: Johannes Weiner Cc: Josef Bacik Cc: Michal Hocko Cc: Mel Gorman Cc: "Kirill A . Shutemov" Cc: Hugh Dickins Cc: Shakeel Butt Cc: Hillf Danton Cc: "Huang, Ying" Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 96aafbf8ce4e..277a36de11c1 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1137,11 +1137,6 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (!sc->may_unmap && page_mapped(page)) goto keep_locked; - /* Double the slab pressure for mapped and swapcache pages */ - if ((page_mapped(page) || PageSwapCache(page)) && - !(PageAnon(page) && !PageSwapBacked(page))) - sc->nr_scanned++; - may_enter_fs = (sc->gfp_mask & __GFP_FS) || (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); From 98879b3b9edc1604f2d1a6686576ef4d08ed3310 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 11 Jul 2019 20:59:30 -0700 Subject: [PATCH 129/147] mm: vmscan: correct some vmscan counters for THP swapout Commit bd4c82c22c36 ("mm, THP, swap: delay splitting THP after swapped out"), THP can be swapped out in a whole. But, nr_reclaimed and some other vm counters still get inc'ed by one even though a whole THP (512 pages) gets swapped out. This doesn't make too much sense to memory reclaim. For example, direct reclaim may just need reclaim SWAP_CLUSTER_MAX pages, reclaiming one THP could fulfill it. But, if nr_reclaimed is not increased correctly, direct reclaim may just waste time to reclaim more pages, SWAP_CLUSTER_MAX * 512 pages in worst case. And, it may cause pgsteal_{kswapd|direct} is greater than pgscan_{kswapd|direct}, like the below: pgsteal_kswapd 122933 pgsteal_direct 26600225 pgscan_kswapd 174153 pgscan_direct 14678312 nr_reclaimed and nr_scanned must be fixed in parallel otherwise it would break some page reclaim logic, e.g. vmpressure: this looks at the scanned/reclaimed ratio so it won't change semantics as long as scanned & reclaimed are fixed in parallel. compaction/reclaim: compaction wants a certain number of physical pages freed up before going back to compacting. kswapd priority raising: kswapd raises priority if we scan fewer pages than the reclaim target (which itself is obviously expressed in order-0 pages). As a result, kswapd can falsely raise its aggressiveness even when it's making great progress. Other than nr_scanned and nr_reclaimed, some other counters, e.g. pgactivate, nr_skipped, nr_ref_keep and nr_unmap_fail need to be fixed too since they are user visible via cgroup, /proc/vmstat or trace points, otherwise they would be underreported. When isolating pages from LRUs, nr_taken has been accounted in base page, but nr_scanned and nr_skipped are still accounted in THP. It doesn't make too much sense too since this may cause trace point underreport the numbers as well. So accounting those counters in base page instead of accounting THP as one page. nr_dirty, nr_unqueued_dirty, nr_congested and nr_writeback are used by file cache, so they are not impacted by THP swap. This change may result in lower steal/scan ratio in some cases since THP may get split during page reclaim, then a part of tail pages get reclaimed instead of the whole 512 pages, but nr_scanned is accounted by 512, particularly for direct reclaim. But, this should be not a significant issue. Link: http://lkml.kernel.org/r/1559025859-72759-2-git-send-email-yang.shi@linux.alibaba.com Signed-off-by: Yang Shi Reviewed-by: "Huang, Ying" Cc: Johannes Weiner Cc: Michal Hocko Cc: Mel Gorman Cc: "Kirill A . Shutemov" Cc: Hugh Dickins Cc: Shakeel Butt Cc: Hillf Danton Cc: Josef Bacik Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 63 +++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 49 insertions(+), 14 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 277a36de11c1..f8e3dcd527b8 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1118,6 +1118,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, int may_enter_fs; enum page_references references = PAGEREF_RECLAIM_CLEAN; bool dirty, writeback; + unsigned int nr_pages; cond_resched(); @@ -1129,7 +1130,10 @@ static unsigned long shrink_page_list(struct list_head *page_list, VM_BUG_ON_PAGE(PageActive(page), page); - sc->nr_scanned++; + nr_pages = 1 << compound_order(page); + + /* Account the number of base pages even though THP */ + sc->nr_scanned += nr_pages; if (unlikely(!page_evictable(page))) goto activate_locked; @@ -1250,7 +1254,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, case PAGEREF_ACTIVATE: goto activate_locked; case PAGEREF_KEEP: - stat->nr_ref_keep++; + stat->nr_ref_keep += nr_pages; goto keep_locked; case PAGEREF_RECLAIM: case PAGEREF_RECLAIM_CLEAN: @@ -1282,7 +1286,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, } if (!add_to_swap(page)) { if (!PageTransHuge(page)) - goto activate_locked; + goto activate_locked_split; /* Fallback to swap normal pages */ if (split_huge_page_to_list(page, page_list)) @@ -1291,7 +1295,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, count_vm_event(THP_SWPOUT_FALLBACK); #endif if (!add_to_swap(page)) - goto activate_locked; + goto activate_locked_split; } may_enter_fs = 1; @@ -1305,6 +1309,18 @@ static unsigned long shrink_page_list(struct list_head *page_list, goto keep_locked; } + /* + * THP may get split above, need minus tail pages and update + * nr_pages to avoid accounting tail pages twice. + * + * The tail pages that are added into swap cache successfully + * reach here. + */ + if ((nr_pages > 1) && !PageTransHuge(page)) { + sc->nr_scanned -= (nr_pages - 1); + nr_pages = 1; + } + /* * The page is mapped into the page tables of one or more * processes. Try to unmap it here. @@ -1315,7 +1331,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (unlikely(PageTransHuge(page))) flags |= TTU_SPLIT_HUGE_PMD; if (!try_to_unmap(page, flags)) { - stat->nr_unmap_fail++; + stat->nr_unmap_fail += nr_pages; goto activate_locked; } } @@ -1442,7 +1458,11 @@ static unsigned long shrink_page_list(struct list_head *page_list, unlock_page(page); free_it: - nr_reclaimed++; + /* + * THP may get swapped out in a whole, need account + * all base pages. + */ + nr_reclaimed += nr_pages; /* * Is there need to periodically free_page_list? It would @@ -1455,6 +1475,15 @@ free_it: list_add(&page->lru, &free_pages); continue; +activate_locked_split: + /* + * The tail pages that are failed to add into swap cache + * reach here. Fixup nr_scanned and nr_pages. + */ + if (nr_pages > 1) { + sc->nr_scanned -= (nr_pages - 1); + nr_pages = 1; + } activate_locked: /* Not a candidate for swapping, so reclaim swap space. */ if (PageSwapCache(page) && (mem_cgroup_swap_full(page) || @@ -1464,8 +1493,7 @@ activate_locked: if (!PageMlocked(page)) { int type = page_is_file_cache(page); SetPageActive(page); - pgactivate++; - stat->nr_activate[type] += hpage_nr_pages(page); + stat->nr_activate[type] += nr_pages; count_memcg_page_event(page, PGACTIVATE); } keep_locked: @@ -1475,6 +1503,8 @@ keep: VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page); } + pgactivate = stat->nr_activate[0] + stat->nr_activate[1]; + mem_cgroup_uncharge_list(&free_pages); try_to_unmap_flush(); free_unref_page_list(&free_pages); @@ -1646,10 +1676,9 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, LIST_HEAD(pages_skipped); isolate_mode_t mode = (sc->may_unmap ? 0 : ISOLATE_UNMAPPED); + total_scan = 0; scan = 0; - for (total_scan = 0; - scan < nr_to_scan && nr_taken < nr_to_scan && !list_empty(src); - total_scan++) { + while (scan < nr_to_scan && !list_empty(src)) { struct page *page; page = lru_to_page(src); @@ -1657,9 +1686,12 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, VM_BUG_ON_PAGE(!PageLRU(page), page); + nr_pages = 1 << compound_order(page); + total_scan += nr_pages; + if (page_zonenum(page) > sc->reclaim_idx) { list_move(&page->lru, &pages_skipped); - nr_skipped[page_zonenum(page)]++; + nr_skipped[page_zonenum(page)] += nr_pages; continue; } @@ -1668,11 +1700,14 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, * return with no isolated pages if the LRU mostly contains * ineligible pages. This causes the VM to not reclaim any * pages, triggering a premature OOM. + * + * Account all tail pages of THP. This would not cause + * premature OOM since __isolate_lru_page() returns -EBUSY + * only when the page is being freed somewhere else. */ - scan++; + scan += nr_pages; switch (__isolate_lru_page(page, mode)) { case 0: - nr_pages = hpage_nr_pages(page); nr_taken += nr_pages; nr_zone_taken[page_zonenum(page)] += nr_pages; list_move(&page->lru, dst); From d914999689609d45b36db2b0fabb05cf7fc1fa1f Mon Sep 17 00:00:00 2001 From: "Tobin C. Harding" Date: Thu, 11 Jul 2019 20:59:34 -0700 Subject: [PATCH 130/147] tools/vm/slabinfo: order command line options During recent discussion on LKML over SLAB vs SLUB it was suggested by Jesper that it would be nice to have a tool to view the current fragmentation of the slab allocators. CC list for this set is taken from that thread. For SLUB we have all the information for this already exposed by the kernel and also we have a userspace tool for displaying this info: tools/vm/slabinfo.c Extend slabinfo to improve the fragmentation information by enabling sorting of caches by number of partial slabs. Also add cache list sorted in this manner to the output of `slabinfo -X`. This patch (of 4): get_opt() has a spurious character within the option string. Remove it and reorder the options in alphabetic order so that it is easier to keep the options correct. Use the same ordering for command help output and long option handling code. Link: http://lkml.kernel.org/r/20190426022622.4089-2-tobin@kernel.org Signed-off-by: Tobin C. Harding Cc: Jesper Dangaard Brouer Cc: Pekka Enberg Cc: Vlastimil Babka Cc: Christoph Lameter Cc: David Rientjes Cc: Joonsoo Kim Cc: Tejun Heo Cc: Qian Cai Cc: Mel Gorman Cc: Alexander Duyck Cc: Michal Hocko Cc: Brendan Gregg , Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/vm/slabinfo.c | 70 ++++++++++++++++++++++----------------------- 1 file changed, 35 insertions(+), 35 deletions(-) diff --git a/tools/vm/slabinfo.c b/tools/vm/slabinfo.c index 73818f1b2ef8..e9b5437b2f28 100644 --- a/tools/vm/slabinfo.c +++ b/tools/vm/slabinfo.c @@ -110,7 +110,7 @@ static void fatal(const char *x, ...) static void usage(void) { printf("slabinfo 4/15/2011. (c) 2007 sgi/(c) 2011 Linux Foundation.\n\n" - "slabinfo [-aADefhilnosrStTvz1LXBU] [N=K] [-dafzput] [slab-regexp]\n" + "slabinfo [-aABDefhilLnorsStTUvXz1] [N=K] [-dafzput] [slab-regexp]\n" "-a|--aliases Show aliases\n" "-A|--activity Most active slabs first\n" "-B|--Bytes Show size in bytes\n" @@ -131,9 +131,9 @@ static void usage(void) "-T|--Totals Show summary information\n" "-U|--Unreclaim Show unreclaimable slabs only\n" "-v|--validate Validate slabs\n" + "-X|--Xtotals Show extended summary information\n" "-z|--zero Include empty slabs\n" "-1|--1ref Single reference\n" - "-X|--Xtotals Show extended summary information\n" "\n" "-d | --debug Switch off all debug options\n" @@ -1334,6 +1334,7 @@ static void xtotals(void) struct option opts[] = { { "aliases", no_argument, NULL, 'a' }, { "activity", no_argument, NULL, 'A' }, + { "Bytes", no_argument, NULL, 'B'}, { "debug", optional_argument, NULL, 'd' }, { "display-activity", no_argument, NULL, 'D' }, { "empty", no_argument, NULL, 'e' }, @@ -1341,21 +1342,20 @@ struct option opts[] = { { "help", no_argument, NULL, 'h' }, { "inverted", no_argument, NULL, 'i'}, { "slabs", no_argument, NULL, 'l' }, + { "Loss", no_argument, NULL, 'L'}, { "numa", no_argument, NULL, 'n' }, + { "lines", required_argument, NULL, 'N'}, { "ops", no_argument, NULL, 'o' }, - { "shrink", no_argument, NULL, 's' }, { "report", no_argument, NULL, 'r' }, + { "shrink", no_argument, NULL, 's' }, { "Size", no_argument, NULL, 'S'}, { "tracking", no_argument, NULL, 't'}, { "Totals", no_argument, NULL, 'T'}, + { "Unreclaim", no_argument, NULL, 'U'}, { "validate", no_argument, NULL, 'v' }, + { "Xtotals", no_argument, NULL, 'X'}, { "zero", no_argument, NULL, 'z' }, { "1ref", no_argument, NULL, '1'}, - { "lines", required_argument, NULL, 'N'}, - { "Loss", no_argument, NULL, 'L'}, - { "Xtotals", no_argument, NULL, 'X'}, - { "Bytes", no_argument, NULL, 'B'}, - { "Unreclaim", no_argument, NULL, 'U'}, { NULL, 0, NULL, 0 } }; @@ -1367,18 +1367,18 @@ int main(int argc, char *argv[]) page_size = getpagesize(); - while ((c = getopt_long(argc, argv, "aAd::Defhil1noprstvzTSN:LXBU", + while ((c = getopt_long(argc, argv, "aABd::DefhilLnN:orsStTUvXz1", opts, NULL)) != -1) switch (c) { - case '1': - show_single_ref = 1; - break; case 'a': show_alias = 1; break; case 'A': sort_active = 1; break; + case 'B': + show_bytes = 1; + break; case 'd': set_debug = 1; if (!debug_opt_scan(optarg)) @@ -1399,9 +1399,22 @@ int main(int argc, char *argv[]) case 'i': show_inverted = 1; break; + case 'l': + show_slab = 1; + break; + case 'L': + sort_loss = 1; + break; case 'n': show_numa = 1; break; + case 'N': + if (optarg) { + output_lines = atoi(optarg); + if (output_lines < 1) + output_lines = 1; + } + break; case 'o': show_ops = 1; break; @@ -1411,33 +1424,20 @@ int main(int argc, char *argv[]) case 's': shrink = 1; break; - case 'l': - show_slab = 1; + case 'S': + sort_size = 1; break; case 't': show_track = 1; break; - case 'v': - validate = 1; - break; - case 'z': - skip_zero = 0; - break; case 'T': show_totals = 1; break; - case 'S': - sort_size = 1; + case 'U': + unreclaim_only = 1; break; - case 'N': - if (optarg) { - output_lines = atoi(optarg); - if (output_lines < 1) - output_lines = 1; - } - break; - case 'L': - sort_loss = 1; + case 'v': + validate = 1; break; case 'X': if (output_lines == -1) @@ -1445,11 +1445,11 @@ int main(int argc, char *argv[]) extended_totals = 1; show_bytes = 1; break; - case 'B': - show_bytes = 1; + case 'z': + skip_zero = 0; break; - case 'U': - unreclaim_only = 1; + case '1': + show_single_ref = 1; break; default: fatal("%s: Invalid option '%c'\n", argv[0], optopt); From 1106b205a3fe034beafad24045b7c00a7eb89669 Mon Sep 17 00:00:00 2001 From: "Tobin C. Harding" Date: Thu, 11 Jul 2019 20:59:38 -0700 Subject: [PATCH 131/147] tools/vm/slabinfo: add partial slab listing to -X We would like to see how fragmented the SLUB allocator is, one window into fragmentation is the total number of partial slabs. Currently `slabinfo -X` shows slabs sorted by loss and by size. We can use this option to also show slabs sorted by number of partial slabs. Option '-X' can be used in conjunction with '-N' to control the number of slabs shown e.g. list of top 5 slabs: slabinfo -X -N5 Add list of slabs ordered by number of partial slabs to output of `slabinfo -X`. Link: http://lkml.kernel.org/r/20190426022622.4089-3-tobin@kernel.org Signed-off-by: Tobin C. Harding Cc: Alexander Duyck Cc: Brendan Gregg , Cc: Christoph Lameter Cc: David Rientjes Cc: Jesper Dangaard Brouer Cc: Joonsoo Kim Cc: Mel Gorman Cc: Michal Hocko Cc: Pekka Enberg Cc: Qian Cai Cc: Tejun Heo Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/vm/slabinfo.c | 41 ++++++++++++++++++++++++++++------------- 1 file changed, 28 insertions(+), 13 deletions(-) diff --git a/tools/vm/slabinfo.c b/tools/vm/slabinfo.c index e9b5437b2f28..3f3a2db65794 100644 --- a/tools/vm/slabinfo.c +++ b/tools/vm/slabinfo.c @@ -79,6 +79,7 @@ int sort_size; int sort_active; int set_debug; int show_ops; +int sort_partial; int show_activity; int output_lines = -1; int sort_loss; @@ -1047,6 +1048,8 @@ static void sort_slabs(void) result = slab_activity(s1) < slab_activity(s2); else if (sort_loss) result = slab_waste(s1) < slab_waste(s2); + else if (sort_partial) + result = s1->partial < s2->partial; else result = strcasecmp(s1->name, s2->name); @@ -1307,27 +1310,39 @@ static void output_slabs(void) } } +static void _xtotals(char *heading, char *underline, + int loss, int size, int partial) +{ + printf("%s%s", heading, underline); + line = 0; + sort_loss = loss; + sort_size = size; + sort_partial = partial; + sort_slabs(); + output_slabs(); +} + static void xtotals(void) { + char *heading, *underline; + totals(); link_slabs(); rename_slabs(); - printf("\nSlabs sorted by size\n"); - printf("--------------------\n"); - sort_loss = 0; - sort_size = 1; - sort_slabs(); - output_slabs(); + heading = "\nSlabs sorted by size\n"; + underline = "--------------------\n"; + _xtotals(heading, underline, 0, 1, 0); + + heading = "\nSlabs sorted by loss\n"; + underline = "--------------------\n"; + _xtotals(heading, underline, 1, 0, 0); + + heading = "\nSlabs sorted by number of partial slabs\n"; + underline = "---------------------------------------\n"; + _xtotals(heading, underline, 0, 0, 1); - printf("\nSlabs sorted by loss\n"); - printf("--------------------\n"); - line = 0; - sort_loss = 1; - sort_size = 0; - sort_slabs(); - output_slabs(); printf("\n"); } From 53a83f9766e33d6bf4e14d5592c1939777fb98db Mon Sep 17 00:00:00 2001 From: "Tobin C. Harding" Date: Thu, 11 Jul 2019 20:59:42 -0700 Subject: [PATCH 132/147] tools/vm/slabinfo: add option to sort by partial slabs We would like to get a better view of the level of fragmentation within the SLUB allocator. Total number of partial slabs is an indicator of fragmentation. Add a command line option (-P | --partial) to sort the slab list by total number of partial slabs. Link: http://lkml.kernel.org/r/20190426022622.4089-4-tobin@kernel.org Signed-off-by: Tobin C. Harding Cc: Alexander Duyck Cc: Brendan Gregg , Cc: Christoph Lameter Cc: David Rientjes Cc: Jesper Dangaard Brouer Cc: Joonsoo Kim Cc: Mel Gorman Cc: Michal Hocko Cc: Pekka Enberg Cc: Qian Cai Cc: Tejun Heo Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/vm/slabinfo.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tools/vm/slabinfo.c b/tools/vm/slabinfo.c index 3f3a2db65794..469ff6157986 100644 --- a/tools/vm/slabinfo.c +++ b/tools/vm/slabinfo.c @@ -111,7 +111,7 @@ static void fatal(const char *x, ...) static void usage(void) { printf("slabinfo 4/15/2011. (c) 2007 sgi/(c) 2011 Linux Foundation.\n\n" - "slabinfo [-aABDefhilLnorsStTUvXz1] [N=K] [-dafzput] [slab-regexp]\n" + "slabinfo [-aABDefhilLnoPrsStTUvXz1] [N=K] [-dafzput] [slab-regexp]\n" "-a|--aliases Show aliases\n" "-A|--activity Most active slabs first\n" "-B|--Bytes Show size in bytes\n" @@ -125,6 +125,7 @@ static void usage(void) "-n|--numa Show NUMA information\n" "-N|--lines=K Show the first K slabs\n" "-o|--ops Show kmem_cache_ops\n" + "-P|--partial Sort by number of partial slabs\n" "-r|--report Detailed report on single slabs\n" "-s|--shrink Shrink slabs\n" "-S|--Size Sort by size\n" @@ -1361,6 +1362,7 @@ struct option opts[] = { { "numa", no_argument, NULL, 'n' }, { "lines", required_argument, NULL, 'N'}, { "ops", no_argument, NULL, 'o' }, + { "partial", no_argument, NULL, 'p'}, { "report", no_argument, NULL, 'r' }, { "shrink", no_argument, NULL, 's' }, { "Size", no_argument, NULL, 'S'}, @@ -1382,7 +1384,7 @@ int main(int argc, char *argv[]) page_size = getpagesize(); - while ((c = getopt_long(argc, argv, "aABd::DefhilLnN:orsStTUvXz1", + while ((c = getopt_long(argc, argv, "aABd::DefhilLnN:oPrsStTUvXz1", opts, NULL)) != -1) switch (c) { case 'a': @@ -1436,6 +1438,9 @@ int main(int argc, char *argv[]) case 'r': show_report = 1; break; + case 'P': + sort_partial = 1; + break; case 's': shrink = 1; break; From cbf800d9c7fb38d953ba8ae1bd27a6382114c32e Mon Sep 17 00:00:00 2001 From: "Tobin C. Harding" Date: Thu, 11 Jul 2019 20:59:46 -0700 Subject: [PATCH 133/147] tools/vm/slabinfo: add sorting info to help menu Passing more than one sorting option has undefined behaviour. Add an explicit statement as such to the help menu, this also has the advantage of highlighting all the sorting options. Link: http://lkml.kernel.org/r/20190426022622.4089-5-tobin@kernel.org Signed-off-by: Tobin C. Harding Cc: Alexander Duyck Cc: Brendan Gregg , Cc: Christoph Lameter Cc: David Rientjes Cc: Jesper Dangaard Brouer Cc: Joonsoo Kim Cc: Mel Gorman Cc: Michal Hocko Cc: Pekka Enberg Cc: Qian Cai Cc: Tejun Heo Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/vm/slabinfo.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/vm/slabinfo.c b/tools/vm/slabinfo.c index 469ff6157986..68092d15e12b 100644 --- a/tools/vm/slabinfo.c +++ b/tools/vm/slabinfo.c @@ -148,6 +148,8 @@ static void usage(void) " p | P Poisoning\n" " u | U Tracking\n" " t | T Tracing\n" + + "\nSorting options (--Loss, --Size, --Partial) are mutually exclusive\n" ); } From 8a713e7df3352b8d9392476e9cf29e4e185dac32 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 11 Jul 2019 20:59:50 -0700 Subject: [PATCH 134/147] proc: use down_read_killable mmap_sem for /proc/pid/maps MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Do not remain stuck forever if something goes wrong. Using a killable lock permits cleanup of stuck tasks and simplifies investigation. This function is also used for /proc/pid/smaps. Link: http://lkml.kernel.org/r/156007493160.3335.14447544314127417266.stgit@buzz Signed-off-by: Konstantin Khlebnikov Reviewed-by: Roman Gushchin Reviewed-by: Cyrill Gorcunov Reviewed-by: Kirill Tkhai Acked-by: Michal Hocko Cc: Alexey Dobriyan Cc: Al Viro Cc: Matthew Wilcox Cc: Michal Koutný Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 6 +++++- fs/proc/task_nommu.c | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 01d4eb0e6bd1..2bf210229daf 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -166,7 +166,11 @@ static void *m_start(struct seq_file *m, loff_t *ppos) if (!mm || !mmget_not_zero(mm)) return NULL; - down_read(&mm->mmap_sem); + if (down_read_killable(&mm->mmap_sem)) { + mmput(mm); + return ERR_PTR(-EINTR); + } + hold_task_mempolicy(priv); priv->tail_vma = get_gate_vma(mm); diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 36bf0f2e102e..7907e6419e57 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -211,7 +211,11 @@ static void *m_start(struct seq_file *m, loff_t *pos) if (!mm || !mmget_not_zero(mm)) return NULL; - down_read(&mm->mmap_sem); + if (down_read_killable(&mm->mmap_sem)) { + mmput(mm); + return ERR_PTR(-EINTR); + } + /* start from the Nth VMA */ for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) if (n-- == 0) From a26a97815548574213fd37f29b4b78ccc6d9ed20 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 11 Jul 2019 20:59:53 -0700 Subject: [PATCH 135/147] proc: use down_read_killable mmap_sem for /proc/pid/smaps_rollup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Do not remain stuck forever if something goes wrong. Using a killable lock permits cleanup of stuck tasks and simplifies investigation. Link: http://lkml.kernel.org/r/156007493429.3335.14666825072272692455.stgit@buzz Signed-off-by: Konstantin Khlebnikov Reviewed-by: Roman Gushchin Reviewed-by: Cyrill Gorcunov Reviewed-by: Kirill Tkhai Acked-by: Michal Hocko Cc: Alexey Dobriyan Cc: Al Viro Cc: Matthew Wilcox Cc: Michal Koutný Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 2bf210229daf..781879a91e3b 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -832,7 +832,10 @@ static int show_smaps_rollup(struct seq_file *m, void *v) memset(&mss, 0, sizeof(mss)); - down_read(&mm->mmap_sem); + ret = down_read_killable(&mm->mmap_sem); + if (ret) + goto out_put_mm; + hold_task_mempolicy(priv); for (vma = priv->mm->mmap; vma; vma = vma->vm_next) { @@ -849,8 +852,9 @@ static int show_smaps_rollup(struct seq_file *m, void *v) release_task_mempolicy(priv); up_read(&mm->mmap_sem); - mmput(mm); +out_put_mm: + mmput(mm); out_put_task: put_task_struct(priv->task); priv->task = NULL; From ad80b932c57d85fd6377f97f359b025baf179a87 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 11 Jul 2019 20:59:56 -0700 Subject: [PATCH 136/147] proc: use down_read_killable mmap_sem for /proc/pid/pagemap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Do not remain stuck forever if something goes wrong. Using a killable lock permits cleanup of stuck tasks and simplifies investigation. Link: http://lkml.kernel.org/r/156007493638.3335.4872164955523928492.stgit@buzz Signed-off-by: Konstantin Khlebnikov Reviewed-by: Roman Gushchin Reviewed-by: Cyrill Gorcunov Reviewed-by: Kirill Tkhai Acked-by: Michal Hocko Cc: Alexey Dobriyan Cc: Al Viro Cc: Matthew Wilcox Cc: Michal Koutný Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 781879a91e3b..78bed6adc62d 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1547,7 +1547,9 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, /* overflow ? */ if (end < start_vaddr || end > end_vaddr) end = end_vaddr; - down_read(&mm->mmap_sem); + ret = down_read_killable(&mm->mmap_sem); + if (ret) + goto out_free; ret = walk_page_range(start_vaddr, end, &pagemap_walk); up_read(&mm->mmap_sem); start_vaddr = end; From c46038017fbdcac627b670c9d4176f1d0c2f5fa3 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 11 Jul 2019 21:00:00 -0700 Subject: [PATCH 137/147] proc: use down_read_killable mmap_sem for /proc/pid/clear_refs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Do not remain stuck forever if something goes wrong. Using a killable lock permits cleanup of stuck tasks and simplifies investigation. Replace the only unkillable mmap_sem lock in clear_refs_write(). Link: http://lkml.kernel.org/r/156007493826.3335.5424884725467456239.stgit@buzz Signed-off-by: Konstantin Khlebnikov Reviewed-by: Roman Gushchin Reviewed-by: Cyrill Gorcunov Reviewed-by: Kirill Tkhai Acked-by: Michal Hocko Cc: Alexey Dobriyan Cc: Al Viro Cc: Matthew Wilcox Cc: Michal Koutný Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 78bed6adc62d..7f84d1477b5b 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1140,7 +1140,10 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, goto out_mm; } - down_read(&mm->mmap_sem); + if (down_read_killable(&mm->mmap_sem)) { + count = -EINTR; + goto out_mm; + } tlb_gather_mmu(&tlb, mm, 0, -1); if (type == CLEAR_REFS_SOFT_DIRTY) { for (vma = mm->mmap; vma; vma = vma->vm_next) { From cd9e2bb8271c971d9f37c722be2616c7f8ba0664 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 11 Jul 2019 21:00:03 -0700 Subject: [PATCH 138/147] proc: use down_read_killable mmap_sem for /proc/pid/map_files MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Do not remain stuck forever if something goes wrong. Using a killable lock permits cleanup of stuck tasks and simplifies investigation. It seems ->d_revalidate() could return any error (except ECHILD) to abort validation and pass error as result of lookup sequence. [akpm@linux-foundation.org: fix proc_map_files_lookup() return value, per Andrei] Link: http://lkml.kernel.org/r/156007493995.3335.9595044802115356911.stgit@buzz Signed-off-by: Konstantin Khlebnikov Reviewed-by: Roman Gushchin Reviewed-by: Cyrill Gorcunov Reviewed-by: Kirill Tkhai Acked-by: Michal Hocko Cc: Alexey Dobriyan Cc: Al Viro Cc: Matthew Wilcox Cc: Michal Koutný Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index c40fca98f2b7..534fb1ae498a 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1962,9 +1962,12 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags) goto out; if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) { - down_read(&mm->mmap_sem); - exact_vma_exists = !!find_exact_vma(mm, vm_start, vm_end); - up_read(&mm->mmap_sem); + status = down_read_killable(&mm->mmap_sem); + if (!status) { + exact_vma_exists = !!find_exact_vma(mm, vm_start, + vm_end); + up_read(&mm->mmap_sem); + } } mmput(mm); @@ -2010,8 +2013,11 @@ static int map_files_get_link(struct dentry *dentry, struct path *path) if (rc) goto out_mmput; + rc = down_read_killable(&mm->mmap_sem); + if (rc) + goto out_mmput; + rc = -ENOENT; - down_read(&mm->mmap_sem); vma = find_exact_vma(mm, vm_start, vm_end); if (vma && vma->vm_file) { *path = vma->vm_file->f_path; @@ -2107,7 +2113,11 @@ static struct dentry *proc_map_files_lookup(struct inode *dir, if (!mm) goto out_put_task; - down_read(&mm->mmap_sem); + result = ERR_PTR(-EINTR); + if (down_read_killable(&mm->mmap_sem)) + goto out_put_mm; + + result = ERR_PTR(-ENOENT); vma = find_exact_vma(mm, vm_start, vm_end); if (!vma) goto out_no_vma; @@ -2118,6 +2128,7 @@ static struct dentry *proc_map_files_lookup(struct inode *dir, out_no_vma: up_read(&mm->mmap_sem); +out_put_mm: mmput(mm); out_put_task: put_task_struct(task); @@ -2160,7 +2171,12 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) mm = get_task_mm(task); if (!mm) goto out_put_task; - down_read(&mm->mmap_sem); + + ret = down_read_killable(&mm->mmap_sem); + if (ret) { + mmput(mm); + goto out_put_task; + } nr_files = 0; From 1e426fe28261b03f297992e89da3320b42816f4e Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 11 Jul 2019 21:00:07 -0700 Subject: [PATCH 139/147] mm: use down_read_killable for locking mmap_sem in access_remote_vm MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This function is used by ptrace and proc files like /proc/pid/cmdline and /proc/pid/environ. Access_remote_vm never returns error codes, all errors are ignored and only size of successfully read data is returned. So, if current task was killed we'll simply return 0 (bytes read). Mmap_sem could be locked for a long time or forever if something goes wrong. Using a killable lock permits cleanup of stuck tasks and simplifies investigation. Link: http://lkml.kernel.org/r/156007494202.3335.16782303099589302087.stgit@buzz Signed-off-by: Konstantin Khlebnikov Reviewed-by: Michal Koutný Acked-by: Oleg Nesterov Acked-by: Michal Hocko Cc: Alexey Dobriyan Cc: Matthew Wilcox Cc: Cyrill Gorcunov Cc: Kirill Tkhai Cc: Al Viro Cc: Roman Gushchin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 4 +++- mm/nommu.c | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index ad4bf1a1a0ef..53bd59579861 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4344,7 +4344,9 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, void *old_buf = buf; int write = gup_flags & FOLL_WRITE; - down_read(&mm->mmap_sem); + if (down_read_killable(&mm->mmap_sem)) + return 0; + /* ignore errors, just check how much was successfully transferred */ while (len) { int bytes, ret, offset; diff --git a/mm/nommu.c b/mm/nommu.c index 07165ad2e548..eb3e2e558da1 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1704,7 +1704,8 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, struct vm_area_struct *vma; int write = gup_flags & FOLL_WRITE; - down_read(&mm->mmap_sem); + if (down_read_killable(&mm->mmap_sem)) + return 0; /* the access must start within one of the target process's mappings */ vma = find_vma(mm, addr); From ee2ad71b0756e995fa4f6d922463e9bccd71b198 Mon Sep 17 00:00:00 2001 From: Luigi Semenzato Date: Thu, 11 Jul 2019 21:00:10 -0700 Subject: [PATCH 140/147] mm: smaps: split PSS into components Report separate components (anon, file, and shmem) for PSS in smaps_rollup. This helps understand and tune the memory manager behavior in consumer devices, particularly mobile devices. Many of them (e.g. chromebooks and Android-based devices) use zram for anon memory, and perform disk reads for discarded file pages. The difference in latency is large (e.g. reading a single page from SSD is 30 times slower than decompressing a zram page on one popular device), thus it is useful to know how much of the PSS is anon vs. file. All the information is already present in /proc/pid/smaps, but much more expensive to obtain because of the large size of that procfs entry. This patch also removes a small code duplication in smaps_account, which would have gotten worse otherwise. Also updated Documentation/filesystems/proc.txt (the smaps section was a bit stale, and I added a smaps_rollup section) and Documentation/ABI/testing/procfs-smaps_rollup. [semenzato@chromium.org: v5] Link: http://lkml.kernel.org/r/20190626234333.44608-1-semenzato@chromium.org Link: http://lkml.kernel.org/r/20190626180429.174569-1-semenzato@chromium.org Signed-off-by: Luigi Semenzato Acked-by: Yu Zhao Cc: Sonny Rao Cc: Yu Zhao Cc: Brian Geffon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/ABI/testing/procfs-smaps_rollup | 14 ++- Documentation/filesystems/proc.txt | 41 +++++++-- fs/proc/task_mmu.c | 92 +++++++++++++------ 3 files changed, 105 insertions(+), 42 deletions(-) diff --git a/Documentation/ABI/testing/procfs-smaps_rollup b/Documentation/ABI/testing/procfs-smaps_rollup index 0a54ed0d63c9..274df44d8b1b 100644 --- a/Documentation/ABI/testing/procfs-smaps_rollup +++ b/Documentation/ABI/testing/procfs-smaps_rollup @@ -3,18 +3,28 @@ Date: August 2017 Contact: Daniel Colascione Description: This file provides pre-summed memory information for a - process. The format is identical to /proc/pid/smaps, + process. The format is almost identical to /proc/pid/smaps, except instead of an entry for each VMA in a process, smaps_rollup has a single entry (tagged "[rollup]") for which each field is the sum of the corresponding fields from all the maps in /proc/pid/smaps. - For more details, see the procfs man page. + Additionally, the fields Pss_Anon, Pss_File and Pss_Shmem + are not present in /proc/pid/smaps. These fields represent + the sum of the Pss field of each type (anon, file, shmem). + For more details, see Documentation/filesystems/proc.txt + and the procfs man page. Typical output looks like this: 00100000-ff709000 ---p 00000000 00:00 0 [rollup] + Size: 1192 kB + KernelPageSize: 4 kB + MMUPageSize: 4 kB Rss: 884 kB Pss: 385 kB + Pss_Anon: 301 kB + Pss_File: 80 kB + Pss_Shmem: 4 kB Shared_Clean: 696 kB Shared_Dirty: 0 kB Private_Clean: 120 kB diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index a226061fa109..d750b6926899 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -154,9 +154,11 @@ Table 1-1: Process specific entries in /proc symbol the task is blocked in - or "0" if not blocked. pagemap Page table stack Report full stack trace, enable via CONFIG_STACKTRACE - smaps an extension based on maps, showing the memory consumption of + smaps An extension based on maps, showing the memory consumption of each mapping and flags associated with it - numa_maps an extension based on maps, showing the memory locality and + smaps_rollup Accumulated smaps stats for all mappings of the process. This + can be derived from smaps, but is faster and more convenient + numa_maps An extension based on maps, showing the memory locality and binding policy as well as mem usage (in pages) of each mapping. .............................................................................. @@ -366,7 +368,7 @@ Table 1-4: Contents of the stat files (as of 2.6.30-rc7) exit_code the thread's exit_code in the form reported by the waitpid system call .............................................................................. -The /proc/PID/maps file containing the currently mapped memory regions and +The /proc/PID/maps file contains the currently mapped memory regions and their access permissions. The format is: @@ -417,11 +419,14 @@ is not associated with a file: or if empty, the mapping is anonymous. The /proc/PID/smaps is an extension based on maps, showing the memory -consumption for each of the process's mappings. For each of mappings there -is a series of lines such as the following: +consumption for each of the process's mappings. For each mapping (aka Virtual +Memory Area, or VMA) there is a series of lines such as the following: 08048000-080bc000 r-xp 00000000 03:02 13130 /bin/bash + Size: 1084 kB +KernelPageSize: 4 kB +MMUPageSize: 4 kB Rss: 892 kB Pss: 374 kB Shared_Clean: 892 kB @@ -443,11 +448,14 @@ Locked: 0 kB THPeligible: 0 VmFlags: rd ex mr mw me dw -the first of these lines shows the same information as is displayed for the -mapping in /proc/PID/maps. The remaining lines show the size of the mapping -(size), the amount of the mapping that is currently resident in RAM (RSS), the -process' proportional share of this mapping (PSS), the number of clean and -dirty private pages in the mapping. +The first of these lines shows the same information as is displayed for the +mapping in /proc/PID/maps. Following lines show the size of the mapping +(size); the size of each page allocated when backing a VMA (KernelPageSize), +which is usually the same as the size in the page table entries; the page size +used by the MMU when backing a VMA (in most cases, the same as KernelPageSize); +the amount of the mapping that is currently resident in RAM (RSS); the +process' proportional share of this mapping (PSS); and the number of clean and +dirty shared and private pages in the mapping. The "proportional set size" (PSS) of a process is the count of pages it has in memory, where each page is divided by the number of processes sharing it. @@ -532,6 +540,19 @@ guarantees: 2) If there is something at a given vaddr during the entirety of the life of the smaps/maps walk, there will be some output for it. +The /proc/PID/smaps_rollup file includes the same fields as /proc/PID/smaps, +but their values are the sums of the corresponding values for all mappings of +the process. Additionally, it contains these fields: + +Pss_Anon +Pss_File +Pss_Shmem + +They represent the proportional shares of anonymous, file, and shmem pages, as +described for smaps above. These fields are omitted in smaps since each +mapping identifies the type (anon, file, or shmem) of all pages it contains. +Thus all information in smaps_rollup can be derived from smaps, but at a +significantly higher cost. The /proc/PID/clear_refs is used to reset the PG_Referenced and ACCESSED/YOUNG bits on both physical and virtual pages associated with a process, and the diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 7f84d1477b5b..dedca3da428a 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -421,17 +421,53 @@ struct mem_size_stats { unsigned long shared_hugetlb; unsigned long private_hugetlb; u64 pss; + u64 pss_anon; + u64 pss_file; + u64 pss_shmem; u64 pss_locked; u64 swap_pss; bool check_shmem_swap; }; +static void smaps_page_accumulate(struct mem_size_stats *mss, + struct page *page, unsigned long size, unsigned long pss, + bool dirty, bool locked, bool private) +{ + mss->pss += pss; + + if (PageAnon(page)) + mss->pss_anon += pss; + else if (PageSwapBacked(page)) + mss->pss_shmem += pss; + else + mss->pss_file += pss; + + if (locked) + mss->pss_locked += pss; + + if (dirty || PageDirty(page)) { + if (private) + mss->private_dirty += size; + else + mss->shared_dirty += size; + } else { + if (private) + mss->private_clean += size; + else + mss->shared_clean += size; + } +} + static void smaps_account(struct mem_size_stats *mss, struct page *page, bool compound, bool young, bool dirty, bool locked) { int i, nr = compound ? 1 << compound_order(page) : 1; unsigned long size = nr * PAGE_SIZE; + /* + * First accumulate quantities that depend only on |size| and the type + * of the compound page. + */ if (PageAnon(page)) { mss->anonymous += size; if (!PageSwapBacked(page) && !dirty && !PageDirty(page)) @@ -444,42 +480,25 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page, mss->referenced += size; /* + * Then accumulate quantities that may depend on sharing, or that may + * differ page-by-page. + * * page_count(page) == 1 guarantees the page is mapped exactly once. * If any subpage of the compound page mapped with PTE it would elevate * page_count(). */ if (page_count(page) == 1) { - if (dirty || PageDirty(page)) - mss->private_dirty += size; - else - mss->private_clean += size; - mss->pss += (u64)size << PSS_SHIFT; - if (locked) - mss->pss_locked += (u64)size << PSS_SHIFT; + smaps_page_accumulate(mss, page, size, size << PSS_SHIFT, dirty, + locked, true); return; } - for (i = 0; i < nr; i++, page++) { int mapcount = page_mapcount(page); - unsigned long pss = (PAGE_SIZE << PSS_SHIFT); - - if (mapcount >= 2) { - if (dirty || PageDirty(page)) - mss->shared_dirty += PAGE_SIZE; - else - mss->shared_clean += PAGE_SIZE; - mss->pss += pss / mapcount; - if (locked) - mss->pss_locked += pss / mapcount; - } else { - if (dirty || PageDirty(page)) - mss->private_dirty += PAGE_SIZE; - else - mss->private_clean += PAGE_SIZE; - mss->pss += pss; - if (locked) - mss->pss_locked += pss; - } + unsigned long pss = PAGE_SIZE << PSS_SHIFT; + if (mapcount >= 2) + pss /= mapcount; + smaps_page_accumulate(mss, page, PAGE_SIZE, pss, dirty, locked, + mapcount < 2); } } @@ -758,10 +777,23 @@ static void smap_gather_stats(struct vm_area_struct *vma, seq_put_decimal_ull_width(m, str, (val) >> 10, 8) /* Show the contents common for smaps and smaps_rollup */ -static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss) +static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss, + bool rollup_mode) { SEQ_PUT_DEC("Rss: ", mss->resident); SEQ_PUT_DEC(" kB\nPss: ", mss->pss >> PSS_SHIFT); + if (rollup_mode) { + /* + * These are meaningful only for smaps_rollup, otherwise two of + * them are zero, and the other one is the same as Pss. + */ + SEQ_PUT_DEC(" kB\nPss_Anon: ", + mss->pss_anon >> PSS_SHIFT); + SEQ_PUT_DEC(" kB\nPss_File: ", + mss->pss_file >> PSS_SHIFT); + SEQ_PUT_DEC(" kB\nPss_Shmem: ", + mss->pss_shmem >> PSS_SHIFT); + } SEQ_PUT_DEC(" kB\nShared_Clean: ", mss->shared_clean); SEQ_PUT_DEC(" kB\nShared_Dirty: ", mss->shared_dirty); SEQ_PUT_DEC(" kB\nPrivate_Clean: ", mss->private_clean); @@ -798,7 +830,7 @@ static int show_smap(struct seq_file *m, void *v) SEQ_PUT_DEC(" kB\nMMUPageSize: ", vma_mmu_pagesize(vma)); seq_puts(m, " kB\n"); - __show_smap(m, &mss); + __show_smap(m, &mss, false); seq_printf(m, "THPeligible: %d\n", transparent_hugepage_enabled(vma)); @@ -848,7 +880,7 @@ static int show_smaps_rollup(struct seq_file *m, void *v) seq_pad(m, ' '); seq_puts(m, "[rollup]\n"); - __show_smap(m, &mss); + __show_smap(m, &mss, true); release_task_mempolicy(priv); up_read(&mm->mmap_sem); From 97105f0ab7b877a8ece2005e214894e93793950c Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 11 Jul 2019 21:00:13 -0700 Subject: [PATCH 141/147] mm: vmalloc: show number of vmalloc pages in /proc/meminfo Vmalloc() is getting more and more used these days (kernel stacks, bpf and percpu allocator are new top users), and the total % of memory consumed by vmalloc() can be pretty significant and changes dynamically. /proc/meminfo is the best place to display this information: its top goal is to show top consumers of the memory. Since the VmallocUsed field in /proc/meminfo is not in use for quite a long time (it has been defined to 0 by a5ad88ce8c7f ("mm: get rid of 'vmalloc_info' from /proc/meminfo")), let's reuse it for showing the actual physical memory consumption of vmalloc(). Link: http://lkml.kernel.org/r/20190417194002.12369-3-guro@fb.com Signed-off-by: Roman Gushchin Acked-by: Johannes Weiner Acked-by: Vlastimil Babka Reviewed-by: Andrew Morton Cc: Matthew Wilcox Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/meminfo.c | 2 +- include/linux/vmalloc.h | 2 ++ mm/vmalloc.c | 10 ++++++++++ 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 568d90e17c17..465ea0153b2a 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -120,7 +120,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "Committed_AS: ", committed); seq_printf(m, "VmallocTotal: %8lu kB\n", (unsigned long)VMALLOC_TOTAL >> 10); - show_val_kb(m, "VmallocUsed: ", 0ul); + show_val_kb(m, "VmallocUsed: ", vmalloc_nr_pages()); show_val_kb(m, "VmallocChunk: ", 0ul); show_val_kb(m, "Percpu: ", pcpu_nr_pages()); diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 51e131245379..9b21d0047710 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -72,10 +72,12 @@ extern void vm_unmap_aliases(void); #ifdef CONFIG_MMU extern void __init vmalloc_init(void); +extern unsigned long vmalloc_nr_pages(void); #else static inline void vmalloc_init(void) { } +static inline unsigned long vmalloc_nr_pages(void) { return 0; } #endif extern void *vmalloc(unsigned long size); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index edb212298c8a..4fa8d84599b0 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -406,6 +406,13 @@ static void purge_vmap_area_lazy(void); static BLOCKING_NOTIFIER_HEAD(vmap_notify_list); static unsigned long lazy_max_pages(void); +static atomic_long_t nr_vmalloc_pages; + +unsigned long vmalloc_nr_pages(void) +{ + return atomic_long_read(&nr_vmalloc_pages); +} + static struct vmap_area *__find_vmap_area(unsigned long addr) { struct rb_node *n = vmap_area_root.rb_node; @@ -2237,6 +2244,7 @@ static void __vunmap(const void *addr, int deallocate_pages) BUG_ON(!page); __free_pages(page, 0); } + atomic_long_sub(area->nr_pages, &nr_vmalloc_pages); kvfree(area->pages); } @@ -2414,12 +2422,14 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, if (unlikely(!page)) { /* Successfully allocated i pages, free them in __vunmap() */ area->nr_pages = i; + atomic_long_add(area->nr_pages, &nr_vmalloc_pages); goto fail; } area->pages[i] = page; if (gfpflags_allow_blocking(gfp_mask|highmem_mask)) cond_resched(); } + atomic_long_add(area->nr_pages, &nr_vmalloc_pages); if (map_vm_area(area, prot, pages)) goto fail; From 135e53514ef2cb200b616bf3fa4272cfa6c39291 Mon Sep 17 00:00:00 2001 From: Jane Chu Date: Thu, 11 Jul 2019 21:00:17 -0700 Subject: [PATCH 142/147] mm/memory-failure.c: clarify error message Some user who install SIGBUS handler that does longjmp out therefore keeping the process alive is confused by the error message "[188988.765862] Memory failure: 0x1840200: Killing cellsrv:33395 due to hardware memory corruption" Slightly modify the error message to improve clarity. Link: http://lkml.kernel.org/r/1558403523-22079-1-git-send-email-jane.chu@oracle.com Signed-off-by: Jane Chu Acked-by: Naoya Horiguchi Acked-by: Pankaj Gupta Reviewed-by: Anshuman Khandual Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index f045514d8d20..7e08cbf3ba49 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -213,7 +213,7 @@ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags) short addr_lsb = tk->size_shift; int ret; - pr_err("Memory failure: %#lx: Killing %s:%d due to hardware memory corruption\n", + pr_err("Memory failure: %#lx: Sending SIGBUS to %s:%d due to hardware memory corruption\n", pfn, t->comm, t->pid); if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) { From f168a9a54ec39b3f832c353733898b713b6b5c1f Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Thu, 11 Jul 2019 21:00:20 -0700 Subject: [PATCH 143/147] mm: memcontrol: use CSS_TASK_ITER_PROCS at mem_cgroup_scan_tasks() Since commit c03cd7738a83 ("cgroup: Include dying leaders with live threads in PROCS iterations") corrected how CSS_TASK_ITER_PROCS works, mem_cgroup_scan_tasks() can use CSS_TASK_ITER_PROCS in order to check only one thread from each thread group. [penguin-kernel@I-love.SAKURA.ne.jp: remove thread group leader check in oom_evaluate_task()] Link: http://lkml.kernel.org/r/1560853257-14934-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp Link: http://lkml.kernel.org/r/c763afc8-f0ae-756a-56a7-395f625b95fc@i-love.sakura.ne.jp Signed-off-by: Tetsuo Handa Acked-by: Michal Hocko Reviewed-by: Shakeel Butt Cc: Johannes Weiner Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 +- mm/oom_kill.c | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2cb7e4e5c51a..773ae5674e12 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1167,7 +1167,7 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, struct css_task_iter it; struct task_struct *task; - css_task_iter_start(&iter->css, 0, &it); + css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it); while (!ret && (task = css_task_iter_next(&it))) ret = fn(task, arg); css_task_iter_end(&it); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index f719b64741d6..606e5e4c6a3e 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -346,9 +346,6 @@ static int oom_evaluate_task(struct task_struct *task, void *arg) if (!points || points < oc->chosen_points) goto next; - /* Prefer thread group leaders for display purposes */ - if (points == oc->chosen_points && thread_group_leader(oc->chosen)) - goto next; select: if (oc->chosen) put_task_struct(oc->chosen); From 5eee7e1cdb97123bb55ac14ccd3af8b6edc31537 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 11 Jul 2019 21:00:23 -0700 Subject: [PATCH 144/147] mm, oom: refactor dump_tasks for memcg OOMs dump_tasks() traverses all the existing processes even for the memcg OOM context which is not only unnecessary but also wasteful. This imposes a long RCU critical section even from a contained context which can be quite disruptive. Change dump_tasks() to be aligned with select_bad_process and use mem_cgroup_scan_tasks to selectively traverse only processes of the target memcg hierarchy during memcg OOM. Link: http://lkml.kernel.org/r/20190617231207.160865-1-shakeelb@google.com Signed-off-by: Shakeel Butt Acked-by: Michal Hocko Acked-by: Roman Gushchin Cc: Johannes Weiner Cc: Tetsuo Handa Cc: Vladimir Davydov Cc: David Rientjes Cc: KOSAKI Motohiro Cc: Paul Jackson Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 68 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 40 insertions(+), 28 deletions(-) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 606e5e4c6a3e..59326614508a 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -382,10 +382,38 @@ static void select_bad_process(struct oom_control *oc) oc->chosen_points = oc->chosen_points * 1000 / oc->totalpages; } +static int dump_task(struct task_struct *p, void *arg) +{ + struct oom_control *oc = arg; + struct task_struct *task; + + if (oom_unkillable_task(p, NULL, oc->nodemask)) + return 0; + + task = find_lock_task_mm(p); + if (!task) { + /* + * This is a kthread or all of p's threads have already + * detached their mm's. There's no need to report + * them; they can't be oom killed anyway. + */ + return 0; + } + + pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n", + task->pid, from_kuid(&init_user_ns, task_uid(task)), + task->tgid, task->mm->total_vm, get_mm_rss(task->mm), + mm_pgtables_bytes(task->mm), + get_mm_counter(task->mm, MM_SWAPENTS), + task->signal->oom_score_adj, task->comm); + task_unlock(task); + + return 0; +} + /** * dump_tasks - dump current memory state of all system tasks - * @memcg: current's memory controller, if constrained - * @nodemask: nodemask passed to page allocator for mempolicy ooms + * @oc: pointer to struct oom_control * * Dumps the current memory state of all eligible tasks. Tasks not in the same * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes @@ -393,37 +421,21 @@ static void select_bad_process(struct oom_control *oc) * State information includes task's pid, uid, tgid, vm size, rss, * pgtables_bytes, swapents, oom_score_adj value, and name. */ -static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) +static void dump_tasks(struct oom_control *oc) { - struct task_struct *p; - struct task_struct *task; - pr_info("Tasks state (memory values in pages):\n"); pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n"); - rcu_read_lock(); - for_each_process(p) { - if (oom_unkillable_task(p, memcg, nodemask)) - continue; - task = find_lock_task_mm(p); - if (!task) { - /* - * This is a kthread or all of p's threads have already - * detached their mm's. There's no need to report - * them; they can't be oom killed anyway. - */ - continue; - } + if (is_memcg_oom(oc)) + mem_cgroup_scan_tasks(oc->memcg, dump_task, oc); + else { + struct task_struct *p; - pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n", - task->pid, from_kuid(&init_user_ns, task_uid(task)), - task->tgid, task->mm->total_vm, get_mm_rss(task->mm), - mm_pgtables_bytes(task->mm), - get_mm_counter(task->mm, MM_SWAPENTS), - task->signal->oom_score_adj, task->comm); - task_unlock(task); + rcu_read_lock(); + for_each_process(p) + dump_task(p, oc); + rcu_read_unlock(); } - rcu_read_unlock(); } static void dump_oom_summary(struct oom_control *oc, struct task_struct *victim) @@ -455,7 +467,7 @@ static void dump_header(struct oom_control *oc, struct task_struct *p) dump_unreclaimable_slab(); } if (sysctl_oom_dump_tasks) - dump_tasks(oc->memcg, oc->nodemask); + dump_tasks(oc); if (p) dump_oom_summary(oc, p); } From 6ba749ee78ef42ffdf4b95c042fc574a37d229d9 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 11 Jul 2019 21:00:26 -0700 Subject: [PATCH 145/147] mm, oom: remove redundant task_in_mem_cgroup() check oom_unkillable_task() can be called from three different contexts i.e. global OOM, memcg OOM and oom_score procfs interface. At the moment oom_unkillable_task() does a task_in_mem_cgroup() check on the given process. Since there is no reason to perform task_in_mem_cgroup() check for global OOM and oom_score procfs interface, those contexts provide NULL memcg and skips the task_in_mem_cgroup() check. However for memcg OOM context, the oom_unkillable_task() is always called from mem_cgroup_scan_tasks() and thus task_in_mem_cgroup() check becomes redundant and effectively dead code. So, just remove the task_in_mem_cgroup() check altogether. Link: http://lkml.kernel.org/r/20190624212631.87212-2-shakeelb@google.com Signed-off-by: Shakeel Butt Signed-off-by: Tetsuo Handa Acked-by: Roman Gushchin Acked-by: Michal Hocko Cc: David Rientjes Cc: Johannes Weiner Cc: KOSAKI Motohiro Cc: Nick Piggin Cc: Paul Jackson Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 2 +- include/linux/memcontrol.h | 7 ------- include/linux/oom.h | 2 +- mm/memcontrol.c | 26 -------------------------- mm/oom_kill.c | 19 +++++++------------ 5 files changed, 9 insertions(+), 47 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index 534fb1ae498a..64dadd469786 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -532,7 +532,7 @@ static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns, unsigned long totalpages = totalram_pages() + total_swap_pages; unsigned long points = 0; - points = oom_badness(task, NULL, NULL, totalpages) * + points = oom_badness(task, NULL, totalpages) * 1000 / totalpages; seq_printf(m, "%lu\n", points); diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 68402842c337..44c41462be33 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -394,7 +394,6 @@ out: struct lruvec *mem_cgroup_page_lruvec(struct page *, struct pglist_data *); -bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg); struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm); @@ -875,12 +874,6 @@ static inline bool mm_match_cgroup(struct mm_struct *mm, return true; } -static inline bool task_in_mem_cgroup(struct task_struct *task, - const struct mem_cgroup *memcg) -{ - return true; -} - static inline struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) { return NULL; diff --git a/include/linux/oom.h b/include/linux/oom.h index d07992009265..b75104690311 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -108,7 +108,7 @@ static inline vm_fault_t check_stable_address_space(struct mm_struct *mm) bool __oom_reap_task_mm(struct mm_struct *mm); extern unsigned long oom_badness(struct task_struct *p, - struct mem_cgroup *memcg, const nodemask_t *nodemask, + const nodemask_t *nodemask, unsigned long totalpages); extern bool out_of_memory(struct oom_control *oc); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 773ae5674e12..4f05735b02d3 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1259,32 +1259,6 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, *lru_size += nr_pages; } -bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg) -{ - struct mem_cgroup *task_memcg; - struct task_struct *p; - bool ret; - - p = find_lock_task_mm(task); - if (p) { - task_memcg = get_mem_cgroup_from_mm(p->mm); - task_unlock(p); - } else { - /* - * All threads may have already detached their mm's, but the oom - * killer still needs to detect if they have already been oom - * killed to prevent needlessly killing additional tasks. - */ - rcu_read_lock(); - task_memcg = mem_cgroup_from_task(task); - css_get(&task_memcg->css); - rcu_read_unlock(); - } - ret = mem_cgroup_is_descendant(task_memcg, memcg); - css_put(&task_memcg->css); - return ret; -} - /** * mem_cgroup_margin - calculate chargeable space of a memory cgroup * @memcg: the memory cgroup diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 59326614508a..b353f468a36a 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -153,17 +153,13 @@ static inline bool is_memcg_oom(struct oom_control *oc) /* return true if the task is not adequate as candidate victim task. */ static bool oom_unkillable_task(struct task_struct *p, - struct mem_cgroup *memcg, const nodemask_t *nodemask) + const nodemask_t *nodemask) { if (is_global_init(p)) return true; if (p->flags & PF_KTHREAD) return true; - /* When mem_cgroup_out_of_memory() and p is not member of the group */ - if (memcg && !task_in_mem_cgroup(p, memcg)) - return true; - /* p may not have freeable memory in nodemask */ if (!has_intersects_mems_allowed(p, nodemask)) return true; @@ -194,20 +190,19 @@ static bool is_dump_unreclaim_slabs(void) * oom_badness - heuristic function to determine which candidate task to kill * @p: task struct of which task we should calculate * @totalpages: total present RAM allowed for page allocation - * @memcg: task's memory controller, if constrained * @nodemask: nodemask passed to page allocator for mempolicy ooms * * The heuristic for determining which task to kill is made to be as simple and * predictable as possible. The goal is to return the highest value for the * task consuming the most memory to avoid subsequent oom failures. */ -unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, +unsigned long oom_badness(struct task_struct *p, const nodemask_t *nodemask, unsigned long totalpages) { long points; long adj; - if (oom_unkillable_task(p, memcg, nodemask)) + if (oom_unkillable_task(p, nodemask)) return 0; p = find_lock_task_mm(p); @@ -318,7 +313,7 @@ static int oom_evaluate_task(struct task_struct *task, void *arg) struct oom_control *oc = arg; unsigned long points; - if (oom_unkillable_task(task, NULL, oc->nodemask)) + if (oom_unkillable_task(task, oc->nodemask)) goto next; /* @@ -342,7 +337,7 @@ static int oom_evaluate_task(struct task_struct *task, void *arg) goto select; } - points = oom_badness(task, NULL, oc->nodemask, oc->totalpages); + points = oom_badness(task, oc->nodemask, oc->totalpages); if (!points || points < oc->chosen_points) goto next; @@ -387,7 +382,7 @@ static int dump_task(struct task_struct *p, void *arg) struct oom_control *oc = arg; struct task_struct *task; - if (oom_unkillable_task(p, NULL, oc->nodemask)) + if (oom_unkillable_task(p, oc->nodemask)) return 0; task = find_lock_task_mm(p); @@ -1084,7 +1079,7 @@ bool out_of_memory(struct oom_control *oc) check_panic_on_oom(oc); if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task && - current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) && + current->mm && !oom_unkillable_task(current, oc->nodemask) && current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) { get_task_struct(current); oc->chosen = current; From ac311a14c682dcd8a120a6244d0542ec654e3d93 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 11 Jul 2019 21:00:31 -0700 Subject: [PATCH 146/147] oom: decouple mems_allowed from oom_unkillable_task Commit ef08e3b4981a ("[PATCH] cpusets: confine oom_killer to mem_exclusive cpuset") introduces a heuristic where a potential oom-killer victim is skipped if the intersection of the potential victim and the current (the process triggered the oom) is empty based on the reason that killing such victim most probably will not help the current allocating process. However the commit 7887a3da753e ("[PATCH] oom: cpuset hint") changed the heuristic to just decrease the oom_badness scores of such potential victim based on the reason that the cpuset of such processes might have changed and previously they may have allocated memory on mems where the current allocating process can allocate from. Unintentionally 7887a3da753e ("[PATCH] oom: cpuset hint") introduced a side effect as the oom_badness is also exposed to the user space through /proc/[pid]/oom_score, so, readers with different cpusets can read different oom_score of the same process. Later, commit 6cf86ac6f36b ("oom: filter tasks not sharing the same cpuset") fixed the side effect introduced by 7887a3da753e by moving the cpuset intersection back to only oom-killer context and out of oom_badness. However the combination of ab290adbaf8f ("oom: make oom_unkillable_task() helper function") and 26ebc984913b ("oom: /proc//oom_score treat kernel thread honestly") unintentionally brought back the cpuset intersection check into the oom_badness calculation function. Other than doing cpuset/mempolicy intersection from oom_badness, the memcg oom context is also doing cpuset/mempolicy intersection which is quite wrong and is caught by syzcaller with the following report: kasan: CONFIG_KASAN_INLINE enabled kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] PREEMPT SMP KASAN CPU: 0 PID: 28426 Comm: syz-executor.5 Not tainted 5.2.0-rc3-next-20190607 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:__read_once_size include/linux/compiler.h:194 [inline] RIP: 0010:has_intersects_mems_allowed mm/oom_kill.c:84 [inline] RIP: 0010:oom_unkillable_task mm/oom_kill.c:168 [inline] RIP: 0010:oom_unkillable_task+0x180/0x400 mm/oom_kill.c:155 Code: c1 ea 03 80 3c 02 00 0f 85 80 02 00 00 4c 8b a3 10 07 00 00 48 b8 00 00 00 00 00 fc ff df 4d 8d 74 24 10 4c 89 f2 48 c1 ea 03 <80> 3c 02 00 0f 85 67 02 00 00 49 8b 44 24 10 4c 8d a0 68 fa ff ff RSP: 0018:ffff888000127490 EFLAGS: 00010a03 RAX: dffffc0000000000 RBX: ffff8880a4cd5438 RCX: ffffffff818dae9c RDX: 100000000c3cc602 RSI: ffffffff818dac8d RDI: 0000000000000001 RBP: ffff8880001274d0 R08: ffff888000086180 R09: ffffed1015d26be0 R10: ffffed1015d26bdf R11: ffff8880ae935efb R12: 8000000061e63007 R13: 0000000000000000 R14: 8000000061e63017 R15: 1ffff11000024ea6 FS: 00005555561f5940(0000) GS:ffff8880ae800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000607304 CR3: 000000009237e000 CR4: 00000000001426f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000600 Call Trace: oom_evaluate_task+0x49/0x520 mm/oom_kill.c:321 mem_cgroup_scan_tasks+0xcc/0x180 mm/memcontrol.c:1169 select_bad_process mm/oom_kill.c:374 [inline] out_of_memory mm/oom_kill.c:1088 [inline] out_of_memory+0x6b2/0x1280 mm/oom_kill.c:1035 mem_cgroup_out_of_memory+0x1ca/0x230 mm/memcontrol.c:1573 mem_cgroup_oom mm/memcontrol.c:1905 [inline] try_charge+0xfbe/0x1480 mm/memcontrol.c:2468 mem_cgroup_try_charge+0x24d/0x5e0 mm/memcontrol.c:6073 mem_cgroup_try_charge_delay+0x1f/0xa0 mm/memcontrol.c:6088 do_huge_pmd_wp_page_fallback+0x24f/0x1680 mm/huge_memory.c:1201 do_huge_pmd_wp_page+0x7fc/0x2160 mm/huge_memory.c:1359 wp_huge_pmd mm/memory.c:3793 [inline] __handle_mm_fault+0x164c/0x3eb0 mm/memory.c:4006 handle_mm_fault+0x3b7/0xa90 mm/memory.c:4053 do_user_addr_fault arch/x86/mm/fault.c:1455 [inline] __do_page_fault+0x5ef/0xda0 arch/x86/mm/fault.c:1521 do_page_fault+0x71/0x57d arch/x86/mm/fault.c:1552 page_fault+0x1e/0x30 arch/x86/entry/entry_64.S:1156 RIP: 0033:0x400590 Code: 06 e9 49 01 00 00 48 8b 44 24 10 48 0b 44 24 28 75 1f 48 8b 14 24 48 8b 7c 24 20 be 04 00 00 00 e8 f5 56 00 00 48 8b 74 24 08 <89> 06 e9 1e 01 00 00 48 8b 44 24 08 48 8b 14 24 be 04 00 00 00 8b RSP: 002b:00007fff7bc49780 EFLAGS: 00010206 RAX: 0000000000000001 RBX: 0000000000760000 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 000000002000cffc RDI: 0000000000000001 RBP: fffffffffffffffe R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000075 R11: 0000000000000246 R12: 0000000000760008 R13: 00000000004c55f2 R14: 0000000000000000 R15: 00007fff7bc499b0 Modules linked in: ---[ end trace a65689219582ffff ]--- RIP: 0010:__read_once_size include/linux/compiler.h:194 [inline] RIP: 0010:has_intersects_mems_allowed mm/oom_kill.c:84 [inline] RIP: 0010:oom_unkillable_task mm/oom_kill.c:168 [inline] RIP: 0010:oom_unkillable_task+0x180/0x400 mm/oom_kill.c:155 Code: c1 ea 03 80 3c 02 00 0f 85 80 02 00 00 4c 8b a3 10 07 00 00 48 b8 00 00 00 00 00 fc ff df 4d 8d 74 24 10 4c 89 f2 48 c1 ea 03 <80> 3c 02 00 0f 85 67 02 00 00 49 8b 44 24 10 4c 8d a0 68 fa ff ff RSP: 0018:ffff888000127490 EFLAGS: 00010a03 RAX: dffffc0000000000 RBX: ffff8880a4cd5438 RCX: ffffffff818dae9c RDX: 100000000c3cc602 RSI: ffffffff818dac8d RDI: 0000000000000001 RBP: ffff8880001274d0 R08: ffff888000086180 R09: ffffed1015d26be0 R10: ffffed1015d26bdf R11: ffff8880ae935efb R12: 8000000061e63007 R13: 0000000000000000 R14: 8000000061e63017 R15: 1ffff11000024ea6 FS: 00005555561f5940(0000) GS:ffff8880ae800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000001b2f823000 CR3: 000000009237e000 CR4: 00000000001426f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000600 The fix is to decouple the cpuset/mempolicy intersection check from oom_unkillable_task() and make sure cpuset/mempolicy intersection check is only done in the global oom context. [shakeelb@google.com: change function name and update comment] Link: http://lkml.kernel.org/r/20190628152421.198994-3-shakeelb@google.com Link: http://lkml.kernel.org/r/20190624212631.87212-3-shakeelb@google.com Signed-off-by: Shakeel Butt Reported-by: syzbot+d0fc9d3c166bc5e4a94b@syzkaller.appspotmail.com Acked-by: Roman Gushchin Acked-by: Michal Hocko Cc: David Rientjes Cc: Johannes Weiner Cc: KOSAKI Motohiro Cc: Nick Piggin Cc: Paul Jackson Cc: Tetsuo Handa Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 3 +-- include/linux/oom.h | 1 - mm/oom_kill.c | 57 +++++++++++++++++++++++++-------------------- 3 files changed, 33 insertions(+), 28 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index 64dadd469786..77eb628ecc7f 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -532,8 +532,7 @@ static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns, unsigned long totalpages = totalram_pages() + total_swap_pages; unsigned long points = 0; - points = oom_badness(task, NULL, totalpages) * - 1000 / totalpages; + points = oom_badness(task, totalpages) * 1000 / totalpages; seq_printf(m, "%lu\n", points); return 0; diff --git a/include/linux/oom.h b/include/linux/oom.h index b75104690311..c696c265f019 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -108,7 +108,6 @@ static inline vm_fault_t check_stable_address_space(struct mm_struct *mm) bool __oom_reap_task_mm(struct mm_struct *mm); extern unsigned long oom_badness(struct task_struct *p, - const nodemask_t *nodemask, unsigned long totalpages); extern bool out_of_memory(struct oom_control *oc); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index b353f468a36a..d1c9c4e66d59 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -64,21 +64,33 @@ int sysctl_oom_dump_tasks = 1; */ DEFINE_MUTEX(oom_lock); +static inline bool is_memcg_oom(struct oom_control *oc) +{ + return oc->memcg != NULL; +} + #ifdef CONFIG_NUMA /** - * has_intersects_mems_allowed() - check task eligiblity for kill + * oom_cpuset_eligible() - check task eligiblity for kill * @start: task struct of which task to consider * @mask: nodemask passed to page allocator for mempolicy ooms * * Task eligibility is determined by whether or not a candidate task, @tsk, * shares the same mempolicy nodes as current if it is bound by such a policy * and whether or not it has the same set of allowed cpuset nodes. + * + * This function is assuming oom-killer context and 'current' has triggered + * the oom-killer. */ -static bool has_intersects_mems_allowed(struct task_struct *start, - const nodemask_t *mask) +static bool oom_cpuset_eligible(struct task_struct *start, + struct oom_control *oc) { struct task_struct *tsk; bool ret = false; + const nodemask_t *mask = oc->nodemask; + + if (is_memcg_oom(oc)) + return true; rcu_read_lock(); for_each_thread(start, tsk) { @@ -105,8 +117,7 @@ static bool has_intersects_mems_allowed(struct task_struct *start, return ret; } #else -static bool has_intersects_mems_allowed(struct task_struct *tsk, - const nodemask_t *mask) +static bool oom_cpuset_eligible(struct task_struct *tsk, struct oom_control *oc) { return true; } @@ -146,24 +157,13 @@ static inline bool is_sysrq_oom(struct oom_control *oc) return oc->order == -1; } -static inline bool is_memcg_oom(struct oom_control *oc) -{ - return oc->memcg != NULL; -} - /* return true if the task is not adequate as candidate victim task. */ -static bool oom_unkillable_task(struct task_struct *p, - const nodemask_t *nodemask) +static bool oom_unkillable_task(struct task_struct *p) { if (is_global_init(p)) return true; if (p->flags & PF_KTHREAD) return true; - - /* p may not have freeable memory in nodemask */ - if (!has_intersects_mems_allowed(p, nodemask)) - return true; - return false; } @@ -190,19 +190,17 @@ static bool is_dump_unreclaim_slabs(void) * oom_badness - heuristic function to determine which candidate task to kill * @p: task struct of which task we should calculate * @totalpages: total present RAM allowed for page allocation - * @nodemask: nodemask passed to page allocator for mempolicy ooms * * The heuristic for determining which task to kill is made to be as simple and * predictable as possible. The goal is to return the highest value for the * task consuming the most memory to avoid subsequent oom failures. */ -unsigned long oom_badness(struct task_struct *p, - const nodemask_t *nodemask, unsigned long totalpages) +unsigned long oom_badness(struct task_struct *p, unsigned long totalpages) { long points; long adj; - if (oom_unkillable_task(p, nodemask)) + if (oom_unkillable_task(p)) return 0; p = find_lock_task_mm(p); @@ -313,7 +311,11 @@ static int oom_evaluate_task(struct task_struct *task, void *arg) struct oom_control *oc = arg; unsigned long points; - if (oom_unkillable_task(task, oc->nodemask)) + if (oom_unkillable_task(task)) + goto next; + + /* p may not have freeable memory in nodemask */ + if (!is_memcg_oom(oc) && !oom_cpuset_eligible(task, oc)) goto next; /* @@ -337,7 +339,7 @@ static int oom_evaluate_task(struct task_struct *task, void *arg) goto select; } - points = oom_badness(task, oc->nodemask, oc->totalpages); + points = oom_badness(task, oc->totalpages); if (!points || points < oc->chosen_points) goto next; @@ -382,7 +384,11 @@ static int dump_task(struct task_struct *p, void *arg) struct oom_control *oc = arg; struct task_struct *task; - if (oom_unkillable_task(p, oc->nodemask)) + if (oom_unkillable_task(p)) + return 0; + + /* p may not have freeable memory in nodemask */ + if (!is_memcg_oom(oc) && !oom_cpuset_eligible(p, oc)) return 0; task = find_lock_task_mm(p); @@ -1079,7 +1085,8 @@ bool out_of_memory(struct oom_control *oc) check_panic_on_oom(oc); if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task && - current->mm && !oom_unkillable_task(current, oc->nodemask) && + current->mm && !oom_unkillable_task(current) && + oom_cpuset_eligible(current, oc) && current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) { get_task_struct(current); oc->chosen = current; From 2c207985f354dfb549e5a543102a3e084eea81f6 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Thu, 11 Jul 2019 21:00:34 -0700 Subject: [PATCH 147/147] mm/oom_kill.c: remove redundant OOM score normalization in select_bad_process() Since commit bbbe48029720 ("mm, oom: remove 'prefer children over parent' heuristic") removed the "%s: Kill process %d (%s) score %u or sacrifice child\n" line, oc->chosen_points is no longer used after select_bad_process(). Link: http://lkml.kernel.org/r/1560853435-15575-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp Signed-off-by: Tetsuo Handa Acked-by: Michal Hocko Cc: Shakeel Butt Cc: Roman Gushchin Cc: Johannes Weiner Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index d1c9c4e66d59..eda2e2a0bdc6 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -375,8 +375,6 @@ static void select_bad_process(struct oom_control *oc) break; rcu_read_unlock(); } - - oc->chosen_points = oc->chosen_points * 1000 / oc->totalpages; } static int dump_task(struct task_struct *p, void *arg)