From eb59254608bc1d42c4c6afdcdce9c0d3ce02b318 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 10 Apr 2018 16:27:36 -0700 Subject: [PATCH 001/140] mm: introduce NR_INDIRECTLY_RECLAIMABLE_BYTES Patch series "indirectly reclaimable memory", v2. This patchset introduces the concept of indirectly reclaimable memory and applies it to fix the issue of when a big number of dentries with external names can significantly affect the MemAvailable value. This patch (of 3): Introduce a concept of indirectly reclaimable memory and adds the corresponding memory counter and /proc/vmstat item. Indirectly reclaimable memory is any sort of memory, used by the kernel (except of reclaimable slabs), which is actually reclaimable, i.e. will be released under memory pressure. The counter is in bytes, as it's not always possible to count such objects in pages. The name contains BYTES by analogy to NR_KERNEL_STACK_KB. Link: http://lkml.kernel.org/r/20180305133743.12746-2-guro@fb.com Signed-off-by: Roman Gushchin Reviewed-by: Andrew Morton Cc: Alexander Viro Cc: Michal Hocko Cc: Johannes Weiner Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 1 + mm/vmstat.c | 1 + 2 files changed, 2 insertions(+) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index f11ae29005f1..a0c9e45a859a 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -180,6 +180,7 @@ enum node_stat_item { NR_VMSCAN_IMMEDIATE, /* Prioritise for reclaim when writeback ends */ NR_DIRTIED, /* page dirtyings since bootup */ NR_WRITTEN, /* page writings since bootup */ + NR_INDIRECTLY_RECLAIMABLE_BYTES, /* measured in bytes */ NR_VM_NODE_STAT_ITEMS }; diff --git a/mm/vmstat.c b/mm/vmstat.c index 33581be705f0..536332e988b8 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1161,6 +1161,7 @@ const char * const vmstat_text[] = { "nr_vmscan_immediate_reclaim", "nr_dirtied", "nr_written", + "nr_indirectly_reclaimable", /* enum writeback_stat_item counters */ "nr_dirty_threshold", From 034ebf65c3c21d85b963d39f992258a64a85e3a9 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 10 Apr 2018 16:27:40 -0700 Subject: [PATCH 002/140] mm: treat indirectly reclaimable memory as available in MemAvailable Adjust /proc/meminfo MemAvailable calculation by adding the amount of indirectly reclaimable memory (rounded to the PAGE_SIZE). Link: http://lkml.kernel.org/r/20180305133743.12746-4-guro@fb.com Signed-off-by: Roman Gushchin Reviewed-by: Andrew Morton Cc: Alexander Viro Cc: Michal Hocko Cc: Johannes Weiner Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0b97b8ece4a9..b04667848375 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4734,6 +4734,13 @@ long si_mem_available(void) min(global_node_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low); + /* + * Part of the kernel memory, which can be released under memory + * pressure. + */ + available += global_node_page_state(NR_INDIRECTLY_RECLAIMABLE_BYTES) >> + PAGE_SHIFT; + if (available < 0) available = 0; return available; From f1782c9bc547754f4bd3043fe8cfda53db85f13f Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 10 Apr 2018 16:27:44 -0700 Subject: [PATCH 003/140] dcache: account external names as indirectly reclaimable memory I received a report about suspicious growth of unreclaimable slabs on some machines. I've found that it happens on machines with low memory pressure, and these unreclaimable slabs are external names attached to dentries. External names are allocated using generic kmalloc() function, so they are accounted as unreclaimable. But they are held by dentries, which are reclaimable, and they will be reclaimed under the memory pressure. In particular, this breaks MemAvailable calculation, as it doesn't take unreclaimable slabs into account. This leads to a silly situation, when a machine is almost idle, has no memory pressure and therefore has a big dentry cache. And the resulting MemAvailable is too low to start a new workload. To address the issue, the NR_INDIRECTLY_RECLAIMABLE_BYTES counter is used to track the amount of memory, consumed by external names. The counter is increased in the dentry allocation path, if an external name structure is allocated; and it's decreased in the dentry freeing path. To reproduce the problem I've used the following Python script: import os for iter in range (0, 10000000): try: name = ("/some_long_name_%d" % iter) + "_" * 220 os.stat(name) except Exception: pass Without this patch: $ cat /proc/meminfo | grep MemAvailable MemAvailable: 7811688 kB $ python indirect.py $ cat /proc/meminfo | grep MemAvailable MemAvailable: 2753052 kB With the patch: $ cat /proc/meminfo | grep MemAvailable MemAvailable: 7809516 kB $ python indirect.py $ cat /proc/meminfo | grep MemAvailable MemAvailable: 7749144 kB [guro@fb.com: fix indirectly reclaimable memory accounting for CONFIG_SLOB] Link: http://lkml.kernel.org/r/20180312194140.19517-1-guro@fb.com [guro@fb.com: fix indirectly reclaimable memory accounting] Link: http://lkml.kernel.org/r/20180313125701.7955-1-guro@fb.com Link: http://lkml.kernel.org/r/20180305133743.12746-5-guro@fb.com Signed-off-by: Roman Gushchin Reviewed-by: Andrew Morton Cc: Alexander Viro Cc: Michal Hocko Cc: Johannes Weiner Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dcache.c | 39 ++++++++++++++++++++++++++++++--------- 1 file changed, 30 insertions(+), 9 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 593079176123..915816e90049 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -257,11 +257,25 @@ static void __d_free(struct rcu_head *head) kmem_cache_free(dentry_cache, dentry); } +static void __d_free_external_name(struct rcu_head *head) +{ + struct external_name *name = container_of(head, struct external_name, + u.head); + + mod_node_page_state(page_pgdat(virt_to_page(name)), + NR_INDIRECTLY_RECLAIMABLE_BYTES, + -ksize(name)); + + kfree(name); +} + static void __d_free_external(struct rcu_head *head) { struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu); - kfree(external_name(dentry)); - kmem_cache_free(dentry_cache, dentry); + + __d_free_external_name(&external_name(dentry)->u.head); + + kmem_cache_free(dentry_cache, dentry); } static inline int dname_external(const struct dentry *dentry) @@ -291,7 +305,7 @@ void release_dentry_name_snapshot(struct name_snapshot *name) struct external_name *p; p = container_of(name->name, struct external_name, name[0]); if (unlikely(atomic_dec_and_test(&p->u.count))) - kfree_rcu(p, u.head); + call_rcu(&p->u.head, __d_free_external_name); } } EXPORT_SYMBOL(release_dentry_name_snapshot); @@ -1617,6 +1631,7 @@ EXPORT_SYMBOL(d_invalidate); struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) { + struct external_name *ext = NULL; struct dentry *dentry; char *dname; int err; @@ -1637,14 +1652,14 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) dname = dentry->d_iname; } else if (name->len > DNAME_INLINE_LEN-1) { size_t size = offsetof(struct external_name, name[1]); - struct external_name *p = kmalloc(size + name->len, - GFP_KERNEL_ACCOUNT); - if (!p) { + + ext = kmalloc(size + name->len, GFP_KERNEL_ACCOUNT); + if (!ext) { kmem_cache_free(dentry_cache, dentry); return NULL; } - atomic_set(&p->u.count, 1); - dname = p->name; + atomic_set(&ext->u.count, 1); + dname = ext->name; } else { dname = dentry->d_iname; } @@ -1683,6 +1698,12 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) } } + if (unlikely(ext)) { + pg_data_t *pgdat = page_pgdat(virt_to_page(ext)); + mod_node_page_state(pgdat, NR_INDIRECTLY_RECLAIMABLE_BYTES, + ksize(ext)); + } + this_cpu_inc(nr_dentry); return dentry; @@ -2770,7 +2791,7 @@ static void copy_name(struct dentry *dentry, struct dentry *target) dentry->d_name.hash_len = target->d_name.hash_len; } if (old_name && likely(atomic_dec_and_test(&old_name->u.count))) - kfree_rcu(old_name, u.head); + call_rcu(&old_name->u.head, __d_free_external_name); } /* From d79f7aa496fc94d763f67b833a1f36f4c171176f Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 10 Apr 2018 16:27:47 -0700 Subject: [PATCH 004/140] mm: treat indirectly reclaimable memory as free in overcommit logic Indirectly reclaimable memory can consume a significant part of total memory and it's actually reclaimable (it will be released under actual memory pressure). So, the overcommit logic should treat it as free. Otherwise, it's possible to cause random system-wide memory allocation failures by consuming a significant amount of memory by indirectly reclaimable memory, e.g. dentry external names. If overcommit policy GUESS is used, it might be used for denial of service attack under some conditions. The following program illustrates the approach. It causes the kernel to allocate an unreclaimable kmalloc-256 chunk for each stat() call, so that at some point the overcommit logic may start blocking large allocation system-wide. int main() { char buf[256]; unsigned long i; struct stat statbuf; buf[0] = '/'; for (i = 1; i < sizeof(buf); i++) buf[i] = '_'; for (i = 0; 1; i++) { sprintf(&buf[248], "%8lu", i); stat(buf, &statbuf); } return 0; } This patch in combination with related indirectly reclaimable memory patches closes this issue. Link: http://lkml.kernel.org/r/20180313130041.8078-1-guro@fb.com Signed-off-by: Roman Gushchin Reviewed-by: Andrew Morton Cc: Alexander Viro Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/util.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/mm/util.c b/mm/util.c index 029fc2f3b395..73676f0f1b43 100644 --- a/mm/util.c +++ b/mm/util.c @@ -667,6 +667,13 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) */ free += global_node_page_state(NR_SLAB_RECLAIMABLE); + /* + * Part of the kernel memory, which can be released + * under memory pressure. + */ + free += global_node_page_state( + NR_INDIRECTLY_RECLAIMABLE_BYTES) >> PAGE_SHIFT; + /* * Leave reserved pages. The pages are not for anonymous pages. */ From 894befec4d70b1c14097e4f009c90114b5a044ba Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Tue, 10 Apr 2018 16:27:51 -0700 Subject: [PATCH 005/140] mm/vmscan: update stale comments Update some comments that became stale since transiton from per-zone to per-node reclaim. Link: http://lkml.kernel.org/r/20180315164553.17856-2-aryabinin@virtuozzo.com Signed-off-by: Andrey Ryabinin Acked-by: Michal Hocko Cc: Shakeel Butt Cc: Mel Gorman Cc: Tejun Heo Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 4390a8d5be41..6d74b12099bd 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -926,7 +926,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); /* - * The number of dirty pages determines if a zone is marked + * The number of dirty pages determines if a node is marked * reclaim_congested which affects wait_iff_congested. kswapd * will stall and start writing pages if the tail of the LRU * is all dirty unqueued pages. @@ -1764,7 +1764,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, * as there is no guarantee the dirtying process is throttled in the * same way balance_dirty_pages() manages. * - * Once a zone is flagged ZONE_WRITEBACK, kswapd will count the number + * Once a node is flagged PGDAT_WRITEBACK, kswapd will count the number * of pages under pages flagged for immediate reclaim and stall if any * are encountered in the nr_immediate check below. */ @@ -1791,7 +1791,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, */ if (sane_reclaim(sc)) { /* - * Tag a zone as congested if all the dirty pages scanned were + * Tag a node as congested if all the dirty pages scanned were * backed by a congested BDI and wait_iff_congested will stall. */ if (stat.nr_dirty && stat.nr_dirty == stat.nr_congested) @@ -1812,7 +1812,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, } /* - * Stall direct reclaim for IO completions if underlying BDIs or zone + * Stall direct reclaim for IO completions if underlying BDIs and node * is congested. Allow kswapd to continue until it starts encountering * unqueued dirty pages or cycling through the LRU too quickly. */ @@ -3808,7 +3808,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) { /* - * Free memory by calling shrink zone with increasing + * Free memory by calling shrink node with increasing * priorities until we have enough memory freed. */ do { From c4fd4fa58018bab781c17c8d92d70106a37b6a5c Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Tue, 10 Apr 2018 16:27:55 -0700 Subject: [PATCH 006/140] mm/vmscan: remove redundant current_may_throttle() check Only kswapd can have non-zero nr_immediate, and current_may_throttle() is always true for kswapd (PF_LESS_THROTTLE bit is never set) thus it's enough to check stat.nr_immediate only. Link: http://lkml.kernel.org/r/20180315164553.17856-4-aryabinin@virtuozzo.com Signed-off-by: Andrey Ryabinin Acked-by: Michal Hocko Cc: Shakeel Butt Cc: Mel Gorman Cc: Tejun Heo Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 6d74b12099bd..403f59edd53e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1807,7 +1807,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, * that pages are cycling through the LRU faster than * they are written so also forcibly stall. */ - if (stat.nr_immediate && current_may_throttle()) + if (stat.nr_immediate) congestion_wait(BLK_RW_ASYNC, HZ/10); } From d108c7721fbd1f867510b2db12ed18ff3d9fa171 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Tue, 10 Apr 2018 16:27:59 -0700 Subject: [PATCH 007/140] mm/vmscan: don't change pgdat state on base of a single LRU list state We have separate LRU list for each memory cgroup. Memory reclaim iterates over cgroups and calls shrink_inactive_list() every inactive LRU list. Based on the state of a single LRU shrink_inactive_list() may flag the whole node as dirty,congested or under writeback. This is obviously wrong and hurtful. It's especially hurtful when we have possibly small congested cgroup in system. Than *all* direct reclaims waste time by sleeping in wait_iff_congested(). And the more memcgs in the system we have the longer memory allocation stall is, because wait_iff_congested() called on each lru-list scan. Sum reclaim stats across all visited LRUs on node and flag node as dirty, congested or under writeback based on that sum. Also call congestion_wait(), wait_iff_congested() once per pgdat scan, instead of once per lru-list scan. This only fixes the problem for global reclaim case. Per-cgroup reclaim may alter global pgdat flags too, which is wrong. But that is separate issue and will be addressed in the next patch. This change will not have any effect on a systems with all workload concentrated in a single cgroup. [aryabinin@virtuozzo.com: check nr_writeback against all nr_taken, not just file] Link: http://lkml.kernel.org/r/20180406180254.8970-1-aryabinin@virtuozzo.com Link: http://lkml.kernel.org/r/20180323152029.11084-4-aryabinin@virtuozzo.com Signed-off-by: Andrey Ryabinin Reviewed-by: Shakeel Butt Cc: Mel Gorman Cc: Tejun Heo Cc: Johannes Weiner Cc: Michal Hocko Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 126 +++++++++++++++++++++++++++++++--------------------- 1 file changed, 75 insertions(+), 51 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 403f59edd53e..1ecc648b6191 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -116,6 +116,16 @@ struct scan_control { /* Number of pages freed so far during a call to shrink_zones() */ unsigned long nr_reclaimed; + + struct { + unsigned int dirty; + unsigned int unqueued_dirty; + unsigned int congested; + unsigned int writeback; + unsigned int immediate; + unsigned int file_taken; + unsigned int taken; + } nr; }; #ifdef ARCH_HAS_PREFETCH @@ -1754,23 +1764,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, mem_cgroup_uncharge_list(&page_list); free_unref_page_list(&page_list); - /* - * If reclaim is isolating dirty pages under writeback, it implies - * that the long-lived page allocation rate is exceeding the page - * laundering rate. Either the global limits are not being effective - * at throttling processes due to the page distribution throughout - * zones or there is heavy usage of a slow backing device. The - * only option is to throttle from reclaim context which is not ideal - * as there is no guarantee the dirtying process is throttled in the - * same way balance_dirty_pages() manages. - * - * Once a node is flagged PGDAT_WRITEBACK, kswapd will count the number - * of pages under pages flagged for immediate reclaim and stall if any - * are encountered in the nr_immediate check below. - */ - if (stat.nr_writeback && stat.nr_writeback == nr_taken) - set_bit(PGDAT_WRITEBACK, &pgdat->flags); - /* * If dirty pages are scanned that are not queued for IO, it * implies that flushers are not doing their job. This can @@ -1785,40 +1778,14 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, if (stat.nr_unqueued_dirty == nr_taken) wakeup_flusher_threads(WB_REASON_VMSCAN); - /* - * Legacy memcg will stall in page writeback so avoid forcibly - * stalling here. - */ - if (sane_reclaim(sc)) { - /* - * Tag a node as congested if all the dirty pages scanned were - * backed by a congested BDI and wait_iff_congested will stall. - */ - if (stat.nr_dirty && stat.nr_dirty == stat.nr_congested) - set_bit(PGDAT_CONGESTED, &pgdat->flags); - - /* Allow kswapd to start writing pages during reclaim. */ - if (stat.nr_unqueued_dirty == nr_taken) - set_bit(PGDAT_DIRTY, &pgdat->flags); - - /* - * If kswapd scans pages marked marked for immediate - * reclaim and under writeback (nr_immediate), it implies - * that pages are cycling through the LRU faster than - * they are written so also forcibly stall. - */ - if (stat.nr_immediate) - congestion_wait(BLK_RW_ASYNC, HZ/10); - } - - /* - * Stall direct reclaim for IO completions if underlying BDIs and node - * is congested. Allow kswapd to continue until it starts encountering - * unqueued dirty pages or cycling through the LRU too quickly. - */ - if (!sc->hibernation_mode && !current_is_kswapd() && - current_may_throttle()) - wait_iff_congested(pgdat, BLK_RW_ASYNC, HZ/10); + sc->nr.dirty += stat.nr_dirty; + sc->nr.congested += stat.nr_congested; + sc->nr.unqueued_dirty += stat.nr_unqueued_dirty; + sc->nr.writeback += stat.nr_writeback; + sc->nr.immediate += stat.nr_immediate; + sc->nr.taken += nr_taken; + if (file) + sc->nr.file_taken += nr_taken; trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id, nr_scanned, nr_reclaimed, @@ -2522,6 +2489,8 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) unsigned long node_lru_pages = 0; struct mem_cgroup *memcg; + memset(&sc->nr, 0, sizeof(sc->nr)); + nr_reclaimed = sc->nr_reclaimed; nr_scanned = sc->nr_scanned; @@ -2587,6 +2556,61 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) if (sc->nr_reclaimed - nr_reclaimed) reclaimable = true; + /* + * If reclaim is isolating dirty pages under writeback, it + * implies that the long-lived page allocation rate is exceeding + * the page laundering rate. Either the global limits are not + * being effective at throttling processes due to the page + * distribution throughout zones or there is heavy usage of a + * slow backing device. The only option is to throttle from + * reclaim context which is not ideal as there is no guarantee + * the dirtying process is throttled in the same way + * balance_dirty_pages() manages. + * + * Once a node is flagged PGDAT_WRITEBACK, kswapd will count the + * number of pages under pages flagged for immediate reclaim and + * stall if any are encountered in the nr_immediate check below. + */ + if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken) + set_bit(PGDAT_WRITEBACK, &pgdat->flags); + + /* + * Legacy memcg will stall in page writeback so avoid forcibly + * stalling here. + */ + if (sane_reclaim(sc)) { + /* + * Tag a node as congested if all the dirty pages + * scanned were backed by a congested BDI and + * wait_iff_congested will stall. + */ + if (sc->nr.dirty && sc->nr.dirty == sc->nr.congested) + set_bit(PGDAT_CONGESTED, &pgdat->flags); + + /* Allow kswapd to start writing pages during reclaim.*/ + if (sc->nr.unqueued_dirty == sc->nr.file_taken) + set_bit(PGDAT_DIRTY, &pgdat->flags); + + /* + * If kswapd scans pages marked marked for immediate + * reclaim and under writeback (nr_immediate), it + * implies that pages are cycling through the LRU + * faster than they are written so also forcibly stall. + */ + if (sc->nr.immediate) + congestion_wait(BLK_RW_ASYNC, HZ/10); + } + + /* + * Stall direct reclaim for IO completions if underlying BDIs + * and node is congested. Allow kswapd to continue until it + * starts encountering unqueued dirty pages or cycling through + * the LRU too quickly. + */ + if (!sc->hibernation_mode && !current_is_kswapd() && + current_may_throttle()) + wait_iff_congested(pgdat, BLK_RW_ASYNC, HZ/10); + } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed, sc->nr_scanned - nr_scanned, sc)); From e3c1ac586c9922180146605bfb4816e3b11148c5 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Tue, 10 Apr 2018 16:28:03 -0700 Subject: [PATCH 008/140] mm/vmscan: don't mess with pgdat->flags in memcg reclaim memcg reclaim may alter pgdat->flags based on the state of LRU lists in cgroup and its children. PGDAT_WRITEBACK may force kswapd to sleep congested_wait(), PGDAT_DIRTY may force kswapd to writeback filesystem pages. But the worst here is PGDAT_CONGESTED, since it may force all direct reclaims to stall in wait_iff_congested(). Note that only kswapd have powers to clear any of these bits. This might just never happen if cgroup limits configured that way. So all direct reclaims will stall as long as we have some congested bdi in the system. Leave all pgdat->flags manipulations to kswapd. kswapd scans the whole pgdat, only kswapd can clear pgdat->flags once node is balanced, thus it's reasonable to leave all decisions about node state to kswapd. Why only kswapd? Why not allow to global direct reclaim change these flags? It is because currently only kswapd can clear these flags. I'm less worried about the case when PGDAT_CONGESTED falsely not set, and more worried about the case when it falsely set. If direct reclaimer sets PGDAT_CONGESTED, do we have guarantee that after the congestion problem is sorted out, kswapd will be woken up and clear the flag? It seems like there is no such guarantee. E.g. direct reclaimers may eventually balance pgdat and kswapd simply won't wake up (see wakeup_kswapd()). Moving pgdat->flags manipulation to kswapd, means that cgroup2 recalim now loses its congestion throttling mechanism. Add per-cgroup congestion state and throttle cgroup2 reclaimers if memcg is in congestion state. Currently there is no need in per-cgroup PGDAT_WRITEBACK and PGDAT_DIRTY bits since they alter only kswapd behavior. The problem could be easily demonstrated by creating heavy congestion in one cgroup: echo "+memory" > /sys/fs/cgroup/cgroup.subtree_control mkdir -p /sys/fs/cgroup/congester echo 512M > /sys/fs/cgroup/congester/memory.max echo $$ > /sys/fs/cgroup/congester/cgroup.procs /* generate a lot of diry data on slow HDD */ while true; do dd if=/dev/zero of=/mnt/sdb/zeroes bs=1M count=1024; done & .... while true; do dd if=/dev/zero of=/mnt/sdb/zeroes bs=1M count=1024; done & and some job in another cgroup: mkdir /sys/fs/cgroup/victim echo 128M > /sys/fs/cgroup/victim/memory.max # time cat /dev/sda > /dev/null real 10m15.054s user 0m0.487s sys 1m8.505s According to the tracepoint in wait_iff_congested(), the 'cat' spent 50% of the time sleeping there. With the patch, cat don't waste time anymore: # time cat /dev/sda > /dev/null real 5m32.911s user 0m0.411s sys 0m56.664s [aryabinin@virtuozzo.com: congestion state should be per-node] Link: http://lkml.kernel.org/r/20180406135215.10057-1-aryabinin@virtuozzo.com [ayabinin@virtuozzo.com: make congestion state per-cgroup-per-node instead of just per-cgroup[ Link: http://lkml.kernel.org/r/20180406180254.8970-2-aryabinin@virtuozzo.com Link: http://lkml.kernel.org/r/20180323152029.11084-5-aryabinin@virtuozzo.com Signed-off-by: Andrey Ryabinin Reviewed-by: Shakeel Butt Acked-by: Johannes Weiner Cc: Mel Gorman Cc: Tejun Heo Cc: Michal Hocko Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/backing-dev.h | 2 +- include/linux/memcontrol.h | 3 ++ mm/backing-dev.c | 19 +++----- mm/vmscan.c | 96 +++++++++++++++++++++++++++---------- 4 files changed, 82 insertions(+), 38 deletions(-) diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 3e4ce54d84ab..e6cbb915ee56 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -175,7 +175,7 @@ static inline int wb_congested(struct bdi_writeback *wb, int cong_bits) } long congestion_wait(int sync, long timeout); -long wait_iff_congested(struct pglist_data *pgdat, int sync, long timeout); +long wait_iff_congested(int sync, long timeout); static inline bool bdi_cap_synchronous_io(struct backing_dev_info *bdi) { diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index c46016bb25eb..f292efac378d 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -120,6 +120,9 @@ struct mem_cgroup_per_node { unsigned long usage_in_excess;/* Set to the value by which */ /* the soft limit is exceeded*/ bool on_tree; + bool congested; /* memcg has many dirty pages */ + /* backed by a congested BDI */ + struct mem_cgroup *memcg; /* Back pointer, we cannot */ /* use container_of */ }; diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 08b9aab631ab..023190c69dce 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -1020,23 +1020,18 @@ EXPORT_SYMBOL(congestion_wait); /** * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a pgdat to complete writes - * @pgdat: A pgdat to check if it is heavily congested * @sync: SYNC or ASYNC IO * @timeout: timeout in jiffies * - * In the event of a congested backing_dev (any backing_dev) and the given - * @pgdat has experienced recent congestion, this waits for up to @timeout - * jiffies for either a BDI to exit congestion of the given @sync queue - * or a write to complete. - * - * In the absence of pgdat congestion, cond_resched() is called to yield - * the processor if necessary but otherwise does not sleep. + * In the event of a congested backing_dev (any backing_dev) this waits + * for up to @timeout jiffies for either a BDI to exit congestion of the + * given @sync queue or a write to complete. * * The return value is 0 if the sleep is for the full timeout. Otherwise, * it is the number of jiffies that were still remaining when the function * returned. return_value == timeout implies the function did not sleep. */ -long wait_iff_congested(struct pglist_data *pgdat, int sync, long timeout) +long wait_iff_congested(int sync, long timeout) { long ret; unsigned long start = jiffies; @@ -1044,12 +1039,10 @@ long wait_iff_congested(struct pglist_data *pgdat, int sync, long timeout) wait_queue_head_t *wqh = &congestion_wqh[sync]; /* - * If there is no congestion, or heavy congestion is not being - * encountered in the current pgdat, yield if necessary instead + * If there is no congestion, yield if necessary instead * of sleeping on the congestion queue */ - if (atomic_read(&nr_wb_congested[sync]) == 0 || - !test_bit(PGDAT_CONGESTED, &pgdat->flags)) { + if (atomic_read(&nr_wb_congested[sync]) == 0) { cond_resched(); /* In case we scheduled, work out time remaining */ diff --git a/mm/vmscan.c b/mm/vmscan.c index 1ecc648b6191..e411385b304a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -200,6 +200,29 @@ static bool sane_reclaim(struct scan_control *sc) #endif return false; } + +static void set_memcg_congestion(pg_data_t *pgdat, + struct mem_cgroup *memcg, + bool congested) +{ + struct mem_cgroup_per_node *mn; + + if (!memcg) + return; + + mn = mem_cgroup_nodeinfo(memcg, pgdat->node_id); + WRITE_ONCE(mn->congested, congested); +} + +static bool memcg_congested(pg_data_t *pgdat, + struct mem_cgroup *memcg) +{ + struct mem_cgroup_per_node *mn; + + mn = mem_cgroup_nodeinfo(memcg, pgdat->node_id); + return READ_ONCE(mn->congested); + +} #else static bool global_reclaim(struct scan_control *sc) { @@ -210,6 +233,18 @@ static bool sane_reclaim(struct scan_control *sc) { return true; } + +static inline void set_memcg_congestion(struct pglist_data *pgdat, + struct mem_cgroup *memcg, bool congested) +{ +} + +static inline bool memcg_congested(struct pglist_data *pgdat, + struct mem_cgroup *memcg) +{ + return false; + +} #endif /* @@ -2474,6 +2509,12 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat, return true; } +static bool pgdat_memcg_congested(pg_data_t *pgdat, struct mem_cgroup *memcg) +{ + return test_bit(PGDAT_CONGESTED, &pgdat->flags) || + (memcg && memcg_congested(pgdat, memcg)); +} + static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) { struct reclaim_state *reclaim_state = current->reclaim_state; @@ -2556,29 +2597,27 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) if (sc->nr_reclaimed - nr_reclaimed) reclaimable = true; - /* - * If reclaim is isolating dirty pages under writeback, it - * implies that the long-lived page allocation rate is exceeding - * the page laundering rate. Either the global limits are not - * being effective at throttling processes due to the page - * distribution throughout zones or there is heavy usage of a - * slow backing device. The only option is to throttle from - * reclaim context which is not ideal as there is no guarantee - * the dirtying process is throttled in the same way - * balance_dirty_pages() manages. - * - * Once a node is flagged PGDAT_WRITEBACK, kswapd will count the - * number of pages under pages flagged for immediate reclaim and - * stall if any are encountered in the nr_immediate check below. - */ - if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken) - set_bit(PGDAT_WRITEBACK, &pgdat->flags); + if (current_is_kswapd()) { + /* + * If reclaim is isolating dirty pages under writeback, + * it implies that the long-lived page allocation rate + * is exceeding the page laundering rate. Either the + * global limits are not being effective at throttling + * processes due to the page distribution throughout + * zones or there is heavy usage of a slow backing + * device. The only option is to throttle from reclaim + * context which is not ideal as there is no guarantee + * the dirtying process is throttled in the same way + * balance_dirty_pages() manages. + * + * Once a node is flagged PGDAT_WRITEBACK, kswapd will + * count the number of pages under pages flagged for + * immediate reclaim and stall if any are encountered + * in the nr_immediate check below. + */ + if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken) + set_bit(PGDAT_WRITEBACK, &pgdat->flags); - /* - * Legacy memcg will stall in page writeback so avoid forcibly - * stalling here. - */ - if (sane_reclaim(sc)) { /* * Tag a node as congested if all the dirty pages * scanned were backed by a congested BDI and @@ -2601,6 +2640,14 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) congestion_wait(BLK_RW_ASYNC, HZ/10); } + /* + * Legacy memcg will stall in page writeback so avoid forcibly + * stalling in wait_iff_congested(). + */ + if (!global_reclaim(sc) && sane_reclaim(sc) && + sc->nr.dirty && sc->nr.dirty == sc->nr.congested) + set_memcg_congestion(pgdat, root, true); + /* * Stall direct reclaim for IO completions if underlying BDIs * and node is congested. Allow kswapd to continue until it @@ -2608,8 +2655,8 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) * the LRU too quickly. */ if (!sc->hibernation_mode && !current_is_kswapd() && - current_may_throttle()) - wait_iff_congested(pgdat, BLK_RW_ASYNC, HZ/10); + current_may_throttle() && pgdat_memcg_congested(pgdat, root)) + wait_iff_congested(BLK_RW_ASYNC, HZ/10); } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed, sc->nr_scanned - nr_scanned, sc)); @@ -2826,6 +2873,7 @@ retry: continue; last_pgdat = zone->zone_pgdat; snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat); + set_memcg_congestion(last_pgdat, sc->target_mem_cgroup, false); } delayacct_freepages_end(); From d51d1e64500fcb48fc6a18c77c965b8f48a175f2 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 10 Apr 2018 16:28:07 -0700 Subject: [PATCH 009/140] mm, vmscan, tracing: use pointer to reclaim_stat struct in trace event The trace event trace_mm_vmscan_lru_shrink_inactive() currently has 12 parameters! Seven of them are from the reclaim_stat structure. This structure is currently local to mm/vmscan.c. By moving it to the global vmstat.h header, we can also reference it from the vmscan tracepoints. In moving it, it brings down the overhead of passing so many arguments to the trace event. In the future, we may limit the number of arguments that a trace event may pass (ideally just 6, but more realistically it may be 8). Before this patch, the code to call the trace event is this: 0f 83 aa fe ff ff jae ffffffff811e6261 48 8b 45 a0 mov -0x60(%rbp),%rax 45 8b 64 24 20 mov 0x20(%r12),%r12d 44 8b 6d d4 mov -0x2c(%rbp),%r13d 8b 4d d0 mov -0x30(%rbp),%ecx 44 8b 75 cc mov -0x34(%rbp),%r14d 44 8b 7d c8 mov -0x38(%rbp),%r15d 48 89 45 90 mov %rax,-0x70(%rbp) 8b 83 b8 fe ff ff mov -0x148(%rbx),%eax 8b 55 c0 mov -0x40(%rbp),%edx 8b 7d c4 mov -0x3c(%rbp),%edi 8b 75 b8 mov -0x48(%rbp),%esi 89 45 80 mov %eax,-0x80(%rbp) 65 ff 05 e4 f7 e2 7e incl %gs:0x7ee2f7e4(%rip) # 15bd0 <__preempt_count> 48 8b 05 75 5b 13 01 mov 0x1135b75(%rip),%rax # ffffffff8231bf68 <__tracepoint_mm_vmscan_lru_shrink_inactive+0x28> 48 85 c0 test %rax,%rax 74 72 je ffffffff811e646a 48 89 c3 mov %rax,%rbx 4c 8b 10 mov (%rax),%r10 89 f8 mov %edi,%eax 48 89 85 68 ff ff ff mov %rax,-0x98(%rbp) 89 f0 mov %esi,%eax 48 89 85 60 ff ff ff mov %rax,-0xa0(%rbp) 89 c8 mov %ecx,%eax 48 89 85 78 ff ff ff mov %rax,-0x88(%rbp) 89 d0 mov %edx,%eax 48 89 85 70 ff ff ff mov %rax,-0x90(%rbp) 8b 45 8c mov -0x74(%rbp),%eax 48 8b 7b 08 mov 0x8(%rbx),%rdi 48 83 c3 18 add $0x18,%rbx 50 push %rax 41 54 push %r12 41 55 push %r13 ff b5 78 ff ff ff pushq -0x88(%rbp) 41 56 push %r14 41 57 push %r15 ff b5 70 ff ff ff pushq -0x90(%rbp) 4c 8b 8d 68 ff ff ff mov -0x98(%rbp),%r9 4c 8b 85 60 ff ff ff mov -0xa0(%rbp),%r8 48 8b 4d 98 mov -0x68(%rbp),%rcx 48 8b 55 90 mov -0x70(%rbp),%rdx 8b 75 80 mov -0x80(%rbp),%esi 41 ff d2 callq *%r10 After the patch: 0f 83 a8 fe ff ff jae ffffffff811e626d 8b 9b b8 fe ff ff mov -0x148(%rbx),%ebx 45 8b 64 24 20 mov 0x20(%r12),%r12d 4c 8b 6d a0 mov -0x60(%rbp),%r13 65 ff 05 f5 f7 e2 7e incl %gs:0x7ee2f7f5(%rip) # 15bd0 <__preempt_count> 4c 8b 35 86 5b 13 01 mov 0x1135b86(%rip),%r14 # ffffffff8231bf68 <__tracepoint_mm_vmscan_lru_shrink_inactive+0x28> 4d 85 f6 test %r14,%r14 74 2a je ffffffff811e6411 49 8b 06 mov (%r14),%rax 8b 4d 8c mov -0x74(%rbp),%ecx 49 8b 7e 08 mov 0x8(%r14),%rdi 49 83 c6 18 add $0x18,%r14 4c 89 ea mov %r13,%rdx 45 89 e1 mov %r12d,%r9d 4c 8d 45 b8 lea -0x48(%rbp),%r8 89 de mov %ebx,%esi 51 push %rcx 48 8b 4d 98 mov -0x68(%rbp),%rcx ff d0 callq *%rax Link: http://lkml.kernel.org/r/2559d7cb-ec60-1200-2362-04fa34fd02bb@fb.com Link: http://lkml.kernel.org/r/20180322121003.4177af15@gandalf.local.home Signed-off-by: Steven Rostedt (VMware) Reported-by: Alexei Starovoitov Acked-by: David Rientjes Acked-by: Michal Hocko Cc: Mel Gorman Cc: Vlastimil Babka Cc: Andrey Ryabinin Cc: Alexei Starovoitov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmstat.h | 11 +++++++++++ include/trace/events/vmscan.h | 24 +++++++++--------------- mm/vmscan.c | 18 +----------------- 3 files changed, 21 insertions(+), 32 deletions(-) diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index a4c2317d8b9f..f25cef84b41d 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -20,6 +20,17 @@ extern int sysctl_vm_numa_stat_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos); #endif +struct reclaim_stat { + unsigned nr_dirty; + unsigned nr_unqueued_dirty; + unsigned nr_congested; + unsigned nr_writeback; + unsigned nr_immediate; + unsigned nr_activate; + unsigned nr_ref_keep; + unsigned nr_unmap_fail; +}; + #ifdef CONFIG_VM_EVENT_COUNTERS /* * Light weight per cpu counter implementation. diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index 6570c5b45ba1..a1cb91342231 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -346,15 +346,9 @@ TRACE_EVENT(mm_vmscan_lru_shrink_inactive, TP_PROTO(int nid, unsigned long nr_scanned, unsigned long nr_reclaimed, - unsigned long nr_dirty, unsigned long nr_writeback, - unsigned long nr_congested, unsigned long nr_immediate, - unsigned long nr_activate, unsigned long nr_ref_keep, - unsigned long nr_unmap_fail, - int priority, int file), + struct reclaim_stat *stat, int priority, int file), - TP_ARGS(nid, nr_scanned, nr_reclaimed, nr_dirty, nr_writeback, - nr_congested, nr_immediate, nr_activate, nr_ref_keep, - nr_unmap_fail, priority, file), + TP_ARGS(nid, nr_scanned, nr_reclaimed, stat, priority, file), TP_STRUCT__entry( __field(int, nid) @@ -375,13 +369,13 @@ TRACE_EVENT(mm_vmscan_lru_shrink_inactive, __entry->nid = nid; __entry->nr_scanned = nr_scanned; __entry->nr_reclaimed = nr_reclaimed; - __entry->nr_dirty = nr_dirty; - __entry->nr_writeback = nr_writeback; - __entry->nr_congested = nr_congested; - __entry->nr_immediate = nr_immediate; - __entry->nr_activate = nr_activate; - __entry->nr_ref_keep = nr_ref_keep; - __entry->nr_unmap_fail = nr_unmap_fail; + __entry->nr_dirty = stat->nr_dirty; + __entry->nr_writeback = stat->nr_writeback; + __entry->nr_congested = stat->nr_congested; + __entry->nr_immediate = stat->nr_immediate; + __entry->nr_activate = stat->nr_activate; + __entry->nr_ref_keep = stat->nr_ref_keep; + __entry->nr_unmap_fail = stat->nr_unmap_fail; __entry->priority = priority; __entry->reclaim_flags = trace_shrink_flags(file); ), diff --git a/mm/vmscan.c b/mm/vmscan.c index e411385b304a..a1d7ba0136fe 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -902,17 +902,6 @@ static void page_check_dirty_writeback(struct page *page, mapping->a_ops->is_dirty_writeback(page, dirty, writeback); } -struct reclaim_stat { - unsigned nr_dirty; - unsigned nr_unqueued_dirty; - unsigned nr_congested; - unsigned nr_writeback; - unsigned nr_immediate; - unsigned nr_activate; - unsigned nr_ref_keep; - unsigned nr_unmap_fail; -}; - /* * shrink_page_list() returns the number of reclaimed pages */ @@ -1823,12 +1812,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, sc->nr.file_taken += nr_taken; trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id, - nr_scanned, nr_reclaimed, - stat.nr_dirty, stat.nr_writeback, - stat.nr_congested, stat.nr_immediate, - stat.nr_activate, stat.nr_ref_keep, - stat.nr_unmap_fail, - sc->priority, file); + nr_scanned, nr_reclaimed, &stat, sc->priority, file); return nr_reclaimed; } From 76ea470ce45cc45edaea93ffa5c46e5c9278df70 Mon Sep 17 00:00:00 2001 From: Ralph Campbell Date: Tue, 10 Apr 2018 16:28:11 -0700 Subject: [PATCH 010/140] mm/hmm: documentation editorial update to HMM documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update the documentation for HMM to fix minor typos and phrasing to be a bit more readable. Link: http://lkml.kernel.org/r/20180323005527.758-2-jglisse@redhat.com Signed-off-by: Ralph Campbell Signed-off-by: Jérôme Glisse Cc: Stephen Bates Cc: Jason Gunthorpe Cc: Logan Gunthorpe Cc: Evgeny Baskakov Cc: Mark Hairgrove Cc: John Hubbard Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/hmm.txt | 346 ++++++++++++++++++++------------------- MAINTAINERS | 1 + 2 files changed, 180 insertions(+), 167 deletions(-) diff --git a/Documentation/vm/hmm.txt b/Documentation/vm/hmm.txt index 4d3aac9f4a5d..e99b97003982 100644 --- a/Documentation/vm/hmm.txt +++ b/Documentation/vm/hmm.txt @@ -1,151 +1,159 @@ Heterogeneous Memory Management (HMM) -Transparently allow any component of a program to use any memory region of said -program with a device without using device specific memory allocator. This is -becoming a requirement to simplify the use of advance heterogeneous computing -where GPU, DSP or FPGA are use to perform various computations. +Provide infrastructure and helpers to integrate non conventional memory (device +memory like GPU on board memory) into regular kernel code path. Corner stone of +this being specialize struct page for such memory (see sections 5 to 7 of this +document). -This document is divided as follow, in the first section i expose the problems -related to the use of a device specific allocator. The second section i expose -the hardware limitations that are inherent to many platforms. The third section -gives an overview of HMM designs. The fourth section explains how CPU page- -table mirroring works and what is HMM purpose in this context. Fifth section -deals with how device memory is represented inside the kernel. Finaly the last -section present the new migration helper that allow to leverage the device DMA -engine. +HMM also provide optional helpers for SVM (Share Virtual Memory) ie allowing a +device to transparently access program address coherently with the CPU meaning +that any valid pointer on the CPU is also a valid pointer for the device. This +is becoming a mandatory to simplify the use of advance heterogeneous computing +where GPU, DSP, or FPGA are used to perform various computations on behalf of +a process. + +This document is divided as follows: in the first section I expose the problems +related to using device specific memory allocators. In the second section, I +expose the hardware limitations that are inherent to many platforms. The third +section gives an overview of the HMM design. The fourth section explains how +CPU page-table mirroring works and what is HMM's purpose in this context. The +fifth section deals with how device memory is represented inside the kernel. +Finally, the last section presents a new migration helper that allows lever- +aging the device DMA engine. -1) Problems of using device specific memory allocator: -2) System bus, device memory characteristics -3) Share address space and migration +1) Problems of using a device specific memory allocator: +2) I/O bus, device memory characteristics +3) Shared address space and migration 4) Address space mirroring implementation and API 5) Represent and manage device memory from core kernel point of view -6) Migrate to and from device memory +6) Migration to and from device memory 7) Memory cgroup (memcg) and rss accounting ------------------------------------------------------------------------------- -1) Problems of using device specific memory allocator: +1) Problems of using a device specific memory allocator: -Device with large amount of on board memory (several giga bytes) like GPU have -historically manage their memory through dedicated driver specific API. This -creates a disconnect between memory allocated and managed by device driver and -regular application memory (private anonymous, share memory or regular file -back memory). From here on i will refer to this aspect as split address space. -I use share address space to refer to the opposite situation ie one in which -any memory region can be use by device transparently. +Devices with a large amount of on board memory (several giga bytes) like GPUs +have historically managed their memory through dedicated driver specific APIs. +This creates a disconnect between memory allocated and managed by a device +driver and regular application memory (private anonymous, shared memory, or +regular file backed memory). From here on I will refer to this aspect as split +address space. I use shared address space to refer to the opposite situation: +i.e., one in which any application memory region can be used by a device +transparently. Split address space because device can only access memory allocated through the -device specific API. This imply that all memory object in a program are not -equal from device point of view which complicate large program that rely on a -wide set of libraries. +device specific API. This implies that all memory objects in a program are not +equal from the device point of view which complicates large programs that rely +on a wide set of libraries. -Concretly this means that code that wants to leverage device like GPU need to +Concretly this means that code that wants to leverage devices like GPUs need to copy object between genericly allocated memory (malloc, mmap private/share/) and memory allocated through the device driver API (this still end up with an mmap but of the device file). -For flat dataset (array, grid, image, ...) this isn't too hard to achieve but -complex data-set (list, tree, ...) are hard to get right. Duplicating a complex -data-set need to re-map all the pointer relations between each of its elements. -This is error prone and program gets harder to debug because of the duplicate -data-set. +For flat data-sets (array, grid, image, ...) this isn't too hard to achieve but +complex data-sets (list, tree, ...) are hard to get right. Duplicating a +complex data-set needs to re-map all the pointer relations between each of its +elements. This is error prone and program gets harder to debug because of the +duplicate data-set and addresses. -Split address space also means that library can not transparently use data they -are getting from core program or other library and thus each library might have -to duplicate its input data-set using specific memory allocator. Large project -suffer from this and waste resources because of the various memory copy. +Split address space also means that libraries can not transparently use data +they are getting from the core program or another library and thus each library +might have to duplicate its input data-set using the device specific memory +allocator. Large projects suffer from this and waste resources because of the +various memory copies. Duplicating each library API to accept as input or output memory allocted by each device specific allocator is not a viable option. It would lead to a -combinatorial explosions in the library entry points. +combinatorial explosion in the library entry points. -Finaly with the advance of high level language constructs (in C++ but in other -language too) it is now possible for compiler to leverage GPU or other devices -without even the programmer knowledge. Some of compiler identified patterns are -only do-able with a share address. It is as well more reasonable to use a share -address space for all the other patterns. +Finally, with the advance of high level language constructs (in C++ but in +other languages too) it is now possible for the compiler to leverage GPUs and +other devices without programmer knowledge. Some compiler identified patterns +are only do-able with a shared address space. It is also more reasonable to use +a shared address space for all other patterns. ------------------------------------------------------------------------------- -2) System bus, device memory characteristics +2) I/O bus, device memory characteristics -System bus cripple share address due to few limitations. Most system bus only +I/O buses cripple shared address due to few limitations. Most I/O buses only allow basic memory access from device to main memory, even cache coherency is -often optional. Access to device memory from CPU is even more limited, most -often than not it is not cache coherent. +often optional. Access to device memory from CPU is even more limited. More +often than not, it is not cache coherent. -If we only consider the PCIE bus than device can access main memory (often -through an IOMMU) and be cache coherent with the CPUs. However it only allows -a limited set of atomic operation from device on main memory. This is worse -in the other direction the CPUs can only access a limited range of the device +If we only consider the PCIE bus, then a device can access main memory (often +through an IOMMU) and be cache coherent with the CPUs. However, it only allows +a limited set of atomic operations from device on main memory. This is worse +in the other direction, the CPU can only access a limited range of the device memory and can not perform atomic operations on it. Thus device memory can not -be consider like regular memory from kernel point of view. +be considered the same as regular memory from the kernel point of view. Another crippling factor is the limited bandwidth (~32GBytes/s with PCIE 4.0 -and 16 lanes). This is 33 times less that fastest GPU memory (1 TBytes/s). -The final limitation is latency, access to main memory from the device has an -order of magnitude higher latency than when the device access its own memory. +and 16 lanes). This is 33 times less than the fastest GPU memory (1 TBytes/s). +The final limitation is latency. Access to main memory from the device has an +order of magnitude higher latency than when the device accesses its own memory. -Some platform are developing new system bus or additions/modifications to PCIE -to address some of those limitations (OpenCAPI, CCIX). They mainly allow two +Some platforms are developing new I/O buses or additions/modifications to PCIE +to address some of these limitations (OpenCAPI, CCIX). They mainly allow two way cache coherency between CPU and device and allow all atomic operations the -architecture supports. Saddly not all platform are following this trends and -some major architecture are left without hardware solutions to those problems. +architecture supports. Saddly, not all platforms are following this trend and +some major architectures are left without hardware solutions to these problems. -So for share address space to make sense not only we must allow device to +So for shared address space to make sense, not only must we allow device to access any memory memory but we must also permit any memory to be migrated to device memory while device is using it (blocking CPU access while it happens). ------------------------------------------------------------------------------- -3) Share address space and migration +3) Shared address space and migration HMM intends to provide two main features. First one is to share the address -space by duplication the CPU page table into the device page table so same -address point to same memory and this for any valid main memory address in +space by duplicating the CPU page table in the device page table so the same +address points to the same physical memory for any valid main memory address in the process address space. -To achieve this, HMM offer a set of helpers to populate the device page table +To achieve this, HMM offers a set of helpers to populate the device page table while keeping track of CPU page table updates. Device page table updates are -not as easy as CPU page table updates. To update the device page table you must -allow a buffer (or use a pool of pre-allocated buffer) and write GPU specifics -commands in it to perform the update (unmap, cache invalidations and flush, -...). This can not be done through common code for all device. Hence why HMM -provides helpers to factor out everything that can be while leaving the gory -details to the device driver. +not as easy as CPU page table updates. To update the device page table, you must +allocate a buffer (or use a pool of pre-allocated buffers) and write GPU +specific commands in it to perform the update (unmap, cache invalidations, and +flush, ...). This can not be done through common code for all devices. Hence +why HMM provides helpers to factor out everything that can be while leaving the +hardware specific details to the device driver. -The second mechanism HMM provide is a new kind of ZONE_DEVICE memory that does -allow to allocate a struct page for each page of the device memory. Those page -are special because the CPU can not map them. They however allow to migrate -main memory to device memory using exhisting migration mechanism and everything -looks like if page was swap out to disk from CPU point of view. Using a struct -page gives the easiest and cleanest integration with existing mm mechanisms. -Again here HMM only provide helpers, first to hotplug new ZONE_DEVICE memory -for the device memory and second to perform migration. Policy decision of what -and when to migrate things is left to the device driver. +The second mechanism HMM provides, is a new kind of ZONE_DEVICE memory that +allows allocating a struct page for each page of the device memory. Those pages +are special because the CPU can not map them. However, they allow migrating +main memory to device memory using existing migration mechanisms and everything +looks like a page is swapped out to disk from the CPU point of view. Using a +struct page gives the easiest and cleanest integration with existing mm mech- +anisms. Here again, HMM only provides helpers, first to hotplug new ZONE_DEVICE +memory for the device memory and second to perform migration. Policy decisions +of what and when to migrate things is left to the device driver. -Note that any CPU access to a device page trigger a page fault and a migration -back to main memory ie when a page backing an given address A is migrated from -a main memory page to a device page then any CPU access to address A trigger a -page fault and initiate a migration back to main memory. +Note that any CPU access to a device page triggers a page fault and a migration +back to main memory. For example, when a page backing a given CPU address A is +migrated from a main memory page to a device page, then any CPU access to +address A triggers a page fault and initiates a migration back to main memory. - -With this two features, HMM not only allow a device to mirror a process address -space and keeps both CPU and device page table synchronize, but also allow to -leverage device memory by migrating part of data-set that is actively use by a -device. +With these two features, HMM not only allows a device to mirror process address +space and keeping both CPU and device page table synchronized, but also lever- +ages device memory by migrating the part of the data-set that is actively being +used by the device. ------------------------------------------------------------------------------- 4) Address space mirroring implementation and API -Address space mirroring main objective is to allow to duplicate range of CPU -page table into a device page table and HMM helps keeping both synchronize. A +Address space mirroring's main objective is to allow duplication of a range of +CPU page table into a device page table; HMM helps keep both synchronized. A device driver that want to mirror a process address space must start with the registration of an hmm_mirror struct: @@ -155,8 +163,8 @@ registration of an hmm_mirror struct: struct mm_struct *mm); The locked variant is to be use when the driver is already holding the mmap_sem -of the mm in write mode. The mirror struct has a set of callback that are use -to propagate CPU page table: +of the mm in write mode. The mirror struct has a set of callbacks that are used +to propagate CPU page tables: struct hmm_mirror_ops { /* sync_cpu_device_pagetables() - synchronize page tables @@ -181,13 +189,13 @@ to propagate CPU page table: unsigned long end); }; -Device driver must perform update to the range following action (turn range -read only, or fully unmap, ...). Once driver callback returns the device must -be done with the update. +The device driver must perform the update action to the range (mark range +read only, or fully unmap, ...). The device must be done with the update before +the driver callback returns. -When device driver wants to populate a range of virtual address it can use -either: +When the device driver wants to populate a range of virtual addresses, it can +use either: int hmm_vma_get_pfns(struct vm_area_struct *vma, struct hmm_range *range, unsigned long start, @@ -201,17 +209,19 @@ either: bool write, bool block); -First one (hmm_vma_get_pfns()) will only fetch present CPU page table entry and -will not trigger a page fault on missing or non present entry. The second one -do trigger page fault on missing or read only entry if write parameter is true. -Page fault use the generic mm page fault code path just like a CPU page fault. +The first one (hmm_vma_get_pfns()) will only fetch present CPU page table +entries and will not trigger a page fault on missing or non present entries. +The second one does trigger a page fault on missing or read only entry if the +write parameter is true. Page faults use the generic mm page fault code path +just like a CPU page fault. -Both function copy CPU page table into their pfns array argument. Each entry in -that array correspond to an address in the virtual range. HMM provide a set of -flags to help driver identify special CPU page table entries. +Both functions copy CPU page table entries into their pfns array argument. Each +entry in that array corresponds to an address in the virtual range. HMM +provides a set of flags to help the driver identify special CPU page table +entries. Locking with the update() callback is the most important aspect the driver must -respect in order to keep things properly synchronize. The usage pattern is : +respect in order to keep things properly synchronized. The usage pattern is: int driver_populate_range(...) { @@ -233,43 +243,44 @@ respect in order to keep things properly synchronize. The usage pattern is : return 0; } -The driver->update lock is the same lock that driver takes inside its update() -callback. That lock must be call before hmm_vma_range_done() to avoid any race -with a concurrent CPU page table update. +The driver->update lock is the same lock that the driver takes inside its +update() callback. That lock must be held before hmm_vma_range_done() to avoid +any race with a concurrent CPU page table update. -HMM implements all this on top of the mmu_notifier API because we wanted to a -simpler API and also to be able to perform optimization latter own like doing -concurrent device update in multi-devices scenario. +HMM implements all this on top of the mmu_notifier API because we wanted a +simpler API and also to be able to perform optimizations latter on like doing +concurrent device updates in multi-devices scenario. -HMM also serve as an impedence missmatch between how CPU page table update are -done (by CPU write to the page table and TLB flushes) from how device update -their own page table. Device update is a multi-step process, first appropriate -commands are write to a buffer, then this buffer is schedule for execution on -the device. It is only once the device has executed commands in the buffer that -the update is done. Creating and scheduling update command buffer can happen -concurrently for multiple devices. Waiting for each device to report commands -as executed is serialize (there is no point in doing this concurrently). +HMM also serves as an impedence mismatch between how CPU page table updates +are done (by CPU write to the page table and TLB flushes) and how devices +update their own page table. Device updates are a multi-step process. First, +appropriate commands are writen to a buffer, then this buffer is scheduled for +execution on the device. It is only once the device has executed commands in +the buffer that the update is done. Creating and scheduling the update command +buffer can happen concurrently for multiple devices. Waiting for each device to +report commands as executed is serialized (there is no point in doing this +concurrently). ------------------------------------------------------------------------------- 5) Represent and manage device memory from core kernel point of view -Several differents design were try to support device memory. First one use -device specific data structure to keep information about migrated memory and -HMM hooked itself in various place of mm code to handle any access to address -that were back by device memory. It turns out that this ended up replicating -most of the fields of struct page and also needed many kernel code path to be -updated to understand this new kind of memory. +Several different designs were tried to support device memory. First one used +a device specific data structure to keep information about migrated memory and +HMM hooked itself in various places of mm code to handle any access to +addresses that were backed by device memory. It turns out that this ended up +replicating most of the fields of struct page and also needed many kernel code +paths to be updated to understand this new kind of memory. -Thing is most kernel code path never try to access the memory behind a page -but only care about struct page contents. Because of this HMM switchted to -directly using struct page for device memory which left most kernel code path -un-aware of the difference. We only need to make sure that no one ever try to -map those page from the CPU side. +Most kernel code paths never try to access the memory behind a page +but only care about struct page contents. Because of this, HMM switched to +directly using struct page for device memory which left most kernel code paths +unaware of the difference. We only need to make sure that no one ever tries to +map those pages from the CPU side. -HMM provide a set of helpers to register and hotplug device memory as a new -region needing struct page. This is offer through a very simple API: +HMM provides a set of helpers to register and hotplug device memory as a new +region needing a struct page. This is offered through a very simple API: struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops, struct device *device, @@ -289,18 +300,19 @@ The hmm_devmem_ops is where most of the important things are: }; The first callback (free()) happens when the last reference on a device page is -drop. This means the device page is now free and no longer use by anyone. The -second callback happens whenever CPU try to access a device page which it can -not do. This second callback must trigger a migration back to system memory. +dropped. This means the device page is now free and no longer used by anyone. +The second callback happens whenever the CPU tries to access a device page +which it can not do. This second callback must trigger a migration back to +system memory. ------------------------------------------------------------------------------- -6) Migrate to and from device memory +6) Migration to and from device memory -Because CPU can not access device memory, migration must use device DMA engine -to perform copy from and to device memory. For this we need a new migration -helper: +Because the CPU can not access device memory, migration must use the device DMA +engine to perform copy from and to device memory. For this we need a new +migration helper: int migrate_vma(const struct migrate_vma_ops *ops, struct vm_area_struct *vma, @@ -311,15 +323,15 @@ helper: unsigned long *dst, void *private); -Unlike other migration function it works on a range of virtual address, there -is two reasons for that. First device DMA copy has a high setup overhead cost +Unlike other migration functions it works on a range of virtual address, there +are two reasons for that. First, device DMA copy has a high setup overhead cost and thus batching multiple pages is needed as otherwise the migration overhead -make the whole excersie pointless. The second reason is because driver trigger -such migration base on range of address the device is actively accessing. +makes the whole exersize pointless. The second reason is because the +migration might be for a range of addresses the device is actively accessing. -The migrate_vma_ops struct define two callbacks. First one (alloc_and_copy()) -control destination memory allocation and copy operation. Second one is there -to allow device driver to perform cleanup operation after migration. +The migrate_vma_ops struct defines two callbacks. First one (alloc_and_copy()) +controls destination memory allocation and copy operation. Second one is there +to allow the device driver to perform cleanup operations after migration. struct migrate_vma_ops { void (*alloc_and_copy)(struct vm_area_struct *vma, @@ -336,19 +348,19 @@ to allow device driver to perform cleanup operation after migration. void *private); }; -It is important to stress that this migration helpers allow for hole in the +It is important to stress that these migration helpers allow for holes in the virtual address range. Some pages in the range might not be migrated for all -the usual reasons (page is pin, page is lock, ...). This helper does not fail -but just skip over those pages. +the usual reasons (page is pinned, page is locked, ...). This helper does not +fail but just skips over those pages. -The alloc_and_copy() might as well decide to not migrate all pages in the -range (for reasons under the callback control). For those the callback just -have to leave the corresponding dst entry empty. +The alloc_and_copy() might decide to not migrate all pages in the +range (for reasons under the callback control). For those, the callback just +has to leave the corresponding dst entry empty. -Finaly the migration of the struct page might fails (for file back page) for +Finally, the migration of the struct page might fail (for file backed page) for various reasons (failure to freeze reference, or update page cache, ...). If -that happens then the finalize_and_map() can catch any pages that was not -migrated. Note those page were still copied to new page and thus we wasted +that happens, then the finalize_and_map() can catch any pages that were not +migrated. Note those pages were still copied to a new page and thus we wasted bandwidth but this is considered as a rare event and a price that we are willing to pay to keep all the code simpler. @@ -358,27 +370,27 @@ willing to pay to keep all the code simpler. 7) Memory cgroup (memcg) and rss accounting For now device memory is accounted as any regular page in rss counters (either -anonymous if device page is use for anonymous, file if device page is use for -file back page or shmem if device page is use for share memory). This is a -deliberate choice to keep existing application that might start using device -memory without knowing about it to keep runing unimpacted. +anonymous if device page is used for anonymous, file if device page is used for +file backed page or shmem if device page is used for shared memory). This is a +deliberate choice to keep existing applications, that might start using device +memory without knowing about it, running unimpacted. -Drawbacks is that OOM killer might kill an application using a lot of device -memory and not a lot of regular system memory and thus not freeing much system -memory. We want to gather more real world experience on how application and -system react under memory pressure in the presence of device memory before +A Drawback is that the OOM killer might kill an application using a lot of +device memory and not a lot of regular system memory and thus not freeing much +system memory. We want to gather more real world experience on how applications +and system react under memory pressure in the presence of device memory before deciding to account device memory differently. -Same decision was made for memory cgroup. Device memory page are accounted +Same decision was made for memory cgroup. Device memory pages are accounted against same memory cgroup a regular page would be accounted to. This does simplify migration to and from device memory. This also means that migration back from device memory to regular memory can not fail because it would go above memory cgroup limit. We might revisit this choice latter on once we -get more experience in how device memory is use and its impact on memory +get more experience in how device memory is used and its impact on memory resource control. -Note that device memory can never be pin nor by device driver nor through GUP +Note that device memory can never be pinned by device driver nor through GUP and thus such memory is always free upon process exit. Or when last reference -is drop in case of share memory or file back memory. +is dropped in case of shared memory or file backed memory. diff --git a/MAINTAINERS b/MAINTAINERS index b7bd40b6b80d..3e2c01faf53d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6410,6 +6410,7 @@ L: linux-mm@kvack.org S: Maintained F: mm/hmm* F: include/linux/hmm* +F: Documentation/vm/hmm.txt HOST AP DRIVER M: Jouni Malinen From b28b08de436a638c82d0cf3dcdbdbad055baf1fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= Date: Tue, 10 Apr 2018 16:28:15 -0700 Subject: [PATCH 011/140] mm/hmm: fix header file if/else/endif maze MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The #if/#else/#endif for IS_ENABLED(CONFIG_HMM) were wrong. Because of this after multiple include there was multiple definition of both hmm_mm_init() and hmm_mm_destroy() leading to build failure if HMM was enabled (CONFIG_HMM set). Link: http://lkml.kernel.org/r/20180323005527.758-3-jglisse@redhat.com Signed-off-by: Jérôme Glisse Acked-by: Balbir Singh Cc: Andrew Morton Cc: Ralph Campbell Cc: John Hubbard Cc: Evgeny Baskakov Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hmm.h | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 325017ad9311..36dd21fe5caf 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -498,23 +498,16 @@ struct hmm_device { struct hmm_device *hmm_device_new(void *drvdata); void hmm_device_put(struct hmm_device *hmm_device); #endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */ -#endif /* IS_ENABLED(CONFIG_HMM) */ /* Below are for HMM internal use only! Not to be used by device driver! */ -#if IS_ENABLED(CONFIG_HMM_MIRROR) void hmm_mm_destroy(struct mm_struct *mm); static inline void hmm_mm_init(struct mm_struct *mm) { mm->hmm = NULL; } -#else /* IS_ENABLED(CONFIG_HMM_MIRROR) */ -static inline void hmm_mm_destroy(struct mm_struct *mm) {} -static inline void hmm_mm_init(struct mm_struct *mm) {} -#endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */ - - #else /* IS_ENABLED(CONFIG_HMM) */ static inline void hmm_mm_destroy(struct mm_struct *mm) {} static inline void hmm_mm_init(struct mm_struct *mm) {} +#endif /* IS_ENABLED(CONFIG_HMM) */ #endif /* LINUX_HMM_H */ From e1401513c6b5efec59678a4d4e9f90957684b7e3 Mon Sep 17 00:00:00 2001 From: Ralph Campbell Date: Tue, 10 Apr 2018 16:28:19 -0700 Subject: [PATCH 012/140] mm/hmm: HMM should have a callback before MM is destroyed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit hmm_mirror_register() registers a callback for when the CPU pagetable is modified. Normally, the device driver will call hmm_mirror_unregister() when the process using the device is finished. However, if the process exits uncleanly, the struct_mm can be destroyed with no warning to the device driver. Link: http://lkml.kernel.org/r/20180323005527.758-4-jglisse@redhat.com Signed-off-by: Ralph Campbell Signed-off-by: Jérôme Glisse Reviewed-by: John Hubbard Cc: Evgeny Baskakov Cc: Mark Hairgrove Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hmm.h | 10 ++++++++++ mm/hmm.c | 29 ++++++++++++++++++++++++++++- 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 36dd21fe5caf..fa7b51f65905 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -218,6 +218,16 @@ enum hmm_update_type { * @update: callback to update range on a device */ struct hmm_mirror_ops { + /* release() - release hmm_mirror + * + * @mirror: pointer to struct hmm_mirror + * + * This is called when the mm_struct is being released. + * The callback should make sure no references to the mirror occur + * after the callback returns. + */ + void (*release)(struct hmm_mirror *mirror); + /* sync_cpu_device_pagetables() - synchronize page tables * * @mirror: pointer to struct hmm_mirror diff --git a/mm/hmm.c b/mm/hmm.c index 320545b98ff5..8116727766f7 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -160,6 +160,32 @@ static void hmm_invalidate_range(struct hmm *hmm, up_read(&hmm->mirrors_sem); } +static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) +{ + struct hmm_mirror *mirror; + struct hmm *hmm = mm->hmm; + + down_write(&hmm->mirrors_sem); + mirror = list_first_entry_or_null(&hmm->mirrors, struct hmm_mirror, + list); + while (mirror) { + list_del_init(&mirror->list); + if (mirror->ops->release) { + /* + * Drop mirrors_sem so callback can wait on any pending + * work that might itself trigger mmu_notifier callback + * and thus would deadlock with us. + */ + up_write(&hmm->mirrors_sem); + mirror->ops->release(mirror); + down_write(&hmm->mirrors_sem); + } + mirror = list_first_entry_or_null(&hmm->mirrors, + struct hmm_mirror, list); + } + up_write(&hmm->mirrors_sem); +} + static void hmm_invalidate_range_start(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, @@ -185,6 +211,7 @@ static void hmm_invalidate_range_end(struct mmu_notifier *mn, } static const struct mmu_notifier_ops hmm_mmu_notifier_ops = { + .release = hmm_release, .invalidate_range_start = hmm_invalidate_range_start, .invalidate_range_end = hmm_invalidate_range_end, }; @@ -230,7 +257,7 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror) struct hmm *hmm = mirror->hmm; down_write(&hmm->mirrors_sem); - list_del(&mirror->list); + list_del_init(&mirror->list); up_write(&hmm->mirrors_sem); } EXPORT_SYMBOL(hmm_mirror_unregister); From c01cbba2aa8b4d05607d373b02fe9e4eda5c0591 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= Date: Tue, 10 Apr 2018 16:28:23 -0700 Subject: [PATCH 013/140] mm/hmm: unregister mmu_notifier when last HMM client quit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This code was lost in translation at one point. This properly call mmu_notifier_unregister_no_release() once last user is gone. This fix the zombie mm_struct as without this patch we do not drop the refcount we have on it. Link: http://lkml.kernel.org/r/20180323005527.758-5-jglisse@redhat.com Signed-off-by: Jérôme Glisse Cc: Evgeny Baskakov Cc: Ralph Campbell Cc: Mark Hairgrove Cc: John Hubbard Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hmm.c | 38 +++++++++++++++++++++++++++++++++++--- 1 file changed, 35 insertions(+), 3 deletions(-) diff --git a/mm/hmm.c b/mm/hmm.c index 8116727766f7..2d00769e8985 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -233,13 +233,24 @@ int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm) if (!mm || !mirror || !mirror->ops) return -EINVAL; +again: mirror->hmm = hmm_register(mm); if (!mirror->hmm) return -ENOMEM; down_write(&mirror->hmm->mirrors_sem); - list_add(&mirror->list, &mirror->hmm->mirrors); - up_write(&mirror->hmm->mirrors_sem); + if (mirror->hmm->mm == NULL) { + /* + * A racing hmm_mirror_unregister() is about to destroy the hmm + * struct. Try again to allocate a new one. + */ + up_write(&mirror->hmm->mirrors_sem); + mirror->hmm = NULL; + goto again; + } else { + list_add(&mirror->list, &mirror->hmm->mirrors); + up_write(&mirror->hmm->mirrors_sem); + } return 0; } @@ -254,11 +265,32 @@ EXPORT_SYMBOL(hmm_mirror_register); */ void hmm_mirror_unregister(struct hmm_mirror *mirror) { - struct hmm *hmm = mirror->hmm; + bool should_unregister = false; + struct mm_struct *mm; + struct hmm *hmm; + if (mirror->hmm == NULL) + return; + + hmm = mirror->hmm; down_write(&hmm->mirrors_sem); list_del_init(&mirror->list); + should_unregister = list_empty(&hmm->mirrors); + mirror->hmm = NULL; + mm = hmm->mm; + hmm->mm = NULL; up_write(&hmm->mirrors_sem); + + if (!should_unregister || mm == NULL) + return; + + spin_lock(&mm->page_table_lock); + if (mm->hmm == hmm) + mm->hmm = NULL; + spin_unlock(&mm->page_table_lock); + + mmu_notifier_unregister_no_release(&hmm->mmu_notifier, mm); + kfree(hmm); } EXPORT_SYMBOL(hmm_mirror_unregister); From c719547f032d4610c7a20900baacae26d0b1ff3e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= Date: Tue, 10 Apr 2018 16:28:27 -0700 Subject: [PATCH 014/140] mm/hmm: hmm_pfns_bad() was accessing wrong struct MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The private field of mm_walk struct point to an hmm_vma_walk struct and not to the hmm_range struct desired. Fix to get proper struct pointer. Link: http://lkml.kernel.org/r/20180323005527.758-6-jglisse@redhat.com Signed-off-by: Jérôme Glisse Cc: Evgeny Baskakov Cc: Ralph Campbell Cc: Mark Hairgrove Cc: John Hubbard Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hmm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/hmm.c b/mm/hmm.c index 2d00769e8985..812a66997627 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -336,7 +336,8 @@ static int hmm_pfns_bad(unsigned long addr, unsigned long end, struct mm_walk *walk) { - struct hmm_range *range = walk->private; + struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct hmm_range *range = hmm_vma_walk->range; hmm_pfn_t *pfns = range->pfns; unsigned long i; From 08232a4544cc6befaabfbec2087bedaf21b0da34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= Date: Tue, 10 Apr 2018 16:28:30 -0700 Subject: [PATCH 015/140] mm/hmm: use struct for hmm_vma_fault(), hmm_vma_get_pfns() parameters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both hmm_vma_fault() and hmm_vma_get_pfns() were taking a hmm_range struct as parameter and were initializing that struct with others of their parameters. Have caller of those function do this as they are likely to already do and only pass this struct to both function this shorten function signature and make it easier in the future to add new parameters by simply adding them to the structure. Link: http://lkml.kernel.org/r/20180323005527.758-7-jglisse@redhat.com Signed-off-by: Jérôme Glisse Reviewed-by: John Hubbard Cc: Evgeny Baskakov Cc: Ralph Campbell Cc: Mark Hairgrove Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hmm.h | 18 +++-------- mm/hmm.c | 78 ++++++++++++++++----------------------------- 2 files changed, 33 insertions(+), 63 deletions(-) diff --git a/include/linux/hmm.h b/include/linux/hmm.h index fa7b51f65905..d0d6760cdada 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -274,6 +274,7 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror); /* * struct hmm_range - track invalidation lock on virtual address range * + * @vma: the vm area struct for the range * @list: all range lock are on a list * @start: range virtual start address (inclusive) * @end: range virtual end address (exclusive) @@ -281,6 +282,7 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror); * @valid: pfns array did not change since it has been fill by an HMM function */ struct hmm_range { + struct vm_area_struct *vma; struct list_head list; unsigned long start; unsigned long end; @@ -301,12 +303,8 @@ struct hmm_range { * * IF YOU DO NOT FOLLOW THE ABOVE RULE THE SNAPSHOT CONTENT MIGHT BE INVALID ! */ -int hmm_vma_get_pfns(struct vm_area_struct *vma, - struct hmm_range *range, - unsigned long start, - unsigned long end, - hmm_pfn_t *pfns); -bool hmm_vma_range_done(struct vm_area_struct *vma, struct hmm_range *range); +int hmm_vma_get_pfns(struct hmm_range *range); +bool hmm_vma_range_done(struct hmm_range *range); /* @@ -327,13 +325,7 @@ bool hmm_vma_range_done(struct vm_area_struct *vma, struct hmm_range *range); * * See the function description in mm/hmm.c for further documentation. */ -int hmm_vma_fault(struct vm_area_struct *vma, - struct hmm_range *range, - unsigned long start, - unsigned long end, - hmm_pfn_t *pfns, - bool write, - bool block); +int hmm_vma_fault(struct hmm_range *range, bool write, bool block); #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */ diff --git a/mm/hmm.c b/mm/hmm.c index 812a66997627..fc5057d7aa05 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -533,11 +533,7 @@ fault: /* * hmm_vma_get_pfns() - snapshot CPU page table for a range of virtual addresses - * @vma: virtual memory area containing the virtual address range - * @range: used to track snapshot validity - * @start: range virtual start address (inclusive) - * @end: range virtual end address (exclusive) - * @entries: array of hmm_pfn_t: provided by the caller, filled in by function + * @range: range being snapshotted * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, 0 success * * This snapshots the CPU page table for a range of virtual addresses. Snapshot @@ -551,26 +547,23 @@ fault: * NOT CALLING hmm_vma_range_done() IF FUNCTION RETURNS 0 WILL LEAD TO SERIOUS * MEMORY CORRUPTION ! YOU HAVE BEEN WARNED ! */ -int hmm_vma_get_pfns(struct vm_area_struct *vma, - struct hmm_range *range, - unsigned long start, - unsigned long end, - hmm_pfn_t *pfns) +int hmm_vma_get_pfns(struct hmm_range *range) { + struct vm_area_struct *vma = range->vma; struct hmm_vma_walk hmm_vma_walk; struct mm_walk mm_walk; struct hmm *hmm; /* FIXME support hugetlb fs */ if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) { - hmm_pfns_special(pfns, start, end); + hmm_pfns_special(range->pfns, range->start, range->end); return -EINVAL; } /* Sanity check, this really should not happen ! */ - if (start < vma->vm_start || start >= vma->vm_end) + if (range->start < vma->vm_start || range->start >= vma->vm_end) return -EINVAL; - if (end < vma->vm_start || end > vma->vm_end) + if (range->end < vma->vm_start || range->end > vma->vm_end) return -EINVAL; hmm = hmm_register(vma->vm_mm); @@ -581,9 +574,6 @@ int hmm_vma_get_pfns(struct vm_area_struct *vma, return -EINVAL; /* Initialize range to track CPU page table update */ - range->start = start; - range->pfns = pfns; - range->end = end; spin_lock(&hmm->lock); range->valid = true; list_add_rcu(&range->list, &hmm->ranges); @@ -601,14 +591,13 @@ int hmm_vma_get_pfns(struct vm_area_struct *vma, mm_walk.pmd_entry = hmm_vma_walk_pmd; mm_walk.pte_hole = hmm_vma_walk_hole; - walk_page_range(start, end, &mm_walk); + walk_page_range(range->start, range->end, &mm_walk); return 0; } EXPORT_SYMBOL(hmm_vma_get_pfns); /* * hmm_vma_range_done() - stop tracking change to CPU page table over a range - * @vma: virtual memory area containing the virtual address range * @range: range being tracked * Returns: false if range data has been invalidated, true otherwise * @@ -628,10 +617,10 @@ EXPORT_SYMBOL(hmm_vma_get_pfns); * * There are two ways to use this : * again: - * hmm_vma_get_pfns(vma, range, start, end, pfns); or hmm_vma_fault(...); + * hmm_vma_get_pfns(range); or hmm_vma_fault(...); * trans = device_build_page_table_update_transaction(pfns); * device_page_table_lock(); - * if (!hmm_vma_range_done(vma, range)) { + * if (!hmm_vma_range_done(range)) { * device_page_table_unlock(); * goto again; * } @@ -639,13 +628,13 @@ EXPORT_SYMBOL(hmm_vma_get_pfns); * device_page_table_unlock(); * * Or: - * hmm_vma_get_pfns(vma, range, start, end, pfns); or hmm_vma_fault(...); + * hmm_vma_get_pfns(range); or hmm_vma_fault(...); * device_page_table_lock(); - * hmm_vma_range_done(vma, range); - * device_update_page_table(pfns); + * hmm_vma_range_done(range); + * device_update_page_table(range->pfns); * device_page_table_unlock(); */ -bool hmm_vma_range_done(struct vm_area_struct *vma, struct hmm_range *range) +bool hmm_vma_range_done(struct hmm_range *range) { unsigned long npages = (range->end - range->start) >> PAGE_SHIFT; struct hmm *hmm; @@ -655,7 +644,7 @@ bool hmm_vma_range_done(struct vm_area_struct *vma, struct hmm_range *range) return false; } - hmm = hmm_register(vma->vm_mm); + hmm = hmm_register(range->vma->vm_mm); if (!hmm) { memset(range->pfns, 0, sizeof(*range->pfns) * npages); return false; @@ -671,11 +660,7 @@ EXPORT_SYMBOL(hmm_vma_range_done); /* * hmm_vma_fault() - try to fault some address in a virtual address range - * @vma: virtual memory area containing the virtual address range - * @range: use to track pfns array content validity - * @start: fault range virtual start address (inclusive) - * @end: fault range virtual end address (exclusive) - * @pfns: array of hmm_pfn_t, only entry with fault flag set will be faulted + * @range: range being faulted * @write: is it a write fault * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem) * Returns: 0 success, error otherwise (-EAGAIN means mmap_sem have been drop) @@ -691,10 +676,10 @@ EXPORT_SYMBOL(hmm_vma_range_done); * down_read(&mm->mmap_sem); * // Find vma and address device wants to fault, initialize hmm_pfn_t * // array accordingly - * ret = hmm_vma_fault(vma, start, end, pfns, allow_retry); + * ret = hmm_vma_fault(range, write, block); * switch (ret) { * case -EAGAIN: - * hmm_vma_range_done(vma, range); + * hmm_vma_range_done(range); * // You might want to rate limit or yield to play nicely, you may * // also commit any valid pfn in the array assuming that you are * // getting true from hmm_vma_range_monitor_end() @@ -708,7 +693,7 @@ EXPORT_SYMBOL(hmm_vma_range_done); * } * // Take device driver lock that serialize device page table update * driver_lock_device_page_table_update(); - * hmm_vma_range_done(vma, range); + * hmm_vma_range_done(range); * // Commit pfns we got from hmm_vma_fault() * driver_unlock_device_page_table_update(); * up_read(&mm->mmap_sem) @@ -718,28 +703,24 @@ EXPORT_SYMBOL(hmm_vma_range_done); * * YOU HAVE BEEN WARNED ! */ -int hmm_vma_fault(struct vm_area_struct *vma, - struct hmm_range *range, - unsigned long start, - unsigned long end, - hmm_pfn_t *pfns, - bool write, - bool block) +int hmm_vma_fault(struct hmm_range *range, bool write, bool block) { + struct vm_area_struct *vma = range->vma; + unsigned long start = range->start; struct hmm_vma_walk hmm_vma_walk; struct mm_walk mm_walk; struct hmm *hmm; int ret; /* Sanity check, this really should not happen ! */ - if (start < vma->vm_start || start >= vma->vm_end) + if (range->start < vma->vm_start || range->start >= vma->vm_end) return -EINVAL; - if (end < vma->vm_start || end > vma->vm_end) + if (range->end < vma->vm_start || range->end > vma->vm_end) return -EINVAL; hmm = hmm_register(vma->vm_mm); if (!hmm) { - hmm_pfns_clear(pfns, start, end); + hmm_pfns_clear(range->pfns, range->start, range->end); return -ENOMEM; } /* Caller must have registered a mirror using hmm_mirror_register() */ @@ -747,9 +728,6 @@ int hmm_vma_fault(struct vm_area_struct *vma, return -EINVAL; /* Initialize range to track CPU page table update */ - range->start = start; - range->pfns = pfns; - range->end = end; spin_lock(&hmm->lock); range->valid = true; list_add_rcu(&range->list, &hmm->ranges); @@ -757,7 +735,7 @@ int hmm_vma_fault(struct vm_area_struct *vma, /* FIXME support hugetlb fs */ if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) { - hmm_pfns_special(pfns, start, end); + hmm_pfns_special(range->pfns, range->start, range->end); return 0; } @@ -777,7 +755,7 @@ int hmm_vma_fault(struct vm_area_struct *vma, mm_walk.pte_hole = hmm_vma_walk_hole; do { - ret = walk_page_range(start, end, &mm_walk); + ret = walk_page_range(start, range->end, &mm_walk); start = hmm_vma_walk.last; } while (ret == -EAGAIN); @@ -785,8 +763,8 @@ int hmm_vma_fault(struct vm_area_struct *vma, unsigned long i; i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; - hmm_pfns_clear(&pfns[i], hmm_vma_walk.last, end); - hmm_vma_range_done(vma, range); + hmm_pfns_clear(&range->pfns[i], hmm_vma_walk.last, range->end); + hmm_vma_range_done(range); } return ret; } From 86586a41b8fe655e28be418a40e9bb2bb478cdd5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= Date: Tue, 10 Apr 2018 16:28:34 -0700 Subject: [PATCH 016/140] mm/hmm: remove HMM_PFN_READ flag and ignore peculiar architecture MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Only peculiar architecture allow write without read thus assume that any valid pfn do allow for read. Note we do not care for write only because it does make sense with thing like atomic compare and exchange or any other operations that allow you to get the memory value through them. Link: http://lkml.kernel.org/r/20180323005527.758-8-jglisse@redhat.com Signed-off-by: Jérôme Glisse Reviewed-by: John Hubbard Cc: Evgeny Baskakov Cc: Ralph Campbell Cc: Mark Hairgrove Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hmm.h | 16 +++++++--------- mm/hmm.c | 44 ++++++++++++++++++++++++++++++++++---------- 2 files changed, 41 insertions(+), 19 deletions(-) diff --git a/include/linux/hmm.h b/include/linux/hmm.h index d0d6760cdada..dd907f614dfe 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -83,8 +83,7 @@ struct hmm; * hmm_pfn_t - HMM uses its own pfn type to keep several flags per page * * Flags: - * HMM_PFN_VALID: pfn is valid - * HMM_PFN_READ: CPU page table has read permission set + * HMM_PFN_VALID: pfn is valid. It has, at least, read permission. * HMM_PFN_WRITE: CPU page table has write permission set * HMM_PFN_ERROR: corresponding CPU page table entry points to poisoned memory * HMM_PFN_EMPTY: corresponding CPU page table entry is pte_none() @@ -97,13 +96,12 @@ struct hmm; typedef unsigned long hmm_pfn_t; #define HMM_PFN_VALID (1 << 0) -#define HMM_PFN_READ (1 << 1) -#define HMM_PFN_WRITE (1 << 2) -#define HMM_PFN_ERROR (1 << 3) -#define HMM_PFN_EMPTY (1 << 4) -#define HMM_PFN_SPECIAL (1 << 5) -#define HMM_PFN_DEVICE_UNADDRESSABLE (1 << 6) -#define HMM_PFN_SHIFT 7 +#define HMM_PFN_WRITE (1 << 1) +#define HMM_PFN_ERROR (1 << 2) +#define HMM_PFN_EMPTY (1 << 3) +#define HMM_PFN_SPECIAL (1 << 4) +#define HMM_PFN_DEVICE_UNADDRESSABLE (1 << 5) +#define HMM_PFN_SHIFT 6 /* * hmm_pfn_t_to_page() - return struct page pointed to by a valid hmm_pfn_t diff --git a/mm/hmm.c b/mm/hmm.c index fc5057d7aa05..5da0f852a7aa 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -417,11 +417,9 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, hmm_pfn_t *pfns = range->pfns; unsigned long addr = start, i; bool write_fault; - hmm_pfn_t flag; pte_t *ptep; i = (addr - range->start) >> PAGE_SHIFT; - flag = vma->vm_flags & VM_READ ? HMM_PFN_READ : 0; write_fault = hmm_vma_walk->fault & hmm_vma_walk->write; again: @@ -433,6 +431,7 @@ again: if (pmd_devmap(*pmdp) || pmd_trans_huge(*pmdp)) { unsigned long pfn; + hmm_pfn_t flag = 0; pmd_t pmd; /* @@ -497,7 +496,6 @@ again: } else if (write_fault) goto fault; pfns[i] |= HMM_PFN_DEVICE_UNADDRESSABLE; - pfns[i] |= flag; } else if (is_migration_entry(entry)) { if (hmm_vma_walk->fault) { pte_unmap(ptep); @@ -517,7 +515,7 @@ again: if (write_fault && !pte_write(pte)) goto fault; - pfns[i] = hmm_pfn_t_from_pfn(pte_pfn(pte)) | flag; + pfns[i] = hmm_pfn_t_from_pfn(pte_pfn(pte)); pfns[i] |= pte_write(pte) ? HMM_PFN_WRITE : 0; continue; @@ -534,7 +532,8 @@ fault: /* * hmm_vma_get_pfns() - snapshot CPU page table for a range of virtual addresses * @range: range being snapshotted - * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, 0 success + * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid + * vma permission, 0 success * * This snapshots the CPU page table for a range of virtual addresses. Snapshot * validity is tracked by range struct. See hmm_vma_range_done() for further @@ -573,6 +572,17 @@ int hmm_vma_get_pfns(struct hmm_range *range) if (!hmm->mmu_notifier.ops) return -EINVAL; + if (!(vma->vm_flags & VM_READ)) { + /* + * If vma do not allow read access, then assume that it does + * not allow write access, either. Architecture that allow + * write without read access are not supported by HMM, because + * operations such has atomic access would not work. + */ + hmm_pfns_clear(range->pfns, range->start, range->end); + return -EPERM; + } + /* Initialize range to track CPU page table update */ spin_lock(&hmm->lock); range->valid = true; @@ -686,6 +696,9 @@ EXPORT_SYMBOL(hmm_vma_range_done); * goto retry; * case 0: * break; + * case -ENOMEM: + * case -EINVAL: + * case -EPERM: * default: * // Handle error ! * up_read(&mm->mmap_sem) @@ -727,11 +740,16 @@ int hmm_vma_fault(struct hmm_range *range, bool write, bool block) if (!hmm->mmu_notifier.ops) return -EINVAL; - /* Initialize range to track CPU page table update */ - spin_lock(&hmm->lock); - range->valid = true; - list_add_rcu(&range->list, &hmm->ranges); - spin_unlock(&hmm->lock); + if (!(vma->vm_flags & VM_READ)) { + /* + * If vma do not allow read access, then assume that it does + * not allow write access, either. Architecture that allow + * write without read access are not supported by HMM, because + * operations such has atomic access would not work. + */ + hmm_pfns_clear(range->pfns, range->start, range->end); + return -EPERM; + } /* FIXME support hugetlb fs */ if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) { @@ -739,6 +757,12 @@ int hmm_vma_fault(struct hmm_range *range, bool write, bool block) return 0; } + /* Initialize range to track CPU page table update */ + spin_lock(&hmm->lock); + range->valid = true; + list_add_rcu(&range->list, &hmm->ranges); + spin_unlock(&hmm->lock); + hmm_vma_walk.fault = true; hmm_vma_walk.write = write; hmm_vma_walk.block = block; From ff05c0c6bbe5043af6a1686522ed845f40ba49ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= Date: Tue, 10 Apr 2018 16:28:38 -0700 Subject: [PATCH 017/140] mm/hmm: use uint64_t for HMM pfn instead of defining hmm_pfn_t to ulong MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All device driver we care about are using 64bits page table entry. In order to match this and to avoid useless define convert all HMM pfn to directly use uint64_t. It is a first step on the road to allow driver to directly use pfn value return by HMM (saving memory and CPU cycles use for conversion between the two). Link: http://lkml.kernel.org/r/20180323005527.758-9-jglisse@redhat.com Signed-off-by: Jérôme Glisse Reviewed-by: John Hubbard Cc: Evgeny Baskakov Cc: Ralph Campbell Cc: Mark Hairgrove Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hmm.h | 46 +++++++++++++++++++++------------------------ mm/hmm.c | 26 ++++++++++++------------- 2 files changed, 34 insertions(+), 38 deletions(-) diff --git a/include/linux/hmm.h b/include/linux/hmm.h index dd907f614dfe..54d684fe3b90 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -80,8 +80,6 @@ struct hmm; /* - * hmm_pfn_t - HMM uses its own pfn type to keep several flags per page - * * Flags: * HMM_PFN_VALID: pfn is valid. It has, at least, read permission. * HMM_PFN_WRITE: CPU page table has write permission set @@ -93,8 +91,6 @@ struct hmm; * set and the pfn value is undefined. * HMM_PFN_DEVICE_UNADDRESSABLE: unaddressable device memory (ZONE_DEVICE) */ -typedef unsigned long hmm_pfn_t; - #define HMM_PFN_VALID (1 << 0) #define HMM_PFN_WRITE (1 << 1) #define HMM_PFN_ERROR (1 << 2) @@ -104,14 +100,14 @@ typedef unsigned long hmm_pfn_t; #define HMM_PFN_SHIFT 6 /* - * hmm_pfn_t_to_page() - return struct page pointed to by a valid hmm_pfn_t - * @pfn: hmm_pfn_t to convert to struct page - * Returns: struct page pointer if pfn is a valid hmm_pfn_t, NULL otherwise + * hmm_pfn_to_page() - return struct page pointed to by a valid HMM pfn + * @pfn: HMM pfn value to get corresponding struct page from + * Returns: struct page pointer if pfn is a valid HMM pfn, NULL otherwise * - * If the hmm_pfn_t is valid (ie valid flag set) then return the struct page - * matching the pfn value stored in the hmm_pfn_t. Otherwise return NULL. + * If the HMM pfn is valid (ie valid flag set) then return the struct page + * matching the pfn value stored in the HMM pfn. Otherwise return NULL. */ -static inline struct page *hmm_pfn_t_to_page(hmm_pfn_t pfn) +static inline struct page *hmm_pfn_to_page(uint64_t pfn) { if (!(pfn & HMM_PFN_VALID)) return NULL; @@ -119,11 +115,11 @@ static inline struct page *hmm_pfn_t_to_page(hmm_pfn_t pfn) } /* - * hmm_pfn_t_to_pfn() - return pfn value store in a hmm_pfn_t - * @pfn: hmm_pfn_t to extract pfn from - * Returns: pfn value if hmm_pfn_t is valid, -1UL otherwise + * hmm_pfn_to_pfn() - return pfn value store in a HMM pfn + * @pfn: HMM pfn value to extract pfn from + * Returns: pfn value if HMM pfn is valid, -1UL otherwise */ -static inline unsigned long hmm_pfn_t_to_pfn(hmm_pfn_t pfn) +static inline unsigned long hmm_pfn_to_pfn(uint64_t pfn) { if (!(pfn & HMM_PFN_VALID)) return -1UL; @@ -131,21 +127,21 @@ static inline unsigned long hmm_pfn_t_to_pfn(hmm_pfn_t pfn) } /* - * hmm_pfn_t_from_page() - create a valid hmm_pfn_t value from struct page - * @page: struct page pointer for which to create the hmm_pfn_t - * Returns: valid hmm_pfn_t for the page + * hmm_pfn_from_page() - create a valid HMM pfn value from struct page + * @page: struct page pointer for which to create the HMM pfn + * Returns: valid HMM pfn for the page */ -static inline hmm_pfn_t hmm_pfn_t_from_page(struct page *page) +static inline uint64_t hmm_pfn_from_page(struct page *page) { return (page_to_pfn(page) << HMM_PFN_SHIFT) | HMM_PFN_VALID; } /* - * hmm_pfn_t_from_pfn() - create a valid hmm_pfn_t value from pfn - * @pfn: pfn value for which to create the hmm_pfn_t - * Returns: valid hmm_pfn_t for the pfn + * hmm_pfn_from_pfn() - create a valid HMM pfn value from pfn + * @pfn: pfn value for which to create the HMM pfn + * Returns: valid HMM pfn for the pfn */ -static inline hmm_pfn_t hmm_pfn_t_from_pfn(unsigned long pfn) +static inline uint64_t hmm_pfn_from_pfn(unsigned long pfn) { return (pfn << HMM_PFN_SHIFT) | HMM_PFN_VALID; } @@ -284,7 +280,7 @@ struct hmm_range { struct list_head list; unsigned long start; unsigned long end; - hmm_pfn_t *pfns; + uint64_t *pfns; bool valid; }; @@ -307,7 +303,7 @@ bool hmm_vma_range_done(struct hmm_range *range); /* * Fault memory on behalf of device driver. Unlike handle_mm_fault(), this will - * not migrate any device memory back to system memory. The hmm_pfn_t array will + * not migrate any device memory back to system memory. The HMM pfn array will * be updated with the fault result and current snapshot of the CPU page table * for the range. * @@ -316,7 +312,7 @@ bool hmm_vma_range_done(struct hmm_range *range); * function returns -EAGAIN. * * Return value does not reflect if the fault was successful for every single - * address or not. Therefore, the caller must to inspect the hmm_pfn_t array to + * address or not. Therefore, the caller must to inspect the HMM pfn array to * determine fault status for each address. * * Trying to fault inside an invalid vma will result in -EINVAL. diff --git a/mm/hmm.c b/mm/hmm.c index 5da0f852a7aa..b69f30fc064b 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -304,7 +304,7 @@ struct hmm_vma_walk { static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr, - hmm_pfn_t *pfn) + uint64_t *pfn) { unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_REMOTE; struct hmm_vma_walk *hmm_vma_walk = walk->private; @@ -324,7 +324,7 @@ static int hmm_vma_do_fault(struct mm_walk *walk, return -EAGAIN; } -static void hmm_pfns_special(hmm_pfn_t *pfns, +static void hmm_pfns_special(uint64_t *pfns, unsigned long addr, unsigned long end) { @@ -338,7 +338,7 @@ static int hmm_pfns_bad(unsigned long addr, { struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; - hmm_pfn_t *pfns = range->pfns; + uint64_t *pfns = range->pfns; unsigned long i; i = (addr - range->start) >> PAGE_SHIFT; @@ -348,7 +348,7 @@ static int hmm_pfns_bad(unsigned long addr, return 0; } -static void hmm_pfns_clear(hmm_pfn_t *pfns, +static void hmm_pfns_clear(uint64_t *pfns, unsigned long addr, unsigned long end) { @@ -362,7 +362,7 @@ static int hmm_vma_walk_hole(unsigned long addr, { struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; - hmm_pfn_t *pfns = range->pfns; + uint64_t *pfns = range->pfns; unsigned long i; hmm_vma_walk->last = addr; @@ -387,7 +387,7 @@ static int hmm_vma_walk_clear(unsigned long addr, { struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; - hmm_pfn_t *pfns = range->pfns; + uint64_t *pfns = range->pfns; unsigned long i; hmm_vma_walk->last = addr; @@ -414,7 +414,7 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; struct vm_area_struct *vma = walk->vma; - hmm_pfn_t *pfns = range->pfns; + uint64_t *pfns = range->pfns; unsigned long addr = start, i; bool write_fault; pte_t *ptep; @@ -431,7 +431,7 @@ again: if (pmd_devmap(*pmdp) || pmd_trans_huge(*pmdp)) { unsigned long pfn; - hmm_pfn_t flag = 0; + uint64_t flag = 0; pmd_t pmd; /* @@ -456,7 +456,7 @@ again: pfn = pmd_pfn(pmd) + pte_index(addr); flag |= pmd_write(pmd) ? HMM_PFN_WRITE : 0; for (; addr < end; addr += PAGE_SIZE, i++, pfn++) - pfns[i] = hmm_pfn_t_from_pfn(pfn) | flag; + pfns[i] = hmm_pfn_from_pfn(pfn) | flag; return 0; } @@ -490,7 +490,7 @@ again: * device and report anything else as error. */ if (is_device_private_entry(entry)) { - pfns[i] = hmm_pfn_t_from_pfn(swp_offset(entry)); + pfns[i] = hmm_pfn_from_pfn(swp_offset(entry)); if (is_write_device_private_entry(entry)) { pfns[i] |= HMM_PFN_WRITE; } else if (write_fault) @@ -515,7 +515,7 @@ again: if (write_fault && !pte_write(pte)) goto fault; - pfns[i] = hmm_pfn_t_from_pfn(pte_pfn(pte)); + pfns[i] = hmm_pfn_from_pfn(pte_pfn(pte)); pfns[i] |= pte_write(pte) ? HMM_PFN_WRITE : 0; continue; @@ -678,8 +678,8 @@ EXPORT_SYMBOL(hmm_vma_range_done); * This is similar to a regular CPU page fault except that it will not trigger * any memory migration if the memory being faulted is not accessible by CPUs. * - * On error, for one virtual address in the range, the function will set the - * hmm_pfn_t error flag for the corresponding pfn entry. + * On error, for one virtual address in the range, the function will mark the + * corresponding HMM pfn entry with an error flag. * * Expected use pattern: * retry: From 855ce7d2525c97cf706ad82a419f0c2d632b9481 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= Date: Tue, 10 Apr 2018 16:28:42 -0700 Subject: [PATCH 018/140] mm/hmm: cleanup special vma handling (VM_SPECIAL) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Special vma (one with any of the VM_SPECIAL flags) can not be access by device because there is no consistent model across device drivers on those vma and their backing memory. This patch directly use hmm_range struct for hmm_pfns_special() argument as it is always affecting the whole vma and thus the whole range. It also make behavior consistent after this patch both hmm_vma_fault() and hmm_vma_get_pfns() returns -EINVAL when facing such vma. Previously hmm_vma_fault() returned 0 and hmm_vma_get_pfns() return -EINVAL but both were filling the HMM pfn array with special entry. Link: http://lkml.kernel.org/r/20180323005527.758-10-jglisse@redhat.com Signed-off-by: Jérôme Glisse Reviewed-by: John Hubbard Cc: Evgeny Baskakov Cc: Ralph Campbell Cc: Mark Hairgrove Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hmm.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/mm/hmm.c b/mm/hmm.c index b69f30fc064b..a93c1e35df91 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -324,14 +324,6 @@ static int hmm_vma_do_fault(struct mm_walk *walk, return -EAGAIN; } -static void hmm_pfns_special(uint64_t *pfns, - unsigned long addr, - unsigned long end) -{ - for (; addr < end; addr += PAGE_SIZE, pfns++) - *pfns = HMM_PFN_SPECIAL; -} - static int hmm_pfns_bad(unsigned long addr, unsigned long end, struct mm_walk *walk) @@ -529,6 +521,14 @@ fault: return 0; } +static void hmm_pfns_special(struct hmm_range *range) +{ + unsigned long addr = range->start, i = 0; + + for (; addr < range->end; addr += PAGE_SIZE, i++) + range->pfns[i] = HMM_PFN_SPECIAL; +} + /* * hmm_vma_get_pfns() - snapshot CPU page table for a range of virtual addresses * @range: range being snapshotted @@ -553,12 +553,6 @@ int hmm_vma_get_pfns(struct hmm_range *range) struct mm_walk mm_walk; struct hmm *hmm; - /* FIXME support hugetlb fs */ - if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) { - hmm_pfns_special(range->pfns, range->start, range->end); - return -EINVAL; - } - /* Sanity check, this really should not happen ! */ if (range->start < vma->vm_start || range->start >= vma->vm_end) return -EINVAL; @@ -572,6 +566,12 @@ int hmm_vma_get_pfns(struct hmm_range *range) if (!hmm->mmu_notifier.ops) return -EINVAL; + /* FIXME support hugetlb fs */ + if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) { + hmm_pfns_special(range); + return -EINVAL; + } + if (!(vma->vm_flags & VM_READ)) { /* * If vma do not allow read access, then assume that it does @@ -740,6 +740,12 @@ int hmm_vma_fault(struct hmm_range *range, bool write, bool block) if (!hmm->mmu_notifier.ops) return -EINVAL; + /* FIXME support hugetlb fs */ + if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) { + hmm_pfns_special(range); + return -EINVAL; + } + if (!(vma->vm_flags & VM_READ)) { /* * If vma do not allow read access, then assume that it does @@ -751,12 +757,6 @@ int hmm_vma_fault(struct hmm_range *range, bool write, bool block) return -EPERM; } - /* FIXME support hugetlb fs */ - if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) { - hmm_pfns_special(range->pfns, range->start, range->end); - return 0; - } - /* Initialize range to track CPU page table update */ spin_lock(&hmm->lock); range->valid = true; From 5504ed29692faad06ea74c4275e96a8ffc83a1e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= Date: Tue, 10 Apr 2018 16:28:46 -0700 Subject: [PATCH 019/140] mm/hmm: do not differentiate between empty entry or missing directory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is no point in differentiating between a range for which there is not even a directory (and thus entries) and empty entry (pte_none() or pmd_none() returns true). Simply drop the distinction ie remove HMM_PFN_EMPTY flag and merge now duplicate hmm_vma_walk_hole() and hmm_vma_walk_clear() functions. Link: http://lkml.kernel.org/r/20180323005527.758-11-jglisse@redhat.com Signed-off-by: Jérôme Glisse Reviewed-by: John Hubbard Cc: Evgeny Baskakov Cc: Ralph Campbell Cc: Mark Hairgrove Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hmm.h | 8 +++----- mm/hmm.c | 45 +++++++++++++++------------------------------ 2 files changed, 18 insertions(+), 35 deletions(-) diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 54d684fe3b90..cf283db22106 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -84,7 +84,6 @@ struct hmm; * HMM_PFN_VALID: pfn is valid. It has, at least, read permission. * HMM_PFN_WRITE: CPU page table has write permission set * HMM_PFN_ERROR: corresponding CPU page table entry points to poisoned memory - * HMM_PFN_EMPTY: corresponding CPU page table entry is pte_none() * HMM_PFN_SPECIAL: corresponding CPU page table entry is special; i.e., the * result of vm_insert_pfn() or vm_insert_page(). Therefore, it should not * be mirrored by a device, because the entry will never have HMM_PFN_VALID @@ -94,10 +93,9 @@ struct hmm; #define HMM_PFN_VALID (1 << 0) #define HMM_PFN_WRITE (1 << 1) #define HMM_PFN_ERROR (1 << 2) -#define HMM_PFN_EMPTY (1 << 3) -#define HMM_PFN_SPECIAL (1 << 4) -#define HMM_PFN_DEVICE_UNADDRESSABLE (1 << 5) -#define HMM_PFN_SHIFT 6 +#define HMM_PFN_SPECIAL (1 << 3) +#define HMM_PFN_DEVICE_UNADDRESSABLE (1 << 4) +#define HMM_PFN_SHIFT 5 /* * hmm_pfn_to_page() - return struct page pointed to by a valid HMM pfn diff --git a/mm/hmm.c b/mm/hmm.c index a93c1e35df91..b8affe0bf4eb 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -348,6 +348,16 @@ static void hmm_pfns_clear(uint64_t *pfns, *pfns = 0; } +/* + * hmm_vma_walk_hole() - handle a range lacking valid pmd or pte(s) + * @start: range virtual start address (inclusive) + * @end: range virtual end address (exclusive) + * @walk: mm_walk structure + * Returns: 0 on success, -EAGAIN after page fault, or page fault error + * + * This function will be called whenever pmd_none() or pte_none() returns true, + * or whenever there is no page directory covering the virtual address range. + */ static int hmm_vma_walk_hole(unsigned long addr, unsigned long end, struct mm_walk *walk) @@ -357,31 +367,6 @@ static int hmm_vma_walk_hole(unsigned long addr, uint64_t *pfns = range->pfns; unsigned long i; - hmm_vma_walk->last = addr; - i = (addr - range->start) >> PAGE_SHIFT; - for (; addr < end; addr += PAGE_SIZE, i++) { - pfns[i] = HMM_PFN_EMPTY; - if (hmm_vma_walk->fault) { - int ret; - - ret = hmm_vma_do_fault(walk, addr, &pfns[i]); - if (ret != -EAGAIN) - return ret; - } - } - - return hmm_vma_walk->fault ? -EAGAIN : 0; -} - -static int hmm_vma_walk_clear(unsigned long addr, - unsigned long end, - struct mm_walk *walk) -{ - struct hmm_vma_walk *hmm_vma_walk = walk->private; - struct hmm_range *range = hmm_vma_walk->range; - uint64_t *pfns = range->pfns; - unsigned long i; - hmm_vma_walk->last = addr; i = (addr - range->start) >> PAGE_SHIFT; for (; addr < end; addr += PAGE_SIZE, i++) { @@ -440,10 +425,10 @@ again: if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd)) goto again; if (pmd_protnone(pmd)) - return hmm_vma_walk_clear(start, end, walk); + return hmm_vma_walk_hole(start, end, walk); if (write_fault && !pmd_write(pmd)) - return hmm_vma_walk_clear(start, end, walk); + return hmm_vma_walk_hole(start, end, walk); pfn = pmd_pfn(pmd) + pte_index(addr); flag |= pmd_write(pmd) ? HMM_PFN_WRITE : 0; @@ -462,7 +447,7 @@ again: pfns[i] = 0; if (pte_none(pte)) { - pfns[i] = HMM_PFN_EMPTY; + pfns[i] = 0; if (hmm_vma_walk->fault) goto fault; continue; @@ -513,8 +498,8 @@ again: fault: pte_unmap(ptep); - /* Fault all pages in range */ - return hmm_vma_walk_clear(start, end, walk); + /* Fault any virtual address we were asked to fault */ + return hmm_vma_walk_hole(start, end, walk); } pte_unmap(ptep - 1); From b2744118a65efee90aea95f7cd31bf74eb8009f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= Date: Tue, 10 Apr 2018 16:28:49 -0700 Subject: [PATCH 020/140] mm/hmm: rename HMM_PFN_DEVICE_UNADDRESSABLE to HMM_PFN_DEVICE_PRIVATE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make naming consistent across code, DEVICE_PRIVATE is the name use outside HMM code so use that one. Link: http://lkml.kernel.org/r/20180323005527.758-12-jglisse@redhat.com Signed-off-by: Jérôme Glisse Reviewed-by: John Hubbard Cc: Evgeny Baskakov Cc: Ralph Campbell Cc: Mark Hairgrove Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hmm.h | 4 ++-- mm/hmm.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/hmm.h b/include/linux/hmm.h index cf283db22106..e8515cad5a00 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -88,13 +88,13 @@ struct hmm; * result of vm_insert_pfn() or vm_insert_page(). Therefore, it should not * be mirrored by a device, because the entry will never have HMM_PFN_VALID * set and the pfn value is undefined. - * HMM_PFN_DEVICE_UNADDRESSABLE: unaddressable device memory (ZONE_DEVICE) + * HMM_PFN_DEVICE_PRIVATE: unaddressable device memory (ZONE_DEVICE) */ #define HMM_PFN_VALID (1 << 0) #define HMM_PFN_WRITE (1 << 1) #define HMM_PFN_ERROR (1 << 2) #define HMM_PFN_SPECIAL (1 << 3) -#define HMM_PFN_DEVICE_UNADDRESSABLE (1 << 4) +#define HMM_PFN_DEVICE_PRIVATE (1 << 4) #define HMM_PFN_SHIFT 5 /* diff --git a/mm/hmm.c b/mm/hmm.c index b8affe0bf4eb..c287fbbbf088 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -472,7 +472,7 @@ again: pfns[i] |= HMM_PFN_WRITE; } else if (write_fault) goto fault; - pfns[i] |= HMM_PFN_DEVICE_UNADDRESSABLE; + pfns[i] |= HMM_PFN_DEVICE_PRIVATE; } else if (is_migration_entry(entry)) { if (hmm_vma_walk->fault) { pte_unmap(ptep); From 33cd47dcbba068850eb73f00cc41090ba26c1e9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= Date: Tue, 10 Apr 2018 16:28:54 -0700 Subject: [PATCH 021/140] mm/hmm: move hmm_pfns_clear() closer to where it is used MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move hmm_pfns_clear() closer to where it is used to make it clear it is not use by page table walkers. Link: http://lkml.kernel.org/r/20180323005527.758-13-jglisse@redhat.com Signed-off-by: Jérôme Glisse Reviewed-by: John Hubbard Cc: Evgeny Baskakov Cc: Ralph Campbell Cc: Mark Hairgrove Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hmm.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/mm/hmm.c b/mm/hmm.c index c287fbbbf088..05b49a5d6674 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -340,14 +340,6 @@ static int hmm_pfns_bad(unsigned long addr, return 0; } -static void hmm_pfns_clear(uint64_t *pfns, - unsigned long addr, - unsigned long end) -{ - for (; addr < end; addr += PAGE_SIZE, pfns++) - *pfns = 0; -} - /* * hmm_vma_walk_hole() - handle a range lacking valid pmd or pte(s) * @start: range virtual start address (inclusive) @@ -506,6 +498,14 @@ fault: return 0; } +static void hmm_pfns_clear(uint64_t *pfns, + unsigned long addr, + unsigned long end) +{ + for (; addr < end; addr += PAGE_SIZE, pfns++) + *pfns = 0; +} + static void hmm_pfns_special(struct hmm_range *range) { unsigned long addr = range->start, i = 0; From 53f5c3f489ecddc7570a2e2422a6fc5b25007b9d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= Date: Tue, 10 Apr 2018 16:28:59 -0700 Subject: [PATCH 022/140] mm/hmm: factor out pte and pmd handling to simplify hmm_vma_walk_pmd() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No functional change, just create one function to handle pmd and one to handle pte (hmm_vma_handle_pmd() and hmm_vma_handle_pte()). Link: http://lkml.kernel.org/r/20180323005527.758-14-jglisse@redhat.com Signed-off-by: Jérôme Glisse Reviewed-by: John Hubbard Cc: Evgeny Baskakov Cc: Ralph Campbell Cc: Mark Hairgrove Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hmm.c | 174 ++++++++++++++++++++++++++++++++----------------------- 1 file changed, 102 insertions(+), 72 deletions(-) diff --git a/mm/hmm.c b/mm/hmm.c index 05b49a5d6674..2cc4dda1fd2e 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -375,6 +375,99 @@ static int hmm_vma_walk_hole(unsigned long addr, return hmm_vma_walk->fault ? -EAGAIN : 0; } +static int hmm_vma_handle_pmd(struct mm_walk *walk, + unsigned long addr, + unsigned long end, + uint64_t *pfns, + pmd_t pmd) +{ + struct hmm_vma_walk *hmm_vma_walk = walk->private; + unsigned long pfn, i; + uint64_t flag = 0; + + if (pmd_protnone(pmd)) + return hmm_vma_walk_hole(addr, end, walk); + + if ((hmm_vma_walk->fault & hmm_vma_walk->write) && !pmd_write(pmd)) + return hmm_vma_walk_hole(addr, end, walk); + + pfn = pmd_pfn(pmd) + pte_index(addr); + flag |= pmd_write(pmd) ? HMM_PFN_WRITE : 0; + for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) + pfns[i] = hmm_pfn_from_pfn(pfn) | flag; + hmm_vma_walk->last = end; + return 0; +} + +static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, + unsigned long end, pmd_t *pmdp, pte_t *ptep, + uint64_t *pfn) +{ + struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct vm_area_struct *vma = walk->vma; + pte_t pte = *ptep; + + *pfn = 0; + + if (pte_none(pte)) { + *pfn = 0; + if (hmm_vma_walk->fault) + goto fault; + return 0; + } + + if (!pte_present(pte)) { + swp_entry_t entry = pte_to_swp_entry(pte); + + if (!non_swap_entry(entry)) { + if (hmm_vma_walk->fault) + goto fault; + return 0; + } + + /* + * This is a special swap entry, ignore migration, use + * device and report anything else as error. + */ + if (is_device_private_entry(entry)) { + *pfn = hmm_pfn_from_pfn(swp_offset(entry)); + if (is_write_device_private_entry(entry)) { + *pfn |= HMM_PFN_WRITE; + } else if ((hmm_vma_walk->fault & hmm_vma_walk->write)) + goto fault; + *pfn |= HMM_PFN_DEVICE_PRIVATE; + return 0; + } + + if (is_migration_entry(entry)) { + if (hmm_vma_walk->fault) { + pte_unmap(ptep); + hmm_vma_walk->last = addr; + migration_entry_wait(vma->vm_mm, + pmdp, addr); + return -EAGAIN; + } + return 0; + } + + /* Report error for everything else */ + *pfn = HMM_PFN_ERROR; + return -EFAULT; + } + + if ((hmm_vma_walk->fault & hmm_vma_walk->write) && !pte_write(pte)) + goto fault; + + *pfn = hmm_pfn_from_pfn(pte_pfn(pte)); + *pfn |= pte_write(pte) ? HMM_PFN_WRITE : 0; + return 0; + +fault: + pte_unmap(ptep); + /* Fault any virtual address we were asked to fault */ + return hmm_vma_walk_hole(addr, end, walk); +} + static int hmm_vma_walk_pmd(pmd_t *pmdp, unsigned long start, unsigned long end, @@ -382,25 +475,20 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, { struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; - struct vm_area_struct *vma = walk->vma; uint64_t *pfns = range->pfns; unsigned long addr = start, i; - bool write_fault; pte_t *ptep; i = (addr - range->start) >> PAGE_SHIFT; - write_fault = hmm_vma_walk->fault & hmm_vma_walk->write; again: if (pmd_none(*pmdp)) return hmm_vma_walk_hole(start, end, walk); - if (pmd_huge(*pmdp) && vma->vm_flags & VM_HUGETLB) + if (pmd_huge(*pmdp) && (range->vma->vm_flags & VM_HUGETLB)) return hmm_pfns_bad(start, end, walk); if (pmd_devmap(*pmdp) || pmd_trans_huge(*pmdp)) { - unsigned long pfn; - uint64_t flag = 0; pmd_t pmd; /* @@ -416,17 +504,8 @@ again: barrier(); if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd)) goto again; - if (pmd_protnone(pmd)) - return hmm_vma_walk_hole(start, end, walk); - if (write_fault && !pmd_write(pmd)) - return hmm_vma_walk_hole(start, end, walk); - - pfn = pmd_pfn(pmd) + pte_index(addr); - flag |= pmd_write(pmd) ? HMM_PFN_WRITE : 0; - for (; addr < end; addr += PAGE_SIZE, i++, pfn++) - pfns[i] = hmm_pfn_from_pfn(pfn) | flag; - return 0; + return hmm_vma_handle_pmd(walk, addr, end, &pfns[i], pmd); } if (pmd_bad(*pmdp)) @@ -434,67 +513,18 @@ again: ptep = pte_offset_map(pmdp, addr); for (; addr < end; addr += PAGE_SIZE, ptep++, i++) { - pte_t pte = *ptep; + int r; - pfns[i] = 0; - - if (pte_none(pte)) { - pfns[i] = 0; - if (hmm_vma_walk->fault) - goto fault; - continue; + r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, &pfns[i]); + if (r) { + /* hmm_vma_handle_pte() did unmap pte directory */ + hmm_vma_walk->last = addr; + return r; } - - if (!pte_present(pte)) { - swp_entry_t entry = pte_to_swp_entry(pte); - - if (!non_swap_entry(entry)) { - if (hmm_vma_walk->fault) - goto fault; - continue; - } - - /* - * This is a special swap entry, ignore migration, use - * device and report anything else as error. - */ - if (is_device_private_entry(entry)) { - pfns[i] = hmm_pfn_from_pfn(swp_offset(entry)); - if (is_write_device_private_entry(entry)) { - pfns[i] |= HMM_PFN_WRITE; - } else if (write_fault) - goto fault; - pfns[i] |= HMM_PFN_DEVICE_PRIVATE; - } else if (is_migration_entry(entry)) { - if (hmm_vma_walk->fault) { - pte_unmap(ptep); - hmm_vma_walk->last = addr; - migration_entry_wait(vma->vm_mm, - pmdp, addr); - return -EAGAIN; - } - continue; - } else { - /* Report error for everything else */ - pfns[i] = HMM_PFN_ERROR; - } - continue; - } - - if (write_fault && !pte_write(pte)) - goto fault; - - pfns[i] = hmm_pfn_from_pfn(pte_pfn(pte)); - pfns[i] |= pte_write(pte) ? HMM_PFN_WRITE : 0; - continue; - -fault: - pte_unmap(ptep); - /* Fault any virtual address we were asked to fault */ - return hmm_vma_walk_hole(start, end, walk); } pte_unmap(ptep - 1); + hmm_vma_walk->last = addr; return 0; } From 2aee09d8c1164219971c7b396f2235bd5334018c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= Date: Tue, 10 Apr 2018 16:29:02 -0700 Subject: [PATCH 023/140] mm/hmm: change hmm_vma_fault() to allow write fault on page basis MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This changes hmm_vma_fault() to not take a global write fault flag for a range but instead rely on caller to populate HMM pfns array with proper fault flag ie HMM_PFN_VALID if driver want read fault for that address or HMM_PFN_VALID and HMM_PFN_WRITE for write. Moreover by setting HMM_PFN_DEVICE_PRIVATE the device driver can ask for device private memory to be migrated back to system memory through page fault. This is more flexible API and it better reflects how device handles and reports fault. Link: http://lkml.kernel.org/r/20180323005527.758-15-jglisse@redhat.com Signed-off-by: Jérôme Glisse Cc: Evgeny Baskakov Cc: Ralph Campbell Cc: Mark Hairgrove Cc: John Hubbard Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hmm.h | 2 +- mm/hmm.c | 151 ++++++++++++++++++++++++++++++++++---------- 2 files changed, 119 insertions(+), 34 deletions(-) diff --git a/include/linux/hmm.h b/include/linux/hmm.h index e8515cad5a00..0f7ea3074175 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -317,7 +317,7 @@ bool hmm_vma_range_done(struct hmm_range *range); * * See the function description in mm/hmm.c for further documentation. */ -int hmm_vma_fault(struct hmm_range *range, bool write, bool block); +int hmm_vma_fault(struct hmm_range *range, bool block); #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */ diff --git a/mm/hmm.c b/mm/hmm.c index 2cc4dda1fd2e..290c872062a1 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -299,12 +299,10 @@ struct hmm_vma_walk { unsigned long last; bool fault; bool block; - bool write; }; -static int hmm_vma_do_fault(struct mm_walk *walk, - unsigned long addr, - uint64_t *pfn) +static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr, + bool write_fault, uint64_t *pfn) { unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_REMOTE; struct hmm_vma_walk *hmm_vma_walk = walk->private; @@ -312,7 +310,7 @@ static int hmm_vma_do_fault(struct mm_walk *walk, int r; flags |= hmm_vma_walk->block ? 0 : FAULT_FLAG_ALLOW_RETRY; - flags |= hmm_vma_walk->write ? FAULT_FLAG_WRITE : 0; + flags |= write_fault ? FAULT_FLAG_WRITE : 0; r = handle_mm_fault(vma, addr, flags); if (r & VM_FAULT_RETRY) return -EBUSY; @@ -344,15 +342,17 @@ static int hmm_pfns_bad(unsigned long addr, * hmm_vma_walk_hole() - handle a range lacking valid pmd or pte(s) * @start: range virtual start address (inclusive) * @end: range virtual end address (exclusive) + * @fault: should we fault or not ? + * @write_fault: write fault ? * @walk: mm_walk structure * Returns: 0 on success, -EAGAIN after page fault, or page fault error * * This function will be called whenever pmd_none() or pte_none() returns true, * or whenever there is no page directory covering the virtual address range. */ -static int hmm_vma_walk_hole(unsigned long addr, - unsigned long end, - struct mm_walk *walk) +static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end, + bool fault, bool write_fault, + struct mm_walk *walk) { struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; @@ -363,16 +363,89 @@ static int hmm_vma_walk_hole(unsigned long addr, i = (addr - range->start) >> PAGE_SHIFT; for (; addr < end; addr += PAGE_SIZE, i++) { pfns[i] = 0; - if (hmm_vma_walk->fault) { + if (fault || write_fault) { int ret; - ret = hmm_vma_do_fault(walk, addr, &pfns[i]); + ret = hmm_vma_do_fault(walk, addr, write_fault, + &pfns[i]); if (ret != -EAGAIN) return ret; } } - return hmm_vma_walk->fault ? -EAGAIN : 0; + return (fault || write_fault) ? -EAGAIN : 0; +} + +static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, + uint64_t pfns, uint64_t cpu_flags, + bool *fault, bool *write_fault) +{ + *fault = *write_fault = false; + if (!hmm_vma_walk->fault) + return; + + /* We aren't ask to do anything ... */ + if (!(pfns & HMM_PFN_VALID)) + return; + /* If CPU page table is not valid then we need to fault */ + *fault = cpu_flags & HMM_PFN_VALID; + /* Need to write fault ? */ + if ((pfns & HMM_PFN_WRITE) && !(cpu_flags & HMM_PFN_WRITE)) { + *fault = *write_fault = false; + return; + } + /* Do we fault on device memory ? */ + if ((pfns & HMM_PFN_DEVICE_PRIVATE) && + (cpu_flags & HMM_PFN_DEVICE_PRIVATE)) { + *write_fault = pfns & HMM_PFN_WRITE; + *fault = true; + } +} + +static void hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk, + const uint64_t *pfns, unsigned long npages, + uint64_t cpu_flags, bool *fault, + bool *write_fault) +{ + unsigned long i; + + if (!hmm_vma_walk->fault) { + *fault = *write_fault = false; + return; + } + + for (i = 0; i < npages; ++i) { + hmm_pte_need_fault(hmm_vma_walk, pfns[i], cpu_flags, + fault, write_fault); + if ((*fault) || (*write_fault)) + return; + } +} + +static int hmm_vma_walk_hole(unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct hmm_range *range = hmm_vma_walk->range; + bool fault, write_fault; + unsigned long i, npages; + uint64_t *pfns; + + i = (addr - range->start) >> PAGE_SHIFT; + npages = (end - addr) >> PAGE_SHIFT; + pfns = &range->pfns[i]; + hmm_range_need_fault(hmm_vma_walk, pfns, npages, + 0, &fault, &write_fault); + return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); +} + +static inline uint64_t pmd_to_hmm_pfn_flags(pmd_t pmd) +{ + if (pmd_protnone(pmd)) + return 0; + return pmd_write(pmd) ? HMM_PFN_VALID | + HMM_PFN_WRITE : + HMM_PFN_VALID; } static int hmm_vma_handle_pmd(struct mm_walk *walk, @@ -382,14 +455,17 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk, pmd_t pmd) { struct hmm_vma_walk *hmm_vma_walk = walk->private; - unsigned long pfn, i; - uint64_t flag = 0; + unsigned long pfn, npages, i; + uint64_t flag = 0, cpu_flags; + bool fault, write_fault; - if (pmd_protnone(pmd)) - return hmm_vma_walk_hole(addr, end, walk); + npages = (end - addr) >> PAGE_SHIFT; + cpu_flags = pmd_to_hmm_pfn_flags(pmd); + hmm_range_need_fault(hmm_vma_walk, pfns, npages, cpu_flags, + &fault, &write_fault); - if ((hmm_vma_walk->fault & hmm_vma_walk->write) && !pmd_write(pmd)) - return hmm_vma_walk_hole(addr, end, walk); + if (pmd_protnone(pmd) || fault || write_fault) + return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); pfn = pmd_pfn(pmd) + pte_index(addr); flag |= pmd_write(pmd) ? HMM_PFN_WRITE : 0; @@ -399,19 +475,32 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk, return 0; } +static inline uint64_t pte_to_hmm_pfn_flags(pte_t pte) +{ + if (pte_none(pte) || !pte_present(pte)) + return 0; + return pte_write(pte) ? HMM_PFN_VALID | + HMM_PFN_WRITE : + HMM_PFN_VALID; +} + static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, unsigned long end, pmd_t *pmdp, pte_t *ptep, uint64_t *pfn) { struct hmm_vma_walk *hmm_vma_walk = walk->private; struct vm_area_struct *vma = walk->vma; + bool fault, write_fault; + uint64_t cpu_flags; pte_t pte = *ptep; *pfn = 0; + cpu_flags = pte_to_hmm_pfn_flags(pte); + hmm_pte_need_fault(hmm_vma_walk, *pfn, cpu_flags, + &fault, &write_fault); if (pte_none(pte)) { - *pfn = 0; - if (hmm_vma_walk->fault) + if (fault || write_fault) goto fault; return 0; } @@ -420,7 +509,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, swp_entry_t entry = pte_to_swp_entry(pte); if (!non_swap_entry(entry)) { - if (hmm_vma_walk->fault) + if (fault || write_fault) goto fault; return 0; } @@ -430,21 +519,20 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, * device and report anything else as error. */ if (is_device_private_entry(entry)) { + cpu_flags = HMM_PFN_VALID | HMM_PFN_DEVICE_PRIVATE; + cpu_flags |= is_write_device_private_entry(entry) ? + HMM_PFN_WRITE : 0; *pfn = hmm_pfn_from_pfn(swp_offset(entry)); - if (is_write_device_private_entry(entry)) { - *pfn |= HMM_PFN_WRITE; - } else if ((hmm_vma_walk->fault & hmm_vma_walk->write)) - goto fault; *pfn |= HMM_PFN_DEVICE_PRIVATE; return 0; } if (is_migration_entry(entry)) { - if (hmm_vma_walk->fault) { + if (fault || write_fault) { pte_unmap(ptep); hmm_vma_walk->last = addr; migration_entry_wait(vma->vm_mm, - pmdp, addr); + pmdp, addr); return -EAGAIN; } return 0; @@ -455,17 +543,16 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, return -EFAULT; } - if ((hmm_vma_walk->fault & hmm_vma_walk->write) && !pte_write(pte)) + if (fault || write_fault) goto fault; - *pfn = hmm_pfn_from_pfn(pte_pfn(pte)); - *pfn |= pte_write(pte) ? HMM_PFN_WRITE : 0; + *pfn = hmm_pfn_from_pfn(pte_pfn(pte)) | cpu_flags; return 0; fault: pte_unmap(ptep); /* Fault any virtual address we were asked to fault */ - return hmm_vma_walk_hole(addr, end, walk); + return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); } static int hmm_vma_walk_pmd(pmd_t *pmdp, @@ -686,7 +773,6 @@ EXPORT_SYMBOL(hmm_vma_range_done); /* * hmm_vma_fault() - try to fault some address in a virtual address range * @range: range being faulted - * @write: is it a write fault * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem) * Returns: 0 success, error otherwise (-EAGAIN means mmap_sem have been drop) * @@ -731,7 +817,7 @@ EXPORT_SYMBOL(hmm_vma_range_done); * * YOU HAVE BEEN WARNED ! */ -int hmm_vma_fault(struct hmm_range *range, bool write, bool block) +int hmm_vma_fault(struct hmm_range *range, bool block) { struct vm_area_struct *vma = range->vma; unsigned long start = range->start; @@ -779,7 +865,6 @@ int hmm_vma_fault(struct hmm_range *range, bool write, bool block) spin_unlock(&hmm->lock); hmm_vma_walk.fault = true; - hmm_vma_walk.write = write; hmm_vma_walk.block = block; hmm_vma_walk.range = range; mm_walk.private = &hmm_vma_walk; From f88a1e90c665408732ab16ea48e1a182dce597a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= Date: Tue, 10 Apr 2018 16:29:06 -0700 Subject: [PATCH 024/140] mm/hmm: use device driver encoding for HMM pfn MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Users of hmm_vma_fault() and hmm_vma_get_pfns() provide a flags array and pfn shift value allowing them to define their own encoding for HMM pfn that are fill inside the pfns array of the hmm_range struct. With this device driver can get pfn that match their own private encoding out of HMM without having to do any conversion. [rcampbell@nvidia.com: don't ignore specific pte fault flag in hmm_vma_fault()] Link: http://lkml.kernel.org/r/20180326213009.2460-2-jglisse@redhat.com [rcampbell@nvidia.com: clarify fault logic for device private memory] Link: http://lkml.kernel.org/r/20180326213009.2460-3-jglisse@redhat.com Link: http://lkml.kernel.org/r/20180323005527.758-16-jglisse@redhat.com Signed-off-by: Jérôme Glisse Signed-off-by: Ralph Campbell Cc: Evgeny Baskakov Cc: Ralph Campbell Cc: Mark Hairgrove Cc: John Hubbard Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hmm.h | 130 ++++++++++++++++++++++++++++++++------------ mm/hmm.c | 99 +++++++++++++++++++-------------- 2 files changed, 152 insertions(+), 77 deletions(-) diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 0f7ea3074175..5d26e0a223d9 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -80,68 +80,145 @@ struct hmm; /* + * hmm_pfn_flag_e - HMM flag enums + * * Flags: * HMM_PFN_VALID: pfn is valid. It has, at least, read permission. * HMM_PFN_WRITE: CPU page table has write permission set + * HMM_PFN_DEVICE_PRIVATE: private device memory (ZONE_DEVICE) + * + * The driver provide a flags array, if driver valid bit for an entry is bit + * 3 ie (entry & (1 << 3)) is true if entry is valid then driver must provide + * an array in hmm_range.flags with hmm_range.flags[HMM_PFN_VALID] == 1 << 3. + * Same logic apply to all flags. This is same idea as vm_page_prot in vma + * except that this is per device driver rather than per architecture. + */ +enum hmm_pfn_flag_e { + HMM_PFN_VALID = 0, + HMM_PFN_WRITE, + HMM_PFN_DEVICE_PRIVATE, + HMM_PFN_FLAG_MAX +}; + +/* + * hmm_pfn_value_e - HMM pfn special value + * + * Flags: * HMM_PFN_ERROR: corresponding CPU page table entry points to poisoned memory + * HMM_PFN_NONE: corresponding CPU page table entry is pte_none() * HMM_PFN_SPECIAL: corresponding CPU page table entry is special; i.e., the * result of vm_insert_pfn() or vm_insert_page(). Therefore, it should not * be mirrored by a device, because the entry will never have HMM_PFN_VALID * set and the pfn value is undefined. - * HMM_PFN_DEVICE_PRIVATE: unaddressable device memory (ZONE_DEVICE) + * + * Driver provide entry value for none entry, error entry and special entry, + * driver can alias (ie use same value for error and special for instance). It + * should not alias none and error or special. + * + * HMM pfn value returned by hmm_vma_get_pfns() or hmm_vma_fault() will be: + * hmm_range.values[HMM_PFN_ERROR] if CPU page table entry is poisonous, + * hmm_range.values[HMM_PFN_NONE] if there is no CPU page table + * hmm_range.values[HMM_PFN_SPECIAL] if CPU page table entry is a special one */ -#define HMM_PFN_VALID (1 << 0) -#define HMM_PFN_WRITE (1 << 1) -#define HMM_PFN_ERROR (1 << 2) -#define HMM_PFN_SPECIAL (1 << 3) -#define HMM_PFN_DEVICE_PRIVATE (1 << 4) -#define HMM_PFN_SHIFT 5 +enum hmm_pfn_value_e { + HMM_PFN_ERROR, + HMM_PFN_NONE, + HMM_PFN_SPECIAL, + HMM_PFN_VALUE_MAX +}; + +/* + * struct hmm_range - track invalidation lock on virtual address range + * + * @vma: the vm area struct for the range + * @list: all range lock are on a list + * @start: range virtual start address (inclusive) + * @end: range virtual end address (exclusive) + * @pfns: array of pfns (big enough for the range) + * @flags: pfn flags to match device driver page table + * @values: pfn value for some special case (none, special, error, ...) + * @pfn_shifts: pfn shift value (should be <= PAGE_SHIFT) + * @valid: pfns array did not change since it has been fill by an HMM function + */ +struct hmm_range { + struct vm_area_struct *vma; + struct list_head list; + unsigned long start; + unsigned long end; + uint64_t *pfns; + const uint64_t *flags; + const uint64_t *values; + uint8_t pfn_shift; + bool valid; +}; /* * hmm_pfn_to_page() - return struct page pointed to by a valid HMM pfn + * @range: range use to decode HMM pfn value * @pfn: HMM pfn value to get corresponding struct page from * Returns: struct page pointer if pfn is a valid HMM pfn, NULL otherwise * * If the HMM pfn is valid (ie valid flag set) then return the struct page * matching the pfn value stored in the HMM pfn. Otherwise return NULL. */ -static inline struct page *hmm_pfn_to_page(uint64_t pfn) +static inline struct page *hmm_pfn_to_page(const struct hmm_range *range, + uint64_t pfn) { - if (!(pfn & HMM_PFN_VALID)) + if (pfn == range->values[HMM_PFN_NONE]) return NULL; - return pfn_to_page(pfn >> HMM_PFN_SHIFT); + if (pfn == range->values[HMM_PFN_ERROR]) + return NULL; + if (pfn == range->values[HMM_PFN_SPECIAL]) + return NULL; + if (!(pfn & range->flags[HMM_PFN_VALID])) + return NULL; + return pfn_to_page(pfn >> range->pfn_shift); } /* * hmm_pfn_to_pfn() - return pfn value store in a HMM pfn + * @range: range use to decode HMM pfn value * @pfn: HMM pfn value to extract pfn from * Returns: pfn value if HMM pfn is valid, -1UL otherwise */ -static inline unsigned long hmm_pfn_to_pfn(uint64_t pfn) +static inline unsigned long hmm_pfn_to_pfn(const struct hmm_range *range, + uint64_t pfn) { - if (!(pfn & HMM_PFN_VALID)) + if (pfn == range->values[HMM_PFN_NONE]) return -1UL; - return (pfn >> HMM_PFN_SHIFT); + if (pfn == range->values[HMM_PFN_ERROR]) + return -1UL; + if (pfn == range->values[HMM_PFN_SPECIAL]) + return -1UL; + if (!(pfn & range->flags[HMM_PFN_VALID])) + return -1UL; + return (pfn >> range->pfn_shift); } /* * hmm_pfn_from_page() - create a valid HMM pfn value from struct page + * @range: range use to encode HMM pfn value * @page: struct page pointer for which to create the HMM pfn * Returns: valid HMM pfn for the page */ -static inline uint64_t hmm_pfn_from_page(struct page *page) +static inline uint64_t hmm_pfn_from_page(const struct hmm_range *range, + struct page *page) { - return (page_to_pfn(page) << HMM_PFN_SHIFT) | HMM_PFN_VALID; + return (page_to_pfn(page) << range->pfn_shift) | + range->flags[HMM_PFN_VALID]; } /* * hmm_pfn_from_pfn() - create a valid HMM pfn value from pfn + * @range: range use to encode HMM pfn value * @pfn: pfn value for which to create the HMM pfn * Returns: valid HMM pfn for the pfn */ -static inline uint64_t hmm_pfn_from_pfn(unsigned long pfn) +static inline uint64_t hmm_pfn_from_pfn(const struct hmm_range *range, + unsigned long pfn) { - return (pfn << HMM_PFN_SHIFT) | HMM_PFN_VALID; + return (pfn << range->pfn_shift) | + range->flags[HMM_PFN_VALID]; } @@ -263,25 +340,6 @@ int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm); void hmm_mirror_unregister(struct hmm_mirror *mirror); -/* - * struct hmm_range - track invalidation lock on virtual address range - * - * @vma: the vm area struct for the range - * @list: all range lock are on a list - * @start: range virtual start address (inclusive) - * @end: range virtual end address (exclusive) - * @pfns: array of pfns (big enough for the range) - * @valid: pfns array did not change since it has been fill by an HMM function - */ -struct hmm_range { - struct vm_area_struct *vma; - struct list_head list; - unsigned long start; - unsigned long end; - uint64_t *pfns; - bool valid; -}; - /* * To snapshot the CPU page table, call hmm_vma_get_pfns(), then take a device * driver lock that serializes device page table updates, then call diff --git a/mm/hmm.c b/mm/hmm.c index 290c872062a1..398d0214be66 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -306,6 +306,7 @@ static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr, { unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_REMOTE; struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct hmm_range *range = hmm_vma_walk->range; struct vm_area_struct *vma = walk->vma; int r; @@ -315,7 +316,7 @@ static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr, if (r & VM_FAULT_RETRY) return -EBUSY; if (r & VM_FAULT_ERROR) { - *pfn = HMM_PFN_ERROR; + *pfn = range->values[HMM_PFN_ERROR]; return -EFAULT; } @@ -333,7 +334,7 @@ static int hmm_pfns_bad(unsigned long addr, i = (addr - range->start) >> PAGE_SHIFT; for (; addr < end; addr += PAGE_SIZE, i++) - pfns[i] = HMM_PFN_ERROR; + pfns[i] = range->values[HMM_PFN_ERROR]; return 0; } @@ -362,7 +363,7 @@ static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end, hmm_vma_walk->last = addr; i = (addr - range->start) >> PAGE_SHIFT; for (; addr < end; addr += PAGE_SIZE, i++) { - pfns[i] = 0; + pfns[i] = range->values[HMM_PFN_NONE]; if (fault || write_fault) { int ret; @@ -380,24 +381,31 @@ static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, uint64_t pfns, uint64_t cpu_flags, bool *fault, bool *write_fault) { + struct hmm_range *range = hmm_vma_walk->range; + *fault = *write_fault = false; if (!hmm_vma_walk->fault) return; /* We aren't ask to do anything ... */ - if (!(pfns & HMM_PFN_VALID)) + if (!(pfns & range->flags[HMM_PFN_VALID])) return; - /* If CPU page table is not valid then we need to fault */ - *fault = cpu_flags & HMM_PFN_VALID; - /* Need to write fault ? */ - if ((pfns & HMM_PFN_WRITE) && !(cpu_flags & HMM_PFN_WRITE)) { - *fault = *write_fault = false; + /* If this is device memory than only fault if explicitly requested */ + if ((cpu_flags & range->flags[HMM_PFN_DEVICE_PRIVATE])) { + /* Do we fault on device memory ? */ + if (pfns & range->flags[HMM_PFN_DEVICE_PRIVATE]) { + *write_fault = pfns & range->flags[HMM_PFN_WRITE]; + *fault = true; + } return; } - /* Do we fault on device memory ? */ - if ((pfns & HMM_PFN_DEVICE_PRIVATE) && - (cpu_flags & HMM_PFN_DEVICE_PRIVATE)) { - *write_fault = pfns & HMM_PFN_WRITE; + + /* If CPU page table is not valid then we need to fault */ + *fault = !(cpu_flags & range->flags[HMM_PFN_VALID]); + /* Need to write fault ? */ + if ((pfns & range->flags[HMM_PFN_WRITE]) && + !(cpu_flags & range->flags[HMM_PFN_WRITE])) { + *write_fault = true; *fault = true; } } @@ -439,13 +447,13 @@ static int hmm_vma_walk_hole(unsigned long addr, unsigned long end, return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); } -static inline uint64_t pmd_to_hmm_pfn_flags(pmd_t pmd) +static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd) { if (pmd_protnone(pmd)) return 0; - return pmd_write(pmd) ? HMM_PFN_VALID | - HMM_PFN_WRITE : - HMM_PFN_VALID; + return pmd_write(pmd) ? range->flags[HMM_PFN_VALID] | + range->flags[HMM_PFN_WRITE] : + range->flags[HMM_PFN_VALID]; } static int hmm_vma_handle_pmd(struct mm_walk *walk, @@ -455,12 +463,13 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk, pmd_t pmd) { struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct hmm_range *range = hmm_vma_walk->range; unsigned long pfn, npages, i; - uint64_t flag = 0, cpu_flags; bool fault, write_fault; + uint64_t cpu_flags; npages = (end - addr) >> PAGE_SHIFT; - cpu_flags = pmd_to_hmm_pfn_flags(pmd); + cpu_flags = pmd_to_hmm_pfn_flags(range, pmd); hmm_range_need_fault(hmm_vma_walk, pfns, npages, cpu_flags, &fault, &write_fault); @@ -468,20 +477,19 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk, return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); pfn = pmd_pfn(pmd) + pte_index(addr); - flag |= pmd_write(pmd) ? HMM_PFN_WRITE : 0; for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) - pfns[i] = hmm_pfn_from_pfn(pfn) | flag; + pfns[i] = hmm_pfn_from_pfn(range, pfn) | cpu_flags; hmm_vma_walk->last = end; return 0; } -static inline uint64_t pte_to_hmm_pfn_flags(pte_t pte) +static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte) { if (pte_none(pte) || !pte_present(pte)) return 0; - return pte_write(pte) ? HMM_PFN_VALID | - HMM_PFN_WRITE : - HMM_PFN_VALID; + return pte_write(pte) ? range->flags[HMM_PFN_VALID] | + range->flags[HMM_PFN_WRITE] : + range->flags[HMM_PFN_VALID]; } static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, @@ -489,14 +497,16 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, uint64_t *pfn) { struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct hmm_range *range = hmm_vma_walk->range; struct vm_area_struct *vma = walk->vma; bool fault, write_fault; uint64_t cpu_flags; pte_t pte = *ptep; + uint64_t orig_pfn = *pfn; - *pfn = 0; - cpu_flags = pte_to_hmm_pfn_flags(pte); - hmm_pte_need_fault(hmm_vma_walk, *pfn, cpu_flags, + *pfn = range->values[HMM_PFN_NONE]; + cpu_flags = pte_to_hmm_pfn_flags(range, pte); + hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, &fault, &write_fault); if (pte_none(pte)) { @@ -519,11 +529,16 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, * device and report anything else as error. */ if (is_device_private_entry(entry)) { - cpu_flags = HMM_PFN_VALID | HMM_PFN_DEVICE_PRIVATE; + cpu_flags = range->flags[HMM_PFN_VALID] | + range->flags[HMM_PFN_DEVICE_PRIVATE]; cpu_flags |= is_write_device_private_entry(entry) ? - HMM_PFN_WRITE : 0; - *pfn = hmm_pfn_from_pfn(swp_offset(entry)); - *pfn |= HMM_PFN_DEVICE_PRIVATE; + range->flags[HMM_PFN_WRITE] : 0; + hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, + &fault, &write_fault); + if (fault || write_fault) + goto fault; + *pfn = hmm_pfn_from_pfn(range, swp_offset(entry)); + *pfn |= cpu_flags; return 0; } @@ -539,14 +554,14 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, } /* Report error for everything else */ - *pfn = HMM_PFN_ERROR; + *pfn = range->values[HMM_PFN_ERROR]; return -EFAULT; } if (fault || write_fault) goto fault; - *pfn = hmm_pfn_from_pfn(pte_pfn(pte)) | cpu_flags; + *pfn = hmm_pfn_from_pfn(range, pte_pfn(pte)) | cpu_flags; return 0; fault: @@ -615,12 +630,13 @@ again: return 0; } -static void hmm_pfns_clear(uint64_t *pfns, +static void hmm_pfns_clear(struct hmm_range *range, + uint64_t *pfns, unsigned long addr, unsigned long end) { for (; addr < end; addr += PAGE_SIZE, pfns++) - *pfns = 0; + *pfns = range->values[HMM_PFN_NONE]; } static void hmm_pfns_special(struct hmm_range *range) @@ -628,7 +644,7 @@ static void hmm_pfns_special(struct hmm_range *range) unsigned long addr = range->start, i = 0; for (; addr < range->end; addr += PAGE_SIZE, i++) - range->pfns[i] = HMM_PFN_SPECIAL; + range->pfns[i] = range->values[HMM_PFN_SPECIAL]; } /* @@ -681,7 +697,7 @@ int hmm_vma_get_pfns(struct hmm_range *range) * write without read access are not supported by HMM, because * operations such has atomic access would not work. */ - hmm_pfns_clear(range->pfns, range->start, range->end); + hmm_pfns_clear(range, range->pfns, range->start, range->end); return -EPERM; } @@ -834,7 +850,7 @@ int hmm_vma_fault(struct hmm_range *range, bool block) hmm = hmm_register(vma->vm_mm); if (!hmm) { - hmm_pfns_clear(range->pfns, range->start, range->end); + hmm_pfns_clear(range, range->pfns, range->start, range->end); return -ENOMEM; } /* Caller must have registered a mirror using hmm_mirror_register() */ @@ -854,7 +870,7 @@ int hmm_vma_fault(struct hmm_range *range, bool block) * write without read access are not supported by HMM, because * operations such has atomic access would not work. */ - hmm_pfns_clear(range->pfns, range->start, range->end); + hmm_pfns_clear(range, range->pfns, range->start, range->end); return -EPERM; } @@ -887,7 +903,8 @@ int hmm_vma_fault(struct hmm_range *range, bool block) unsigned long i; i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; - hmm_pfns_clear(&range->pfns[i], hmm_vma_walk.last, range->end); + hmm_pfns_clear(range, &range->pfns[i], hmm_vma_walk.last, + range->end); hmm_vma_range_done(range); } return ret; From 18be460eeb2cd3f4bfb6498b10ce5b800f9a20ea Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 10 Apr 2018 16:29:09 -0700 Subject: [PATCH 025/140] mm/hmm.c: remove superfluous RCU protection around radix tree lookup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit hmm_devmem_find() requires rcu_read_lock_held() but there's nothing which actually uses the RCU protection. The only caller is hmm_devmem_pages_create() which already grabs the mutex and does superfluous rcu_read_lock/unlock() around the function. This doesn't add anything and just adds to confusion. Remove the RCU protection and open-code the radix tree lookup. If this needs to become more sophisticated in the future, let's add them back when necessary. Link: http://lkml.kernel.org/r/20180314194515.1661824-4-tj@kernel.org Signed-off-by: Tejun Heo Reviewed-by: Jérôme Glisse Cc: Paul E. McKenney Cc: Benjamin LaHaise Cc: Al Viro Cc: Kent Overstreet Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hmm.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/mm/hmm.c b/mm/hmm.c index 398d0214be66..486dc394a5a3 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -1024,13 +1024,6 @@ static void hmm_devmem_release(struct device *dev, void *data) hmm_devmem_radix_release(resource); } -static struct hmm_devmem *hmm_devmem_find(resource_size_t phys) -{ - WARN_ON_ONCE(!rcu_read_lock_held()); - - return radix_tree_lookup(&hmm_devmem_radix, phys >> PA_SECTION_SHIFT); -} - static int hmm_devmem_pages_create(struct hmm_devmem *devmem) { resource_size_t key, align_start, align_size, align_end; @@ -1071,9 +1064,8 @@ static int hmm_devmem_pages_create(struct hmm_devmem *devmem) for (key = align_start; key <= align_end; key += PA_SECTION_SIZE) { struct hmm_devmem *dup; - rcu_read_lock(); - dup = hmm_devmem_find(key); - rcu_read_unlock(); + dup = radix_tree_lookup(&hmm_devmem_radix, + key >> PA_SECTION_SHIFT); if (dup) { dev_err(device, "%s: collides with mapping for %s\n", __func__, dev_name(dup->device)); From 9d8a463a7016e9e5578a561588a18acef139919c Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 10 Apr 2018 16:29:13 -0700 Subject: [PATCH 026/140] mm/hmm: fix header file if/else/endif maze, again MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The last fix was still wrong, as we need the inline dummy functions also for the case that CONFIG_HMM is enabled but CONFIG_HMM_MIRROR is not: kernel/fork.o: In function `__mmdrop': fork.c:(.text+0x14f6): undefined reference to `hmm_mm_destroy' This adds back the second copy of the dummy functions, hopefully this time in the right place. Link: http://lkml.kernel.org/r/20180404110236.804484-1-arnd@arndb.de Fixes: 8900d06a277a ("mm/hmm: fix header file if/else/endif maze") Signed-off-by: Arnd Bergmann Reviewed-by: Jérôme Glisse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hmm.h | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 5d26e0a223d9..39988924de3a 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -376,8 +376,18 @@ bool hmm_vma_range_done(struct hmm_range *range); * See the function description in mm/hmm.c for further documentation. */ int hmm_vma_fault(struct hmm_range *range, bool block); -#endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */ +/* Below are for HMM internal use only! Not to be used by device driver! */ +void hmm_mm_destroy(struct mm_struct *mm); + +static inline void hmm_mm_init(struct mm_struct *mm) +{ + mm->hmm = NULL; +} +#else /* IS_ENABLED(CONFIG_HMM_MIRROR) */ +static inline void hmm_mm_destroy(struct mm_struct *mm) {} +static inline void hmm_mm_init(struct mm_struct *mm) {} +#endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */ #if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC) struct hmm_devmem; @@ -550,16 +560,9 @@ struct hmm_device { struct hmm_device *hmm_device_new(void *drvdata); void hmm_device_put(struct hmm_device *hmm_device); #endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */ - -/* Below are for HMM internal use only! Not to be used by device driver! */ -void hmm_mm_destroy(struct mm_struct *mm); - -static inline void hmm_mm_init(struct mm_struct *mm) -{ - mm->hmm = NULL; -} #else /* IS_ENABLED(CONFIG_HMM) */ static inline void hmm_mm_destroy(struct mm_struct *mm) {} static inline void hmm_mm_init(struct mm_struct *mm) {} #endif /* IS_ENABLED(CONFIG_HMM) */ + #endif /* LINUX_HMM_H */ From e8eddfd2d9cc00a65147353afed8398651a54736 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= Date: Tue, 10 Apr 2018 16:29:16 -0700 Subject: [PATCH 027/140] Documentation/vm/hmm.txt: typos and syntaxes fixes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This fix typos and syntaxes, thanks to Randy Dunlap for pointing them out (they were all my faults). Link: http://lkml.kernel.org/r/20180409151859.4713-1-jglisse@redhat.com Signed-off-by: Jérôme Glisse Reviewed-by: Ralph Campbell Reviewed-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/hmm.txt | 106 +++++++++++++++++++-------------------- 1 file changed, 53 insertions(+), 53 deletions(-) diff --git a/Documentation/vm/hmm.txt b/Documentation/vm/hmm.txt index e99b97003982..2d1d6f69e91b 100644 --- a/Documentation/vm/hmm.txt +++ b/Documentation/vm/hmm.txt @@ -1,22 +1,22 @@ Heterogeneous Memory Management (HMM) -Provide infrastructure and helpers to integrate non conventional memory (device -memory like GPU on board memory) into regular kernel code path. Corner stone of -this being specialize struct page for such memory (see sections 5 to 7 of this -document). +Provide infrastructure and helpers to integrate non-conventional memory (device +memory like GPU on board memory) into regular kernel path, with the cornerstone +of this being specialized struct page for such memory (see sections 5 to 7 of +this document). -HMM also provide optional helpers for SVM (Share Virtual Memory) ie allowing a -device to transparently access program address coherently with the CPU meaning -that any valid pointer on the CPU is also a valid pointer for the device. This -is becoming a mandatory to simplify the use of advance heterogeneous computing -where GPU, DSP, or FPGA are used to perform various computations on behalf of -a process. +HMM also provides optional helpers for SVM (Share Virtual Memory), i.e., +allowing a device to transparently access program address coherently with the +CPU meaning that any valid pointer on the CPU is also a valid pointer for the +device. This is becoming mandatory to simplify the use of advanced hetero- +geneous computing where GPU, DSP, or FPGA are used to perform various +computations on behalf of a process. This document is divided as follows: in the first section I expose the problems related to using device specific memory allocators. In the second section, I expose the hardware limitations that are inherent to many platforms. The third section gives an overview of the HMM design. The fourth section explains how -CPU page-table mirroring works and what is HMM's purpose in this context. The +CPU page-table mirroring works and the purpose of HMM in this context. The fifth section deals with how device memory is represented inside the kernel. Finally, the last section presents a new migration helper that allows lever- aging the device DMA engine. @@ -35,7 +35,7 @@ aging the device DMA engine. 1) Problems of using a device specific memory allocator: -Devices with a large amount of on board memory (several giga bytes) like GPUs +Devices with a large amount of on board memory (several gigabytes) like GPUs have historically managed their memory through dedicated driver specific APIs. This creates a disconnect between memory allocated and managed by a device driver and regular application memory (private anonymous, shared memory, or @@ -44,29 +44,29 @@ address space. I use shared address space to refer to the opposite situation: i.e., one in which any application memory region can be used by a device transparently. -Split address space because device can only access memory allocated through the -device specific API. This implies that all memory objects in a program are not -equal from the device point of view which complicates large programs that rely -on a wide set of libraries. +Split address space happens because device can only access memory allocated +through device specific API. This implies that all memory objects in a program +are not equal from the device point of view which complicates large programs +that rely on a wide set of libraries. -Concretly this means that code that wants to leverage devices like GPUs need to -copy object between genericly allocated memory (malloc, mmap private/share/) -and memory allocated through the device driver API (this still end up with an -mmap but of the device file). +Concretely this means that code that wants to leverage devices like GPUs needs +to copy object between generically allocated memory (malloc, mmap private, mmap +share) and memory allocated through the device driver API (this still ends up +with an mmap but of the device file). -For flat data-sets (array, grid, image, ...) this isn't too hard to achieve but -complex data-sets (list, tree, ...) are hard to get right. Duplicating a -complex data-set needs to re-map all the pointer relations between each of its +For flat data sets (array, grid, image, ...) this isn't too hard to achieve but +complex data sets (list, tree, ...) are hard to get right. Duplicating a +complex data set needs to re-map all the pointer relations between each of its elements. This is error prone and program gets harder to debug because of the -duplicate data-set and addresses. +duplicate data set and addresses. -Split address space also means that libraries can not transparently use data +Split address space also means that libraries cannot transparently use data they are getting from the core program or another library and thus each library -might have to duplicate its input data-set using the device specific memory +might have to duplicate its input data set using the device specific memory allocator. Large projects suffer from this and waste resources because of the various memory copies. -Duplicating each library API to accept as input or output memory allocted by +Duplicating each library API to accept as input or output memory allocated by each device specific allocator is not a viable option. It would lead to a combinatorial explosion in the library entry points. @@ -81,16 +81,16 @@ a shared address space for all other patterns. 2) I/O bus, device memory characteristics -I/O buses cripple shared address due to few limitations. Most I/O buses only -allow basic memory access from device to main memory, even cache coherency is -often optional. Access to device memory from CPU is even more limited. More -often than not, it is not cache coherent. +I/O buses cripple shared address spaces due to a few limitations. Most I/O +buses only allow basic memory access from device to main memory; even cache +coherency is often optional. Access to device memory from CPU is even more +limited. More often than not, it is not cache coherent. If we only consider the PCIE bus, then a device can access main memory (often through an IOMMU) and be cache coherent with the CPUs. However, it only allows a limited set of atomic operations from device on main memory. This is worse -in the other direction, the CPU can only access a limited range of the device -memory and can not perform atomic operations on it. Thus device memory can not +in the other direction: the CPU can only access a limited range of the device +memory and cannot perform atomic operations on it. Thus device memory cannot be considered the same as regular memory from the kernel point of view. Another crippling factor is the limited bandwidth (~32GBytes/s with PCIE 4.0 @@ -99,14 +99,14 @@ The final limitation is latency. Access to main memory from the device has an order of magnitude higher latency than when the device accesses its own memory. Some platforms are developing new I/O buses or additions/modifications to PCIE -to address some of these limitations (OpenCAPI, CCIX). They mainly allow two +to address some of these limitations (OpenCAPI, CCIX). They mainly allow two- way cache coherency between CPU and device and allow all atomic operations the -architecture supports. Saddly, not all platforms are following this trend and +architecture supports. Sadly, not all platforms are following this trend and some major architectures are left without hardware solutions to these problems. -So for shared address space to make sense, not only must we allow device to -access any memory memory but we must also permit any memory to be migrated to -device memory while device is using it (blocking CPU access while it happens). +So for shared address space to make sense, not only must we allow devices to +access any memory but we must also permit any memory to be migrated to device +memory while device is using it (blocking CPU access while it happens). ------------------------------------------------------------------------------- @@ -123,13 +123,13 @@ while keeping track of CPU page table updates. Device page table updates are not as easy as CPU page table updates. To update the device page table, you must allocate a buffer (or use a pool of pre-allocated buffers) and write GPU specific commands in it to perform the update (unmap, cache invalidations, and -flush, ...). This can not be done through common code for all devices. Hence +flush, ...). This cannot be done through common code for all devices. Hence why HMM provides helpers to factor out everything that can be while leaving the hardware specific details to the device driver. -The second mechanism HMM provides, is a new kind of ZONE_DEVICE memory that +The second mechanism HMM provides is a new kind of ZONE_DEVICE memory that allows allocating a struct page for each page of the device memory. Those pages -are special because the CPU can not map them. However, they allow migrating +are special because the CPU cannot map them. However, they allow migrating main memory to device memory using existing migration mechanisms and everything looks like a page is swapped out to disk from the CPU point of view. Using a struct page gives the easiest and cleanest integration with existing mm mech- @@ -144,7 +144,7 @@ address A triggers a page fault and initiates a migration back to main memory. With these two features, HMM not only allows a device to mirror process address space and keeping both CPU and device page table synchronized, but also lever- -ages device memory by migrating the part of the data-set that is actively being +ages device memory by migrating the part of the data set that is actively being used by the device. @@ -154,7 +154,7 @@ used by the device. Address space mirroring's main objective is to allow duplication of a range of CPU page table into a device page table; HMM helps keep both synchronized. A -device driver that want to mirror a process address space must start with the +device driver that wants to mirror a process address space must start with the registration of an hmm_mirror struct: int hmm_mirror_register(struct hmm_mirror *mirror, @@ -162,7 +162,7 @@ registration of an hmm_mirror struct: int hmm_mirror_register_locked(struct hmm_mirror *mirror, struct mm_struct *mm); -The locked variant is to be use when the driver is already holding the mmap_sem +The locked variant is to be used when the driver is already holding mmap_sem of the mm in write mode. The mirror struct has a set of callbacks that are used to propagate CPU page tables: @@ -210,8 +210,8 @@ use either: bool block); The first one (hmm_vma_get_pfns()) will only fetch present CPU page table -entries and will not trigger a page fault on missing or non present entries. -The second one does trigger a page fault on missing or read only entry if the +entries and will not trigger a page fault on missing or non-present entries. +The second one does trigger a page fault on missing or read-only entry if the write parameter is true. Page faults use the generic mm page fault code path just like a CPU page fault. @@ -251,10 +251,10 @@ HMM implements all this on top of the mmu_notifier API because we wanted a simpler API and also to be able to perform optimizations latter on like doing concurrent device updates in multi-devices scenario. -HMM also serves as an impedence mismatch between how CPU page table updates +HMM also serves as an impedance mismatch between how CPU page table updates are done (by CPU write to the page table and TLB flushes) and how devices update their own page table. Device updates are a multi-step process. First, -appropriate commands are writen to a buffer, then this buffer is scheduled for +appropriate commands are written to a buffer, then this buffer is scheduled for execution on the device. It is only once the device has executed commands in the buffer that the update is done. Creating and scheduling the update command buffer can happen concurrently for multiple devices. Waiting for each device to @@ -302,7 +302,7 @@ The hmm_devmem_ops is where most of the important things are: The first callback (free()) happens when the last reference on a device page is dropped. This means the device page is now free and no longer used by anyone. The second callback happens whenever the CPU tries to access a device page -which it can not do. This second callback must trigger a migration back to +which it cannot do. This second callback must trigger a migration back to system memory. @@ -310,7 +310,7 @@ system memory. 6) Migration to and from device memory -Because the CPU can not access device memory, migration must use the device DMA +Because the CPU cannot access device memory, migration must use the device DMA engine to perform copy from and to device memory. For this we need a new migration helper: @@ -326,7 +326,7 @@ migration helper: Unlike other migration functions it works on a range of virtual address, there are two reasons for that. First, device DMA copy has a high setup overhead cost and thus batching multiple pages is needed as otherwise the migration overhead -makes the whole exersize pointless. The second reason is because the +makes the whole exercise pointless. The second reason is because the migration might be for a range of addresses the device is actively accessing. The migrate_vma_ops struct defines two callbacks. First one (alloc_and_copy()) @@ -375,7 +375,7 @@ file backed page or shmem if device page is used for shared memory). This is a deliberate choice to keep existing applications, that might start using device memory without knowing about it, running unimpacted. -A Drawback is that the OOM killer might kill an application using a lot of +A drawback is that the OOM killer might kill an application using a lot of device memory and not a lot of regular system memory and thus not freeing much system memory. We want to gather more real world experience on how applications and system react under memory pressure in the presence of device memory before @@ -385,7 +385,7 @@ deciding to account device memory differently. Same decision was made for memory cgroup. Device memory pages are accounted against same memory cgroup a regular page would be accounted to. This does simplify migration to and from device memory. This also means that migration -back from device memory to regular memory can not fail because it would +back from device memory to regular memory cannot fail because it would go above memory cgroup limit. We might revisit this choice latter on once we get more experience in how device memory is used and its impact on memory resource control. From 09a913a7a947fb6f624379e9da22670994942b85 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 10 Apr 2018 16:29:20 -0700 Subject: [PATCH 028/140] sched/numa: avoid trapping faults and attempting migration of file-backed dirty pages change_pte_range is called from task work context to mark PTEs for receiving NUMA faulting hints. If the marked pages are dirty then migration may fail. Some filesystems cannot migrate dirty pages without blocking so are skipped in MIGRATE_ASYNC mode which just wastes CPU. Even when they can, it can be a waste of cycles when the pages are shared forcing higher scan rates. This patch avoids marking shared dirty pages for hinting faults but also will skip a migration if the page was dirtied after the scanner updated a clean page. This is most noticeable running the NASA Parallel Benchmark when backed by btrfs, the default root filesystem for some distributions, but also noticeable when using XFS. The following are results from a 4-socket machine running a 4.16-rc4 kernel with some scheduler patches that are pending for the next merge window. 4.16.0-rc4 4.16.0-rc4 schedtip-20180309 nodirty-v1 Time cg.D 459.07 ( 0.00%) 444.21 ( 3.24%) Time ep.D 76.96 ( 0.00%) 77.69 ( -0.95%) Time is.D 25.55 ( 0.00%) 27.85 ( -9.00%) Time lu.D 601.58 ( 0.00%) 596.87 ( 0.78%) Time mg.D 107.73 ( 0.00%) 108.22 ( -0.45%) is.D regresses slightly in terms of absolute time but note that that particular load varies quite a bit from run to run. The more relevant observation is the total system CPU usage. 4.16.0-rc4 4.16.0-rc4 schedtip-20180309 nodirty-v1 User 71471.91 70627.04 System 11078.96 8256.13 Elapsed 661.66 632.74 That is a substantial drop in system CPU usage and overall the workload completes faster. The NUMA balancing statistics are also interesting NUMA base PTE updates 111407972 139848884 NUMA huge PMD updates 206506 264869 NUMA page range updates 217139044 275461812 NUMA hint faults 4300924 3719784 NUMA hint local faults 3012539 3416618 NUMA hint local percent 70 91 NUMA pages migrated 1517487 1358420 While more PTEs are scanned due to changes in what faults are gathered, it's clear that a far higher percentage of faults are local as the bulk of the remote hits were dirty pages that, in this case with btrfs, had no chance of migrating. The following is a comparison when using XFS as that is a more realistic filesystem choice for a data partition 4.16.0-rc4 4.16.0-rc4 schedtip-20180309 nodirty-v1r47 Time cg.D 485.28 ( 0.00%) 442.62 ( 8.79%) Time ep.D 77.68 ( 0.00%) 77.54 ( 0.18%) Time is.D 26.44 ( 0.00%) 24.79 ( 6.24%) Time lu.D 597.46 ( 0.00%) 597.11 ( 0.06%) Time mg.D 142.65 ( 0.00%) 105.83 ( 25.81%) That is a reasonable gain on two relatively long-lived workloads. While not presented, there is also a substantial drop in system CPu usage and the NUMA balancing stats show similar improvements in locality as btrfs did. Link: http://lkml.kernel.org/r/20180326094334.zserdec62gwmmfqf@techsingularity.net Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 7 +++++++ mm/mprotect.c | 9 +++++++++ 2 files changed, 16 insertions(+) diff --git a/mm/migrate.c b/mm/migrate.c index 003886606a22..337a40201c82 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1986,6 +1986,13 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, (vma->vm_flags & VM_EXEC)) goto out; + /* + * Also do not migrate dirty pages as not all filesystems can move + * dirty pages in MIGRATE_ASYNC mode which is a waste of cycles. + */ + if (page_is_file_cache(page) && PageDirty(page)) + goto out; + /* * Rate-limit the amount of data that is being migrated to a node. * Optimal placement is no good if the memory bus is saturated and diff --git a/mm/mprotect.c b/mm/mprotect.c index c1d6af7455da..625608bc8962 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -89,6 +90,14 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, page_mapcount(page) != 1) continue; + /* + * While migration can move some dirty pages, + * it cannot move them all from MIGRATE_ASYNC + * context. + */ + if (page_is_file_cache(page) && PageDirty(page)) + continue; + /* Avoid TLB flush if possible */ if (pte_protnone(oldpte)) continue; From bc8755ba66325cb5874eef81d935e91c331d0b1d Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Tue, 10 Apr 2018 16:29:23 -0700 Subject: [PATCH 029/140] mm: check __highest_present_section_nr directly in memory_dev_init() __highest_present_section_nr is a more strict boundary than NR_MEM_SECTIONS. So checking __highest_present_section_nr directly is enough. Link: http://lkml.kernel.org/r/20180330032044.21647-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Andrew Morton Cc: Dave Hansen Cc: Michal Hocko Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 79fcd2bae96b..bffe8616bd55 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -837,11 +837,8 @@ int __init memory_dev_init(void) * during boot and have been initialized */ mutex_lock(&mem_sysfs_mutex); - for (i = 0; i < NR_MEM_SECTIONS; i += sections_per_block) { - /* Don't iterate over sections we know are !present: */ - if (i > __highest_present_section_nr) - break; - + for (i = 0; i <= __highest_present_section_nr; + i += sections_per_block) { err = add_memory_block(i); if (!ret) ret = err; From 07707125aec6a7529900616ba491210ec3d85fc6 Mon Sep 17 00:00:00 2001 From: Ralph Campbell Date: Tue, 10 Apr 2018 16:29:27 -0700 Subject: [PATCH 030/140] mm/migrate: properly preserve write attribute in special migrate entry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use of pte_write(pte) is only valid for present pte, the common code which set the migration entry can be reach for both valid present pte and special swap entry (for device memory). Fix the code to use the mpfn value which properly handle both cases. On x86 this did not have any bad side effect because pte write bit is below PAGE_BIT_GLOBAL and thus special swap entry have it set to 0 which in turn means we were always creating read only special migration entry. So once migration did finish we always write protected the CPU page table entry (moreover this is only an issue when migrating from device memory to system memory). End effect is that CPU write access would fault again and restore write permission. This behaviour isn't too bad; it just burns CPU cycles by forcing CPU to take a second fault on write access. ie, double faulting the same address. There is no corruption or incorrect states (it behaves as a COWed page from a fork with a mapcount of 1). Link: http://lkml.kernel.org/r/20180402023506.12180-1-jglisse@redhat.com Signed-off-by: Ralph Campbell Signed-off-by: Jérôme Glisse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/migrate.c b/mm/migrate.c index 337a40201c82..1e59a3f967e4 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2346,7 +2346,8 @@ again: ptep_get_and_clear(mm, addr, ptep); /* Setup special migration page table entry */ - entry = make_migration_entry(page, pte_write(pte)); + entry = make_migration_entry(page, mpfn & + MIGRATE_PFN_WRITE); swp_pte = swp_entry_to_pte(entry); if (pte_soft_dirty(pte)) swp_pte = pte_swp_mksoft_dirty(swp_pte); From 2a70f6a76bb86d1f39a34b7764f6dcc4257b0356 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 10 Apr 2018 16:29:30 -0700 Subject: [PATCH 031/140] memcg, thp: do not invoke oom killer on thp charges A THP memcg charge can trigger the oom killer since 2516035499b9 ("mm, thp: remove __GFP_NORETRY from khugepaged and madvised allocations"). We have used an explicit __GFP_NORETRY previously which ruled the OOM killer automagically. Memcg charge path should be semantically compliant with the allocation path and that means that if we do not trigger the OOM killer for costly orders which should do the same in the memcg charge path as well. Otherwise we are forcing callers to distinguish the two and use different gfp masks which is both non-intuitive and bug prone. As soon as we get a costly high order kmalloc user we even do not have any means to tell the memcg specific gfp mask to prevent from OOM because the charging is deep within guts of the slab allocator. The unexpected memcg OOM on THP has already been fixed upstream by 9d3c3354bb85 ("mm, thp: do not cause memcg oom for thp") but this is a one-off fix rather than a generic solution. Teach mem_cgroup_oom to bail out on costly order requests to fix the THP issue as well as any other costly OOM eligible allocations to be added in future. Also revert 9d3c3354bb85 because special gfp for THP is no longer needed. Link: http://lkml.kernel.org/r/20180403193129.22146-1-mhocko@kernel.org Fixes: 2516035499b9 ("mm, thp: remove __GFP_NORETRY from khugepaged and madvised allocations") Signed-off-by: Michal Hocko Acked-by: Johannes Weiner Cc: "Kirill A. Shutemov" Cc: Vlastimil Babka Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 5 ++--- mm/khugepaged.c | 8 ++------ mm/memcontrol.c | 2 +- 3 files changed, 5 insertions(+), 10 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f0ae8d1d4329..229ab8c75a6b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -555,8 +555,7 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page, VM_BUG_ON_PAGE(!PageCompound(page), page); - if (mem_cgroup_try_charge(page, vma->vm_mm, gfp | __GFP_NORETRY, &memcg, - true)) { + if (mem_cgroup_try_charge(page, vma->vm_mm, gfp, &memcg, true)) { put_page(page); count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; @@ -1317,7 +1316,7 @@ alloc: } if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm, - huge_gfp | __GFP_NORETRY, &memcg, true))) { + huge_gfp, &memcg, true))) { put_page(new_page); split_huge_pmd(vma, vmf->pmd, vmf->address); if (page) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index e42568284e06..c15da1ea7e63 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -965,9 +965,7 @@ static void collapse_huge_page(struct mm_struct *mm, goto out_nolock; } - /* Do not oom kill for khugepaged charges */ - if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp | __GFP_NORETRY, - &memcg, true))) { + if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) { result = SCAN_CGROUP_CHARGE_FAIL; goto out_nolock; } @@ -1326,9 +1324,7 @@ static void collapse_shmem(struct mm_struct *mm, goto out; } - /* Do not oom kill for khugepaged charges */ - if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp | __GFP_NORETRY, - &memcg, true))) { + if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) { result = SCAN_CGROUP_CHARGE_FAIL; goto out; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9ec024b862ac..6b4f5c0a8eef 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1485,7 +1485,7 @@ static void memcg_oom_recover(struct mem_cgroup *memcg) static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) { - if (!current->memcg_may_oom) + if (!current->memcg_may_oom || order > PAGE_ALLOC_COSTLY_ORDER) return; /* * We are in the middle of the charge context here, so we From 1ec6995d1290bfb87cc3a51f0836c889e857cef9 Mon Sep 17 00:00:00 2001 From: Xidong Wang Date: Tue, 10 Apr 2018 16:29:34 -0700 Subject: [PATCH 032/140] z3fold: fix memory leak In z3fold_create_pool(), the memory allocated by __alloc_percpu() is not released on the error path that pool->compact_wq , which holds the return value of create_singlethread_workqueue(), is NULL. This will result in a memory leak bug. [akpm@linux-foundation.org: fix oops on kzalloc() failure, check __alloc_percpu() retval] Link: http://lkml.kernel.org/r/1522803111-29209-1-git-send-email-wangxidong_97@163.com Signed-off-by: Xidong Wang Reviewed-by: Andrew Morton Cc: Vitaly Wool Cc: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/z3fold.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/mm/z3fold.c b/mm/z3fold.c index f579ad4a8100..c6b1fb0d85a5 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -467,6 +467,8 @@ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp, spin_lock_init(&pool->lock); spin_lock_init(&pool->stale_lock); pool->unbuddied = __alloc_percpu(sizeof(struct list_head)*NCHUNKS, 2); + if (!pool->unbuddied) + goto out_pool; for_each_possible_cpu(cpu) { struct list_head *unbuddied = per_cpu_ptr(pool->unbuddied, cpu); @@ -479,7 +481,7 @@ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp, pool->name = name; pool->compact_wq = create_singlethread_workqueue(pool->name); if (!pool->compact_wq) - goto out; + goto out_unbuddied; pool->release_wq = create_singlethread_workqueue(pool->name); if (!pool->release_wq) goto out_wq; @@ -489,8 +491,11 @@ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp, out_wq: destroy_workqueue(pool->compact_wq); -out: +out_unbuddied: + free_percpu(pool->unbuddied); +out_pool: kfree(pool); +out: return NULL; } From 8a97ea546bb6532f77a0efe165012ee0d0c4b903 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 10 Apr 2018 16:29:37 -0700 Subject: [PATCH 033/140] mm/z3fold.c: use gfpflags_allow_blocking We have a perfectly good macro to determine whether the gfp flags allow you to sleep or not; use it instead of trying to infer it. Link: http://lkml.kernel.org/r/20180408062206.GC16007@bombadil.infradead.org Signed-off-by: Matthew Wilcox Reviewed-by: Andrew Morton Cc: Vitaly Wool Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/z3fold.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/z3fold.c b/mm/z3fold.c index c6b1fb0d85a5..c0bca6153b95 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -538,7 +538,7 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp, struct z3fold_header *zhdr = NULL; struct page *page = NULL; enum buddy bud; - bool can_sleep = (gfp & __GFP_RECLAIM) == __GFP_RECLAIM; + bool can_sleep = gfpflags_allow_blocking(gfp); if (!size || (gfp & __GFP_HIGHMEM)) return -EINVAL; From a38c015f3156895b07e71d4e4414289f8a3b2745 Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Tue, 10 Apr 2018 16:29:41 -0700 Subject: [PATCH 034/140] mm/ksm.c: fix inconsistent accounting of zero pages When using KSM with use_zero_pages, we replace anonymous pages containing only zeroes with actual zero pages, which are not anonymous. We need to do proper accounting of the mm counters, otherwise we will get wrong values in /proc and a BUG message in dmesg when tearing down the mm. Link: http://lkml.kernel.org/r/1522931274-15552-1-git-send-email-imbrenda@linux.vnet.ibm.com Fixes: e86c59b1b1 ("mm/ksm: improve deduplication of zero pages with colouring") Signed-off-by: Claudio Imbrenda Reviewed-by: Andrew Morton Cc: Andrea Arcangeli Cc: Minchan Kim Cc: Kirill A. Shutemov Cc: Hugh Dickins Cc: Christian Borntraeger Cc: Gerald Schaefer Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/mm/ksm.c b/mm/ksm.c index e8d6c6210b80..e3cbf9a92f3c 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1131,6 +1131,13 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, } else { newpte = pte_mkspecial(pfn_pte(page_to_pfn(kpage), vma->vm_page_prot)); + /* + * We're replacing an anonymous page with a zero page, which is + * not anonymous. We need to do proper accounting otherwise we + * will get wrong values in /proc, and a BUG message in dmesg + * when tearing down the mm. + */ + dec_mm_counter(mm, MM_ANONPAGES); } flush_cache_page(vma, addr, pte_pfn(*ptep)); From e27be240df53f1a20c659168e722b5d9f16cc7f4 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 10 Apr 2018 16:29:45 -0700 Subject: [PATCH 035/140] mm: memcg: make sure memory.events is uptodate when waking pollers Commit a983b5ebee57 ("mm: memcontrol: fix excessive complexity in memory.stat reporting") added per-cpu drift to all memory cgroup stats and events shown in memory.stat and memory.events. For memory.stat this is acceptable. But memory.events issues file notifications, and somebody polling the file for changes will be confused when the counters in it are unchanged after a wakeup. Luckily, the events in memory.events - MEMCG_LOW, MEMCG_HIGH, MEMCG_MAX, MEMCG_OOM - are sufficiently rare and high-level that we don't need per-cpu buffering for them: MEMCG_HIGH and MEMCG_MAX would be the most frequent, but they're counting invocations of reclaim, which is a complex operation that touches many shared cachelines. This splits memory.events from the generic VM events and tracks them in their own, unbuffered atomic counters. That's also cleaner, as it eliminates the ugly enum nesting of VM and cgroup events. [hannes@cmpxchg.org: "array subscript is above array bounds"] Link: http://lkml.kernel.org/r/20180406155441.GA20806@cmpxchg.org Link: http://lkml.kernel.org/r/20180405175507.GA24817@cmpxchg.org Fixes: a983b5ebee57 ("mm: memcontrol: fix excessive complexity in memory.stat reporting") Signed-off-by: Johannes Weiner Reported-by: Tejun Heo Acked-by: Tejun Heo Acked-by: Michal Hocko Cc: Vladimir Davydov Cc: Roman Gushchin Cc: Rik van Riel Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 35 ++++++++++++++++++----------------- mm/memcontrol.c | 28 ++++++++++++++++------------ mm/vmscan.c | 2 +- 3 files changed, 35 insertions(+), 30 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index f292efac378d..d99b71bc2c66 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -48,13 +48,12 @@ enum memcg_stat_item { MEMCG_NR_STAT, }; -/* Cgroup-specific events, on top of universal VM events */ -enum memcg_event_item { - MEMCG_LOW = NR_VM_EVENT_ITEMS, +enum memcg_memory_event { + MEMCG_LOW, MEMCG_HIGH, MEMCG_MAX, MEMCG_OOM, - MEMCG_NR_EVENTS, + MEMCG_NR_MEMORY_EVENTS, }; struct mem_cgroup_reclaim_cookie { @@ -88,7 +87,7 @@ enum mem_cgroup_events_target { struct mem_cgroup_stat_cpu { long count[MEMCG_NR_STAT]; - unsigned long events[MEMCG_NR_EVENTS]; + unsigned long events[NR_VM_EVENT_ITEMS]; unsigned long nr_page_events; unsigned long targets[MEM_CGROUP_NTARGETS]; }; @@ -205,7 +204,8 @@ struct mem_cgroup { /* OOM-Killer disable */ int oom_kill_disable; - /* handle for "memory.events" */ + /* memory.events */ + atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS]; struct cgroup_file events_file; /* protect arrays of thresholds */ @@ -234,9 +234,10 @@ struct mem_cgroup { struct task_struct *move_lock_task; unsigned long move_lock_flags; + /* memory.stat */ struct mem_cgroup_stat_cpu __percpu *stat_cpu; atomic_long_t stat[MEMCG_NR_STAT]; - atomic_long_t events[MEMCG_NR_EVENTS]; + atomic_long_t events[NR_VM_EVENT_ITEMS]; unsigned long socket_pressure; @@ -648,9 +649,9 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, gfp_t gfp_mask, unsigned long *total_scanned); -/* idx can be of type enum memcg_event_item or vm_event_item */ static inline void __count_memcg_events(struct mem_cgroup *memcg, - int idx, unsigned long count) + enum vm_event_item idx, + unsigned long count) { unsigned long x; @@ -666,7 +667,8 @@ static inline void __count_memcg_events(struct mem_cgroup *memcg, } static inline void count_memcg_events(struct mem_cgroup *memcg, - int idx, unsigned long count) + enum vm_event_item idx, + unsigned long count) { unsigned long flags; @@ -675,9 +677,8 @@ static inline void count_memcg_events(struct mem_cgroup *memcg, local_irq_restore(flags); } -/* idx can be of type enum memcg_event_item or vm_event_item */ static inline void count_memcg_page_event(struct page *page, - int idx) + enum vm_event_item idx) { if (page->mem_cgroup) count_memcg_events(page->mem_cgroup, idx, 1); @@ -701,10 +702,10 @@ static inline void count_memcg_event_mm(struct mm_struct *mm, rcu_read_unlock(); } -static inline void mem_cgroup_event(struct mem_cgroup *memcg, - enum memcg_event_item event) +static inline void memcg_memory_event(struct mem_cgroup *memcg, + enum memcg_memory_event event) { - count_memcg_events(memcg, event, 1); + atomic_long_inc(&memcg->memory_events[event]); cgroup_file_notify(&memcg->events_file); } @@ -724,8 +725,8 @@ static inline bool mem_cgroup_disabled(void) return true; } -static inline void mem_cgroup_event(struct mem_cgroup *memcg, - enum memcg_event_item event) +static inline void memcg_memory_event(struct mem_cgroup *memcg, + enum memcg_memory_event event) { } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6b4f5c0a8eef..f314334546a2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1839,7 +1839,7 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu) } } - for (i = 0; i < MEMCG_NR_EVENTS; i++) { + for (i = 0; i < NR_VM_EVENT_ITEMS; i++) { long x; x = this_cpu_xchg(memcg->stat_cpu->events[i], 0); @@ -1858,7 +1858,7 @@ static void reclaim_high(struct mem_cgroup *memcg, do { if (page_counter_read(&memcg->memory) <= memcg->high) continue; - mem_cgroup_event(memcg, MEMCG_HIGH); + memcg_memory_event(memcg, MEMCG_HIGH); try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true); } while ((memcg = parent_mem_cgroup(memcg))); } @@ -1949,7 +1949,7 @@ retry: if (!gfpflags_allow_blocking(gfp_mask)) goto nomem; - mem_cgroup_event(mem_over_limit, MEMCG_MAX); + memcg_memory_event(mem_over_limit, MEMCG_MAX); nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, gfp_mask, may_swap); @@ -1992,7 +1992,7 @@ retry: if (fatal_signal_pending(current)) goto force; - mem_cgroup_event(mem_over_limit, MEMCG_OOM); + memcg_memory_event(mem_over_limit, MEMCG_OOM); mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages * PAGE_SIZE)); @@ -2688,10 +2688,10 @@ static void tree_events(struct mem_cgroup *memcg, unsigned long *events) struct mem_cgroup *iter; int i; - memset(events, 0, sizeof(*events) * MEMCG_NR_EVENTS); + memset(events, 0, sizeof(*events) * NR_VM_EVENT_ITEMS); for_each_mem_cgroup_tree(iter, memcg) { - for (i = 0; i < MEMCG_NR_EVENTS; i++) + for (i = 0; i < NR_VM_EVENT_ITEMS; i++) events[i] += memcg_sum_events(iter, i); } } @@ -5178,7 +5178,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, continue; } - mem_cgroup_event(memcg, MEMCG_OOM); + memcg_memory_event(memcg, MEMCG_OOM); if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0)) break; } @@ -5191,10 +5191,14 @@ static int memory_events_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - seq_printf(m, "low %lu\n", memcg_sum_events(memcg, MEMCG_LOW)); - seq_printf(m, "high %lu\n", memcg_sum_events(memcg, MEMCG_HIGH)); - seq_printf(m, "max %lu\n", memcg_sum_events(memcg, MEMCG_MAX)); - seq_printf(m, "oom %lu\n", memcg_sum_events(memcg, MEMCG_OOM)); + seq_printf(m, "low %lu\n", + atomic_long_read(&memcg->memory_events[MEMCG_LOW])); + seq_printf(m, "high %lu\n", + atomic_long_read(&memcg->memory_events[MEMCG_HIGH])); + seq_printf(m, "max %lu\n", + atomic_long_read(&memcg->memory_events[MEMCG_MAX])); + seq_printf(m, "oom %lu\n", + atomic_long_read(&memcg->memory_events[MEMCG_OOM])); seq_printf(m, "oom_kill %lu\n", memcg_sum_events(memcg, OOM_KILL)); return 0; @@ -5204,7 +5208,7 @@ static int memory_stat_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); unsigned long stat[MEMCG_NR_STAT]; - unsigned long events[MEMCG_NR_EVENTS]; + unsigned long events[NR_VM_EVENT_ITEMS]; int i; /* diff --git a/mm/vmscan.c b/mm/vmscan.c index a1d7ba0136fe..671597ce1ea0 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2530,7 +2530,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) sc->memcg_low_skipped = 1; continue; } - mem_cgroup_event(memcg, MEMCG_LOW); + memcg_memory_event(memcg, MEMCG_LOW); } reclaimed = sc->nr_reclaimed; From a06ad633a37c64a0cd4c229fc605cee8725d376e Mon Sep 17 00:00:00 2001 From: Tom Abraham Date: Tue, 10 Apr 2018 16:29:48 -0700 Subject: [PATCH 036/140] swap: divide-by-zero when zero length swap file on ssd Calling swapon() on a zero length swap file on SSD can lead to a divide-by-zero. Although creating such files isn't possible with mkswap and they woud be considered invalid, it would be better for the swapon code to be more robust and handle this condition gracefully (return -EINVAL). Especially since the fix is small and straightforward. To help with wear leveling on SSD, the swapon syscall calculates a random position in the swap file using modulo p->highest_bit, which is set to maxpages - 1 in read_swap_header. If the swap file is zero length, read_swap_header sets maxpages=1 and last_page=0, resulting in p->highest_bit=0 and we divide-by-zero when we modulo p->highest_bit in swapon syscall. This can be prevented by having read_swap_header return zero if last_page is zero. Link: http://lkml.kernel.org/r/5AC747C1020000A7001FA82C@prv-mh.provo.novell.com Signed-off-by: Thomas Abraham Reported-by: Reviewed-by: Andrew Morton Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mm/swapfile.c b/mm/swapfile.c index c7a33717d079..a134d1e86795 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2961,6 +2961,10 @@ static unsigned long read_swap_header(struct swap_info_struct *p, maxpages = swp_offset(pte_to_swp_entry( swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; last_page = swap_header->info.last_page; + if (!last_page) { + pr_warn("Empty swap-file\n"); + return 0; + } if (last_page > maxpages) { pr_warn("Truncating oversized swap area, only using %luk out of %luk\n", maxpages << (PAGE_SHIFT - 10), From 4eaf431f6f71bbed40a4c733ffe93a7e8cedf9d9 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 10 Apr 2018 16:29:52 -0700 Subject: [PATCH 037/140] memcg: fix per_node_info cleanup syzbot has triggered a NULL ptr dereference when allocation fault injection enforces a failure and alloc_mem_cgroup_per_node_info initializes memcg->nodeinfo only half way through. But __mem_cgroup_free still tries to free all per-node data and dereferences pn->lruvec_stat_cpu unconditioanlly even if the specific per-node data hasn't been initialized. The bug is quite unlikely to hit because small allocations do not fail and we would need quite some numa nodes to make struct mem_cgroup_per_node large enough to cross the costly order. Link: http://lkml.kernel.org/r/20180406100906.17790-1-mhocko@kernel.org Reported-by: syzbot+8a5de3cce7cdc70e9ebe@syzkaller.appspotmail.com Fixes: 00f3ca2c2d66 ("mm: memcontrol: per-lruvec stats infrastructure") Signed-off-by: Michal Hocko Reviewed-by: Andrey Ryabinin Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f314334546a2..7978c6faae06 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4108,6 +4108,9 @@ static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) { struct mem_cgroup_per_node *pn = memcg->nodeinfo[node]; + if (!pn) + return; + free_percpu(pn->lruvec_stat_cpu); kfree(pn); } From bfc6b1cabce28d46d2559855ca224992e14fede7 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 10 Apr 2018 16:29:55 -0700 Subject: [PATCH 038/140] mm/swapfile.c: make pointer swap_avail_heads static The pointer swap_avail_heads is local to the source and does not need to be in global scope, so make it static. Cleans up sparse warning: mm/swapfile.c:88:19: warning: symbol 'swap_avail_heads' was not declared. Should it be static? Link: http://lkml.kernel.org/r/20180206215836.12366-1-colin.king@canonical.com Signed-off-by: Colin Ian King Reviewed-by: Andrew Morton Acked-by: "Huang, Ying" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index a134d1e86795..cc2cf04d9018 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -85,7 +85,7 @@ PLIST_HEAD(swap_active_head); * is held and the locking order requires swap_lock to be taken * before any swap_info_struct->lock. */ -struct plist_head *swap_avail_heads; +static struct plist_head *swap_avail_heads; static DEFINE_SPINLOCK(swap_avail_lock); struct swap_info_struct *swap_info[MAX_SWAPFILES]; From a49bd4d7163707de377aee062f17befef6da891b Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 10 Apr 2018 16:29:59 -0700 Subject: [PATCH 039/140] mm, numa: rework do_pages_move Patch series "unclutter thp migration" Motivation: THP migration is hacked into the generic migration with rather surprising semantic. The migration allocation callback is supposed to check whether the THP can be migrated at once and if that is not the case then it allocates a simple page to migrate. unmap_and_move then fixes that up by splitting the THP into small pages while moving the head page to the newly allocated order-0 page. Remaining pages are moved to the LRU list by split_huge_page. The same happens if the THP allocation fails. This is really ugly and error prone [2]. I also believe that split_huge_page to the LRU lists is inherently wrong because all tail pages are not migrated. Some callers will just work around that by retrying (e.g. memory hotplug). There are other pfn walkers which are simply broken though. e.g. madvise_inject_error will migrate head and then advances next pfn by the huge page size. do_move_page_to_node_array, queue_pages_range (migrate_pages, mbind), will simply split the THP before migration if the THP migration is not supported then falls back to single page migration but it doesn't handle tail pages if the THP migration path is not able to allocate a fresh THP so we end up with ENOMEM and fail the whole migration which is a questionable behavior. Page compaction doesn't try to migrate large pages so it should be immune. The first patch reworks do_pages_move which relies on a very ugly calling semantic when the return status is pushed to the migration path via private pointer. It uses pre allocated fixed size batching to achieve that. We simply cannot do the same if a THP is to be split during the migration path which is done in the patch 3. Patch 2 is follow up cleanup which removes the mentioned return status calling convention ugliness. On a side note: There are some semantic issues I have encountered on the way when working on patch 1 but I am not addressing them here. E.g. trying to move THP tail pages will result in either success or EBUSY (the later one more likely once we isolate head from the LRU list). Hugetlb reports EACCESS on tail pages. Some errors are reported via status parameter but migration failures are not even though the original `reason' argument suggests there was an intention to do so. From a quick look into git history this never worked. I have tried to keep the semantic unchanged. Then there is a relatively minor thing that the page isolation might fail because of pages not being on the LRU - e.g. because they are sitting on the per-cpu LRU caches. Easily fixable. This patch (of 3): do_pages_move is supposed to move user defined memory (an array of addresses) to the user defined numa nodes (an array of nodes one for each address). The user provided status array then contains resulting numa node for each address or an error. The semantic of this function is little bit confusing because only some errors are reported back. Notably migrate_pages error is only reported via the return value. This patch doesn't try to address these semantic nuances but rather change the underlying implementation. Currently we are processing user input (which can be really large) in batches which are stored to a temporarily allocated page. Each address is resolved to its struct page and stored to page_to_node structure along with the requested target numa node. The array of these structures is then conveyed down the page migration path via private argument. new_page_node then finds the corresponding structure and allocates the proper target page. What is the problem with the current implementation and why to change it? Apart from being quite ugly it also doesn't cope with unexpected pages showing up on the migration list inside migrate_pages path. That doesn't happen currently but the follow up patch would like to make the thp migration code more clear and that would need to split a THP into the list for some cases. How does the new implementation work? Well, instead of batching into a fixed size array we simply batch all pages that should be migrated to the same node and isolate all of them into a linked list which doesn't require any additional storage. This should work reasonably well because page migration usually migrates larger ranges of memory to a specific node. So the common case should work equally well as the current implementation. Even if somebody constructs an input where the target numa nodes would be interleaved we shouldn't see a large performance impact because page migration alone doesn't really benefit from batching. mmap_sem batching for the lookup is quite questionable and isolate_lru_page which would benefit from batching is not using it even in the current implementation. Link: http://lkml.kernel.org/r/20180103082555.14592-2-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Kirill A. Shutemov Reviewed-by: Andrew Morton Cc: Anshuman Khandual Cc: Zi Yan Cc: Naoya Horiguchi Cc: Vlastimil Babka Cc: Andrea Reale Cc: Kirill A. Shutemov Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 1 + mm/mempolicy.c | 5 +- mm/migrate.c | 314 ++++++++++++++++++++++--------------------------- 3 files changed, 142 insertions(+), 178 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index e6bd35182dae..1a1bb5d59c15 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -538,4 +538,5 @@ static inline bool is_migrate_highatomic_page(struct page *page) } void setup_zone_pageset(struct zone *zone); +extern struct page *alloc_new_node_page(struct page *page, unsigned long node, int **x); #endif /* __MM_INTERNAL_H */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 01cbb7078d6c..947e73feea41 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -942,7 +942,8 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist, } } -static struct page *new_node_page(struct page *page, unsigned long node, int **x) +/* page allocation callback for NUMA node migration */ +struct page *alloc_new_node_page(struct page *page, unsigned long node, int **x) { if (PageHuge(page)) return alloc_huge_page_node(page_hstate(compound_head(page)), @@ -986,7 +987,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, flags | MPOL_MF_DISCONTIG_OK, &pagelist); if (!list_empty(&pagelist)) { - err = migrate_pages(&pagelist, new_node_page, NULL, dest, + err = migrate_pages(&pagelist, alloc_new_node_page, NULL, dest, MIGRATE_SYNC, MR_SYSCALL); if (err) putback_movable_pages(&pagelist); diff --git a/mm/migrate.c b/mm/migrate.c index 1e59a3f967e4..6068d4049a31 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1444,141 +1444,103 @@ out: } #ifdef CONFIG_NUMA -/* - * Move a list of individual pages - */ -struct page_to_node { - unsigned long addr; - struct page *page; - int node; - int status; -}; -static struct page *new_page_node(struct page *p, unsigned long private, - int **result) +static int store_status(int __user *status, int start, int value, int nr) { - struct page_to_node *pm = (struct page_to_node *)private; + while (nr-- > 0) { + if (put_user(value, status + start)) + return -EFAULT; + start++; + } - while (pm->node != MAX_NUMNODES && pm->page != p) - pm++; + return 0; +} - if (pm->node == MAX_NUMNODES) - return NULL; +static int do_move_pages_to_node(struct mm_struct *mm, + struct list_head *pagelist, int node) +{ + int err; - *result = &pm->status; + if (list_empty(pagelist)) + return 0; - if (PageHuge(p)) - return alloc_huge_page_node(page_hstate(compound_head(p)), - pm->node); - else if (thp_migration_supported() && PageTransHuge(p)) { - struct page *thp; - - thp = alloc_pages_node(pm->node, - (GFP_TRANSHUGE | __GFP_THISNODE) & ~__GFP_RECLAIM, - HPAGE_PMD_ORDER); - if (!thp) - return NULL; - prep_transhuge_page(thp); - return thp; - } else - return __alloc_pages_node(pm->node, - GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, 0); + err = migrate_pages(pagelist, alloc_new_node_page, NULL, node, + MIGRATE_SYNC, MR_SYSCALL); + if (err) + putback_movable_pages(pagelist); + return err; } /* - * Move a set of pages as indicated in the pm array. The addr - * field must be set to the virtual address of the page to be moved - * and the node number must contain a valid target node. - * The pm array ends with node = MAX_NUMNODES. + * Resolves the given address to a struct page, isolates it from the LRU and + * puts it to the given pagelist. + * Returns -errno if the page cannot be found/isolated or 0 when it has been + * queued or the page doesn't need to be migrated because it is already on + * the target node */ -static int do_move_page_to_node_array(struct mm_struct *mm, - struct page_to_node *pm, - int migrate_all) +static int add_page_for_migration(struct mm_struct *mm, unsigned long addr, + int node, struct list_head *pagelist, bool migrate_all) { + struct vm_area_struct *vma; + struct page *page; + unsigned int follflags; int err; - struct page_to_node *pp; - LIST_HEAD(pagelist); down_read(&mm->mmap_sem); + err = -EFAULT; + vma = find_vma(mm, addr); + if (!vma || addr < vma->vm_start || !vma_migratable(vma)) + goto out; - /* - * Build a list of pages to migrate - */ - for (pp = pm; pp->node != MAX_NUMNODES; pp++) { - struct vm_area_struct *vma; - struct page *page; - struct page *head; - unsigned int follflags; + /* FOLL_DUMP to ignore special (like zero) pages */ + follflags = FOLL_GET | FOLL_DUMP; + if (!thp_migration_supported()) + follflags |= FOLL_SPLIT; + page = follow_page(vma, addr, follflags); - err = -EFAULT; - vma = find_vma(mm, pp->addr); - if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma)) - goto set_status; + err = PTR_ERR(page); + if (IS_ERR(page)) + goto out; - /* FOLL_DUMP to ignore special (like zero) pages */ - follflags = FOLL_GET | FOLL_DUMP; - if (!thp_migration_supported()) - follflags |= FOLL_SPLIT; - page = follow_page(vma, pp->addr, follflags); - - err = PTR_ERR(page); - if (IS_ERR(page)) - goto set_status; - - err = -ENOENT; - if (!page) - goto set_status; - - err = page_to_nid(page); - - if (err == pp->node) - /* - * Node already in the right place - */ - goto put_and_set; - - err = -EACCES; - if (page_mapcount(page) > 1 && - !migrate_all) - goto put_and_set; - - if (PageHuge(page)) { - if (PageHead(page)) { - isolate_huge_page(page, &pagelist); - err = 0; - pp->page = page; - } - goto put_and_set; - } - - pp->page = compound_head(page); - head = compound_head(page); - err = isolate_lru_page(head); - if (!err) { - list_add_tail(&head->lru, &pagelist); - mod_node_page_state(page_pgdat(head), - NR_ISOLATED_ANON + page_is_file_cache(head), - hpage_nr_pages(head)); - } -put_and_set: - /* - * Either remove the duplicate refcount from - * isolate_lru_page() or drop the page ref if it was - * not isolated. - */ - put_page(page); -set_status: - pp->status = err; - } + err = -ENOENT; + if (!page) + goto out; err = 0; - if (!list_empty(&pagelist)) { - err = migrate_pages(&pagelist, new_page_node, NULL, - (unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL); - if (err) - putback_movable_pages(&pagelist); - } + if (page_to_nid(page) == node) + goto out_putpage; + err = -EACCES; + if (page_mapcount(page) > 1 && !migrate_all) + goto out_putpage; + + if (PageHuge(page)) { + if (PageHead(page)) { + isolate_huge_page(page, pagelist); + err = 0; + } + } else { + struct page *head; + + head = compound_head(page); + err = isolate_lru_page(head); + if (err) + goto out_putpage; + + err = 0; + list_add_tail(&head->lru, pagelist); + mod_node_page_state(page_pgdat(head), + NR_ISOLATED_ANON + page_is_file_cache(head), + hpage_nr_pages(head)); + } +out_putpage: + /* + * Either remove the duplicate refcount from + * isolate_lru_page() or drop the page ref if it was + * not isolated. + */ + put_page(page); +out: up_read(&mm->mmap_sem); return err; } @@ -1593,79 +1555,79 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, const int __user *nodes, int __user *status, int flags) { - struct page_to_node *pm; - unsigned long chunk_nr_pages; - unsigned long chunk_start; - int err; - - err = -ENOMEM; - pm = (struct page_to_node *)__get_free_page(GFP_KERNEL); - if (!pm) - goto out; + int current_node = NUMA_NO_NODE; + LIST_HEAD(pagelist); + int start, i; + int err = 0, err1; migrate_prep(); - /* - * Store a chunk of page_to_node array in a page, - * but keep the last one as a marker - */ - chunk_nr_pages = (PAGE_SIZE / sizeof(struct page_to_node)) - 1; + for (i = start = 0; i < nr_pages; i++) { + const void __user *p; + unsigned long addr; + int node; - for (chunk_start = 0; - chunk_start < nr_pages; - chunk_start += chunk_nr_pages) { - int j; + err = -EFAULT; + if (get_user(p, pages + i)) + goto out_flush; + if (get_user(node, nodes + i)) + goto out_flush; + addr = (unsigned long)p; - if (chunk_start + chunk_nr_pages > nr_pages) - chunk_nr_pages = nr_pages - chunk_start; + err = -ENODEV; + if (node < 0 || node >= MAX_NUMNODES) + goto out_flush; + if (!node_state(node, N_MEMORY)) + goto out_flush; - /* fill the chunk pm with addrs and nodes from user-space */ - for (j = 0; j < chunk_nr_pages; j++) { - const void __user *p; - int node; + err = -EACCES; + if (!node_isset(node, task_nodes)) + goto out_flush; - err = -EFAULT; - if (get_user(p, pages + j + chunk_start)) - goto out_pm; - pm[j].addr = (unsigned long) p; - - if (get_user(node, nodes + j + chunk_start)) - goto out_pm; - - err = -ENODEV; - if (node < 0 || node >= MAX_NUMNODES) - goto out_pm; - - if (!node_state(node, N_MEMORY)) - goto out_pm; - - err = -EACCES; - if (!node_isset(node, task_nodes)) - goto out_pm; - - pm[j].node = node; + if (current_node == NUMA_NO_NODE) { + current_node = node; + start = i; + } else if (node != current_node) { + err = do_move_pages_to_node(mm, &pagelist, current_node); + if (err) + goto out; + err = store_status(status, start, current_node, i - start); + if (err) + goto out; + start = i; + current_node = node; } - /* End marker for this chunk */ - pm[chunk_nr_pages].node = MAX_NUMNODES; + /* + * Errors in the page lookup or isolation are not fatal and we simply + * report them via status + */ + err = add_page_for_migration(mm, addr, current_node, + &pagelist, flags & MPOL_MF_MOVE_ALL); + if (!err) + continue; - /* Migrate this chunk */ - err = do_move_page_to_node_array(mm, pm, - flags & MPOL_MF_MOVE_ALL); - if (err < 0) - goto out_pm; + err = store_status(status, i, err, 1); + if (err) + goto out_flush; - /* Return status information */ - for (j = 0; j < chunk_nr_pages; j++) - if (put_user(pm[j].status, status + j + chunk_start)) { - err = -EFAULT; - goto out_pm; - } + err = do_move_pages_to_node(mm, &pagelist, current_node); + if (err) + goto out; + if (i > start) { + err = store_status(status, start, current_node, i - start); + if (err) + goto out; + } + current_node = NUMA_NO_NODE; } - err = 0; - -out_pm: - free_page((unsigned long)pm); +out_flush: + /* Make sure we do not overwrite the existing error */ + err1 = do_move_pages_to_node(mm, &pagelist, current_node); + if (!err1) + err1 = store_status(status, start, current_node, i - start); + if (!err) + err = err1; out: return err; } From 666feb21a0083e5b29ddd96588553ffa0cc357b6 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 10 Apr 2018 16:30:03 -0700 Subject: [PATCH 040/140] mm, migrate: remove reason argument from new_page_t No allocation callback is using this argument anymore. new_page_node used to use this parameter to convey node_id resp. migration error up to move_pages code (do_move_page_to_node_array). The error status never made it into the final status field and we have a better way to communicate node id to the status field now. All other allocation callbacks simply ignored the argument so we can drop it finally. [mhocko@suse.com: fix migration callback] Link: http://lkml.kernel.org/r/20180105085259.GH2801@dhcp22.suse.cz [akpm@linux-foundation.org: fix alloc_misplaced_dst_page()] [mhocko@kernel.org: fix build] Link: http://lkml.kernel.org/r/20180103091134.GB11319@dhcp22.suse.cz Link: http://lkml.kernel.org/r/20180103082555.14592-3-mhocko@kernel.org Signed-off-by: Michal Hocko Reviewed-by: Zi Yan Cc: Andrea Reale Cc: Anshuman Khandual Cc: Kirill A. Shutemov Cc: Mike Kravetz Cc: Naoya Horiguchi Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/mm/mmu_context_iommu.c | 3 +-- include/linux/migrate.h | 3 +-- include/linux/page-isolation.h | 3 +-- mm/compaction.c | 3 +-- mm/internal.h | 2 +- mm/memory-failure.c | 2 +- mm/memory_hotplug.c | 3 +-- mm/mempolicy.c | 6 +++--- mm/migrate.c | 21 +++------------------ mm/page_isolation.c | 3 +-- 10 files changed, 14 insertions(+), 35 deletions(-) diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c index 9a8a084e4aba..4c615fcb0cf0 100644 --- a/arch/powerpc/mm/mmu_context_iommu.c +++ b/arch/powerpc/mm/mmu_context_iommu.c @@ -75,8 +75,7 @@ EXPORT_SYMBOL_GPL(mm_iommu_preregistered); /* * Taken from alloc_migrate_target with changes to remove CMA allocations */ -struct page *new_iommu_non_cma_page(struct page *page, unsigned long private, - int **resultp) +struct page *new_iommu_non_cma_page(struct page *page, unsigned long private) { gfp_t gfp_mask = GFP_USER; struct page *new_page; diff --git a/include/linux/migrate.h b/include/linux/migrate.h index ab45f8a0d288..e0393240bf64 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -7,8 +7,7 @@ #include #include -typedef struct page *new_page_t(struct page *page, unsigned long private, - int **reason); +typedef struct page *new_page_t(struct page *page, unsigned long private); typedef void free_page_t(struct page *page, unsigned long private); /* diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h index cdad58bbfd8b..4ae347cbc36d 100644 --- a/include/linux/page-isolation.h +++ b/include/linux/page-isolation.h @@ -63,7 +63,6 @@ undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, bool skip_hwpoisoned_pages); -struct page *alloc_migrate_target(struct page *page, unsigned long private, - int **resultp); +struct page *alloc_migrate_target(struct page *page, unsigned long private); #endif diff --git a/mm/compaction.c b/mm/compaction.c index 88d01a50a015..29bd1df18b98 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1166,8 +1166,7 @@ static void isolate_freepages(struct compact_control *cc) * from the isolated freelists in the block we are migrating to. */ static struct page *compaction_alloc(struct page *migratepage, - unsigned long data, - int **result) + unsigned long data) { struct compact_control *cc = (struct compact_control *)data; struct page *freepage; diff --git a/mm/internal.h b/mm/internal.h index 1a1bb5d59c15..502d14189794 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -538,5 +538,5 @@ static inline bool is_migrate_highatomic_page(struct page *page) } void setup_zone_pageset(struct zone *zone); -extern struct page *alloc_new_node_page(struct page *page, unsigned long node, int **x); +extern struct page *alloc_new_node_page(struct page *page, unsigned long node); #endif /* __MM_INTERNAL_H */ diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 2d4bf647cf01..9d142b9b86dc 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1487,7 +1487,7 @@ int unpoison_memory(unsigned long pfn) } EXPORT_SYMBOL(unpoison_memory); -static struct page *new_page(struct page *p, unsigned long private, int **x) +static struct page *new_page(struct page *p, unsigned long private) { int nid = page_to_nid(p); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index cc6dfa5832ca..ec028494519c 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1329,8 +1329,7 @@ static unsigned long scan_movable_pages(unsigned long start, unsigned long end) return 0; } -static struct page *new_node_page(struct page *page, unsigned long private, - int **result) +static struct page *new_node_page(struct page *page, unsigned long private) { int nid = page_to_nid(page); nodemask_t nmask = node_states[N_MEMORY]; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 947e73feea41..e94bd70840de 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -943,7 +943,7 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist, } /* page allocation callback for NUMA node migration */ -struct page *alloc_new_node_page(struct page *page, unsigned long node, int **x) +struct page *alloc_new_node_page(struct page *page, unsigned long node) { if (PageHuge(page)) return alloc_huge_page_node(page_hstate(compound_head(page)), @@ -1108,7 +1108,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, * list of pages handed to migrate_pages()--which is how we get here-- * is in virtual address order. */ -static struct page *new_page(struct page *page, unsigned long start, int **x) +static struct page *new_page(struct page *page, unsigned long start) { struct vm_area_struct *vma; unsigned long uninitialized_var(address); @@ -1153,7 +1153,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, return -ENOSYS; } -static struct page *new_page(struct page *page, unsigned long start, int **x) +static struct page *new_page(struct page *page, unsigned long start) { return NULL; } diff --git a/mm/migrate.c b/mm/migrate.c index 6068d4049a31..c606752f6d2a 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1137,10 +1137,9 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page, enum migrate_reason reason) { int rc = MIGRATEPAGE_SUCCESS; - int *result = NULL; struct page *newpage; - newpage = get_new_page(page, private, &result); + newpage = get_new_page(page, private); if (!newpage) return -ENOMEM; @@ -1231,12 +1230,6 @@ put_new: put_page(newpage); } - if (result) { - if (rc) - *result = rc; - else - *result = page_to_nid(newpage); - } return rc; } @@ -1264,7 +1257,6 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, enum migrate_mode mode, int reason) { int rc = -EAGAIN; - int *result = NULL; int page_was_mapped = 0; struct page *new_hpage; struct anon_vma *anon_vma = NULL; @@ -1281,7 +1273,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, return -ENOSYS; } - new_hpage = get_new_page(hpage, private, &result); + new_hpage = get_new_page(hpage, private); if (!new_hpage) return -ENOMEM; @@ -1345,12 +1337,6 @@ out: else putback_active_hugepage(new_hpage); - if (result) { - if (rc) - *result = rc; - else - *result = page_to_nid(new_hpage); - } return rc; } @@ -1828,8 +1814,7 @@ static bool migrate_balanced_pgdat(struct pglist_data *pgdat, } static struct page *alloc_misplaced_dst_page(struct page *page, - unsigned long data, - int **result) + unsigned long data) { int nid = (int) data; struct page *newpage; diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 61dee77bb211..43e085608846 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -309,8 +309,7 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, return pfn < end_pfn ? -EBUSY : 0; } -struct page *alloc_migrate_target(struct page *page, unsigned long private, - int **resultp) +struct page *alloc_migrate_target(struct page *page, unsigned long private) { return new_page_nodemask(page, numa_node_id(), &node_states[N_MEMORY]); } From 94723aafb9e76414fada7c1c198733a86f01ea8f Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 10 Apr 2018 16:30:07 -0700 Subject: [PATCH 041/140] mm: unclutter THP migration THP migration is hacked into the generic migration with rather surprising semantic. The migration allocation callback is supposed to check whether the THP can be migrated at once and if that is not the case then it allocates a simple page to migrate. unmap_and_move then fixes that up by spliting the THP into small pages while moving the head page to the newly allocated order-0 page. Remaning pages are moved to the LRU list by split_huge_page. The same happens if the THP allocation fails. This is really ugly and error prone [1]. I also believe that split_huge_page to the LRU lists is inherently wrong because all tail pages are not migrated. Some callers will just work around that by retrying (e.g. memory hotplug). There are other pfn walkers which are simply broken though. e.g. madvise_inject_error will migrate head and then advances next pfn by the huge page size. do_move_page_to_node_array, queue_pages_range (migrate_pages, mbind), will simply split the THP before migration if the THP migration is not supported then falls back to single page migration but it doesn't handle tail pages if the THP migration path is not able to allocate a fresh THP so we end up with ENOMEM and fail the whole migration which is a questionable behavior. Page compaction doesn't try to migrate large pages so it should be immune. This patch tries to unclutter the situation by moving the special THP handling up to the migrate_pages layer where it actually belongs. We simply split the THP page into the existing list if unmap_and_move fails with ENOMEM and retry. So we will _always_ migrate all THP subpages and specific migrate_pages users do not have to deal with this case in a special way. [1] http://lkml.kernel.org/r/20171121021855.50525-1-zi.yan@sent.com Link: http://lkml.kernel.org/r/20180103082555.14592-4-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Kirill A. Shutemov Reviewed-by: Zi Yan Cc: Andrea Reale Cc: Anshuman Khandual Cc: Mike Kravetz Cc: Naoya Horiguchi Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 4 ++-- mm/huge_memory.c | 6 ++++++ mm/memory_hotplug.c | 2 +- mm/mempolicy.c | 31 +++---------------------------- mm/migrate.c | 34 ++++++++++++++++++++++++---------- 5 files changed, 36 insertions(+), 41 deletions(-) diff --git a/include/linux/migrate.h b/include/linux/migrate.h index e0393240bf64..f2b4abbca55e 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -42,9 +42,9 @@ static inline struct page *new_page_nodemask(struct page *page, return alloc_huge_page_nodemask(page_hstate(compound_head(page)), preferred_nid, nodemask); - if (thp_migration_supported() && PageTransHuge(page)) { - order = HPAGE_PMD_ORDER; + if (PageTransHuge(page)) { gfp_mask |= GFP_TRANSHUGE; + order = HPAGE_PMD_ORDER; } if (PageHighMem(page) || (zone_idx(page_zone(page)) == ZONE_MOVABLE)) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 229ab8c75a6b..3f3267af4e3b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2401,6 +2401,12 @@ static void __split_huge_page_tail(struct page *head, int tail, page_tail->index = head->index + tail; page_cpupid_xchg_last(page_tail, page_cpupid_last(head)); + + /* + * always add to the tail because some iterators expect new + * pages to show after the currently processed elements - e.g. + * migrate_pages + */ lru_add_page_tail(head, page_tail, lruvec, list); } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index ec028494519c..f74826cdceea 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1372,7 +1372,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) if (isolate_huge_page(page, &source)) move_pages -= 1 << compound_order(head); continue; - } else if (thp_migration_supported() && PageTransHuge(page)) + } else if (PageTransHuge(page)) pfn = page_to_pfn(compound_head(page)) + hpage_nr_pages(page) - 1; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e94bd70840de..9ac49ef17b4e 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -446,15 +446,6 @@ static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr, __split_huge_pmd(walk->vma, pmd, addr, false, NULL); goto out; } - if (!thp_migration_supported()) { - get_page(page); - spin_unlock(ptl); - lock_page(page); - ret = split_huge_page(page); - unlock_page(page); - put_page(page); - goto out; - } if (!queue_pages_required(page, qp)) { ret = 1; goto unlock; @@ -495,7 +486,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, if (pmd_trans_unstable(pmd)) return 0; -retry: + pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); for (; addr != end; pte++, addr += PAGE_SIZE) { if (!pte_present(*pte)) @@ -511,22 +502,6 @@ retry: continue; if (!queue_pages_required(page, qp)) continue; - if (PageTransCompound(page) && !thp_migration_supported()) { - get_page(page); - pte_unmap_unlock(pte, ptl); - lock_page(page); - ret = split_huge_page(page); - unlock_page(page); - put_page(page); - /* Failed to split -- skip. */ - if (ret) { - pte = pte_offset_map_lock(walk->mm, pmd, - addr, &ptl); - continue; - } - goto retry; - } - migrate_page_add(page, qp->pagelist, flags); } pte_unmap_unlock(pte - 1, ptl); @@ -948,7 +923,7 @@ struct page *alloc_new_node_page(struct page *page, unsigned long node) if (PageHuge(page)) return alloc_huge_page_node(page_hstate(compound_head(page)), node); - else if (thp_migration_supported() && PageTransHuge(page)) { + else if (PageTransHuge(page)) { struct page *thp; thp = alloc_pages_node(node, @@ -1124,7 +1099,7 @@ static struct page *new_page(struct page *page, unsigned long start) if (PageHuge(page)) { return alloc_huge_page_vma(page_hstate(compound_head(page)), vma, address); - } else if (thp_migration_supported() && PageTransHuge(page)) { + } else if (PageTransHuge(page)) { struct page *thp; thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address, diff --git a/mm/migrate.c b/mm/migrate.c index c606752f6d2a..51b55f2d2db5 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1139,6 +1139,9 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page, int rc = MIGRATEPAGE_SUCCESS; struct page *newpage; + if (!thp_migration_supported() && PageTransHuge(page)) + return -ENOMEM; + newpage = get_new_page(page, private); if (!newpage) return -ENOMEM; @@ -1160,14 +1163,6 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page, goto out; } - if (unlikely(PageTransHuge(page) && !PageTransHuge(newpage))) { - lock_page(page); - rc = split_huge_page(page); - unlock_page(page); - if (rc) - goto out; - } - rc = __unmap_and_move(page, newpage, force, mode); if (rc == MIGRATEPAGE_SUCCESS) set_page_owner_migrate_reason(newpage, reason); @@ -1381,6 +1376,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, retry = 0; list_for_each_entry_safe(page, page2, from, lru) { +retry: cond_resched(); if (PageHuge(page)) @@ -1394,6 +1390,26 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, switch(rc) { case -ENOMEM: + /* + * THP migration might be unsupported or the + * allocation could've failed so we should + * retry on the same page with the THP split + * to base pages. + * + * Head page is retried immediately and tail + * pages are added to the tail of the list so + * we encounter them after the rest of the list + * is processed. + */ + if (PageTransHuge(page)) { + lock_page(page); + rc = split_huge_page_to_list(page, from); + unlock_page(page); + if (!rc) { + list_safe_reset_next(page, page2, lru); + goto retry; + } + } nr_failed++; goto out; case -EAGAIN: @@ -1480,8 +1496,6 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr, /* FOLL_DUMP to ignore special (like zero) pages */ follflags = FOLL_GET | FOLL_DUMP; - if (!thp_migration_supported()) - follflags |= FOLL_SPLIT; page = follow_page(vma, addr, follflags); err = PTR_ERR(page); From d3cda2337bbc9edd2a26b83cb00eaa8c048ff274 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 10 Apr 2018 16:30:11 -0700 Subject: [PATCH 042/140] mm/page_alloc: don't reserve ZONE_HIGHMEM for ZONE_MOVABLE request Freepage on ZONE_HIGHMEM doesn't work for kernel memory so it's not that important to reserve. When ZONE_MOVABLE is used, this problem would theorectically cause to decrease usable memory for GFP_HIGHUSER_MOVABLE allocation request which is mainly used for page cache and anon page allocation. So, fix it by setting 0 to sysctl_lowmem_reserve_ratio[ZONE_HIGHMEM]. And, defining sysctl_lowmem_reserve_ratio array by MAX_NR_ZONES - 1 size makes code complex. For example, if there is highmem system, following reserve ratio is activated for *NORMAL ZONE* which would be easyily misleading people. #ifdef CONFIG_HIGHMEM 32 #endif This patch also fixes this situation by defining sysctl_lowmem_reserve_ratio array by MAX_NR_ZONES and place "#ifdef" to right place. Link: http://lkml.kernel.org/r/1504672525-17915-1-git-send-email-iamjoonsoo.kim@lge.com Signed-off-by: Joonsoo Kim Reviewed-by: Aneesh Kumar K.V Acked-by: Vlastimil Babka Tested-by: Tony Lindgren Cc: Michal Hocko Cc: Vlastimil Babka Cc: Mel Gorman Cc: Johannes Weiner Cc: "Aneesh Kumar K . V" Cc: Minchan Kim Cc: Rik van Riel Cc: Laura Abbott Cc: Marek Szyprowski Cc: Michal Nazarewicz Cc: Russell King Cc: Will Deacon Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/vm.txt | 5 ++--- include/linux/mmzone.h | 2 +- mm/page_alloc.c | 25 ++++++++++++++----------- 3 files changed, 17 insertions(+), 15 deletions(-) diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index ff234d229cbb..17256f2ad919 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -312,8 +312,6 @@ The lowmem_reserve_ratio is an array. You can see them by reading this file. % cat /proc/sys/vm/lowmem_reserve_ratio 256 256 32 - -Note: # of this elements is one fewer than number of zones. Because the highest - zone's value is not necessary for following calculation. But, these values are not used directly. The kernel calculates # of protection pages for each zones from them. These are shown as array of protection pages @@ -364,7 +362,8 @@ As above expression, they are reciprocal number of ratio. pages of higher zones on the node. If you would like to protect more pages, smaller values are effective. -The minimum value is 1 (1/1 -> 100%). +The minimum value is 1 (1/1 -> 100%). The value less than 1 completely +disables protection of the pages. ============================================================== diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index a0c9e45a859a..32699b2dc52a 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -885,7 +885,7 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); int watermark_scale_factor_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); -extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1]; +extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES]; int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b04667848375..34a4c12d2675 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -205,17 +205,18 @@ static void __free_pages_ok(struct page *page, unsigned int order); * TBD: should special case ZONE_DMA32 machines here - in those we normally * don't need any ZONE_NORMAL reservation */ -int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { +int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] = { #ifdef CONFIG_ZONE_DMA - 256, + [ZONE_DMA] = 256, #endif #ifdef CONFIG_ZONE_DMA32 - 256, + [ZONE_DMA32] = 256, #endif + [ZONE_NORMAL] = 32, #ifdef CONFIG_HIGHMEM - 32, + [ZONE_HIGHMEM] = 0, #endif - 32, + [ZONE_MOVABLE] = 0, }; EXPORT_SYMBOL(totalram_pages); @@ -7132,13 +7133,15 @@ static void setup_per_zone_lowmem_reserve(void) struct zone *lower_zone; idx--; - - if (sysctl_lowmem_reserve_ratio[idx] < 1) - sysctl_lowmem_reserve_ratio[idx] = 1; - lower_zone = pgdat->node_zones + idx; - lower_zone->lowmem_reserve[j] = managed_pages / - sysctl_lowmem_reserve_ratio[idx]; + + if (sysctl_lowmem_reserve_ratio[idx] < 1) { + sysctl_lowmem_reserve_ratio[idx] = 0; + lower_zone->lowmem_reserve[j] = 0; + } else { + lower_zone->lowmem_reserve[j] = + managed_pages / sysctl_lowmem_reserve_ratio[idx]; + } managed_pages += lower_zone->managed_pages; } } From bad8c6c0b1144694ecb0bc5629ede9b8b578b86e Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 10 Apr 2018 16:30:15 -0700 Subject: [PATCH 043/140] mm/cma: manage the memory of the CMA area by using the ZONE_MOVABLE Patch series "mm/cma: manage the memory of the CMA area by using the ZONE_MOVABLE", v2. 0. History This patchset is the follow-up of the discussion about the "Introduce ZONE_CMA (v7)" [1]. Please reference it if more information is needed. 1. What does this patch do? This patch changes the management way for the memory of the CMA area in the MM subsystem. Currently the memory of the CMA area is managed by the zone where their pfn is belong to. However, this approach has some problems since MM subsystem doesn't have enough logic to handle the situation that different characteristic memories are in a single zone. To solve this issue, this patch try to manage all the memory of the CMA area by using the MOVABLE zone. In MM subsystem's point of view, characteristic of the memory on the MOVABLE zone and the memory of the CMA area are the same. So, managing the memory of the CMA area by using the MOVABLE zone will not have any problem. 2. Motivation There are some problems with current approach. See following. Although these problem would not be inherent and it could be fixed without this conception change, it requires many hooks addition in various code path and it would be intrusive to core MM and would be really error-prone. Therefore, I try to solve them with this new approach. Anyway, following is the problems of the current implementation. o CMA memory utilization First, following is the freepage calculation logic in MM. - For movable allocation: freepage = total freepage - For unmovable allocation: freepage = total freepage - CMA freepage Freepages on the CMA area is used after the normal freepages in the zone where the memory of the CMA area is belong to are exhausted. At that moment that the number of the normal freepages is zero, so - For movable allocation: freepage = total freepage = CMA freepage - For unmovable allocation: freepage = 0 If unmovable allocation comes at this moment, allocation request would fail to pass the watermark check and reclaim is started. After reclaim, there would exist the normal freepages so freepages on the CMA areas would not be used. FYI, there is another attempt [2] trying to solve this problem in lkml. And, as far as I know, Qualcomm also has out-of-tree solution for this problem. Useless reclaim: There is no logic to distinguish CMA pages in the reclaim path. Hence, CMA page is reclaimed even if the system just needs the page that can be usable for the kernel allocation. Atomic allocation failure: This is also related to the fallback allocation policy for the memory of the CMA area. Consider the situation that the number of the normal freepages is *zero* since the bunch of the movable allocation requests come. Kswapd would not be woken up due to following freepage calculation logic. - For movable allocation: freepage = total freepage = CMA freepage If atomic unmovable allocation request comes at this moment, it would fails due to following logic. - For unmovable allocation: freepage = total freepage - CMA freepage = 0 It was reported by Aneesh [3]. Useless compaction: Usual high-order allocation request is unmovable allocation request and it cannot be served from the memory of the CMA area. In compaction, migration scanner try to migrate the page in the CMA area and make high-order page there. As mentioned above, it cannot be usable for the unmovable allocation request so it's just waste. 3. Current approach and new approach Current approach is that the memory of the CMA area is managed by the zone where their pfn is belong to. However, these memory should be distinguishable since they have a strong limitation. So, they are marked as MIGRATE_CMA in pageblock flag and handled specially. However, as mentioned in section 2, the MM subsystem doesn't have enough logic to deal with this special pageblock so many problems raised. New approach is that the memory of the CMA area is managed by the MOVABLE zone. MM already have enough logic to deal with special zone like as HIGHMEM and MOVABLE zone. So, managing the memory of the CMA area by the MOVABLE zone just naturally work well because constraints for the memory of the CMA area that the memory should always be migratable is the same with the constraint for the MOVABLE zone. There is one side-effect for the usability of the memory of the CMA area. The use of MOVABLE zone is only allowed for a request with GFP_HIGHMEM && GFP_MOVABLE so now the memory of the CMA area is also only allowed for this gfp flag. Before this patchset, a request with GFP_MOVABLE can use them. IMO, It would not be a big issue since most of GFP_MOVABLE request also has GFP_HIGHMEM flag. For example, file cache page and anonymous page. However, file cache page for blockdev file is an exception. Request for it has no GFP_HIGHMEM flag. There is pros and cons on this exception. In my experience, blockdev file cache pages are one of the top reason that causes cma_alloc() to fail temporarily. So, we can get more guarantee of cma_alloc() success by discarding this case. Note that there is no change in admin POV since this patchset is just for internal implementation change in MM subsystem. Just one minor difference for admin is that the memory stat for CMA area will be printed in the MOVABLE zone. That's all. 4. Result Following is the experimental result related to utilization problem. 8 CPUs, 1024 MB, VIRTUAL MACHINE make -j16 CMA area: 0 MB 512 MB Elapsed-time: 92.4 186.5 pswpin: 82 18647 pswpout: 160 69839 CMA : 0 MB 512 MB Elapsed-time: 93.1 93.4 pswpin: 84 46 pswpout: 183 92 akpm: "kernel test robot" reported a 26% improvement in vm-scalability.throughput: http://lkml.kernel.org/r/20180330012721.GA3845@yexl-desktop [1]: lkml.kernel.org/r/1491880640-9944-1-git-send-email-iamjoonsoo.kim@lge.com [2]: https://lkml.org/lkml/2014/10/15/623 [3]: http://www.spinics.net/lists/linux-mm/msg100562.html Link: http://lkml.kernel.org/r/1512114786-5085-2-git-send-email-iamjoonsoo.kim@lge.com Signed-off-by: Joonsoo Kim Reviewed-by: Aneesh Kumar K.V Tested-by: Tony Lindgren Acked-by: Vlastimil Babka Cc: Johannes Weiner Cc: Laura Abbott Cc: Marek Szyprowski Cc: Mel Gorman Cc: Michal Hocko Cc: Michal Nazarewicz Cc: Minchan Kim Cc: Rik van Riel Cc: Russell King Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 3 -- include/linux/mm.h | 1 + mm/cma.c | 83 +++++++++++++++++++++++++++++----- mm/internal.h | 3 ++ mm/page_alloc.c | 55 ++++++++++++++++++++-- 5 files changed, 126 insertions(+), 19 deletions(-) diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 2b0265265c28..e0e49b5b1ee1 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -216,9 +216,6 @@ void put_online_mems(void); void mem_hotplug_begin(void); void mem_hotplug_done(void); -extern void set_zone_contiguous(struct zone *zone); -extern void clear_zone_contiguous(struct zone *zone); - #else /* ! CONFIG_MEMORY_HOTPLUG */ #define pfn_to_online_page(pfn) \ ({ \ diff --git a/include/linux/mm.h b/include/linux/mm.h index 3ad632366973..342c441c25d0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2108,6 +2108,7 @@ extern void setup_per_cpu_pageset(void); extern void zone_pcp_update(struct zone *zone); extern void zone_pcp_reset(struct zone *zone); +extern void setup_zone_pageset(struct zone *zone); /* page_alloc.c */ extern int min_free_kbytes; diff --git a/mm/cma.c b/mm/cma.c index 5809bbe360d7..aa40e6c7b042 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -39,6 +39,7 @@ #include #include "cma.h" +#include "internal.h" struct cma cma_areas[MAX_CMA_AREAS]; unsigned cma_area_count; @@ -109,23 +110,25 @@ static int __init cma_activate_area(struct cma *cma) if (!cma->bitmap) return -ENOMEM; - WARN_ON_ONCE(!pfn_valid(pfn)); - zone = page_zone(pfn_to_page(pfn)); - do { unsigned j; base_pfn = pfn; + if (!pfn_valid(base_pfn)) + goto err; + + zone = page_zone(pfn_to_page(base_pfn)); for (j = pageblock_nr_pages; j; --j, pfn++) { - WARN_ON_ONCE(!pfn_valid(pfn)); + if (!pfn_valid(pfn)) + goto err; + /* - * alloc_contig_range requires the pfn range - * specified to be in the same zone. Make this - * simple by forcing the entire CMA resv range - * to be in the same zone. + * In init_cma_reserved_pageblock(), present_pages + * is adjusted with assumption that all pages in + * the pageblock come from a single zone. */ if (page_zone(pfn_to_page(pfn)) != zone) - goto not_in_zone; + goto err; } init_cma_reserved_pageblock(pfn_to_page(base_pfn)); } while (--i); @@ -139,7 +142,7 @@ static int __init cma_activate_area(struct cma *cma) return 0; -not_in_zone: +err: pr_err("CMA area %s could not be activated\n", cma->name); kfree(cma->bitmap); cma->count = 0; @@ -149,6 +152,41 @@ not_in_zone: static int __init cma_init_reserved_areas(void) { int i; + struct zone *zone; + pg_data_t *pgdat; + + if (!cma_area_count) + return 0; + + for_each_online_pgdat(pgdat) { + unsigned long start_pfn = UINT_MAX, end_pfn = 0; + + zone = &pgdat->node_zones[ZONE_MOVABLE]; + + /* + * In this case, we cannot adjust the zone range + * since it is now maximum node span and we don't + * know original zone range. + */ + if (populated_zone(zone)) + continue; + + for (i = 0; i < cma_area_count; i++) { + if (pfn_to_nid(cma_areas[i].base_pfn) != + pgdat->node_id) + continue; + + start_pfn = min(start_pfn, cma_areas[i].base_pfn); + end_pfn = max(end_pfn, cma_areas[i].base_pfn + + cma_areas[i].count); + } + + if (!end_pfn) + continue; + + zone->zone_start_pfn = start_pfn; + zone->spanned_pages = end_pfn - start_pfn; + } for (i = 0; i < cma_area_count; i++) { int ret = cma_activate_area(&cma_areas[i]); @@ -157,9 +195,32 @@ static int __init cma_init_reserved_areas(void) return ret; } + /* + * Reserved pages for ZONE_MOVABLE are now activated and + * this would change ZONE_MOVABLE's managed page counter and + * the other zones' present counter. We need to re-calculate + * various zone information that depends on this initialization. + */ + build_all_zonelists(NULL); + for_each_populated_zone(zone) { + if (zone_idx(zone) == ZONE_MOVABLE) { + zone_pcp_reset(zone); + setup_zone_pageset(zone); + } else + zone_pcp_update(zone); + + set_zone_contiguous(zone); + } + + /* + * We need to re-init per zone wmark by calling + * init_per_zone_wmark_min() but doesn't call here because it is + * registered on core_initcall and it will be called later than us. + */ + return 0; } -core_initcall(cma_init_reserved_areas); +pure_initcall(cma_init_reserved_areas); /** * cma_init_reserved_mem() - create custom contiguous area from reserved memory diff --git a/mm/internal.h b/mm/internal.h index 502d14189794..228dd6642951 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -168,6 +168,9 @@ extern void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags); extern int user_min_free_kbytes; +extern void set_zone_contiguous(struct zone *zone); +extern void clear_zone_contiguous(struct zone *zone); + #if defined CONFIG_COMPACTION || defined CONFIG_CMA /* diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 34a4c12d2675..facc25ee6e2d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1747,16 +1747,38 @@ void __init page_alloc_init_late(void) } #ifdef CONFIG_CMA +static void __init adjust_present_page_count(struct page *page, long count) +{ + struct zone *zone = page_zone(page); + + /* We don't need to hold a lock since it is boot-up process */ + zone->present_pages += count; +} + /* Free whole pageblock and set its migration type to MIGRATE_CMA. */ void __init init_cma_reserved_pageblock(struct page *page) { unsigned i = pageblock_nr_pages; + unsigned long pfn = page_to_pfn(page); struct page *p = page; + int nid = page_to_nid(page); + + /* + * ZONE_MOVABLE will steal present pages from other zones by + * changing page links so page_zone() is changed. Before that, + * we need to adjust previous zone's page count first. + */ + adjust_present_page_count(page, -pageblock_nr_pages); do { __ClearPageReserved(p); set_page_count(p, 0); - } while (++p, --i); + + /* Steal pages from other zones */ + set_page_links(p, ZONE_MOVABLE, nid, pfn); + } while (++p, ++pfn, --i); + + adjust_present_page_count(page, pageblock_nr_pages); set_pageblock_migratetype(page, MIGRATE_CMA); @@ -6208,6 +6230,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) { enum zone_type j; int nid = pgdat->node_id; + unsigned long node_end_pfn = 0; pgdat_resize_init(pgdat); #ifdef CONFIG_NUMA_BALANCING @@ -6235,9 +6258,13 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) struct zone *zone = pgdat->node_zones + j; unsigned long size, realsize, freesize, memmap_pages; unsigned long zone_start_pfn = zone->zone_start_pfn; + unsigned long movable_size = 0; size = zone->spanned_pages; realsize = freesize = zone->present_pages; + if (zone_end_pfn(zone) > node_end_pfn) + node_end_pfn = zone_end_pfn(zone); + /* * Adjust freesize so that it accounts for how much memory @@ -6286,12 +6313,30 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) zone_seqlock_init(zone); zone_pcp_init(zone); - if (!size) + /* + * The size of the CMA area is unknown now so we need to + * prepare the memory for the usemap at maximum. + */ + if (IS_ENABLED(CONFIG_CMA) && j == ZONE_MOVABLE && + pgdat->node_spanned_pages) { + movable_size = node_end_pfn - pgdat->node_start_pfn; + } + + if (!size && !movable_size) continue; set_pageblock_order(); - setup_usemap(pgdat, zone, zone_start_pfn, size); - init_currently_empty_zone(zone, zone_start_pfn, size); + if (movable_size) { + zone->zone_start_pfn = pgdat->node_start_pfn; + zone->spanned_pages = movable_size; + setup_usemap(pgdat, zone, + pgdat->node_start_pfn, movable_size); + init_currently_empty_zone(zone, + pgdat->node_start_pfn, movable_size); + } else { + setup_usemap(pgdat, zone, zone_start_pfn, size); + init_currently_empty_zone(zone, zone_start_pfn, size); + } memmap_init(size, nid, j, zone_start_pfn); } } @@ -7932,7 +7977,7 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages) } #endif -#ifdef CONFIG_MEMORY_HOTPLUG +#if defined CONFIG_MEMORY_HOTPLUG || defined CONFIG_CMA /* * The zone indicated has a new number of managed_pages; batch sizes and percpu * page high values need to be recalulated. From 1d47a3ec09b5489cd915e8f492aa623cdab5d002 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 10 Apr 2018 16:30:19 -0700 Subject: [PATCH 044/140] mm/cma: remove ALLOC_CMA Now, all reserved pages for CMA region are belong to the ZONE_MOVABLE and it only serves for a request with GFP_HIGHMEM && GFP_MOVABLE. Therefore, we don't need to maintain ALLOC_CMA at all. Link: http://lkml.kernel.org/r/1512114786-5085-3-git-send-email-iamjoonsoo.kim@lge.com Signed-off-by: Joonsoo Kim Reviewed-by: Aneesh Kumar K.V Tested-by: Tony Lindgren Acked-by: Vlastimil Babka Cc: Johannes Weiner Cc: Laura Abbott Cc: Marek Szyprowski Cc: Mel Gorman Cc: Michal Hocko Cc: Michal Nazarewicz Cc: Minchan Kim Cc: Rik van Riel Cc: Russell King Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 4 +--- mm/internal.h | 1 - mm/page_alloc.c | 28 +++------------------------- 3 files changed, 4 insertions(+), 29 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 29bd1df18b98..028b7210a669 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1450,14 +1450,12 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order, * if compaction succeeds. * For costly orders, we require low watermark instead of min for * compaction to proceed to increase its chances. - * ALLOC_CMA is used, as pages in CMA pageblocks are considered - * suitable migration targets */ watermark = (order > PAGE_ALLOC_COSTLY_ORDER) ? low_wmark_pages(zone) : min_wmark_pages(zone); watermark += compact_gap(order); if (!__zone_watermark_ok(zone, 0, watermark, classzone_idx, - ALLOC_CMA, wmark_target)) + 0, wmark_target)) return COMPACT_SKIPPED; return COMPACT_CONTINUE; diff --git a/mm/internal.h b/mm/internal.h index 228dd6642951..62d8c34e63d5 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -498,7 +498,6 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, #define ALLOC_HARDER 0x10 /* try to alloc harder */ #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ -#define ALLOC_CMA 0x80 /* allow allocations from CMA areas */ enum ttu_flags; struct tlbflush_unmap_batch; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index facc25ee6e2d..b4390db64da3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2893,7 +2893,7 @@ int __isolate_free_page(struct page *page, unsigned int order) * exists. */ watermark = min_wmark_pages(zone) + (1UL << order); - if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA)) + if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) return 0; __mod_zone_freepage_state(zone, -(1UL << order), mt); @@ -3169,12 +3169,6 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, } -#ifdef CONFIG_CMA - /* If allocation can't use CMA areas don't use free CMA pages */ - if (!(alloc_flags & ALLOC_CMA)) - free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES); -#endif - /* * Check watermarks for an order-0 allocation request. If these * are not met, then a high-order request also cannot go ahead @@ -3201,10 +3195,8 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, } #ifdef CONFIG_CMA - if ((alloc_flags & ALLOC_CMA) && - !list_empty(&area->free_list[MIGRATE_CMA])) { + if (!list_empty(&area->free_list[MIGRATE_CMA])) return true; - } #endif if (alloc_harder && !list_empty(&area->free_list[MIGRATE_HIGHATOMIC])) @@ -3224,13 +3216,6 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order, unsigned long mark, int classzone_idx, unsigned int alloc_flags) { long free_pages = zone_page_state(z, NR_FREE_PAGES); - long cma_pages = 0; - -#ifdef CONFIG_CMA - /* If allocation can't use CMA areas don't use free CMA pages */ - if (!(alloc_flags & ALLOC_CMA)) - cma_pages = zone_page_state(z, NR_FREE_CMA_PAGES); -#endif /* * Fast check for order-0 only. If this fails then the reserves @@ -3239,7 +3224,7 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order, * the caller is !atomic then it'll uselessly search the free * list. That corner case is then slower but it is harmless. */ - if (!order && (free_pages - cma_pages) > mark + z->lowmem_reserve[classzone_idx]) + if (!order && free_pages > mark + z->lowmem_reserve[classzone_idx]) return true; return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, @@ -3875,10 +3860,6 @@ gfp_to_alloc_flags(gfp_t gfp_mask) } else if (unlikely(rt_task(current)) && !in_interrupt()) alloc_flags |= ALLOC_HARDER; -#ifdef CONFIG_CMA - if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) - alloc_flags |= ALLOC_CMA; -#endif return alloc_flags; } @@ -4345,9 +4326,6 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, if (should_fail_alloc_page(gfp_mask, order)) return false; - if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE) - *alloc_flags |= ALLOC_CMA; - return true; } From 3d2054ad8c2d5100b68b0c0405f89fd90bf4107b Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 10 Apr 2018 16:30:23 -0700 Subject: [PATCH 045/140] ARM: CMA: avoid double mapping to the CMA area if CONFIG_HIGHMEM=y CMA area is now managed by the separate zone, ZONE_MOVABLE, to fix many MM related problems. In this implementation, if CONFIG_HIGHMEM = y, then ZONE_MOVABLE is considered as HIGHMEM and the memory of the CMA area is also considered as HIGHMEM. That means that they are considered as the page without direct mapping. However, CMA area could be in a lowmem and the memory could have direct mapping. In ARM, when establishing a new mapping for DMA, direct mapping should be cleared since two mapping with different cache policy could cause unknown problem. With this patch, PageHighmem() for the CMA memory located in lowmem returns true so that the function for DMA mapping cannot notice whether it needs to clear direct mapping or not, correctly. To handle this situation, this patch always clears direct mapping for such CMA memory. Link: http://lkml.kernel.org/r/1512114786-5085-4-git-send-email-iamjoonsoo.kim@lge.com Signed-off-by: Joonsoo Kim Tested-by: Tony Lindgren Cc: "Aneesh Kumar K . V" Cc: Johannes Weiner Cc: Laura Abbott Cc: Marek Szyprowski Cc: Mel Gorman Cc: Michal Hocko Cc: Michal Nazarewicz Cc: Minchan Kim Cc: Rik van Riel Cc: Russell King Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/mm/dma-mapping.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c index ada8eb206a90..8c398fedbbb6 100644 --- a/arch/arm/mm/dma-mapping.c +++ b/arch/arm/mm/dma-mapping.c @@ -466,6 +466,12 @@ void __init dma_contiguous_early_fixup(phys_addr_t base, unsigned long size) void __init dma_contiguous_remap(void) { int i; + + if (!dma_mmu_remap_num) + return; + + /* call flush_cache_all() since CMA area would be large enough */ + flush_cache_all(); for (i = 0; i < dma_mmu_remap_num; i++) { phys_addr_t start = dma_mmu_remap[i].base; phys_addr_t end = start + dma_mmu_remap[i].size; @@ -498,7 +504,15 @@ void __init dma_contiguous_remap(void) flush_tlb_kernel_range(__phys_to_virt(start), __phys_to_virt(end)); - iotable_init(&map, 1); + /* + * All the memory in CMA region will be on ZONE_MOVABLE. + * If that zone is considered as highmem, the memory in CMA + * region is also considered as highmem even if it's + * physical address belong to lowmem. In this case, + * re-mapping isn't required. + */ + if (!is_highmem_idx(ZONE_MOVABLE)) + iotable_init(&map, 1); } } From b7d349c741293b694c552593dbd7d38ea7eb7143 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 10 Apr 2018 16:30:27 -0700 Subject: [PATCH 046/140] mm/thp: don't count ZONE_MOVABLE as the target for freepage reserving There was a regression report for "mm/cma: manage the memory of the CMA area by using the ZONE_MOVABLE" [1] and I think that it is related to this problem. CMA patchset makes the system use one more zone (ZONE_MOVABLE) and then increases min_free_kbytes. It reduces usable memory and it could cause regression. ZONE_MOVABLE only has movable pages so we don't need to keep enough freepages to avoid or deal with fragmentation. So, don't count it. This changes min_free_kbytes and thus min_watermark greatly if ZONE_MOVABLE is used. It will make the user uses more memory. System: 22GB ram, fakenuma, 2 nodes. 5 zones are used. Before: min_free_kbytes: 112640 zone_info (min_watermark): Node 0, zone DMA min 19 Node 0, zone DMA32 min 3778 Node 0, zone Normal min 10191 Node 0, zone Movable min 0 Node 0, zone Device min 0 Node 1, zone DMA min 0 Node 1, zone DMA32 min 0 Node 1, zone Normal min 14043 Node 1, zone Movable min 127 Node 1, zone Device min 0 After: min_free_kbytes: 90112 zone_info (min_watermark): Node 0, zone DMA min 15 Node 0, zone DMA32 min 3022 Node 0, zone Normal min 8152 Node 0, zone Movable min 0 Node 0, zone Device min 0 Node 1, zone DMA min 0 Node 1, zone DMA32 min 0 Node 1, zone Normal min 11234 Node 1, zone Movable min 102 Node 1, zone Device min 0 [1] (lkml.kernel.org/r/20180102063528.GG30397%20()%20yexl-desktop) Link: http://lkml.kernel.org/r/1522913236-15776-1-git-send-email-iamjoonsoo.kim@lge.com Signed-off-by: Joonsoo Kim Cc: Michal Hocko Cc: "Kirill A . Shutemov" Cc: Mel Gorman Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/khugepaged.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index c15da1ea7e63..eb32d0707c80 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1879,8 +1879,16 @@ static void set_recommended_min_free_kbytes(void) int nr_zones = 0; unsigned long recommended_min; - for_each_populated_zone(zone) + for_each_populated_zone(zone) { + /* + * We don't need to worry about fragmentation of + * ZONE_MOVABLE since it only has movable pages. + */ + if (zone_idx(zone) > gfp_zone(GFP_USER)) + continue; + nr_zones++; + } /* Ensure 2 pageblocks are free to assist fragmentation avoidance */ recommended_min = pageblock_nr_pages * nr_zones * 2; From c3895391df385c6628638f014c87e16f5e2efd45 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Tue, 10 Apr 2018 16:30:31 -0700 Subject: [PATCH 047/140] kasan, slub: fix handling of kasan_slab_free hook The kasan_slab_free hook's return value denotes whether the reuse of a slab object must be delayed (e.g. when the object is put into memory qurantine). The current way SLUB handles this hook is by ignoring its return value and hardcoding checks similar (but not exactly the same) to the ones performed in kasan_slab_free, which is prone to making mistakes. The main difference between the hardcoded checks and the ones in kasan_slab_free is whether we want to perform a free in case when an invalid-free or a double-free was detected (we don't). This patch changes the way SLUB handles this by: 1. taking into account the return value of kasan_slab_free for each of the objects, that are being freed; 2. reconstructing the freelist of objects to exclude the ones, whose reuse must be delayed. [andreyknvl@google.com: eliminate unnecessary branch in slab_free] Link: http://lkml.kernel.org/r/a62759a2545fddf69b0c034547212ca1eb1b3ce2.1520359686.git.andreyknvl@google.com Link: http://lkml.kernel.org/r/083f58501e54731203801d899632d76175868e97.1519400992.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Andrey Ryabinin Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Kostya Serebryany Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 57 +++++++++++++++++++++++++++++++++---------------------- 1 file changed, 34 insertions(+), 23 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 4fb037c98782..44aa7847324a 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1363,10 +1363,8 @@ static __always_inline void kfree_hook(void *x) kasan_kfree_large(x, _RET_IP_); } -static __always_inline void *slab_free_hook(struct kmem_cache *s, void *x) +static __always_inline bool slab_free_hook(struct kmem_cache *s, void *x) { - void *freeptr; - kmemleak_free_recursive(x, s->flags); /* @@ -1386,17 +1384,12 @@ static __always_inline void *slab_free_hook(struct kmem_cache *s, void *x) if (!(s->flags & SLAB_DEBUG_OBJECTS)) debug_check_no_obj_freed(x, s->object_size); - freeptr = get_freepointer(s, x); - /* - * kasan_slab_free() may put x into memory quarantine, delaying its - * reuse. In this case the object's freelist pointer is changed. - */ - kasan_slab_free(s, x, _RET_IP_); - return freeptr; + /* KASAN might put x into memory quarantine, delaying its reuse */ + return kasan_slab_free(s, x, _RET_IP_); } -static inline void slab_free_freelist_hook(struct kmem_cache *s, - void *head, void *tail) +static inline bool slab_free_freelist_hook(struct kmem_cache *s, + void **head, void **tail) { /* * Compiler cannot detect this function can be removed if slab_free_hook() @@ -1407,13 +1400,33 @@ static inline void slab_free_freelist_hook(struct kmem_cache *s, defined(CONFIG_DEBUG_OBJECTS_FREE) || \ defined(CONFIG_KASAN) - void *object = head; - void *tail_obj = tail ? : head; - void *freeptr; + void *object; + void *next = *head; + void *old_tail = *tail ? *tail : *head; + + /* Head and tail of the reconstructed freelist */ + *head = NULL; + *tail = NULL; do { - freeptr = slab_free_hook(s, object); - } while ((object != tail_obj) && (object = freeptr)); + object = next; + next = get_freepointer(s, object); + /* If object's reuse doesn't have to be delayed */ + if (!slab_free_hook(s, object)) { + /* Move object to the new freelist */ + set_freepointer(s, object, *head); + *head = object; + if (!*tail) + *tail = object; + } + } while (object != old_tail); + + if (*head == *tail) + *tail = NULL; + + return *head != NULL; +#else + return true; #endif } @@ -2968,14 +2981,12 @@ static __always_inline void slab_free(struct kmem_cache *s, struct page *page, void *head, void *tail, int cnt, unsigned long addr) { - slab_free_freelist_hook(s, head, tail); /* - * slab_free_freelist_hook() could have put the items into quarantine. - * If so, no need to free them. + * With KASAN enabled slab_free_freelist_hook modifies the freelist + * to remove objects, whose reuse must be delayed. */ - if (s->flags & SLAB_KASAN && !(s->flags & SLAB_TYPESAFE_BY_RCU)) - return; - do_slab_free(s, page, head, tail, cnt, addr); + if (slab_free_freelist_hook(s, &head, &tail)) + do_slab_free(s, page, head, tail, cnt, addr); } #ifdef CONFIG_KASAN From 91c93ed07f04f5b32a30321d522d8ca9504745bf Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Tue, 10 Apr 2018 16:30:35 -0700 Subject: [PATCH 048/140] kasan: fix invalid-free test crashing the kernel When an invalid-free is triggered by one of the KASAN tests, the object doesn't actually get freed. This later leads to a BUG failure in kmem_cache_destroy that checks that there are no allocated objects in the cache that is being destroyed. Fix this by calling kmem_cache_free with the proper object address after the call that triggers invalid-free. Link: http://lkml.kernel.org/r/286eaefc0a6c3fa9b83b87e7d6dc0fbb5b5c9926.1519924383.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Andrey Ryabinin Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Geert Uytterhoeven Cc: Nick Terrell Cc: Chris Mason Cc: Yury Norov Cc: Al Viro Cc: "Luis R . Rodriguez" Cc: Palmer Dabbelt Cc: "Paul E . McKenney" Cc: Jeff Layton Cc: "Jason A . Donenfeld" Cc: Kostya Serebryany Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/test_kasan.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/lib/test_kasan.c b/lib/test_kasan.c index 98854a64b014..ec657105edbf 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -567,7 +567,15 @@ static noinline void __init kmem_cache_invalid_free(void) return; } + /* Trigger invalid free, the object doesn't get freed */ kmem_cache_free(cache, p + 1); + + /* + * Properly free the object to prevent the "Objects remaining in + * test_cache on __kmem_cache_shutdown" BUG failure. + */ + kmem_cache_free(cache, p); + kmem_cache_destroy(cache); } From 69ca372c100fba99c78ef826a1795aa86e4f01a8 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Tue, 10 Apr 2018 16:30:39 -0700 Subject: [PATCH 049/140] kasan: prevent compiler from optimizing away memset in tests A compiler can optimize away memset calls by replacing them with mov instructions. There are KASAN tests that specifically test that KASAN correctly handles memset calls so we don't want this optimization to happen. The solution is to add -fno-builtin flag to test_kasan.ko Link: http://lkml.kernel.org/r/105ec9a308b2abedb1a0d1fdced0c22d765e4732.1519924383.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Andrey Ryabinin Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Geert Uytterhoeven Cc: Nick Terrell Cc: Chris Mason Cc: Yury Norov Cc: Al Viro Cc: "Luis R . Rodriguez" Cc: Palmer Dabbelt Cc: "Paul E . McKenney" Cc: Jeff Layton Cc: "Jason A . Donenfeld" Cc: Kostya Serebryany Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/Makefile b/lib/Makefile index 8fc0d3a9b34f..6200f978740d 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -53,6 +53,7 @@ obj-$(CONFIG_TEST_FIRMWARE) += test_firmware.o obj-$(CONFIG_TEST_SYSCTL) += test_sysctl.o obj-$(CONFIG_TEST_HASH) += test_hash.o test_siphash.o obj-$(CONFIG_TEST_KASAN) += test_kasan.o +CFLAGS_test_kasan.o += -fno-builtin obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o obj-$(CONFIG_TEST_LIST_SORT) += test_list_sort.o obj-$(CONFIG_TEST_LKM) += test_module.o From 0e3dc019143104a6e676287b1e453cccd7add404 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 10 Apr 2018 16:30:44 -0700 Subject: [PATCH 050/140] procfs: add seq_put_hex_ll to speed up /proc/pid/maps seq_put_hex_ll() prints a number in hexadecimal notation and works faster than seq_printf(). == test.py num = 0 with open("/proc/1/maps") as f: while num < 10000 : data = f.read() f.seek(0, 0) num = num + 1 == == Before patch == $ time python test.py real 0m1.561s user 0m0.257s sys 0m1.302s == After patch == $ time python test.py real 0m0.986s user 0m0.279s sys 0m0.707s $ perf -g record python test.py: == Before patch == - 67.42% 2.82% python [kernel.kallsyms] [k] show_map_vma.isra.22 - 64.60% show_map_vma.isra.22 - 44.98% seq_printf - seq_vprintf - vsnprintf + 14.85% number + 12.22% format_decode 5.56% memcpy_erms + 15.06% seq_path + 4.42% seq_pad + 2.45% __GI___libc_read == After patch == - 47.35% 3.38% python [kernel.kallsyms] [k] show_map_vma.isra.23 - 43.97% show_map_vma.isra.23 + 20.84% seq_path - 15.73% show_vma_header_prefix 10.55% seq_put_hex_ll + 2.65% seq_put_decimal_ull 0.95% seq_putc + 6.96% seq_pad + 2.94% __GI___libc_read [avagin@openvz.org: use unsigned int instead of int where it is suitable] Link: http://lkml.kernel.org/r/20180214025619.4005-1-avagin@openvz.org [avagin@openvz.org: v2] Link: http://lkml.kernel.org/r/20180117082050.25406-1-avagin@openvz.org Link: http://lkml.kernel.org/r/20180112185812.7710-1-avagin@openvz.org Signed-off-by: Andrei Vagin Cc: Alexey Dobriyan Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 21 ++++++++++-------- fs/seq_file.c | 46 ++++++++++++++++++++++++++++++++++++++++ include/linux/seq_file.h | 3 +++ 3 files changed, 61 insertions(+), 9 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index ec6d2983a5cb..b66fc8de7d34 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -287,15 +287,18 @@ static void show_vma_header_prefix(struct seq_file *m, dev_t dev, unsigned long ino) { seq_setwidth(m, 25 + sizeof(void *) * 6 - 1); - seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ", - start, - end, - flags & VM_READ ? 'r' : '-', - flags & VM_WRITE ? 'w' : '-', - flags & VM_EXEC ? 'x' : '-', - flags & VM_MAYSHARE ? 's' : 'p', - pgoff, - MAJOR(dev), MINOR(dev), ino); + seq_put_hex_ll(m, NULL, start, 8); + seq_put_hex_ll(m, "-", end, 8); + seq_putc(m, ' '); + seq_putc(m, flags & VM_READ ? 'r' : '-'); + seq_putc(m, flags & VM_WRITE ? 'w' : '-'); + seq_putc(m, flags & VM_EXEC ? 'x' : '-'); + seq_putc(m, flags & VM_MAYSHARE ? 's' : 'p'); + seq_put_hex_ll(m, " ", pgoff, 8); + seq_put_hex_ll(m, " ", MAJOR(dev), 2); + seq_put_hex_ll(m, ":", MINOR(dev), 2); + seq_put_decimal_ull(m, " ", ino); + seq_putc(m, ' '); } static void diff --git a/fs/seq_file.c b/fs/seq_file.c index eea09f6d8830..cde1bdbf7801 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -715,6 +715,52 @@ overflow: } EXPORT_SYMBOL(seq_put_decimal_ull); +/** + * seq_put_hex_ll - put a number in hexadecimal notation + * @m: seq_file identifying the buffer to which data should be written + * @delimiter: a string which is printed before the number + * @v: the number + * @width: a minimum field width + * + * seq_put_hex_ll(m, "", v, 8) is equal to seq_printf(m, "%08llx", v) + * + * This routine is very quick when you show lots of numbers. + * In usual cases, it will be better to use seq_printf(). It's easier to read. + */ +void seq_put_hex_ll(struct seq_file *m, const char *delimiter, + unsigned long long v, unsigned int width) +{ + unsigned int len; + int i; + + if (delimiter && delimiter[0]) { + if (delimiter[1] == 0) + seq_putc(m, delimiter[0]); + else + seq_puts(m, delimiter); + } + + /* If x is 0, the result of __builtin_clzll is undefined */ + if (v == 0) + len = 1; + else + len = (sizeof(v) * 8 - __builtin_clzll(v) + 3) / 4; + + if (len < width) + len = width; + + if (m->count + len > m->size) { + seq_set_overflow(m); + return; + } + + for (i = len - 1; i >= 0; i--) { + m->buf[m->count + i] = hex_asc[0xf & v]; + v = v >> 4; + } + m->count += len; +} + void seq_put_decimal_ll(struct seq_file *m, const char *delimiter, long long num) { int len; diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h index ab437dd2e3b9..599e145f4917 100644 --- a/include/linux/seq_file.h +++ b/include/linux/seq_file.h @@ -121,6 +121,9 @@ void seq_puts(struct seq_file *m, const char *s); void seq_put_decimal_ull(struct seq_file *m, const char *delimiter, unsigned long long num); void seq_put_decimal_ll(struct seq_file *m, const char *delimiter, long long num); +void seq_put_hex_ll(struct seq_file *m, const char *delimiter, + unsigned long long v, unsigned int width); + void seq_escape(struct seq_file *m, const char *s, const char *esc); void seq_hex_dump(struct seq_file *m, const char *prefix_str, int prefix_type, From 8cfa67b4d9a9d9a6061f3cfd0e0ed16e66e45984 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 10 Apr 2018 16:30:47 -0700 Subject: [PATCH 051/140] procfs: optimize seq_pad() to speed up /proc/pid/maps seq_printf() is slow and it can be replaced by memset() in this case. == test.py num = 0 with open("/proc/1/maps") as f: while num < 10000 : data = f.read() f.seek(0, 0) num = num + 1 == == Before patch == $ time python test.py real 0m0.986s user 0m0.279s sys 0m0.707s == After patch == $ time python test.py real 0m0.932s user 0m0.261s sys 0m0.669s $ perf record -g python test.py == Before patch == - 47.35% 3.38% python [kernel.kallsyms] [k] show_map_vma.isra.23 - 43.97% show_map_vma.isra.23 + 20.84% seq_path - 15.73% show_vma_header_prefix + 6.96% seq_pad + 2.94% __GI___libc_read == After patch == - 44.01% 0.34% python [kernel.kallsyms] [k] show_pid_map - 43.67% show_pid_map - 42.91% show_map_vma.isra.23 + 21.55% seq_path - 15.68% show_vma_header_prefix + 2.08% seq_pad 0.55% seq_putc Link: http://lkml.kernel.org/r/20180112185812.7710-2-avagin@openvz.org Signed-off-by: Andrei Vagin Cc: Alexey Dobriyan Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/seq_file.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/seq_file.c b/fs/seq_file.c index cde1bdbf7801..3714ae1d5e1c 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -828,8 +828,14 @@ EXPORT_SYMBOL(seq_write); void seq_pad(struct seq_file *m, char c) { int size = m->pad_until - m->count; - if (size > 0) - seq_printf(m, "%*s", size, ""); + if (size > 0) { + if (size + m->count > m->size) { + seq_set_overflow(m); + return; + } + memset(m->buf + m->count, ' ', size); + m->count += size; + } if (c) seq_putc(m, c); } From 68c3411ff4a4cee53cc854c11ed191eaaf1956ba Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Tue, 10 Apr 2018 16:30:51 -0700 Subject: [PATCH 052/140] proc: get rid of task lock/unlock pair to read umask for the "status" file get_task_umask locks/unlocks the task on its own. The only caller does the same thing immediately after. Utilize the fact the task has to be locked anyway and just do it once. Since there are no other users and the code is short, fold it in. Link: http://lkml.kernel.org/r/1517995608-23683-1-git-send-email-mguzik@redhat.com Signed-off-by: Mateusz Guzik Reviewed-by: Alexey Dobriyan Cc: Konstantin Khlebnikov Cc: Jerome Marchand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/array.c | 23 +++++------------------ 1 file changed, 5 insertions(+), 18 deletions(-) diff --git a/fs/proc/array.c b/fs/proc/array.c index 598803576e4c..851ec0915e4c 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -141,25 +141,12 @@ static inline const char *get_task_state(struct task_struct *tsk) return task_state_array[task_state_index(tsk)]; } -static inline int get_task_umask(struct task_struct *tsk) -{ - struct fs_struct *fs; - int umask = -ENOENT; - - task_lock(tsk); - fs = tsk->fs; - if (fs) - umask = fs->umask; - task_unlock(tsk); - return umask; -} - static inline void task_state(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *p) { struct user_namespace *user_ns = seq_user_ns(m); struct group_info *group_info; - int g, umask; + int g, umask = -1; struct task_struct *tracer; const struct cred *cred; pid_t ppid, tpid = 0, tgid, ngid; @@ -177,16 +164,16 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, ngid = task_numa_group_id(p); cred = get_task_cred(p); - umask = get_task_umask(p); - if (umask >= 0) - seq_printf(m, "Umask:\t%#04o\n", umask); - task_lock(p); + if (p->fs) + umask = p->fs->umask; if (p->files) max_fds = files_fdtable(p->files)->max_fds; task_unlock(p); rcu_read_unlock(); + if (umask >= 0) + seq_printf(m, "Umask:\t%#04o\n", umask); seq_printf(m, "State:\t%s", get_task_state(p)); seq_put_decimal_ull(m, "\nTgid:\t", tgid); From 2f8974243507d9e5b0f214d7668a59a66b93f36c Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 10 Apr 2018 16:30:54 -0700 Subject: [PATCH 053/140] proc: do less stuff under ->pde_unload_lock Commit ca469f35a8e9ef ("deal with races between remove_proc_entry() and proc_reg_release()") moved too much stuff under ->pde_unload_lock making a problem described at series "[PATCH v5] procfs: Improve Scaling in proc" worse. While RCU is being figured out, move kfree() out of ->pde_unload_lock. On my potato, difference is only 0.5% speedup with concurrent open+read+close of /proc/cmdline, but the effect should be more noticeable on more capable machines. $ perf stat -r 16 -- ./proc-j 16 Performance counter stats for './proc-j 16' (16 runs): 130569.502377 task-clock (msec) # 15.872 CPUs utilized ( +- 0.05% ) 19,169 context-switches # 0.147 K/sec ( +- 0.18% ) 15 cpu-migrations # 0.000 K/sec ( +- 3.27% ) 437 page-faults # 0.003 K/sec ( +- 1.25% ) 300,172,097,675 cycles # 2.299 GHz ( +- 0.05% ) 96,793,267,308 instructions # 0.32 insn per cycle ( +- 0.04% ) 22,798,342,298 branches # 174.607 M/sec ( +- 0.04% ) 111,764,687 branch-misses # 0.49% of all branches ( +- 0.47% ) 8.226574400 seconds time elapsed ( +- 0.05% ) ^^^^^^^^^^^ $ perf stat -r 16 -- ./proc-j 16 Performance counter stats for './proc-j 16' (16 runs): 129866.777392 task-clock (msec) # 15.869 CPUs utilized ( +- 0.04% ) 19,154 context-switches # 0.147 K/sec ( +- 0.66% ) 14 cpu-migrations # 0.000 K/sec ( +- 1.73% ) 431 page-faults # 0.003 K/sec ( +- 1.09% ) 298,556,520,546 cycles # 2.299 GHz ( +- 0.04% ) 96,525,366,833 instructions # 0.32 insn per cycle ( +- 0.04% ) 22,730,194,043 branches # 175.027 M/sec ( +- 0.04% ) 111,506,074 branch-misses # 0.49% of all branches ( +- 0.18% ) 8.183629778 seconds time elapsed ( +- 0.04% ) ^^^^^^^^^^^ Link: http://lkml.kernel.org/r/20180213132911.GA24298@avx2 Signed-off-by: Alexey Dobriyan Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/inode.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 6e8724958116..8118ce5df5c6 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -138,7 +138,7 @@ static void unuse_pde(struct proc_dir_entry *pde) complete(pde->pde_unload_completion); } -/* pde is locked */ +/* pde is locked on entry, unlocked on exit */ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo) { /* @@ -157,9 +157,10 @@ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo) pdeo->c = &c; spin_unlock(&pde->pde_unload_lock); wait_for_completion(&c); - spin_lock(&pde->pde_unload_lock); } else { struct file *file; + struct completion *c; + pdeo->closing = true; spin_unlock(&pde->pde_unload_lock); file = pdeo->file; @@ -167,8 +168,10 @@ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo) spin_lock(&pde->pde_unload_lock); /* After ->release. */ list_del(&pdeo->lh); - if (unlikely(pdeo->c)) - complete(pdeo->c); + c = pdeo->c; + spin_unlock(&pde->pde_unload_lock); + if (unlikely(c)) + complete(c); kfree(pdeo); } } @@ -188,6 +191,7 @@ void proc_entry_rundown(struct proc_dir_entry *de) struct pde_opener *pdeo; pdeo = list_first_entry(&de->pde_openers, struct pde_opener, lh); close_pdeo(de, pdeo); + spin_lock(&de->pde_unload_lock); } spin_unlock(&de->pde_unload_lock); } @@ -375,7 +379,7 @@ static int proc_reg_release(struct inode *inode, struct file *file) list_for_each_entry(pdeo, &pde->pde_openers, lh) { if (pdeo->file == file) { close_pdeo(pde, pdeo); - break; + return 0; } } spin_unlock(&pde->pde_unload_lock); From e74a0effffbbea75fe2b6770948f84fcb0917cdd Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 10 Apr 2018 16:30:58 -0700 Subject: [PATCH 054/140] proc: move /proc/sysvipc creation to where it belongs Move the proc_mkdir() call within the sysvipc subsystem such that we avoid polluting proc_root_init() with petty cpp. [dave@stgolabs.net: contributed changelog] Link: http://lkml.kernel.org/r/20180216161732.GA10297@avx2 Signed-off-by: Alexey Dobriyan Reviewed-by: Andrew Morton Acked-by: Davidlohr Bueso Cc: Manfred Spraul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/root.c | 4 ---- ipc/util.c | 1 + 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/proc/root.c b/fs/proc/root.c index ede8e64974be..4a19e02c7ed0 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -136,10 +136,6 @@ void __init proc_root_init(void) proc_symlink("mounts", NULL, "self/mounts"); proc_net_init(); - -#ifdef CONFIG_SYSVIPC - proc_mkdir("sysvipc", NULL); -#endif proc_mkdir("fs", NULL); proc_mkdir("driver", NULL); proc_create_mount_point("fs/nfsd"); /* somewhere for the nfsd filesystem to be mounted */ diff --git a/ipc/util.c b/ipc/util.c index 3783b7991cc7..4e81182fa0ac 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -89,6 +89,7 @@ static int __init ipc_init(void) { int err_sem, err_msg; + proc_mkdir("sysvipc", NULL); err_sem = sem_init(); WARN(err_sem, "ipc: sysv sem_init failed: %d\n", err_sem); err_msg = msg_init(); From e7a6e291e30a00061c356bbcba0d9380943a1671 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 10 Apr 2018 16:31:01 -0700 Subject: [PATCH 055/140] proc: faster open/close of files without ->release hook The whole point of code in fs/proc/inode.c is to make sure ->release hook is called either at close() or at rmmod time. All if it is unnecessary if there is no ->release hook. Save allocation+list manipulations under spinlock in that case. Link: http://lkml.kernel.org/r/20180214063033.GA15579@avx2 Signed-off-by: Alexey Dobriyan Cc: Al Viro Cc: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/inode.c | 43 ++++++++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 8118ce5df5c6..0331ddbee4f6 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -342,31 +342,36 @@ static int proc_reg_open(struct inode *inode, struct file *file) * * Save every "struct file" with custom ->release hook. */ - pdeo = kmalloc(sizeof(struct pde_opener), GFP_KERNEL); - if (!pdeo) - return -ENOMEM; - - if (!use_pde(pde)) { - kfree(pdeo); + if (!use_pde(pde)) return -ENOENT; - } - open = pde->proc_fops->open; - release = pde->proc_fops->release; + release = pde->proc_fops->release; + if (release) { + pdeo = kmalloc(sizeof(struct pde_opener), GFP_KERNEL); + if (!pdeo) { + rv = -ENOMEM; + goto out_unuse; + } + } + + open = pde->proc_fops->open; if (open) rv = open(inode, file); - if (rv == 0 && release) { - /* To know what to release. */ - pdeo->file = file; - pdeo->closing = false; - pdeo->c = NULL; - spin_lock(&pde->pde_unload_lock); - list_add(&pdeo->lh, &pde->pde_openers); - spin_unlock(&pde->pde_unload_lock); - } else - kfree(pdeo); + if (release) { + if (rv == 0) { + /* To know what to release. */ + pdeo->file = file; + pdeo->closing = false; + pdeo->c = NULL; + spin_lock(&pde->pde_unload_lock); + list_add(&pdeo->lh, &pde->pde_openers); + spin_unlock(&pde->pde_unload_lock); + } else + kfree(pdeo); + } +out_unuse: unuse_pde(pde); return rv; } From a9fabc3df4c68e05f023c6a5189f0104e200beca Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 10 Apr 2018 16:31:05 -0700 Subject: [PATCH 056/140] proc: randomize "struct pde_opener" The more the merrier. Link: http://lkml.kernel.org/r/20180214081935.GA17157@avx2 Signed-off-by: Alexey Dobriyan Cc: Al Viro Cc: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/proc/internal.h b/fs/proc/internal.h index d697c8ab0a14..713d5dfe3f05 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -177,7 +177,7 @@ struct pde_opener { struct list_head lh; bool closing; struct completion *c; -}; +} __randomize_layout; extern const struct inode_operations proc_link_inode_operations; extern const struct inode_operations proc_pid_link_inode_operations; From 195b8cf0689554db764f459730c81f741887aa5f Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 10 Apr 2018 16:31:09 -0700 Subject: [PATCH 057/140] proc: move "struct pde_opener" to kmem cache "struct pde_opener" is fixed size and we can have more granular approach to debugging. For those who don't know, per cache SLUB poisoning and red zoning don't work if there is at least one object allocated which is hopeless in case of kmalloc-64 but not in case of standalone cache. Although systemd opens 2 files from the get go, so it is hopeless after all. Link: http://lkml.kernel.org/r/20180214082306.GB17157@avx2 Signed-off-by: Alexey Dobriyan Cc: Al Viro Cc: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/inode.c | 12 ++++++++---- fs/proc/internal.h | 2 +- fs/proc/root.c | 2 +- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 0331ddbee4f6..5349eb07ac29 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -54,6 +54,7 @@ static void proc_evict_inode(struct inode *inode) } static struct kmem_cache *proc_inode_cachep __ro_after_init; +static struct kmem_cache *pde_opener_cache __ro_after_init; static struct inode *proc_alloc_inode(struct super_block *sb) { @@ -92,7 +93,7 @@ static void init_once(void *foo) inode_init_once(&ei->vfs_inode); } -void __init proc_init_inodecache(void) +void __init proc_init_kmemcache(void) { proc_inode_cachep = kmem_cache_create("proc_inode_cache", sizeof(struct proc_inode), @@ -100,6 +101,9 @@ void __init proc_init_inodecache(void) SLAB_MEM_SPREAD|SLAB_ACCOUNT| SLAB_PANIC), init_once); + pde_opener_cache = + kmem_cache_create("pde_opener", sizeof(struct pde_opener), 0, + SLAB_PANIC, NULL); } static int proc_show_options(struct seq_file *seq, struct dentry *root) @@ -172,7 +176,7 @@ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo) spin_unlock(&pde->pde_unload_lock); if (unlikely(c)) complete(c); - kfree(pdeo); + kmem_cache_free(pde_opener_cache, pdeo); } } @@ -347,7 +351,7 @@ static int proc_reg_open(struct inode *inode, struct file *file) release = pde->proc_fops->release; if (release) { - pdeo = kmalloc(sizeof(struct pde_opener), GFP_KERNEL); + pdeo = kmem_cache_alloc(pde_opener_cache, GFP_KERNEL); if (!pdeo) { rv = -ENOMEM; goto out_unuse; @@ -368,7 +372,7 @@ static int proc_reg_open(struct inode *inode, struct file *file) list_add(&pdeo->lh, &pde->pde_openers); spin_unlock(&pde->pde_unload_lock); } else - kfree(pdeo); + kmem_cache_free(pde_opener_cache, pdeo); } out_unuse: diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 713d5dfe3f05..dc00ef8538cb 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -182,7 +182,7 @@ extern const struct inode_operations proc_link_inode_operations; extern const struct inode_operations proc_pid_link_inode_operations; -extern void proc_init_inodecache(void); +void proc_init_kmemcache(void); void set_proc_pid_nlink(void); extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *); extern int proc_fill_super(struct super_block *, void *data, int flags); diff --git a/fs/proc/root.c b/fs/proc/root.c index 4a19e02c7ed0..98797b762a71 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -125,7 +125,7 @@ void __init proc_root_init(void) { int err; - proc_init_inodecache(); + proc_init_kmemcache(); set_proc_pid_nlink(); err = register_filesystem(&proc_fs_type); if (err) From 2acddbe8168967adebf4623923242c9a4f9e1aee Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 10 Apr 2018 16:31:12 -0700 Subject: [PATCH 058/140] proc: account "struct pde_opener" The allocation is persistent in fact as any fool can open a file in /proc and sit on it. Link: http://lkml.kernel.org/r/20180214082409.GC17157@avx2 Signed-off-by: Alexey Dobriyan Cc: Al Viro Cc: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 5349eb07ac29..89618836887d 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -103,7 +103,7 @@ void __init proc_init_kmemcache(void) init_once); pde_opener_cache = kmem_cache_create("pde_opener", sizeof(struct pde_opener), 0, - SLAB_PANIC, NULL); + SLAB_ACCOUNT|SLAB_PANIC, NULL); } static int proc_show_options(struct seq_file *seq, struct dentry *root) From d1be35cb6f96975d792a1535d3fe9b75239065ee Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 10 Apr 2018 16:31:16 -0700 Subject: [PATCH 059/140] proc: add seq_put_decimal_ull_width to speed up /proc/pid/smaps seq_put_decimal_ull_w(m, str, val, width) prints a decimal number with a specified minimal field width. It is equivalent of seq_printf(m, "%s%*d", str, width, val), but it works much faster. == test_smaps.py num = 0 with open("/proc/1/smaps") as f: for x in xrange(10000): data = f.read() f.seek(0, 0) == == Before patch == $ time python test_smaps.py real 0m4.593s user 0m0.398s sys 0m4.158s == After patch == $ time python test_smaps.py real 0m3.828s user 0m0.413s sys 0m3.408s $ perf -g record python test_smaps.py == Before patch == - 79.01% 3.36% python [kernel.kallsyms] [k] show_smap.isra.33 - 75.65% show_smap.isra.33 + 48.85% seq_printf + 15.75% __walk_page_range + 9.70% show_map_vma.isra.23 0.61% seq_puts == After patch == - 75.51% 4.62% python [kernel.kallsyms] [k] show_smap.isra.33 - 70.88% show_smap.isra.33 + 24.82% seq_put_decimal_ull_w + 19.78% __walk_page_range + 12.74% seq_printf + 11.08% show_map_vma.isra.23 + 1.68% seq_puts [akpm@linux-foundation.org: fix drivers/of/unittest.c build] Link: http://lkml.kernel.org/r/20180212074931.7227-1-avagin@openvz.org Signed-off-by: Andrei Vagin Cc: Alexey Dobriyan Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/of/unittest.c | 2 +- fs/proc/meminfo.c | 15 +---- fs/proc/task_mmu.c | 127 ++++++++++++++++----------------------- fs/seq_file.c | 28 ++++++--- include/linux/kernel.h | 3 +- include/linux/seq_file.h | 2 + lib/vsprintf.c | 18 ++++-- 7 files changed, 93 insertions(+), 102 deletions(-) diff --git a/drivers/of/unittest.c b/drivers/of/unittest.c index 02c5984ab09b..6bb37c18292a 100644 --- a/drivers/of/unittest.c +++ b/drivers/of/unittest.c @@ -295,7 +295,7 @@ static void __init of_unittest_printf(void) return; } - num_to_str(phandle_str, sizeof(phandle_str), np->phandle); + num_to_str(phandle_str, sizeof(phandle_str), np->phandle, 0); of_unittest_printf_one(np, "%pOF", full_name); of_unittest_printf_one(np, "%pOFf", full_name); diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 6bb20f864259..65a72ab57471 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -26,20 +26,7 @@ void __attribute__((weak)) arch_report_meminfo(struct seq_file *m) static void show_val_kb(struct seq_file *m, const char *s, unsigned long num) { - char v[32]; - static const char blanks[7] = {' ', ' ', ' ', ' ',' ', ' ', ' '}; - int len; - - len = num_to_str(v, sizeof(v), num << (PAGE_SHIFT - 10)); - - seq_write(m, s, 16); - - if (len > 0) { - if (len < 8) - seq_write(m, blanks, 8 - len); - - seq_write(m, v, len); - } + seq_put_decimal_ull_width(m, s, num << (PAGE_SHIFT - 10), 8); seq_write(m, " kB\n", 4); } diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index b66fc8de7d34..3026feda0432 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -24,6 +24,8 @@ #include #include "internal.h" +#define SEQ_PUT_DEC(str, val) \ + seq_put_decimal_ull_width(m, str, (val) << (PAGE_SHIFT-10), 8) void task_mem(struct seq_file *m, struct mm_struct *mm) { unsigned long text, lib, swap, anon, file, shmem; @@ -53,39 +55,28 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) lib = (mm->exec_vm << PAGE_SHIFT) - text; swap = get_mm_counter(mm, MM_SWAPENTS); - seq_printf(m, - "VmPeak:\t%8lu kB\n" - "VmSize:\t%8lu kB\n" - "VmLck:\t%8lu kB\n" - "VmPin:\t%8lu kB\n" - "VmHWM:\t%8lu kB\n" - "VmRSS:\t%8lu kB\n" - "RssAnon:\t%8lu kB\n" - "RssFile:\t%8lu kB\n" - "RssShmem:\t%8lu kB\n" - "VmData:\t%8lu kB\n" - "VmStk:\t%8lu kB\n" - "VmExe:\t%8lu kB\n" - "VmLib:\t%8lu kB\n" - "VmPTE:\t%8lu kB\n" - "VmSwap:\t%8lu kB\n", - hiwater_vm << (PAGE_SHIFT-10), - total_vm << (PAGE_SHIFT-10), - mm->locked_vm << (PAGE_SHIFT-10), - mm->pinned_vm << (PAGE_SHIFT-10), - hiwater_rss << (PAGE_SHIFT-10), - total_rss << (PAGE_SHIFT-10), - anon << (PAGE_SHIFT-10), - file << (PAGE_SHIFT-10), - shmem << (PAGE_SHIFT-10), - mm->data_vm << (PAGE_SHIFT-10), - mm->stack_vm << (PAGE_SHIFT-10), - text >> 10, - lib >> 10, - mm_pgtables_bytes(mm) >> 10, - swap << (PAGE_SHIFT-10)); + SEQ_PUT_DEC("VmPeak:\t", hiwater_vm); + SEQ_PUT_DEC(" kB\nVmSize:\t", total_vm); + SEQ_PUT_DEC(" kB\nVmLck:\t", mm->locked_vm); + SEQ_PUT_DEC(" kB\nVmPin:\t", mm->pinned_vm); + SEQ_PUT_DEC(" kB\nVmHWM:\t", hiwater_rss); + SEQ_PUT_DEC(" kB\nVmRSS:\t", total_rss); + SEQ_PUT_DEC(" kB\nRssAnon:\t", anon); + SEQ_PUT_DEC(" kB\nRssFile:\t", file); + SEQ_PUT_DEC(" kB\nRssShmem:\t", shmem); + SEQ_PUT_DEC(" kB\nVmData:\t", mm->data_vm); + SEQ_PUT_DEC(" kB\nVmStk:\t", mm->stack_vm); + seq_put_decimal_ull_width(m, + " kB\nVmExe:\t", text >> 10, 8); + seq_put_decimal_ull_width(m, + " kB\nVmLib:\t", lib >> 10, 8); + seq_put_decimal_ull_width(m, + " kB\nVmPTE:\t", mm_pgtables_bytes(mm) >> 10, 8); + SEQ_PUT_DEC(" kB\nVmSwap:\t", swap); + seq_puts(m, " kB\n"); hugetlb_report_usage(m, mm); } +#undef SEQ_PUT_DEC unsigned long task_vsize(struct mm_struct *mm) { @@ -739,6 +730,8 @@ void __weak arch_show_smap(struct seq_file *m, struct vm_area_struct *vma) { } +#define SEQ_PUT_DEC(str, val) \ + seq_put_decimal_ull_width(m, str, (val) >> 10, 8) static int show_smap(struct seq_file *m, void *v, int is_pid) { struct proc_maps_private *priv = m->private; @@ -812,51 +805,34 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) ret = SEQ_SKIP; } - if (!rollup_mode) - seq_printf(m, - "Size: %8lu kB\n" - "KernelPageSize: %8lu kB\n" - "MMUPageSize: %8lu kB\n", - (vma->vm_end - vma->vm_start) >> 10, - vma_kernel_pagesize(vma) >> 10, - vma_mmu_pagesize(vma) >> 10); - - - if (!rollup_mode || last_vma) - seq_printf(m, - "Rss: %8lu kB\n" - "Pss: %8lu kB\n" - "Shared_Clean: %8lu kB\n" - "Shared_Dirty: %8lu kB\n" - "Private_Clean: %8lu kB\n" - "Private_Dirty: %8lu kB\n" - "Referenced: %8lu kB\n" - "Anonymous: %8lu kB\n" - "LazyFree: %8lu kB\n" - "AnonHugePages: %8lu kB\n" - "ShmemPmdMapped: %8lu kB\n" - "Shared_Hugetlb: %8lu kB\n" - "Private_Hugetlb: %7lu kB\n" - "Swap: %8lu kB\n" - "SwapPss: %8lu kB\n" - "Locked: %8lu kB\n", - mss->resident >> 10, - (unsigned long)(mss->pss >> (10 + PSS_SHIFT)), - mss->shared_clean >> 10, - mss->shared_dirty >> 10, - mss->private_clean >> 10, - mss->private_dirty >> 10, - mss->referenced >> 10, - mss->anonymous >> 10, - mss->lazyfree >> 10, - mss->anonymous_thp >> 10, - mss->shmem_thp >> 10, - mss->shared_hugetlb >> 10, - mss->private_hugetlb >> 10, - mss->swap >> 10, - (unsigned long)(mss->swap_pss >> (10 + PSS_SHIFT)), - (unsigned long)(mss->pss >> (10 + PSS_SHIFT))); + if (!rollup_mode) { + SEQ_PUT_DEC("Size: ", vma->vm_end - vma->vm_start); + SEQ_PUT_DEC(" kB\nKernelPageSize: ", vma_kernel_pagesize(vma)); + SEQ_PUT_DEC(" kB\nMMUPageSize: ", vma_mmu_pagesize(vma)); + seq_puts(m, " kB\n"); + } + if (!rollup_mode || last_vma) { + SEQ_PUT_DEC("Rss: ", mss->resident); + SEQ_PUT_DEC(" kB\nPss: ", mss->pss >> PSS_SHIFT); + SEQ_PUT_DEC(" kB\nShared_Clean: ", mss->shared_clean); + SEQ_PUT_DEC(" kB\nShared_Dirty: ", mss->shared_dirty); + SEQ_PUT_DEC(" kB\nPrivate_Clean: ", mss->private_clean); + SEQ_PUT_DEC(" kB\nPrivate_Dirty: ", mss->private_dirty); + SEQ_PUT_DEC(" kB\nReferenced: ", mss->referenced); + SEQ_PUT_DEC(" kB\nAnonymous: ", mss->anonymous); + SEQ_PUT_DEC(" kB\nLazyFree: ", mss->lazyfree); + SEQ_PUT_DEC(" kB\nAnonHugePages: ", mss->anonymous_thp); + SEQ_PUT_DEC(" kB\nShmemPmdMapped: ", mss->shmem_thp); + SEQ_PUT_DEC(" kB\nShared_Hugetlb: ", mss->shared_hugetlb); + seq_put_decimal_ull_width(m, " kB\nPrivate_Hugetlb: ", + mss->private_hugetlb >> 10, 7); + SEQ_PUT_DEC(" kB\nSwap: ", mss->swap); + SEQ_PUT_DEC(" kB\nSwapPss: ", + mss->swap_pss >> PSS_SHIFT); + SEQ_PUT_DEC(" kB\nLocked: ", mss->pss >> PSS_SHIFT); + seq_puts(m, " kB\n"); + } if (!rollup_mode) { arch_show_smap(m, vma); show_smap_vma_flags(m, vma); @@ -864,6 +840,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) m_cache_vma(m, vma); return ret; } +#undef SEQ_PUT_DEC static int show_pid_smap(struct seq_file *m, void *v) { diff --git a/fs/seq_file.c b/fs/seq_file.c index 3714ae1d5e1c..84650ad3b1bf 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -673,15 +673,20 @@ void seq_puts(struct seq_file *m, const char *s) } EXPORT_SYMBOL(seq_puts); -/* +/** * A helper routine for putting decimal numbers without rich format of printf(). * only 'unsigned long long' is supported. - * This routine will put strlen(delimiter) + number into seq_file. + * @m: seq_file identifying the buffer to which data should be written + * @delimiter: a string which is printed before the number + * @num: the number + * @width: a minimum field width + * + * This routine will put strlen(delimiter) + number into seq_filed. * This routine is very quick when you show lots of numbers. * In usual cases, it will be better to use seq_printf(). It's easier to read. */ -void seq_put_decimal_ull(struct seq_file *m, const char *delimiter, - unsigned long long num) +void seq_put_decimal_ull_width(struct seq_file *m, const char *delimiter, + unsigned long long num, unsigned int width) { int len; @@ -695,7 +700,10 @@ void seq_put_decimal_ull(struct seq_file *m, const char *delimiter, memcpy(m->buf + m->count, delimiter, len); m->count += len; - if (m->count + 1 >= m->size) + if (!width) + width = 1; + + if (m->count + width >= m->size) goto overflow; if (num < 10) { @@ -703,7 +711,7 @@ void seq_put_decimal_ull(struct seq_file *m, const char *delimiter, return; } - len = num_to_str(m->buf + m->count, m->size - m->count, num); + len = num_to_str(m->buf + m->count, m->size - m->count, num, width); if (!len) goto overflow; @@ -713,6 +721,12 @@ void seq_put_decimal_ull(struct seq_file *m, const char *delimiter, overflow: seq_set_overflow(m); } + +void seq_put_decimal_ull(struct seq_file *m, const char *delimiter, + unsigned long long num) +{ + return seq_put_decimal_ull_width(m, delimiter, num, 0); +} EXPORT_SYMBOL(seq_put_decimal_ull); /** @@ -788,7 +802,7 @@ void seq_put_decimal_ll(struct seq_file *m, const char *delimiter, long long num return; } - len = num_to_str(m->buf + m->count, m->size - m->count, num); + len = num_to_str(m->buf + m->count, m->size - m->count, num, 0); if (!len) goto overflow; diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 52b70894eaa5..98273343bd45 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -439,7 +439,8 @@ extern long simple_strtol(const char *,char **,unsigned int); extern unsigned long long simple_strtoull(const char *,char **,unsigned int); extern long long simple_strtoll(const char *,char **,unsigned int); -extern int num_to_str(char *buf, int size, unsigned long long num); +extern int num_to_str(char *buf, int size, + unsigned long long num, unsigned int width); /* lib/printf utilities */ diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h index 599e145f4917..23d6a92cea9f 100644 --- a/include/linux/seq_file.h +++ b/include/linux/seq_file.h @@ -118,6 +118,8 @@ __printf(2, 3) void seq_printf(struct seq_file *m, const char *fmt, ...); void seq_putc(struct seq_file *m, char c); void seq_puts(struct seq_file *m, const char *s); +void seq_put_decimal_ull_width(struct seq_file *m, const char *delimiter, + unsigned long long num, unsigned int width); void seq_put_decimal_ull(struct seq_file *m, const char *delimiter, unsigned long long num); void seq_put_decimal_ll(struct seq_file *m, const char *delimiter, long long num); diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 89f8a4a4b770..30c0cb8cc9bc 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -336,7 +336,7 @@ char *put_dec(char *buf, unsigned long long n) * * If speed is not important, use snprintf(). It's easy to read the code. */ -int num_to_str(char *buf, int size, unsigned long long num) +int num_to_str(char *buf, int size, unsigned long long num, unsigned int width) { /* put_dec requires 2-byte alignment of the buffer. */ char tmp[sizeof(num) * 3] __aligned(2); @@ -350,11 +350,21 @@ int num_to_str(char *buf, int size, unsigned long long num) len = put_dec(tmp, num) - tmp; } - if (len > size) + if (len > size || width > size) return 0; + + if (width > len) { + width = width - len; + for (idx = 0; idx < width; idx++) + buf[idx] = ' '; + } else { + width = 0; + } + for (idx = 0; idx < len; ++idx) - buf[idx] = tmp[len - idx - 1]; - return len; + buf[idx + width] = tmp[len - idx - 1]; + + return len + width; } #define SIGN 1 /* unsigned/signed, must be 1 */ From f66406638fffe874c56e7e41106167c5235f251e Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 10 Apr 2018 16:31:19 -0700 Subject: [PATCH 060/140] proc: replace seq_printf on seq_putc to speed up /proc/pid/smaps seq_putc() works much faster than seq_printf() == Before patch == $ time python test_smaps.py real 0m3.828s user 0m0.413s sys 0m3.408s == After patch == $ time python test_smaps.py real 0m3.405s user 0m0.401s sys 0m3.003s == Before patch == - 75.51% 4.62% python [kernel.kallsyms] [k] show_smap.isra.33 - 70.88% show_smap.isra.33 + 24.82% seq_put_decimal_ull_aligned + 19.78% __walk_page_range + 12.74% seq_printf + 11.08% show_map_vma.isra.23 + 1.68% seq_puts == After patch == - 69.16% 5.70% python [kernel.kallsyms] [k] show_smap.isra.33 - 63.46% show_smap.isra.33 + 25.98% seq_put_decimal_ull_aligned + 20.90% __walk_page_range + 12.60% show_map_vma.isra.23 1.56% seq_putc + 1.55% seq_puts Link: http://lkml.kernel.org/r/20180212074931.7227-2-avagin@openvz.org Signed-off-by: Andrei Vagin Reviewed-by: Alexey Dobriyan Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 3026feda0432..65ae54659833 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -688,8 +688,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) if (!mnemonics[i][0]) continue; if (vma->vm_flags & (1UL << i)) { - seq_printf(m, "%c%c ", - mnemonics[i][0], mnemonics[i][1]); + seq_putc(m, mnemonics[i][0]); + seq_putc(m, mnemonics[i][1]); + seq_putc(m, ' '); } } seq_putc(m, '\n'); From 48dffbf82d2f17bc6dd3c2b7fd733738ea567914 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 10 Apr 2018 16:31:23 -0700 Subject: [PATCH 061/140] proc: optimize single-symbol delimiters to spead up seq_put_decimal_ull A delimiter is a string which is printed before a number. A syngle-symbol delimiters can be printed by set_putc() and this works faster than printing by set_puts(). == test_proc.c int main(int argc, char **argv) { int n, i, fd; char buf[16384]; n = atoi(argv[1]); for (i = 0; i < n; i++) { fd = open(argv[2], O_RDONLY); if (fd < 0) return 1; if (read(fd, buf, sizeof(buf)) <= 0) return 1; close(fd); } return 0; } == $ time ./test_proc 1000000 /proc/1/stat == Before patch == real 0m3.820s user 0m0.337s sys 0m3.394s == After patch == real 0m3.110s user 0m0.324s sys 0m2.700s Link: http://lkml.kernel.org/r/20180212074931.7227-3-avagin@openvz.org Signed-off-by: Andrei Vagin Cc: Alexey Dobriyan Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/seq_file.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/fs/seq_file.c b/fs/seq_file.c index 84650ad3b1bf..0677e89f3c6f 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -693,12 +693,12 @@ void seq_put_decimal_ull_width(struct seq_file *m, const char *delimiter, if (m->count + 2 >= m->size) /* we'll write 2 bytes at least */ goto overflow; - len = strlen(delimiter); - if (m->count + len >= m->size) - goto overflow; - - memcpy(m->buf + m->count, delimiter, len); - m->count += len; + if (delimiter && delimiter[0]) { + if (delimiter[1] == 0) + seq_putc(m, delimiter[0]); + else + seq_puts(m, delimiter); + } if (!width) width = 1; @@ -782,12 +782,12 @@ void seq_put_decimal_ll(struct seq_file *m, const char *delimiter, long long num if (m->count + 3 >= m->size) /* we'll write 2 bytes at least */ goto overflow; - len = strlen(delimiter); - if (m->count + len >= m->size) - goto overflow; - - memcpy(m->buf + m->count, delimiter, len); - m->count += len; + if (delimiter && delimiter[0]) { + if (delimiter[1] == 0) + seq_putc(m, delimiter[0]); + else + seq_puts(m, delimiter); + } if (m->count + 2 >= m->size) goto overflow; From d0f02231222b313d1b49278cd2e3c7e7406fea6d Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 10 Apr 2018 16:31:26 -0700 Subject: [PATCH 062/140] proc: replace seq_printf by seq_put_smth to speed up /proc/pid/status seq_printf() works slower than seq_puts, seq_puts, etc. == test_proc.c int main(int argc, char **argv) { int n, i, fd; char buf[16384]; n = atoi(argv[1]); for (i = 0; i < n; i++) { fd = open(argv[2], O_RDONLY); if (fd < 0) return 1; if (read(fd, buf, sizeof(buf)) <= 0) return 1; close(fd); } return 0; } == $ time ./test_proc 1000000 /proc/1/status == Before path == real 0m5.171s user 0m0.328s sys 0m4.783s == After patch == real 0m4.761s user 0m0.334s sys 0m4.366s Link: http://lkml.kernel.org/r/20180212074931.7227-4-avagin@openvz.org Signed-off-by: Andrei Vagin Cc: Alexey Dobriyan Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/array.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/fs/proc/array.c b/fs/proc/array.c index 851ec0915e4c..ae2c807fd719 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -174,7 +174,8 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, if (umask >= 0) seq_printf(m, "Umask:\t%#04o\n", umask); - seq_printf(m, "State:\t%s", get_task_state(p)); + seq_puts(m, "State:\t"); + seq_puts(m, get_task_state(p)); seq_put_decimal_ull(m, "\nTgid:\t", tgid); seq_put_decimal_ull(m, "\nNgid:\t", ngid); @@ -300,8 +301,8 @@ static void render_cap_t(struct seq_file *m, const char *header, seq_puts(m, header); CAP_FOR_EACH_U32(__capi) { - seq_printf(m, "%08x", - a->cap[CAP_LAST_U32 - __capi]); + seq_put_hex_ll(m, NULL, + a->cap[CAP_LAST_U32 - __capi], 8); } seq_putc(m, '\n'); } @@ -355,7 +356,8 @@ static void task_cpus_allowed(struct seq_file *m, struct task_struct *task) static inline void task_core_dumping(struct seq_file *m, struct mm_struct *mm) { - seq_printf(m, "CoreDumping:\t%d\n", !!mm->core_state); + seq_put_decimal_ull(m, "CoreDumping:\t", !!mm->core_state); + seq_putc(m, '\n'); } int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, @@ -491,7 +493,11 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, /* convert nsec -> ticks */ start_time = nsec_to_clock_t(task->real_start_time); - seq_printf(m, "%d (%s) %c", pid_nr_ns(pid, ns), tcomm, state); + seq_put_decimal_ull(m, "", pid_nr_ns(pid, ns)); + seq_puts(m, " ("); + seq_puts(m, tcomm); + seq_puts(m, ") "); + seq_putc(m, state); seq_put_decimal_ll(m, " ", ppid); seq_put_decimal_ll(m, " ", pgid); seq_put_decimal_ll(m, " ", sid); From 24b2ec21192c963c17a1b687b6171e95e8b59c06 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 10 Apr 2018 16:31:30 -0700 Subject: [PATCH 063/140] proc: check permissions earlier for /proc/*/wchan get_wchan() accesses stack page before permissions are checked, let's not play this game. Link: http://lkml.kernel.org/r/20180217071923.GA16074@avx2 Signed-off-by: Alexey Dobriyan Reviewed-by: Andrew Morton Cc: Andy Shevchenko Cc: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index d53246863cfb..d8b5a1653444 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -388,14 +388,17 @@ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns, unsigned long wchan; char symname[KSYM_NAME_LEN]; + if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) + goto print0; + wchan = get_wchan(task); - - if (wchan && ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS) - && !lookup_symbol_name(wchan, symname)) + if (wchan && !lookup_symbol_name(wchan, symname)) { seq_printf(m, "%s", symname); - else - seq_putc(m, '0'); + return 0; + } +print0: + seq_putc(m, '0'); return 0; } #endif /* CONFIG_KALLSYMS */ From 21dae0ad07e6c4d3fa1bd9a91a8b51be316a5111 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 10 Apr 2018 16:31:34 -0700 Subject: [PATCH 064/140] proc: use set_puts() at /proc/*/wchan Link: http://lkml.kernel.org/r/20180217072011.GB16074@avx2 Signed-off-by: Alexey Dobriyan Reviewed-by: Andrew Morton Cc: Andy Shevchenko Cc: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index d8b5a1653444..e9e7652b77da 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -393,7 +393,7 @@ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns, wchan = get_wchan(task); if (wchan && !lookup_symbol_name(wchan, symname)) { - seq_printf(m, "%s", symname); + seq_puts(m, symname); return 0; } From a0b0d1c345d0317efe594df268feb5ccc99f651e Mon Sep 17 00:00:00 2001 From: Danilo Krummrich Date: Tue, 10 Apr 2018 16:31:38 -0700 Subject: [PATCH 065/140] fs/proc/proc_sysctl.c: fix potential page fault while unregistering sysctl table proc_sys_link_fill_cache() does not take currently unregistering sysctl tables into account, which might result into a page fault in sysctl_follow_link() - add a check to fix it. This bug has been present since v3.4. Link: http://lkml.kernel.org/r/20180228013506.4915-1-danilokrummrich@dk-develop.de Fixes: 0e47c99d7fe25 ("sysctl: Replace root_list with links between sysctl_table_sets") Signed-off-by: Danilo Krummrich Acked-by: Kees Cook Reviewed-by: Andrew Morton Cc: "Luis R . Rodriguez" Cc: "Eric W. Biederman" Cc: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/proc_sysctl.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index c41ab261397d..7da10e595297 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -707,7 +707,10 @@ static bool proc_sys_link_fill_cache(struct file *file, struct ctl_table *table) { bool ret = true; + head = sysctl_head_grab(head); + if (IS_ERR(head)) + return false; if (S_ISLNK(table->mode)) { /* It is not an error if we can not follow the link ignore it */ From 835b94e05c92e6e8df48112770e624cee192a057 Mon Sep 17 00:00:00 2001 From: Danilo Krummrich Date: Tue, 10 Apr 2018 16:31:41 -0700 Subject: [PATCH 066/140] fs/proc/proc_sysctl.c: remove redundant link check in proc_sys_link_fill_cache() proc_sys_link_fill_cache() does not need to check whether we're called for a link - it's already done by scan(). Link: http://lkml.kernel.org/r/20180228013506.4915-2-danilokrummrich@dk-develop.de Signed-off-by: Danilo Krummrich Acked-by: Kees Cook Reviewed-by: Andrew Morton Cc: Alexey Dobriyan Cc: "Eric W. Biederman" Cc: "Luis R . Rodriguez" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/proc_sysctl.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 7da10e595297..4654fc3c246f 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -712,12 +712,9 @@ static bool proc_sys_link_fill_cache(struct file *file, if (IS_ERR(head)) return false; - if (S_ISLNK(table->mode)) { - /* It is not an error if we can not follow the link ignore it */ - int err = sysctl_follow_link(&head, &table); - if (err) - goto out; - } + /* It is not an error if we can not follow the link ignore it */ + if (sysctl_follow_link(&head, &table)) + goto out; ret = proc_sys_fill_cache(file, ctx, head, table); out: From 9cd65655585523adecd65b750e1537cdea84718e Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 10 Apr 2018 16:31:45 -0700 Subject: [PATCH 067/140] proc: test /proc/self/wchan This patch starts testing /proc. Many more tests to come (I promise). Read from /proc/self/wchan should always return "0" as current is in TASK_RUNNING state while reading /proc/self/wchan. Link: http://lkml.kernel.org/r/20180226212006.GA742@avx2 Signed-off-by: Alexey Dobriyan Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/Makefile | 1 + tools/testing/selftests/proc/.gitignore | 1 + tools/testing/selftests/proc/Makefile | 6 +++++ tools/testing/selftests/proc/config | 1 + .../testing/selftests/proc/proc-self-wchan.c | 25 +++++++++++++++++++ 5 files changed, 34 insertions(+) create mode 100644 tools/testing/selftests/proc/.gitignore create mode 100644 tools/testing/selftests/proc/Makefile create mode 100644 tools/testing/selftests/proc/config create mode 100644 tools/testing/selftests/proc/proc-self-wchan.c diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 2fc410bc4f33..32aafa92074c 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -25,6 +25,7 @@ TARGETS += mqueue TARGETS += net TARGETS += nsfs TARGETS += powerpc +TARGETS += proc TARGETS += pstore TARGETS += ptrace TARGETS += seccomp diff --git a/tools/testing/selftests/proc/.gitignore b/tools/testing/selftests/proc/.gitignore new file mode 100644 index 000000000000..4c851db80bd5 --- /dev/null +++ b/tools/testing/selftests/proc/.gitignore @@ -0,0 +1 @@ +/proc-self-wchan diff --git a/tools/testing/selftests/proc/Makefile b/tools/testing/selftests/proc/Makefile new file mode 100644 index 000000000000..592603a5d675 --- /dev/null +++ b/tools/testing/selftests/proc/Makefile @@ -0,0 +1,6 @@ +CFLAGS += -Wall -O2 + +TEST_GEN_PROGS := +TEST_GEN_PROGS += proc-self-wchan + +include ../lib.mk diff --git a/tools/testing/selftests/proc/config b/tools/testing/selftests/proc/config new file mode 100644 index 000000000000..68fbd2b35884 --- /dev/null +++ b/tools/testing/selftests/proc/config @@ -0,0 +1 @@ +CONFIG_PROC_FS=y diff --git a/tools/testing/selftests/proc/proc-self-wchan.c b/tools/testing/selftests/proc/proc-self-wchan.c new file mode 100644 index 000000000000..b8d8728a6869 --- /dev/null +++ b/tools/testing/selftests/proc/proc-self-wchan.c @@ -0,0 +1,25 @@ +#include +#include +#include +#include +#include + +int main(void) +{ + char buf[64]; + int fd; + + fd = open("/proc/self/wchan", O_RDONLY); + if (fd == -1) { + if (errno == ENOENT) + return 2; + return 1; + } + + buf[0] = '\0'; + if (read(fd, buf, sizeof(buf)) != 1) + return 1; + if (buf[0] != '0') + return 1; + return 0; +} From c4219edf1de2af44fd98903f72f6e1ceb7f3c701 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 10 Apr 2018 16:31:48 -0700 Subject: [PATCH 068/140] proc: test /proc/self/syscall Read from /proc/self/syscall should yield read system call and correct args in the output as current is reading /proc/self/syscall. Link: http://lkml.kernel.org/r/20180226212145.GB742@avx2 Signed-off-by: Alexey Dobriyan Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/proc/.gitignore | 3 +- tools/testing/selftests/proc/Makefile | 1 + .../selftests/proc/proc-self-syscall.c | 45 +++++++++++++++++++ 3 files changed, 48 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/proc/proc-self-syscall.c diff --git a/tools/testing/selftests/proc/.gitignore b/tools/testing/selftests/proc/.gitignore index 4c851db80bd5..c648b27af2e7 100644 --- a/tools/testing/selftests/proc/.gitignore +++ b/tools/testing/selftests/proc/.gitignore @@ -1 +1,2 @@ -/proc-self-wchan +/proc-self-mem +/proc-self-syscall diff --git a/tools/testing/selftests/proc/Makefile b/tools/testing/selftests/proc/Makefile index 592603a5d675..ad20520910e2 100644 --- a/tools/testing/selftests/proc/Makefile +++ b/tools/testing/selftests/proc/Makefile @@ -1,6 +1,7 @@ CFLAGS += -Wall -O2 TEST_GEN_PROGS := +TEST_GEN_PROGS += proc-self-syscall TEST_GEN_PROGS += proc-self-wchan include ../lib.mk diff --git a/tools/testing/selftests/proc/proc-self-syscall.c b/tools/testing/selftests/proc/proc-self-syscall.c new file mode 100644 index 000000000000..05eb6f91f1e9 --- /dev/null +++ b/tools/testing/selftests/proc/proc-self-syscall.c @@ -0,0 +1,45 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static inline ssize_t sys_read(int fd, void *buf, size_t len) +{ + return syscall(SYS_read, fd, buf, len); +} + +int main(void) +{ + char buf1[64]; + char buf2[64]; + int fd; + ssize_t rv; + + fd = open("/proc/self/syscall", O_RDONLY); + if (fd == -1) { + if (errno == ENOENT) + return 2; + return 1; + } + + /* Do direct system call as libc can wrap anything. */ + snprintf(buf1, sizeof(buf1), "%ld 0x%lx 0x%lx 0x%lx", + (long)SYS_read, (long)fd, (long)buf2, (long)sizeof(buf2)); + + memset(buf2, 0, sizeof(buf2)); + rv = sys_read(fd, buf2, sizeof(buf2)); + if (rv < 0) + return 1; + if (rv < strlen(buf1)) + return 1; + if (strncmp(buf1, buf2, strlen(buf1)) != 0) + return 1; + + return 0; +} From b4884f23331ae31e9ecb617956986c3b76ab9a91 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 10 Apr 2018 16:31:52 -0700 Subject: [PATCH 069/140] proc: move "struct proc_dir_entry" into kmem cache "struct proc_dir_entry" is variable sized because of 0-length trailing array for name, however, because of SLAB padding allocations it is possible to make "struct proc_dir_entry" fixed sized and allocate same amount of memory. It buys fine-grained debugging with poisoning and usercopy protection which is not possible with kmalloc-* caches. Currently, on 32-bit 91+ byte allocations go into kmalloc-128 and on 64-bit 147+ byte allocations go to kmalloc-192 anyway. Additional memory is allocated only for 38/46+ byte long names which are rare or may not even exist in the wild. Link: http://lkml.kernel.org/r/20180223205504.GA17139@avx2 Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/generic.c | 50 +++++++++++++++++++++++++++++----------------- fs/proc/inode.c | 4 ++++ fs/proc/internal.h | 11 +++++++++- fs/proc/proc_net.c | 7 ++++--- fs/proc/root.c | 3 ++- 5 files changed, 52 insertions(+), 23 deletions(-) diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 5d709fa8f3a2..800247a256c9 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -8,6 +8,7 @@ * Copyright (C) 1997 Theodore Ts'o */ +#include #include #include #include @@ -28,6 +29,17 @@ static DEFINE_RWLOCK(proc_subdir_lock); +struct kmem_cache *proc_dir_entry_cache __ro_after_init; + +void pde_free(struct proc_dir_entry *pde) +{ + if (S_ISLNK(pde->mode)) + kfree(pde->data); + if (pde->name != pde->inline_name) + kfree(pde->name); + kmem_cache_free(proc_dir_entry_cache, pde); +} + static int proc_match(const char *name, struct proc_dir_entry *de, unsigned int len) { if (len < de->namelen) @@ -363,10 +375,20 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, return NULL; } - ent = kzalloc(sizeof(struct proc_dir_entry) + qstr.len + 1, GFP_KERNEL); + ent = kmem_cache_zalloc(proc_dir_entry_cache, GFP_KERNEL); if (!ent) goto out; + if (qstr.len + 1 <= sizeof(ent->inline_name)) { + ent->name = ent->inline_name; + } else { + ent->name = kmalloc(qstr.len + 1, GFP_KERNEL); + if (!ent->name) { + pde_free(ent); + return NULL; + } + } + memcpy(ent->name, fn, qstr.len + 1); ent->namelen = qstr.len; ent->mode = mode; @@ -395,12 +417,11 @@ struct proc_dir_entry *proc_symlink(const char *name, strcpy((char*)ent->data,dest); ent->proc_iops = &proc_link_inode_operations; if (proc_register(parent, ent) < 0) { - kfree(ent->data); - kfree(ent); + pde_free(ent); ent = NULL; } } else { - kfree(ent); + pde_free(ent); ent = NULL; } } @@ -423,7 +444,7 @@ struct proc_dir_entry *proc_mkdir_data(const char *name, umode_t mode, ent->proc_iops = &proc_dir_inode_operations; parent->nlink++; if (proc_register(parent, ent) < 0) { - kfree(ent); + pde_free(ent); parent->nlink--; ent = NULL; } @@ -458,7 +479,7 @@ struct proc_dir_entry *proc_create_mount_point(const char *name) ent->proc_iops = NULL; parent->nlink++; if (proc_register(parent, ent) < 0) { - kfree(ent); + pde_free(ent); parent->nlink--; ent = NULL; } @@ -495,7 +516,7 @@ struct proc_dir_entry *proc_create_data(const char *name, umode_t mode, goto out_free; return pde; out_free: - kfree(pde); + pde_free(pde); out: return NULL; } @@ -522,19 +543,12 @@ void proc_set_user(struct proc_dir_entry *de, kuid_t uid, kgid_t gid) } EXPORT_SYMBOL(proc_set_user); -static void free_proc_entry(struct proc_dir_entry *de) -{ - proc_free_inum(de->low_ino); - - if (S_ISLNK(de->mode)) - kfree(de->data); - kfree(de); -} - void pde_put(struct proc_dir_entry *pde) { - if (atomic_dec_and_test(&pde->count)) - free_proc_entry(pde); + if (atomic_dec_and_test(&pde->count)) { + proc_free_inum(pde->low_ino); + pde_free(pde); + } } /* diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 89618836887d..2cf3b74391ca 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -104,6 +104,10 @@ void __init proc_init_kmemcache(void) pde_opener_cache = kmem_cache_create("pde_opener", sizeof(struct pde_opener), 0, SLAB_ACCOUNT|SLAB_PANIC, NULL); + proc_dir_entry_cache = kmem_cache_create_usercopy( + "proc_dir_entry", sizeof(struct proc_dir_entry), 0, SLAB_PANIC, + offsetof(struct proc_dir_entry, inline_name), + sizeof_field(struct proc_dir_entry, inline_name), NULL); } static int proc_show_options(struct seq_file *seq, struct dentry *root) diff --git a/fs/proc/internal.h b/fs/proc/internal.h index dc00ef8538cb..0ead00771384 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -52,11 +52,20 @@ struct proc_dir_entry { struct proc_dir_entry *parent; struct rb_root_cached subdir; struct rb_node subdir_node; + char *name; umode_t mode; u8 namelen; - char name[]; +#ifdef CONFIG_64BIT +#define SIZEOF_PDE_INLINE_NAME (192-147) +#else +#define SIZEOF_PDE_INLINE_NAME (128-91) +#endif + char inline_name[SIZEOF_PDE_INLINE_NAME]; } __randomize_layout; +extern struct kmem_cache *proc_dir_entry_cache; +void pde_free(struct proc_dir_entry *pde); + union proc_op { int (*proc_get_link)(struct dentry *, struct path *); int (*proc_show)(struct seq_file *m, diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index 68c06ae7888c..e5fe3d400737 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -192,7 +192,7 @@ static __net_init int proc_net_ns_init(struct net *net) int err; err = -ENOMEM; - netd = kzalloc(sizeof(*netd) + 4, GFP_KERNEL); + netd = kmem_cache_zalloc(proc_dir_entry_cache, GFP_KERNEL); if (!netd) goto out; @@ -201,6 +201,7 @@ static __net_init int proc_net_ns_init(struct net *net) netd->nlink = 2; netd->namelen = 3; netd->parent = &proc_root; + netd->name = netd->inline_name; memcpy(netd->name, "net", 4); uid = make_kuid(net->user_ns, 0); @@ -223,7 +224,7 @@ static __net_init int proc_net_ns_init(struct net *net) return 0; free_net: - kfree(netd); + pde_free(netd); out: return err; } @@ -231,7 +232,7 @@ out: static __net_exit void proc_net_ns_exit(struct net *net) { remove_proc_entry("stat", net->proc_net); - kfree(net->proc_net); + pde_free(net->proc_net); } static struct pernet_operations __net_initdata proc_net_ns_ops = { diff --git a/fs/proc/root.c b/fs/proc/root.c index 98797b762a71..cd45abfbb6cc 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -208,7 +208,8 @@ struct proc_dir_entry proc_root = { .proc_fops = &proc_root_operations, .parent = &proc_root, .subdir = RB_ROOT_CACHED, - .name = "/proc", + .name = proc_root.inline_name, + .inline_name = "/proc", }; int pid_ns_prepare_proc(struct pid_namespace *ns) From 35318db566e18ee3ada7e2d62192e5e87b1b5e4b Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 10 Apr 2018 16:41:14 -0700 Subject: [PATCH 070/140] proc: fix /proc/*/map_files lookup some more I totally forgot that _parse_integer() accepts arbitrary amount of leading zeroes leading to the following lookups: OK # readlink /proc/1/map_files/56427ecba000-56427eddc000 /lib/systemd/systemd bogus # readlink /proc/1/map_files/00000000000056427ecba000-56427eddc000 /lib/systemd/systemd # readlink /proc/1/map_files/56427ecba000-00000000000056427eddc000 /lib/systemd/systemd Link: http://lkml.kernel.org/r/20180303215130.GA23480@avx2 Signed-off-by: Alexey Dobriyan Reviewed-by: Cyrill Gorcunov Reviewed-by: Andrew Morton Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 4 + tools/testing/selftests/proc/.gitignore | 4 +- tools/testing/selftests/proc/Makefile | 2 + .../selftests/proc/proc-self-map-files-001.c | 82 ++++++++++++++++++ .../selftests/proc/proc-self-map-files-002.c | 85 +++++++++++++++++++ 5 files changed, 176 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/proc/proc-self-map-files-001.c create mode 100644 tools/testing/selftests/proc/proc-self-map-files-002.c diff --git a/fs/proc/base.c b/fs/proc/base.c index e9e7652b77da..d413a138dc30 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1913,6 +1913,8 @@ static int dname_to_vma_addr(struct dentry *dentry, unsigned long long sval, eval; unsigned int len; + if (str[0] == '0' && str[1] != '-') + return -EINVAL; len = _parse_integer(str, 16, &sval); if (len & KSTRTOX_OVERFLOW) return -EINVAL; @@ -1924,6 +1926,8 @@ static int dname_to_vma_addr(struct dentry *dentry, return -EINVAL; str++; + if (str[0] == '0' && str[1]) + return -EINVAL; len = _parse_integer(str, 16, &eval); if (len & KSTRTOX_OVERFLOW) return -EINVAL; diff --git a/tools/testing/selftests/proc/.gitignore b/tools/testing/selftests/proc/.gitignore index c648b27af2e7..e3ceb19ae99a 100644 --- a/tools/testing/selftests/proc/.gitignore +++ b/tools/testing/selftests/proc/.gitignore @@ -1,2 +1,4 @@ -/proc-self-mem +/proc-self-map-files-001 +/proc-self-map-files-002 /proc-self-syscall +/proc-self-wchan diff --git a/tools/testing/selftests/proc/Makefile b/tools/testing/selftests/proc/Makefile index ad20520910e2..1a0ce32a9786 100644 --- a/tools/testing/selftests/proc/Makefile +++ b/tools/testing/selftests/proc/Makefile @@ -1,6 +1,8 @@ CFLAGS += -Wall -O2 TEST_GEN_PROGS := +TEST_GEN_PROGS += proc-self-map-files-001 +TEST_GEN_PROGS += proc-self-map-files-002 TEST_GEN_PROGS += proc-self-syscall TEST_GEN_PROGS += proc-self-wchan diff --git a/tools/testing/selftests/proc/proc-self-map-files-001.c b/tools/testing/selftests/proc/proc-self-map-files-001.c new file mode 100644 index 000000000000..af1d0a6af810 --- /dev/null +++ b/tools/testing/selftests/proc/proc-self-map-files-001.c @@ -0,0 +1,82 @@ +/* + * Copyright _ 2018 Alexey Dobriyan + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +/* Test readlink /proc/self/map_files/... */ +#include +#include +#include +#include +#include +#include +#include +#include + +static void pass(const char *fmt, unsigned long a, unsigned long b) +{ + char name[64]; + char buf[64]; + + snprintf(name, sizeof(name), fmt, a, b); + if (readlink(name, buf, sizeof(buf)) == -1) + exit(1); +} + +static void fail(const char *fmt, unsigned long a, unsigned long b) +{ + char name[64]; + char buf[64]; + + snprintf(name, sizeof(name), fmt, a, b); + if (readlink(name, buf, sizeof(buf)) == -1 && errno == ENOENT) + return; + exit(1); +} + +int main(void) +{ + const unsigned int PAGE_SIZE = sysconf(_SC_PAGESIZE); + void *p; + int fd; + unsigned long a, b; + + fd = open("/dev/zero", O_RDONLY); + if (fd == -1) + return 1; + + p = mmap(NULL, PAGE_SIZE, PROT_NONE, MAP_PRIVATE|MAP_FILE, fd, 0); + if (p == MAP_FAILED) + return 1; + + a = (unsigned long)p; + b = (unsigned long)p + PAGE_SIZE; + + pass("/proc/self/map_files/%lx-%lx", a, b); + fail("/proc/self/map_files/ %lx-%lx", a, b); + fail("/proc/self/map_files/%lx -%lx", a, b); + fail("/proc/self/map_files/%lx- %lx", a, b); + fail("/proc/self/map_files/%lx-%lx ", a, b); + fail("/proc/self/map_files/0%lx-%lx", a, b); + fail("/proc/self/map_files/%lx-0%lx", a, b); + if (sizeof(long) == 4) { + fail("/proc/self/map_files/100000000%lx-%lx", a, b); + fail("/proc/self/map_files/%lx-100000000%lx", a, b); + } else if (sizeof(long) == 8) { + fail("/proc/self/map_files/10000000000000000%lx-%lx", a, b); + fail("/proc/self/map_files/%lx-10000000000000000%lx", a, b); + } else + return 1; + + return 0; +} diff --git a/tools/testing/selftests/proc/proc-self-map-files-002.c b/tools/testing/selftests/proc/proc-self-map-files-002.c new file mode 100644 index 000000000000..aebf4be56111 --- /dev/null +++ b/tools/testing/selftests/proc/proc-self-map-files-002.c @@ -0,0 +1,85 @@ +/* + * Copyright _ 2018 Alexey Dobriyan + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +/* Test readlink /proc/self/map_files/... with address 0. */ +#include +#include +#include +#include +#include +#include +#include +#include + +static void pass(const char *fmt, unsigned long a, unsigned long b) +{ + char name[64]; + char buf[64]; + + snprintf(name, sizeof(name), fmt, a, b); + if (readlink(name, buf, sizeof(buf)) == -1) + exit(1); +} + +static void fail(const char *fmt, unsigned long a, unsigned long b) +{ + char name[64]; + char buf[64]; + + snprintf(name, sizeof(name), fmt, a, b); + if (readlink(name, buf, sizeof(buf)) == -1 && errno == ENOENT) + return; + exit(1); +} + +int main(void) +{ + const unsigned int PAGE_SIZE = sysconf(_SC_PAGESIZE); + void *p; + int fd; + unsigned long a, b; + + fd = open("/dev/zero", O_RDONLY); + if (fd == -1) + return 1; + + p = mmap(NULL, PAGE_SIZE, PROT_NONE, MAP_PRIVATE|MAP_FILE|MAP_FIXED, fd, 0); + if (p == MAP_FAILED) { + if (errno == EPERM) + return 2; + return 1; + } + + a = (unsigned long)p; + b = (unsigned long)p + PAGE_SIZE; + + pass("/proc/self/map_files/%lx-%lx", a, b); + fail("/proc/self/map_files/ %lx-%lx", a, b); + fail("/proc/self/map_files/%lx -%lx", a, b); + fail("/proc/self/map_files/%lx- %lx", a, b); + fail("/proc/self/map_files/%lx-%lx ", a, b); + fail("/proc/self/map_files/0%lx-%lx", a, b); + fail("/proc/self/map_files/%lx-0%lx", a, b); + if (sizeof(long) == 4) { + fail("/proc/self/map_files/100000000%lx-%lx", a, b); + fail("/proc/self/map_files/%lx-100000000%lx", a, b); + } else if (sizeof(long) == 8) { + fail("/proc/self/map_files/10000000000000000%lx-%lx", a, b); + fail("/proc/self/map_files/%lx-10000000000000000%lx", a, b); + } else + return 1; + + return 0; +} From 1539d584e488538451526da039fa554fdeea1177 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 10 Apr 2018 16:31:57 -0700 Subject: [PATCH 071/140] proc: register filesystem last As soon as register_filesystem() exits, filesystem can be mounted. It is better to present fully operational /proc. Of course it doesn't matter because /proc is not modular but do it anyway. Drop error check, it should be handled by panicking. Link: http://lkml.kernel.org/r/20180309222709.GA3843@avx2 Signed-off-by: Alexey Dobriyan Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/root.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/fs/proc/root.c b/fs/proc/root.c index cd45abfbb6cc..9e99204a0704 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -123,14 +123,8 @@ static struct file_system_type proc_fs_type = { void __init proc_root_init(void) { - int err; - proc_init_kmemcache(); set_proc_pid_nlink(); - err = register_filesystem(&proc_fs_type); - if (err) - return; - proc_self_init(); proc_thread_self_init(); proc_symlink("mounts", NULL, "self/mounts"); @@ -146,6 +140,8 @@ void __init proc_root_init(void) proc_tty_init(); proc_mkdir("bus", NULL); proc_sys_init(); + + register_filesystem(&proc_fs_type); } static int proc_root_getattr(const struct path *path, struct kstat *stat, From 58c501aab3e54b99eac632a2f5ab5f53e0c27948 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 10 Apr 2018 16:32:01 -0700 Subject: [PATCH 072/140] proc: faster /proc/cmdline Use seq_puts() and skip format string processing. Link: http://lkml.kernel.org/r/20180309222948.GB3843@avx2 Signed-off-by: Alexey Dobriyan Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/cmdline.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/proc/cmdline.c b/fs/proc/cmdline.c index 403cbb12a6e9..8233e7af9389 100644 --- a/fs/proc/cmdline.c +++ b/fs/proc/cmdline.c @@ -6,7 +6,8 @@ static int cmdline_proc_show(struct seq_file *m, void *v) { - seq_printf(m, "%s\n", saved_command_line); + seq_puts(m, saved_command_line); + seq_putc(m, '\n'); return 0; } From fe079a5e102cc59b6c2b66a41e39c624ce284519 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 10 Apr 2018 16:32:05 -0700 Subject: [PATCH 073/140] proc: do mmput ASAP for /proc/*/map_files mm_struct is not needed while printing as all the data was already extracted. Link: http://lkml.kernel.org/r/20180309223120.GC3843@avx2 Signed-off-by: Alexey Dobriyan Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index d413a138dc30..eafa39a3a88c 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2211,6 +2211,7 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) } } up_read(&mm->mmap_sem); + mmput(mm); for (i = 0; i < nr_files; i++) { char buf[4 * sizeof(long) + 2]; /* max: %lx-%lx\0 */ @@ -2228,7 +2229,6 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) } if (fa) flex_array_free(fa); - mmput(mm); out_put_task: put_task_struct(task); From 5de3d401b79486b9323e1be30e3a34c2437b8800 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 10 Apr 2018 16:42:23 -0700 Subject: [PATCH 074/140] proc: add selftest for last field of /proc/loadavg Test fork counter formerly known as ->last_pid, the only part of /proc/loadavg which can be tested. Testing in init pid namespace is not reliable because of background activity. Link: http://lkml.kernel.org/r/20180311152241.GA26247@avx2 Signed-off-by: Alexey Dobriyan Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/proc/.gitignore | 1 + tools/testing/selftests/proc/Makefile | 1 + .../testing/selftests/proc/proc-loadavg-001.c | 83 +++++++++++++++++++ 3 files changed, 85 insertions(+) create mode 100644 tools/testing/selftests/proc/proc-loadavg-001.c diff --git a/tools/testing/selftests/proc/.gitignore b/tools/testing/selftests/proc/.gitignore index e3ceb19ae99a..78c40f5aa706 100644 --- a/tools/testing/selftests/proc/.gitignore +++ b/tools/testing/selftests/proc/.gitignore @@ -1,3 +1,4 @@ +/proc-loadavg-001 /proc-self-map-files-001 /proc-self-map-files-002 /proc-self-syscall diff --git a/tools/testing/selftests/proc/Makefile b/tools/testing/selftests/proc/Makefile index 1a0ce32a9786..d7f70583005d 100644 --- a/tools/testing/selftests/proc/Makefile +++ b/tools/testing/selftests/proc/Makefile @@ -1,6 +1,7 @@ CFLAGS += -Wall -O2 TEST_GEN_PROGS := +TEST_GEN_PROGS += proc-loadavg-001 TEST_GEN_PROGS += proc-self-map-files-001 TEST_GEN_PROGS += proc-self-map-files-002 TEST_GEN_PROGS += proc-self-syscall diff --git a/tools/testing/selftests/proc/proc-loadavg-001.c b/tools/testing/selftests/proc/proc-loadavg-001.c new file mode 100644 index 000000000000..e38ad6d94d4b --- /dev/null +++ b/tools/testing/selftests/proc/proc-loadavg-001.c @@ -0,0 +1,83 @@ +/* + * Copyright _ 2018 Alexey Dobriyan + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +/* Test that /proc/loadavg correctly reports last pid in pid namespace. */ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include + +int main(void) +{ + pid_t pid; + int wstatus; + + if (unshare(CLONE_NEWPID) == -1) { + if (errno == ENOSYS || errno == EPERM) + return 2; + return 1; + } + + pid = fork(); + if (pid == -1) + return 1; + if (pid == 0) { + char buf[128], *p; + int fd; + ssize_t rv; + + fd = open("/proc/loadavg" , O_RDONLY); + if (fd == -1) + return 1; + rv = read(fd, buf, sizeof(buf)); + if (rv < 3) + return 1; + p = buf + rv; + + /* pid 1 */ + if (!(p[-3] == ' ' && p[-2] == '1' && p[-1] == '\n')) + return 1; + + pid = fork(); + if (pid == -1) + return 1; + if (pid == 0) + return 0; + if (waitpid(pid, NULL, 0) == -1) + return 1; + + lseek(fd, 0, SEEK_SET); + rv = read(fd, buf, sizeof(buf)); + if (rv < 3) + return 1; + p = buf + rv; + + /* pid 2 */ + if (!(p[-3] == ' ' && p[-2] == '2' && p[-1] == '\n')) + return 1; + + return 0; + } + + if (waitpid(pid, &wstatus, 0) == -1) + return 1; + if (WIFEXITED(wstatus) && WEXITSTATUS(wstatus) == 0) + return 0; + return 1; +} From b77d70db659ad3aa662c80cff4475e773a531fbe Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 10 Apr 2018 16:32:11 -0700 Subject: [PATCH 075/140] proc: reject "." and ".." as filenames Various subsystems can create files and directories in /proc with names directly controlled by userspace. Which means "/", "." and ".." are no-no. "/" split is already taken care of, do the other 2 prohibited names. Link: http://lkml.kernel.org/r/20180310001223.GB12443@avx2 Signed-off-by: Alexey Dobriyan Acked-by: Florian Westphal Cc: Eric Dumazet Cc: Cong Wang Cc: Pavel Machek Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/generic.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 800247a256c9..5dad2e89007b 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -366,6 +366,14 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, WARN(1, "name len %u\n", qstr.len); return NULL; } + if (qstr.len == 1 && fn[0] == '.') { + WARN(1, "name '.'\n"); + return NULL; + } + if (qstr.len == 2 && fn[0] == '.' && fn[1] == '.') { + WARN(1, "name '..'\n"); + return NULL; + } if (*parent == &proc_root && name_to_int(&qstr) != ~0U) { WARN(1, "create '/proc/%s' by hand\n", qstr.name); return NULL; From 9cdd83e3100651af41631fb66838adcd24032f2a Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 10 Apr 2018 16:32:14 -0700 Subject: [PATCH 076/140] proc: switch struct proc_dir_entry::count to refcount ->count is honest reference count unlike ->in_use. Link: http://lkml.kernel.org/r/20180313174550.GA4332@avx2 Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/generic.c | 4 ++-- fs/proc/internal.h | 5 +++-- fs/proc/root.c | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 5dad2e89007b..fc0333fd5676 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -402,7 +402,7 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, ent->mode = mode; ent->nlink = nlink; ent->subdir = RB_ROOT_CACHED; - atomic_set(&ent->count, 1); + refcount_set(&ent->refcnt, 1); spin_lock_init(&ent->pde_unload_lock); INIT_LIST_HEAD(&ent->pde_openers); proc_set_user(ent, (*parent)->uid, (*parent)->gid); @@ -553,7 +553,7 @@ EXPORT_SYMBOL(proc_set_user); void pde_put(struct proc_dir_entry *pde) { - if (atomic_dec_and_test(&pde->count)) { + if (refcount_dec_and_test(&pde->refcnt)) { proc_free_inum(pde->low_ino); pde_free(pde); } diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 0ead00771384..b7024f174778 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -11,6 +11,7 @@ #include #include +#include #include #include #include @@ -36,7 +37,7 @@ struct proc_dir_entry { * negative -> it's going away RSN */ atomic_t in_use; - atomic_t count; /* use count */ + refcount_t refcnt; struct list_head pde_openers; /* who did ->open, but not ->release */ /* protects ->pde_openers and all struct pde_opener instances */ spinlock_t pde_unload_lock; @@ -168,7 +169,7 @@ int proc_readdir_de(struct file *, struct dir_context *, struct proc_dir_entry * static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde) { - atomic_inc(&pde->count); + refcount_inc(&pde->refcnt); return pde; } extern void pde_put(struct proc_dir_entry *); diff --git a/fs/proc/root.c b/fs/proc/root.c index 9e99204a0704..76c996457ff9 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -199,7 +199,7 @@ struct proc_dir_entry proc_root = { .namelen = 5, .mode = S_IFDIR | S_IRUGO | S_IXUGO, .nlink = 2, - .count = ATOMIC_INIT(1), + .refcnt = REFCOUNT_INIT(1), .proc_iops = &proc_root_inode_operations, .proc_fops = &proc_root_operations, .parent = &proc_root, From 05c3f29283af9e3da0ab7414f666cb37f530950a Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 10 Apr 2018 16:46:19 -0700 Subject: [PATCH 077/140] proc: selftests: shotgun testing of read/readdir/readlink/write Perform reads with nearly everything in /proc, and some writing as well. Hopefully memleak checkers and KASAN will find something. [adobriyan@gmail.com: /proc/kmsg can and will block if read under root] Link: http://lkml.kernel.org/r/20180316232147.GA20146@avx2 Signed-off-by: Alexey Dobriyan [adobriyan@gmail.com: /proc/sysrq-trigger lives on the ground floor] Link: http://lkml.kernel.org/r/20180317164911.GA3445@avx2 Link: http://lkml.kernel.org/r/20180315201251.GA12396@avx2 Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/proc/.gitignore | 1 + tools/testing/selftests/proc/Makefile | 1 + tools/testing/selftests/proc/read.c | 147 ++++++++++++++++++++++++ 3 files changed, 149 insertions(+) create mode 100644 tools/testing/selftests/proc/read.c diff --git a/tools/testing/selftests/proc/.gitignore b/tools/testing/selftests/proc/.gitignore index 78c40f5aa706..5627df81ade9 100644 --- a/tools/testing/selftests/proc/.gitignore +++ b/tools/testing/selftests/proc/.gitignore @@ -3,3 +3,4 @@ /proc-self-map-files-002 /proc-self-syscall /proc-self-wchan +/read diff --git a/tools/testing/selftests/proc/Makefile b/tools/testing/selftests/proc/Makefile index d7f70583005d..312a3989820c 100644 --- a/tools/testing/selftests/proc/Makefile +++ b/tools/testing/selftests/proc/Makefile @@ -6,5 +6,6 @@ TEST_GEN_PROGS += proc-self-map-files-001 TEST_GEN_PROGS += proc-self-map-files-002 TEST_GEN_PROGS += proc-self-syscall TEST_GEN_PROGS += proc-self-wchan +TEST_GEN_PROGS += read include ../lib.mk diff --git a/tools/testing/selftests/proc/read.c b/tools/testing/selftests/proc/read.c new file mode 100644 index 000000000000..12e397f78592 --- /dev/null +++ b/tools/testing/selftests/proc/read.c @@ -0,0 +1,147 @@ +/* + * Copyright _ 2018 Alexey Dobriyan + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +// Test +// 1) read of every file in /proc +// 2) readlink of every symlink in /proc +// 3) recursively (1) + (2) for every directory in /proc +// 4) write to /proc/*/clear_refs and /proc/*/task/*/clear_refs +// 5) write to /proc/sysrq-trigger +#undef NDEBUG +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static inline bool streq(const char *s1, const char *s2) +{ + return strcmp(s1, s2) == 0; +} + +static struct dirent *xreaddir(DIR *d) +{ + struct dirent *de; + + errno = 0; + de = readdir(d); + if (!de && errno != 0) { + exit(1); + } + return de; +} + +static void f_reg(DIR *d, const char *filename) +{ + char buf[4096]; + int fd; + ssize_t rv; + + /* read from /proc/kmsg can block */ + fd = openat(dirfd(d), filename, O_RDONLY|O_NONBLOCK); + if (fd == -1) + return; + rv = read(fd, buf, sizeof(buf)); + assert((0 <= rv && rv <= sizeof(buf)) || rv == -1); + close(fd); +} + +static void f_reg_write(DIR *d, const char *filename, const char *buf, size_t len) +{ + int fd; + ssize_t rv; + + fd = openat(dirfd(d), filename, O_WRONLY); + if (fd == -1) + return; + rv = write(fd, buf, len); + assert((0 <= rv && rv <= len) || rv == -1); + close(fd); +} + +static void f_lnk(DIR *d, const char *filename) +{ + char buf[4096]; + ssize_t rv; + + rv = readlinkat(dirfd(d), filename, buf, sizeof(buf)); + assert((0 <= rv && rv <= sizeof(buf)) || rv == -1); +} + +static void f(DIR *d, unsigned int level) +{ + struct dirent *de; + + de = xreaddir(d); + assert(de->d_type == DT_DIR); + assert(streq(de->d_name, ".")); + + de = xreaddir(d); + assert(de->d_type == DT_DIR); + assert(streq(de->d_name, "..")); + + while ((de = xreaddir(d))) { + assert(!streq(de->d_name, ".")); + assert(!streq(de->d_name, "..")); + + switch (de->d_type) { + DIR *dd; + int fd; + + case DT_REG: + if (level == 0 && streq(de->d_name, "sysrq-trigger")) { + f_reg_write(d, de->d_name, "h", 1); + } else if (level == 1 && streq(de->d_name, "clear_refs")) { + f_reg_write(d, de->d_name, "1", 1); + } else if (level == 3 && streq(de->d_name, "clear_refs")) { + f_reg_write(d, de->d_name, "1", 1); + } else { + f_reg(d, de->d_name); + } + break; + case DT_DIR: + fd = openat(dirfd(d), de->d_name, O_DIRECTORY|O_RDONLY); + if (fd == -1) + continue; + dd = fdopendir(fd); + if (!dd) + continue; + f(dd, level + 1); + closedir(dd); + break; + case DT_LNK: + f_lnk(d, de->d_name); + break; + default: + assert(0); + } + } +} + +int main(void) +{ + DIR *d; + + d = opendir("/proc"); + if (!d) + return 2; + f(d, 0); + return 0; +} From 4f1134370a29a5f2d0f4b4be4c5e2fddd38f0f9d Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 10 Apr 2018 16:32:20 -0700 Subject: [PATCH 078/140] proc: use slower rb_first() In a typical for /proc "open+read+close" usecase, dentry is looked up successfully on open only to be killed in dput() on close. In fact dentries which aren't /proc/*/... and /proc/sys/* were almost NEVER CACHED. Simple printk in proc_lookup_de() shows that. Now that ->delete hook intelligently picks which dentries should live in dcache and which should not, rbtree caching is not necessary as dcache does it job, at last! As a side effect, struct proc_dir_entry shrinks by one pointer which can go into inline name. Link: http://lkml.kernel.org/r/20180314231032.GA15854@avx2 Signed-off-by: Alexey Dobriyan Acked-by: Davidlohr Bueso Cc: Peter Zijlstra Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/generic.c | 26 ++++++++++++-------------- fs/proc/internal.h | 6 +++--- fs/proc/proc_net.c | 2 +- fs/proc/root.c | 2 +- 4 files changed, 17 insertions(+), 19 deletions(-) diff --git a/fs/proc/generic.c b/fs/proc/generic.c index fc0333fd5676..04c4804cbdef 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -52,8 +52,8 @@ static int proc_match(const char *name, struct proc_dir_entry *de, unsigned int static struct proc_dir_entry *pde_subdir_first(struct proc_dir_entry *dir) { - return rb_entry_safe(rb_first_cached(&dir->subdir), - struct proc_dir_entry, subdir_node); + return rb_entry_safe(rb_first(&dir->subdir), struct proc_dir_entry, + subdir_node); } static struct proc_dir_entry *pde_subdir_next(struct proc_dir_entry *dir) @@ -66,7 +66,7 @@ static struct proc_dir_entry *pde_subdir_find(struct proc_dir_entry *dir, const char *name, unsigned int len) { - struct rb_node *node = dir->subdir.rb_root.rb_node; + struct rb_node *node = dir->subdir.rb_node; while (node) { struct proc_dir_entry *de = rb_entry(node, @@ -87,9 +87,8 @@ static struct proc_dir_entry *pde_subdir_find(struct proc_dir_entry *dir, static bool pde_subdir_insert(struct proc_dir_entry *dir, struct proc_dir_entry *de) { - struct rb_root_cached *root = &dir->subdir; - struct rb_node **new = &root->rb_root.rb_node, *parent = NULL; - bool leftmost = true; + struct rb_root *root = &dir->subdir; + struct rb_node **new = &root->rb_node, *parent = NULL; /* Figure out where to put new node */ while (*new) { @@ -101,16 +100,15 @@ static bool pde_subdir_insert(struct proc_dir_entry *dir, parent = *new; if (result < 0) new = &(*new)->rb_left; - else if (result > 0) { + else if (result > 0) new = &(*new)->rb_right; - leftmost = false; - } else + else return false; } /* Add new node and rebalance tree. */ rb_link_node(&de->subdir_node, parent, new); - rb_insert_color_cached(&de->subdir_node, root, leftmost); + rb_insert_color(&de->subdir_node, root); return true; } @@ -401,7 +399,7 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, ent->namelen = qstr.len; ent->mode = mode; ent->nlink = nlink; - ent->subdir = RB_ROOT_CACHED; + ent->subdir = RB_ROOT; refcount_set(&ent->refcnt, 1); spin_lock_init(&ent->pde_unload_lock); INIT_LIST_HEAD(&ent->pde_openers); @@ -577,7 +575,7 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) de = pde_subdir_find(parent, fn, len); if (de) - rb_erase_cached(&de->subdir_node, &parent->subdir); + rb_erase(&de->subdir_node, &parent->subdir); write_unlock(&proc_subdir_lock); if (!de) { WARN(1, "name '%s'\n", name); @@ -614,13 +612,13 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) write_unlock(&proc_subdir_lock); return -ENOENT; } - rb_erase_cached(&root->subdir_node, &parent->subdir); + rb_erase(&root->subdir_node, &parent->subdir); de = root; while (1) { next = pde_subdir_first(de); if (next) { - rb_erase_cached(&next->subdir_node, &de->subdir); + rb_erase(&next->subdir_node, &de->subdir); de = next; continue; } diff --git a/fs/proc/internal.h b/fs/proc/internal.h index b7024f174778..0f1692e63cb6 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -51,15 +51,15 @@ struct proc_dir_entry { kgid_t gid; loff_t size; struct proc_dir_entry *parent; - struct rb_root_cached subdir; + struct rb_root subdir; struct rb_node subdir_node; char *name; umode_t mode; u8 namelen; #ifdef CONFIG_64BIT -#define SIZEOF_PDE_INLINE_NAME (192-147) +#define SIZEOF_PDE_INLINE_NAME (192-139) #else -#define SIZEOF_PDE_INLINE_NAME (128-91) +#define SIZEOF_PDE_INLINE_NAME (128-87) #endif char inline_name[SIZEOF_PDE_INLINE_NAME]; } __randomize_layout; diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index e5fe3d400737..1763f370489d 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -196,7 +196,7 @@ static __net_init int proc_net_ns_init(struct net *net) if (!netd) goto out; - netd->subdir = RB_ROOT_CACHED; + netd->subdir = RB_ROOT; netd->data = net; netd->nlink = 2; netd->namelen = 3; diff --git a/fs/proc/root.c b/fs/proc/root.c index 76c996457ff9..61b7340b357a 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -203,7 +203,7 @@ struct proc_dir_entry proc_root = { .proc_iops = &proc_root_inode_operations, .proc_fops = &proc_root_operations, .parent = &proc_root, - .subdir = RB_ROOT_CACHED, + .subdir = RB_ROOT, .name = proc_root.inline_name, .inline_name = "/proc", }; From 1f5bd0547654ada423b184e22f320d76c0fac49e Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 10 Apr 2018 16:43:28 -0700 Subject: [PATCH 079/140] proc: selftests: test /proc/uptime The only tests I could come up with for /proc/uptime are: - test that values increase monotonically for 1 second, - bounce around CPUs and test the same thing. Avoid glibc like plague for affinity given patches like this: https://marc.info/?l=linux-kernel&m=152130031912594&w=4 Link: http://lkml.kernel.org/r/20180317165235.GB3445@avx2 Signed-off-by: Alexey Dobriyan Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/proc/.gitignore | 2 + tools/testing/selftests/proc/Makefile | 2 + .../testing/selftests/proc/proc-uptime-001.c | 45 +++++++++++ .../testing/selftests/proc/proc-uptime-002.c | 79 +++++++++++++++++++ tools/testing/selftests/proc/proc-uptime.h | 74 +++++++++++++++++ 5 files changed, 202 insertions(+) create mode 100644 tools/testing/selftests/proc/proc-uptime-001.c create mode 100644 tools/testing/selftests/proc/proc-uptime-002.c create mode 100644 tools/testing/selftests/proc/proc-uptime.h diff --git a/tools/testing/selftests/proc/.gitignore b/tools/testing/selftests/proc/.gitignore index 5627df81ade9..6c16f77c722c 100644 --- a/tools/testing/selftests/proc/.gitignore +++ b/tools/testing/selftests/proc/.gitignore @@ -3,4 +3,6 @@ /proc-self-map-files-002 /proc-self-syscall /proc-self-wchan +/proc-uptime-001 +/proc-uptime-002 /read diff --git a/tools/testing/selftests/proc/Makefile b/tools/testing/selftests/proc/Makefile index 312a3989820c..dbb87e56264c 100644 --- a/tools/testing/selftests/proc/Makefile +++ b/tools/testing/selftests/proc/Makefile @@ -6,6 +6,8 @@ TEST_GEN_PROGS += proc-self-map-files-001 TEST_GEN_PROGS += proc-self-map-files-002 TEST_GEN_PROGS += proc-self-syscall TEST_GEN_PROGS += proc-self-wchan +TEST_GEN_PROGS += proc-uptime-001 +TEST_GEN_PROGS += proc-uptime-002 TEST_GEN_PROGS += read include ../lib.mk diff --git a/tools/testing/selftests/proc/proc-uptime-001.c b/tools/testing/selftests/proc/proc-uptime-001.c new file mode 100644 index 000000000000..303f26092306 --- /dev/null +++ b/tools/testing/selftests/proc/proc-uptime-001.c @@ -0,0 +1,45 @@ +/* + * Copyright _ 2018 Alexey Dobriyan + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +// Test that values in /proc/uptime increment monotonically. +#undef NDEBUG +#include +#include +#include +#include +#include + +#include "proc-uptime.h" + +int main(void) +{ + uint64_t start, u0, u1, i0, i1; + int fd; + + fd = open("/proc/uptime", O_RDONLY); + assert(fd >= 0); + + proc_uptime(fd, &u0, &i0); + start = u0; + do { + proc_uptime(fd, &u1, &i1); + assert(u1 >= u0); + assert(i1 >= i0); + u0 = u1; + i0 = i1; + } while (u1 - start < 100); + + return 0; +} diff --git a/tools/testing/selftests/proc/proc-uptime-002.c b/tools/testing/selftests/proc/proc-uptime-002.c new file mode 100644 index 000000000000..0cb79e1f1674 --- /dev/null +++ b/tools/testing/selftests/proc/proc-uptime-002.c @@ -0,0 +1,79 @@ +/* + * Copyright _ 2018 Alexey Dobriyan + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +// Test that values in /proc/uptime increment monotonically +// while shifting across CPUs. +#define _GNU_SOURCE +#undef NDEBUG +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "proc-uptime.h" + +static inline int sys_sched_getaffinity(pid_t pid, unsigned int len, unsigned long *m) +{ + return syscall(SYS_sched_getaffinity, pid, len, m); +} + +static inline int sys_sched_setaffinity(pid_t pid, unsigned int len, unsigned long *m) +{ + return syscall(SYS_sched_setaffinity, pid, len, m); +} + +int main(void) +{ + unsigned int len; + unsigned long *m; + unsigned int cpu; + uint64_t u0, u1, i0, i1; + int fd; + + /* find out "nr_cpu_ids" */ + m = NULL; + len = 0; + do { + len += sizeof(unsigned long); + free(m); + m = malloc(len); + } while (sys_sched_getaffinity(0, len, m) == -EINVAL); + + fd = open("/proc/uptime", O_RDONLY); + assert(fd >= 0); + + proc_uptime(fd, &u0, &i0); + for (cpu = 0; cpu < len * 8; cpu++) { + memset(m, 0, len); + m[cpu / (8 * sizeof(unsigned long))] |= 1UL << (cpu % (8 * sizeof(unsigned long))); + + /* CPU might not exist, ignore error */ + sys_sched_setaffinity(0, len, m); + + proc_uptime(fd, &u1, &i1); + assert(u1 >= u0); + assert(i1 >= i0); + u0 = u1; + i0 = i1; + } + + return 0; +} diff --git a/tools/testing/selftests/proc/proc-uptime.h b/tools/testing/selftests/proc/proc-uptime.h new file mode 100644 index 000000000000..d584419f50a7 --- /dev/null +++ b/tools/testing/selftests/proc/proc-uptime.h @@ -0,0 +1,74 @@ +/* + * Copyright _ 2018 Alexey Dobriyan + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +#undef NDEBUG +#include +#include +#include +#include +#include + +static unsigned long long xstrtoull(const char *p, char **end) +{ + if (*p == '0') { + *end = (char *)p + 1; + return 0; + } else if ('1' <= *p && *p <= '9') { + unsigned long long val; + + errno = 0; + val = strtoull(p, end, 10); + assert(errno == 0); + return val; + } else + assert(0); +} + +static void proc_uptime(int fd, uint64_t *uptime, uint64_t *idle) +{ + uint64_t val1, val2; + char buf[64], *p; + ssize_t rv; + + /* save "p < end" checks */ + memset(buf, 0, sizeof(buf)); + rv = pread(fd, buf, sizeof(buf), 0); + assert(0 <= rv && rv <= sizeof(buf)); + buf[sizeof(buf) - 1] = '\0'; + + p = buf; + + val1 = xstrtoull(p, &p); + assert(p[0] == '.'); + assert('0' <= p[1] && p[1] <= '9'); + assert('0' <= p[2] && p[2] <= '9'); + assert(p[3] == ' '); + + val2 = (p[1] - '0') * 10 + p[2] - '0'; + *uptime = val1 * 100 + val2; + + p += 4; + + val1 = xstrtoull(p, &p); + assert(p[0] == '.'); + assert('0' <= p[1] && p[1] <= '9'); + assert('0' <= p[2] && p[2] <= '9'); + assert(p[3] == '\n'); + + val2 = (p[1] - '0') * 10 + p[2] - '0'; + *idle = val1 * 100 + val2; + + assert(p + 4 == buf + rv); +} From 47d4b263a2f7324fb3cb641ca00b2725dd12dea0 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 10 Apr 2018 16:32:26 -0700 Subject: [PATCH 080/140] taint: convert to indexed initialization This converts to using indexed initializers instead of comments, adds a comment on why the taint flags can't be an enum, and make sure that no one forgets to update the taint_flags when adding new bits. Link: http://lkml.kernel.org/r/1519084390-43867-2-git-send-email-keescook@chromium.org Signed-off-by: Kees Cook Reviewed-by: Andrew Morton Cc: Al Viro Cc: Alexey Dobriyan Cc: Jonathan Corbet Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kernel.h | 1 + kernel/panic.c | 36 +++++++++++++++++++----------------- 2 files changed, 20 insertions(+), 17 deletions(-) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 98273343bd45..086e8e80f765 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -544,6 +544,7 @@ extern enum system_states { SYSTEM_RESTART, } system_state; +/* This cannot be an enum because some may be used in assembly source. */ #define TAINT_PROPRIETARY_MODULE 0 #define TAINT_FORCED_MODULE 1 #define TAINT_CPU_OUT_OF_SPEC 2 diff --git a/kernel/panic.c b/kernel/panic.c index 6c3b08cd1139..af4cfa8eda22 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -308,23 +308,23 @@ EXPORT_SYMBOL(panic); * is being removed anyway. */ const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = { - { 'P', 'G', true }, /* TAINT_PROPRIETARY_MODULE */ - { 'F', ' ', true }, /* TAINT_FORCED_MODULE */ - { 'S', ' ', false }, /* TAINT_CPU_OUT_OF_SPEC */ - { 'R', ' ', false }, /* TAINT_FORCED_RMMOD */ - { 'M', ' ', false }, /* TAINT_MACHINE_CHECK */ - { 'B', ' ', false }, /* TAINT_BAD_PAGE */ - { 'U', ' ', false }, /* TAINT_USER */ - { 'D', ' ', false }, /* TAINT_DIE */ - { 'A', ' ', false }, /* TAINT_OVERRIDDEN_ACPI_TABLE */ - { 'W', ' ', false }, /* TAINT_WARN */ - { 'C', ' ', true }, /* TAINT_CRAP */ - { 'I', ' ', false }, /* TAINT_FIRMWARE_WORKAROUND */ - { 'O', ' ', true }, /* TAINT_OOT_MODULE */ - { 'E', ' ', true }, /* TAINT_UNSIGNED_MODULE */ - { 'L', ' ', false }, /* TAINT_SOFTLOCKUP */ - { 'K', ' ', true }, /* TAINT_LIVEPATCH */ - { 'X', ' ', true }, /* TAINT_AUX */ + [ TAINT_PROPRIETARY_MODULE ] = { 'P', 'G', true }, + [ TAINT_FORCED_MODULE ] = { 'F', ' ', true }, + [ TAINT_CPU_OUT_OF_SPEC ] = { 'S', ' ', false }, + [ TAINT_FORCED_RMMOD ] = { 'R', ' ', false }, + [ TAINT_MACHINE_CHECK ] = { 'M', ' ', false }, + [ TAINT_BAD_PAGE ] = { 'B', ' ', false }, + [ TAINT_USER ] = { 'U', ' ', false }, + [ TAINT_DIE ] = { 'D', ' ', false }, + [ TAINT_OVERRIDDEN_ACPI_TABLE ] = { 'A', ' ', false }, + [ TAINT_WARN ] = { 'W', ' ', false }, + [ TAINT_CRAP ] = { 'C', ' ', true }, + [ TAINT_FIRMWARE_WORKAROUND ] = { 'I', ' ', false }, + [ TAINT_OOT_MODULE ] = { 'O', ' ', true }, + [ TAINT_UNSIGNED_MODULE ] = { 'E', ' ', true }, + [ TAINT_SOFTLOCKUP ] = { 'L', ' ', false }, + [ TAINT_LIVEPATCH ] = { 'K', ' ', true }, + [ TAINT_AUX ] = { 'X', ' ', true }, }; /** @@ -354,6 +354,8 @@ const char *print_tainted(void) { static char buf[TAINT_FLAGS_COUNT + sizeof("Tainted: ")]; + BUILD_BUG_ON(ARRAY_SIZE(taint_flags) != TAINT_FLAGS_COUNT); + if (tainted_mask) { char *s; int i; From 9c4560e5bbd8c839c8986f79ef536aa07bd77ec7 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 10 Apr 2018 16:32:29 -0700 Subject: [PATCH 081/140] taint: consolidate documentation This consolidates the taint bit documentation into a single place with both numeric and letter values. Additionally adds the missing TAINT_AUX documentation. Link: http://lkml.kernel.org/r/1519084390-43867-3-git-send-email-keescook@chromium.org Signed-off-by: Kees Cook Reviewed-by: Andrew Morton Cc: Al Viro Cc: Alexey Dobriyan Cc: Jonathan Corbet Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/kernel.txt | 51 +++++++++++++++++---------------- kernel/panic.c | 23 +++------------ 2 files changed, 30 insertions(+), 44 deletions(-) diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 412314eebda6..4a890c7fb6c3 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -964,32 +964,33 @@ detect a hard lockup condition. tainted: -Non-zero if the kernel has been tainted. Numeric values, which -can be ORed together: +Non-zero if the kernel has been tainted. Numeric values, which can be +ORed together. The letters are seen in "Tainted" line of Oops reports. - 1 - A module with a non-GPL license has been loaded, this - includes modules with no license. - Set by modutils >= 2.4.9 and module-init-tools. - 2 - A module was force loaded by insmod -f. - Set by modutils >= 2.4.9 and module-init-tools. - 4 - Unsafe SMP processors: SMP with CPUs not designed for SMP. - 8 - A module was forcibly unloaded from the system by rmmod -f. - 16 - A hardware machine check error occurred on the system. - 32 - A bad page was discovered on the system. - 64 - The user has asked that the system be marked "tainted". This - could be because they are running software that directly modifies - the hardware, or for other reasons. - 128 - The system has died. - 256 - The ACPI DSDT has been overridden with one supplied by the user - instead of using the one provided by the hardware. - 512 - A kernel warning has occurred. -1024 - A module from drivers/staging was loaded. -2048 - The system is working around a severe firmware bug. -4096 - An out-of-tree module has been loaded. -8192 - An unsigned module has been loaded in a kernel supporting module - signature. -16384 - A soft lockup has previously occurred on the system. -32768 - The kernel has been live patched. + 1 (P): A module with a non-GPL license has been loaded, this + includes modules with no license. + Set by modutils >= 2.4.9 and module-init-tools. + 2 (F): A module was force loaded by insmod -f. + Set by modutils >= 2.4.9 and module-init-tools. + 4 (S): Unsafe SMP processors: SMP with CPUs not designed for SMP. + 8 (R): A module was forcibly unloaded from the system by rmmod -f. + 16 (M): A hardware machine check error occurred on the system. + 32 (B): A bad page was discovered on the system. + 64 (U): The user has asked that the system be marked "tainted". This + could be because they are running software that directly modifies + the hardware, or for other reasons. + 128 (D): The system has died. + 256 (A): The ACPI DSDT has been overridden with one supplied by the user + instead of using the one provided by the hardware. + 512 (W): A kernel warning has occurred. + 1024 (C): A module from drivers/staging was loaded. + 2048 (I): The system is working around a severe firmware bug. + 4096 (O): An out-of-tree module has been loaded. + 8192 (E): An unsigned module has been loaded in a kernel supporting module + signature. + 16384 (L): A soft lockup has previously occurred on the system. + 32768 (K): The kernel has been live patched. + 65536 (X): Auxiliary taint, defined and used by for distros. ============================================================== diff --git a/kernel/panic.c b/kernel/panic.c index af4cfa8eda22..5ceb9cbec4a2 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -328,27 +328,12 @@ const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = { }; /** - * print_tainted - return a string to represent the kernel taint state. + * print_tainted - return a string to represent the kernel taint state. * - * 'P' - Proprietary module has been loaded. - * 'F' - Module has been forcibly loaded. - * 'S' - SMP with CPUs not designed for SMP. - * 'R' - User forced a module unload. - * 'M' - System experienced a machine check exception. - * 'B' - System has hit bad_page. - * 'U' - Userspace-defined naughtiness. - * 'D' - Kernel has oopsed before - * 'A' - ACPI table overridden. - * 'W' - Taint on warning. - * 'C' - modules from drivers/staging are loaded. - * 'I' - Working around severe firmware bug. - * 'O' - Out-of-tree module has been loaded. - * 'E' - Unsigned module has been loaded. - * 'L' - A soft lockup has previously occurred. - * 'K' - Kernel has been live patched. - * 'X' - Auxiliary taint, for distros' use. + * For individual taint flag meanings, see Documentation/sysctl/kernel.txt * - * The string is overwritten by the next call to print_tainted(). + * The string is overwritten by the next call to print_tainted(), + * but is always NULL terminated. */ const char *print_tainted(void) { From bc4f2f5469ac2a52affadc4c00c1276d76151a39 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 10 Apr 2018 16:32:33 -0700 Subject: [PATCH 082/140] taint: add taint for randstruct Since the randstruct plugin can intentionally produce extremely unusual kernel structure layouts (even performance pathological ones), some maintainers want to be able to trivially determine if an Oops is coming from a randstruct-built kernel, so as to keep their sanity when debugging. This adds the new flag and initializes taint_mask immediately when built with randstruct. Link: http://lkml.kernel.org/r/1519084390-43867-4-git-send-email-keescook@chromium.org Signed-off-by: Kees Cook Reviewed-by: Andrew Morton Cc: Al Viro Cc: Alexey Dobriyan Cc: Jonathan Corbet Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/kernel.txt | 1 + include/linux/kernel.h | 3 ++- kernel/panic.c | 4 +++- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 4a890c7fb6c3..eded671d55eb 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -991,6 +991,7 @@ ORed together. The letters are seen in "Tainted" line of Oops reports. 16384 (L): A soft lockup has previously occurred on the system. 32768 (K): The kernel has been live patched. 65536 (X): Auxiliary taint, defined and used by for distros. +131072 (T): The kernel was built with the struct randomization plugin. ============================================================== diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 086e8e80f765..6a1eb0b0aad9 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -562,7 +562,8 @@ extern enum system_states { #define TAINT_SOFTLOCKUP 14 #define TAINT_LIVEPATCH 15 #define TAINT_AUX 16 -#define TAINT_FLAGS_COUNT 17 +#define TAINT_RANDSTRUCT 17 +#define TAINT_FLAGS_COUNT 18 struct taint_flag { char c_true; /* character printed when tainted */ diff --git a/kernel/panic.c b/kernel/panic.c index 5ceb9cbec4a2..42e487488554 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -34,7 +34,8 @@ #define PANIC_BLINK_SPD 18 int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE; -static unsigned long tainted_mask; +static unsigned long tainted_mask = + IS_ENABLED(CONFIG_GCC_PLUGIN_RANDSTRUCT) ? (1 << TAINT_RANDSTRUCT) : 0; static int pause_on_oops; static int pause_on_oops_flag; static DEFINE_SPINLOCK(pause_on_oops_lock); @@ -325,6 +326,7 @@ const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = { [ TAINT_SOFTLOCKUP ] = { 'L', ' ', false }, [ TAINT_LIVEPATCH ] = { 'K', ' ', true }, [ TAINT_AUX ] = { 'X', ' ', true }, + [ TAINT_RANDSTRUCT ] = { 'T', ' ', true }, }; /** From 3ea056c50476f877f8bceb560ab69871098cb3a9 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 10 Apr 2018 16:32:36 -0700 Subject: [PATCH 083/140] uts: create "struct uts_namespace" from kmem_cache So "struct uts_namespace" can enjoy fine-grained SLAB debugging and usercopy protection. I'd prefer shorter name "utsns" but there is "user_namespace" already. Link: http://lkml.kernel.org/r/20180228215158.GA23146@avx2 Signed-off-by: Alexey Dobriyan Reviewed-by: Andrew Morton Cc: "Eric W. Biederman" Cc: Serge Hallyn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/utsname.h | 6 ++++++ init/main.c | 2 ++ kernel/utsname.c | 20 ++++++++++++++++---- 3 files changed, 24 insertions(+), 4 deletions(-) diff --git a/include/linux/utsname.h b/include/linux/utsname.h index c8060c2ecd04..44429d9142ca 100644 --- a/include/linux/utsname.h +++ b/include/linux/utsname.h @@ -44,6 +44,8 @@ static inline void put_uts_ns(struct uts_namespace *ns) { kref_put(&ns->kref, free_uts_ns); } + +void uts_ns_init(void); #else static inline void get_uts_ns(struct uts_namespace *ns) { @@ -61,6 +63,10 @@ static inline struct uts_namespace *copy_utsname(unsigned long flags, return old_ns; } + +static inline void uts_ns_init(void) +{ +} #endif #ifdef CONFIG_PROC_SYSCTL diff --git a/init/main.c b/init/main.c index d499f4a80e0b..50359a3162d0 100644 --- a/init/main.c +++ b/init/main.c @@ -51,6 +51,7 @@ #include #include #include +#include #include #include #include @@ -706,6 +707,7 @@ asmlinkage __visible void __init start_kernel(void) cred_init(); fork_init(); proc_caches_init(); + uts_ns_init(); buffer_init(); key_init(); security_init(); diff --git a/kernel/utsname.c b/kernel/utsname.c index 913fe4336d2b..dcd6be1996fe 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -19,6 +19,8 @@ #include #include +static struct kmem_cache *uts_ns_cache __ro_after_init; + static struct ucounts *inc_uts_namespaces(struct user_namespace *ns) { return inc_ucount(ns, current_euid(), UCOUNT_UTS_NAMESPACES); @@ -33,7 +35,7 @@ static struct uts_namespace *create_uts_ns(void) { struct uts_namespace *uts_ns; - uts_ns = kmalloc(sizeof(struct uts_namespace), GFP_KERNEL); + uts_ns = kmem_cache_alloc(uts_ns_cache, GFP_KERNEL); if (uts_ns) kref_init(&uts_ns->kref); return uts_ns; @@ -42,7 +44,7 @@ static struct uts_namespace *create_uts_ns(void) /* * Clone a new ns copying an original utsname, setting refcount to 1 * @old_ns: namespace to clone - * Return ERR_PTR(-ENOMEM) on error (failure to kmalloc), new ns otherwise + * Return ERR_PTR(-ENOMEM) on error (failure to allocate), new ns otherwise */ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, struct uts_namespace *old_ns) @@ -75,7 +77,7 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, return ns; fail_free: - kfree(ns); + kmem_cache_free(uts_ns_cache, ns); fail_dec: dec_uts_namespaces(ucounts); fail: @@ -113,7 +115,7 @@ void free_uts_ns(struct kref *kref) dec_uts_namespaces(ns->ucounts); put_user_ns(ns->user_ns); ns_free_inum(&ns->ns); - kfree(ns); + kmem_cache_free(uts_ns_cache, ns); } static inline struct uts_namespace *to_uts_ns(struct ns_common *ns) @@ -169,3 +171,13 @@ const struct proc_ns_operations utsns_operations = { .install = utsns_install, .owner = utsns_owner, }; + +void __init uts_ns_init(void) +{ + uts_ns_cache = kmem_cache_create_usercopy( + "uts_namespace", sizeof(struct uts_namespace), 0, + SLAB_PANIC|SLAB_ACCOUNT, + offsetof(struct uts_namespace, name), + sizeof_field(struct uts_namespace, name), + NULL); +} From d4ef8d3ff005c70f6c9e2ffea14cc65fc8fe328d Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Tue, 10 Apr 2018 16:32:40 -0700 Subject: [PATCH 084/140] clang-format: add configuration file clang-format is a tool to format C/C++/... code according to a set of rules and heuristics. Like most tools, it is not perfect nor covers every single case, but it is good enough to be helpful. In particular, it is useful for quickly re-formatting blocks of code automatically, for reviewing full files in order to spot coding style mistakes, typos and possible improvements. It is also handy for sorting ``#includes``, for aligning variables and macros, for reflowing text and other similar tasks. It also serves as a teaching tool/guide for newcomers. The tool itself has been already included in the repositories of popular Linux distributions for a long time. The rules in this file are intended for clang-format >= 4, which is easily available in most distributions. This commit adds the configuration file that contains the rules that the tool uses to know how to format the code according to the kernel coding style. This gives us several advantages: * clang-format works out of the box with reasonable defaults; avoiding that everyone has to re-do the configuration. * Everyone agrees (eventually) on what is the most useful default configuration for most of the kernel. * If it becomes commonplace among kernel developers, clang-format may feel compelled to support us better. They already recognize the Linux kernel and its style in their documentation and in one of the style sub-options. Some of clang-format's features relevant for the kernel are: * Uses clang's tooling support behind the scenes to parse and rewrite the code. It is not based on ad-hoc regexps. * Supports reasonably well the Linux kernel coding style. * Fast enough to be used at the press of a key. * There are already integrations (either built-in or third-party) for many common editors used by kernel developers (e.g. vim, emacs, Sublime, Atom...) that allow you to format an entire file or, more usefully, just your selection. * Able to parse unified diffs -- you can, for instance, reformat only the lines changed by a git commit. * Able to reflow text comments as well. * Widely supported and used by hundreds of developers in highly complex projects and organizations (e.g. the LLVM project itself, Chromium, WebKit, Google, Mozilla...). Therefore, it will be supported for a long time. See more information about the tool at: https://clang.llvm.org/docs/ClangFormat.html https://clang.llvm.org/docs/ClangFormatStyleOptions.html Link: http://lkml.kernel.org/r/20180318171632.qfkemw3mwbcukth6@gmail.com Signed-off-by: Miguel Ojeda Cc: Randy Dunlap Cc: Andy Whitcroft Cc: Joe Perches Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- .clang-format | 428 +++++++++++++++++++++++++ .gitignore | 1 + Documentation/process/4.Coding.rst | 8 + Documentation/process/clang-format.rst | 184 +++++++++++ Documentation/process/coding-style.rst | 8 + 5 files changed, 629 insertions(+) create mode 100644 .clang-format create mode 100644 Documentation/process/clang-format.rst diff --git a/.clang-format b/.clang-format new file mode 100644 index 000000000000..faffc0d5af4e --- /dev/null +++ b/.clang-format @@ -0,0 +1,428 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# clang-format configuration file. Intended for clang-format >= 4. +# +# For more information, see: +# +# Documentation/process/clang-format.rst +# https://clang.llvm.org/docs/ClangFormat.html +# https://clang.llvm.org/docs/ClangFormatStyleOptions.html +# +--- +AccessModifierOffset: -4 +AlignAfterOpenBracket: Align +AlignConsecutiveAssignments: false +AlignConsecutiveDeclarations: false +#AlignEscapedNewlines: Left # Unknown to clang-format-4.0 +AlignOperands: true +AlignTrailingComments: false +AllowAllParametersOfDeclarationOnNextLine: false +AllowShortBlocksOnASingleLine: false +AllowShortCaseLabelsOnASingleLine: false +AllowShortFunctionsOnASingleLine: None +AllowShortIfStatementsOnASingleLine: false +AllowShortLoopsOnASingleLine: false +AlwaysBreakAfterDefinitionReturnType: None +AlwaysBreakAfterReturnType: None +AlwaysBreakBeforeMultilineStrings: false +AlwaysBreakTemplateDeclarations: false +BinPackArguments: true +BinPackParameters: true +BraceWrapping: + AfterClass: false + AfterControlStatement: false + AfterEnum: false + AfterFunction: true + AfterNamespace: true + AfterObjCDeclaration: false + AfterStruct: false + AfterUnion: false + #AfterExternBlock: false # Unknown to clang-format-5.0 + BeforeCatch: false + BeforeElse: false + IndentBraces: false + #SplitEmptyFunction: true # Unknown to clang-format-4.0 + #SplitEmptyRecord: true # Unknown to clang-format-4.0 + #SplitEmptyNamespace: true # Unknown to clang-format-4.0 +BreakBeforeBinaryOperators: None +BreakBeforeBraces: Custom +#BreakBeforeInheritanceComma: false # Unknown to clang-format-4.0 +BreakBeforeTernaryOperators: false +BreakConstructorInitializersBeforeComma: false +#BreakConstructorInitializers: BeforeComma # Unknown to clang-format-4.0 +BreakAfterJavaFieldAnnotations: false +BreakStringLiterals: false +ColumnLimit: 80 +CommentPragmas: '^ IWYU pragma:' +#CompactNamespaces: false # Unknown to clang-format-4.0 +ConstructorInitializerAllOnOneLineOrOnePerLine: false +ConstructorInitializerIndentWidth: 8 +ContinuationIndentWidth: 8 +Cpp11BracedListStyle: false +DerivePointerAlignment: false +DisableFormat: false +ExperimentalAutoDetectBinPacking: false +#FixNamespaceComments: false # Unknown to clang-format-4.0 + +# Taken from: +# git grep -h '^#define [^[:space:]]*for_each[^[:space:]]*(' include/ \ +# | sed "s,^#define \([^[:space:]]*for_each[^[:space:]]*\)(.*$, - '\1'," \ +# | sort | uniq +ForEachMacros: + - 'apei_estatus_for_each_section' + - 'ata_for_each_dev' + - 'ata_for_each_link' + - 'ax25_for_each' + - 'ax25_uid_for_each' + - 'bio_for_each_integrity_vec' + - '__bio_for_each_segment' + - 'bio_for_each_segment' + - 'bio_for_each_segment_all' + - 'bio_list_for_each' + - 'bip_for_each_vec' + - 'blkg_for_each_descendant_post' + - 'blkg_for_each_descendant_pre' + - 'blk_queue_for_each_rl' + - 'bond_for_each_slave' + - 'bond_for_each_slave_rcu' + - 'btree_for_each_safe128' + - 'btree_for_each_safe32' + - 'btree_for_each_safe64' + - 'btree_for_each_safel' + - 'card_for_each_dev' + - 'cgroup_taskset_for_each' + - 'cgroup_taskset_for_each_leader' + - 'cpufreq_for_each_entry' + - 'cpufreq_for_each_entry_idx' + - 'cpufreq_for_each_valid_entry' + - 'cpufreq_for_each_valid_entry_idx' + - 'css_for_each_child' + - 'css_for_each_descendant_post' + - 'css_for_each_descendant_pre' + - 'device_for_each_child_node' + - 'drm_atomic_crtc_for_each_plane' + - 'drm_atomic_crtc_state_for_each_plane' + - 'drm_atomic_crtc_state_for_each_plane_state' + - 'drm_for_each_connector_iter' + - 'drm_for_each_crtc' + - 'drm_for_each_encoder' + - 'drm_for_each_encoder_mask' + - 'drm_for_each_fb' + - 'drm_for_each_legacy_plane' + - 'drm_for_each_plane' + - 'drm_for_each_plane_mask' + - 'drm_mm_for_each_hole' + - 'drm_mm_for_each_node' + - 'drm_mm_for_each_node_in_range' + - 'drm_mm_for_each_node_safe' + - 'for_each_active_drhd_unit' + - 'for_each_active_iommu' + - 'for_each_available_child_of_node' + - 'for_each_bio' + - 'for_each_board_func_rsrc' + - 'for_each_bvec' + - 'for_each_child_of_node' + - 'for_each_clear_bit' + - 'for_each_clear_bit_from' + - 'for_each_cmsghdr' + - 'for_each_compatible_node' + - 'for_each_console' + - 'for_each_cpu' + - 'for_each_cpu_and' + - 'for_each_cpu_not' + - 'for_each_cpu_wrap' + - 'for_each_dev_addr' + - 'for_each_dma_cap_mask' + - 'for_each_drhd_unit' + - 'for_each_dss_dev' + - 'for_each_efi_memory_desc' + - 'for_each_efi_memory_desc_in_map' + - 'for_each_endpoint_of_node' + - 'for_each_evictable_lru' + - 'for_each_fib6_node_rt_rcu' + - 'for_each_fib6_walker_rt' + - 'for_each_free_mem_range' + - 'for_each_free_mem_range_reverse' + - 'for_each_func_rsrc' + - 'for_each_hstate' + - 'for_each_if' + - 'for_each_iommu' + - 'for_each_ip_tunnel_rcu' + - 'for_each_irq_nr' + - 'for_each_lru' + - 'for_each_matching_node' + - 'for_each_matching_node_and_match' + - 'for_each_memblock' + - 'for_each_memblock_type' + - 'for_each_memcg_cache_index' + - 'for_each_mem_pfn_range' + - 'for_each_mem_range' + - 'for_each_mem_range_rev' + - 'for_each_migratetype_order' + - 'for_each_msi_entry' + - 'for_each_net' + - 'for_each_netdev' + - 'for_each_netdev_continue' + - 'for_each_netdev_continue_rcu' + - 'for_each_netdev_feature' + - 'for_each_netdev_in_bond_rcu' + - 'for_each_netdev_rcu' + - 'for_each_netdev_reverse' + - 'for_each_netdev_safe' + - 'for_each_net_rcu' + - 'for_each_new_connector_in_state' + - 'for_each_new_crtc_in_state' + - 'for_each_new_plane_in_state' + - 'for_each_new_private_obj_in_state' + - 'for_each_node' + - 'for_each_node_by_name' + - 'for_each_node_by_type' + - 'for_each_node_mask' + - 'for_each_node_state' + - 'for_each_node_with_cpus' + - 'for_each_node_with_property' + - 'for_each_of_allnodes' + - 'for_each_of_allnodes_from' + - 'for_each_of_pci_range' + - 'for_each_old_connector_in_state' + - 'for_each_old_crtc_in_state' + - 'for_each_oldnew_connector_in_state' + - 'for_each_oldnew_crtc_in_state' + - 'for_each_oldnew_plane_in_state' + - 'for_each_oldnew_private_obj_in_state' + - 'for_each_old_plane_in_state' + - 'for_each_old_private_obj_in_state' + - 'for_each_online_cpu' + - 'for_each_online_node' + - 'for_each_online_pgdat' + - 'for_each_pci_bridge' + - 'for_each_pci_dev' + - 'for_each_pci_msi_entry' + - 'for_each_populated_zone' + - 'for_each_possible_cpu' + - 'for_each_present_cpu' + - 'for_each_prime_number' + - 'for_each_prime_number_from' + - 'for_each_process' + - 'for_each_process_thread' + - 'for_each_property_of_node' + - 'for_each_reserved_mem_region' + - 'for_each_resv_unavail_range' + - 'for_each_rtdcom' + - 'for_each_rtdcom_safe' + - 'for_each_set_bit' + - 'for_each_set_bit_from' + - 'for_each_sg' + - 'for_each_sg_page' + - '__for_each_thread' + - 'for_each_thread' + - 'for_each_zone' + - 'for_each_zone_zonelist' + - 'for_each_zone_zonelist_nodemask' + - 'fwnode_for_each_available_child_node' + - 'fwnode_for_each_child_node' + - 'fwnode_graph_for_each_endpoint' + - 'gadget_for_each_ep' + - 'hash_for_each' + - 'hash_for_each_possible' + - 'hash_for_each_possible_rcu' + - 'hash_for_each_possible_rcu_notrace' + - 'hash_for_each_possible_safe' + - 'hash_for_each_rcu' + - 'hash_for_each_safe' + - 'hctx_for_each_ctx' + - 'hlist_bl_for_each_entry' + - 'hlist_bl_for_each_entry_rcu' + - 'hlist_bl_for_each_entry_safe' + - 'hlist_for_each' + - 'hlist_for_each_entry' + - 'hlist_for_each_entry_continue' + - 'hlist_for_each_entry_continue_rcu' + - 'hlist_for_each_entry_continue_rcu_bh' + - 'hlist_for_each_entry_from' + - 'hlist_for_each_entry_from_rcu' + - 'hlist_for_each_entry_rcu' + - 'hlist_for_each_entry_rcu_bh' + - 'hlist_for_each_entry_rcu_notrace' + - 'hlist_for_each_entry_safe' + - '__hlist_for_each_rcu' + - 'hlist_for_each_safe' + - 'hlist_nulls_for_each_entry' + - 'hlist_nulls_for_each_entry_from' + - 'hlist_nulls_for_each_entry_rcu' + - 'hlist_nulls_for_each_entry_safe' + - 'ide_host_for_each_port' + - 'ide_port_for_each_dev' + - 'ide_port_for_each_present_dev' + - 'idr_for_each_entry' + - 'idr_for_each_entry_continue' + - 'idr_for_each_entry_ul' + - 'inet_bind_bucket_for_each' + - 'inet_lhash2_for_each_icsk_rcu' + - 'iov_for_each' + - 'key_for_each' + - 'key_for_each_safe' + - 'klp_for_each_func' + - 'klp_for_each_object' + - 'kvm_for_each_memslot' + - 'kvm_for_each_vcpu' + - 'list_for_each' + - 'list_for_each_entry' + - 'list_for_each_entry_continue' + - 'list_for_each_entry_continue_rcu' + - 'list_for_each_entry_continue_reverse' + - 'list_for_each_entry_from' + - 'list_for_each_entry_from_reverse' + - 'list_for_each_entry_lockless' + - 'list_for_each_entry_rcu' + - 'list_for_each_entry_reverse' + - 'list_for_each_entry_safe' + - 'list_for_each_entry_safe_continue' + - 'list_for_each_entry_safe_from' + - 'list_for_each_entry_safe_reverse' + - 'list_for_each_prev' + - 'list_for_each_prev_safe' + - 'list_for_each_safe' + - 'llist_for_each' + - 'llist_for_each_entry' + - 'llist_for_each_entry_safe' + - 'llist_for_each_safe' + - 'media_device_for_each_entity' + - 'media_device_for_each_intf' + - 'media_device_for_each_link' + - 'media_device_for_each_pad' + - 'netdev_for_each_lower_dev' + - 'netdev_for_each_lower_private' + - 'netdev_for_each_lower_private_rcu' + - 'netdev_for_each_mc_addr' + - 'netdev_for_each_uc_addr' + - 'netdev_for_each_upper_dev_rcu' + - 'netdev_hw_addr_list_for_each' + - 'nft_rule_for_each_expr' + - 'nla_for_each_attr' + - 'nla_for_each_nested' + - 'nlmsg_for_each_attr' + - 'nlmsg_for_each_msg' + - 'nr_neigh_for_each' + - 'nr_neigh_for_each_safe' + - 'nr_node_for_each' + - 'nr_node_for_each_safe' + - 'of_for_each_phandle' + - 'of_property_for_each_string' + - 'of_property_for_each_u32' + - 'pci_bus_for_each_resource' + - 'ping_portaddr_for_each_entry' + - 'plist_for_each' + - 'plist_for_each_continue' + - 'plist_for_each_entry' + - 'plist_for_each_entry_continue' + - 'plist_for_each_entry_safe' + - 'plist_for_each_safe' + - 'pnp_for_each_card' + - 'pnp_for_each_dev' + - 'protocol_for_each_card' + - 'protocol_for_each_dev' + - 'queue_for_each_hw_ctx' + - 'radix_tree_for_each_contig' + - 'radix_tree_for_each_slot' + - 'radix_tree_for_each_tagged' + - 'rbtree_postorder_for_each_entry_safe' + - 'resource_list_for_each_entry' + - 'resource_list_for_each_entry_safe' + - 'rhl_for_each_entry_rcu' + - 'rhl_for_each_rcu' + - 'rht_for_each' + - 'rht_for_each_continue' + - 'rht_for_each_entry' + - 'rht_for_each_entry_continue' + - 'rht_for_each_entry_rcu' + - 'rht_for_each_entry_rcu_continue' + - 'rht_for_each_entry_safe' + - 'rht_for_each_rcu' + - 'rht_for_each_rcu_continue' + - '__rq_for_each_bio' + - 'rq_for_each_segment' + - 'scsi_for_each_prot_sg' + - 'scsi_for_each_sg' + - 'sctp_for_each_hentry' + - 'sctp_skb_for_each' + - 'shdma_for_each_chan' + - '__shost_for_each_device' + - 'shost_for_each_device' + - 'sk_for_each' + - 'sk_for_each_bound' + - 'sk_for_each_entry_offset_rcu' + - 'sk_for_each_from' + - 'sk_for_each_rcu' + - 'sk_for_each_safe' + - 'sk_nulls_for_each' + - 'sk_nulls_for_each_from' + - 'sk_nulls_for_each_rcu' + - 'snd_pcm_group_for_each_entry' + - 'snd_soc_dapm_widget_for_each_path' + - 'snd_soc_dapm_widget_for_each_path_safe' + - 'snd_soc_dapm_widget_for_each_sink_path' + - 'snd_soc_dapm_widget_for_each_source_path' + - 'tb_property_for_each' + - 'udp_portaddr_for_each_entry' + - 'udp_portaddr_for_each_entry_rcu' + - 'usb_hub_for_each_child' + - 'v4l2_device_for_each_subdev' + - 'v4l2_m2m_for_each_dst_buf' + - 'v4l2_m2m_for_each_dst_buf_safe' + - 'v4l2_m2m_for_each_src_buf' + - 'v4l2_m2m_for_each_src_buf_safe' + - 'zorro_for_each_dev' + +#IncludeBlocks: Preserve # Unknown to clang-format-5.0 +IncludeCategories: + - Regex: '.*' + Priority: 1 +IncludeIsMainRegex: '(Test)?$' +IndentCaseLabels: false +#IndentPPDirectives: None # Unknown to clang-format-5.0 +IndentWidth: 8 +IndentWrappedFunctionNames: true +JavaScriptQuotes: Leave +JavaScriptWrapImports: true +KeepEmptyLinesAtTheStartOfBlocks: false +MacroBlockBegin: '' +MacroBlockEnd: '' +MaxEmptyLinesToKeep: 1 +NamespaceIndentation: Inner +#ObjCBinPackProtocolList: Auto # Unknown to clang-format-5.0 +ObjCBlockIndentWidth: 8 +ObjCSpaceAfterProperty: true +ObjCSpaceBeforeProtocolList: true + +# Taken from git's rules +#PenaltyBreakAssignment: 10 # Unknown to clang-format-4.0 +PenaltyBreakBeforeFirstCallParameter: 30 +PenaltyBreakComment: 10 +PenaltyBreakFirstLessLess: 0 +PenaltyBreakString: 10 +PenaltyExcessCharacter: 100 +PenaltyReturnTypeOnItsOwnLine: 60 + +PointerAlignment: Right +ReflowComments: false +SortIncludes: false +#SortUsingDeclarations: false # Unknown to clang-format-4.0 +SpaceAfterCStyleCast: false +SpaceAfterTemplateKeyword: true +SpaceBeforeAssignmentOperators: true +#SpaceBeforeCtorInitializerColon: true # Unknown to clang-format-5.0 +#SpaceBeforeInheritanceColon: true # Unknown to clang-format-5.0 +SpaceBeforeParens: ControlStatements +#SpaceBeforeRangeBasedForLoopColon: true # Unknown to clang-format-5.0 +SpaceInEmptyParentheses: false +SpacesBeforeTrailingComments: 1 +SpacesInAngles: false +SpacesInContainerLiterals: false +SpacesInCStyleCastParentheses: false +SpacesInParentheses: false +SpacesInSquareBrackets: false +Standard: Cpp03 +TabWidth: 8 +UseTab: Always +... diff --git a/.gitignore b/.gitignore index 85bcc2696442..a1dfd2acd9c3 100644 --- a/.gitignore +++ b/.gitignore @@ -81,6 +81,7 @@ modules.builtin !.gitignore !.mailmap !.cocciconfig +!.clang-format # # Generated include files diff --git a/Documentation/process/4.Coding.rst b/Documentation/process/4.Coding.rst index 26b106071364..eb4b185d168c 100644 --- a/Documentation/process/4.Coding.rst +++ b/Documentation/process/4.Coding.rst @@ -58,6 +58,14 @@ can never be transgressed. If there is a good reason to go against the style (a line which becomes far less readable if split to fit within the 80-column limit, for example), just do it. +Note that you can also use the ``clang-format`` tool to help you with +these rules, to quickly re-format parts of your code automatically, +and to review full files in order to spot coding style mistakes, +typos and possible improvements. It is also handy for sorting ``#includes``, +for aligning variables/macros, for reflowing text and other similar tasks. +See the file :ref:`Documentation/process/clang-format.rst ` +for more details. + Abstraction layers ****************** diff --git a/Documentation/process/clang-format.rst b/Documentation/process/clang-format.rst new file mode 100644 index 000000000000..6710c0707721 --- /dev/null +++ b/Documentation/process/clang-format.rst @@ -0,0 +1,184 @@ +.. _clangformat: + +clang-format +============ + +``clang-format`` is a tool to format C/C++/... code according to +a set of rules and heuristics. Like most tools, it is not perfect +nor covers every single case, but it is good enough to be helpful. + +``clang-format`` can be used for several purposes: + + - Quickly reformat a block of code to the kernel style. Specially useful + when moving code around and aligning/sorting. See clangformatreformat_. + + - Spot style mistakes, typos and possible improvements in files + you maintain, patches you review, diffs, etc. See clangformatreview_. + + - Help you follow the coding style rules, specially useful for those + new to kernel development or working at the same time in several + projects with different coding styles. + +Its configuration file is ``.clang-format`` in the root of the kernel tree. +The rules contained there try to approximate the most common kernel +coding style. They also try to follow :ref:`Documentation/process/coding-style.rst ` +as much as possible. Since not all the kernel follows the same style, +it is possible that you may want to tweak the defaults for a particular +subsystem or folder. To do so, you can override the defaults by writing +another ``.clang-format`` file in a subfolder. + +The tool itself has already been included in the repositories of popular +Linux distributions for a long time. Search for ``clang-format`` in +your repositories. Otherwise, you can either download pre-built +LLVM/clang binaries or build the source code from: + + http://releases.llvm.org/download.html + +See more information about the tool at: + + https://clang.llvm.org/docs/ClangFormat.html + + https://clang.llvm.org/docs/ClangFormatStyleOptions.html + + +.. _clangformatreview: + +Review files and patches for coding style +----------------------------------------- + +By running the tool in its inline mode, you can review full subsystems, +folders or individual files for code style mistakes, typos or improvements. + +To do so, you can run something like:: + + # Make sure your working directory is clean! + clang-format -i kernel/*.[ch] + +And then take a look at the git diff. + +Counting the lines of such a diff is also useful for improving/tweaking +the style options in the configuration file; as well as testing new +``clang-format`` features/versions. + +``clang-format`` also supports reading unified diffs, so you can review +patches and git diffs easily. See the documentation at: + + https://clang.llvm.org/docs/ClangFormat.html#script-for-patch-reformatting + +To avoid ``clang-format`` formatting some portion of a file, you can do:: + + int formatted_code; + // clang-format off + void unformatted_code ; + // clang-format on + void formatted_code_again; + +While it might be tempting to use this to keep a file always in sync with +``clang-format``, specially if you are writing new files or if you are +a maintainer, please note that people might be running different +``clang-format`` versions or not have it available at all. Therefore, +you should probably refrain yourself from using this in kernel sources; +at least until we see if ``clang-format`` becomes commonplace. + + +.. _clangformatreformat: + +Reformatting blocks of code +--------------------------- + +By using an integration with your text editor, you can reformat arbitrary +blocks (selections) of code with a single keystroke. This is specially +useful when moving code around, for complex code that is deeply intended, +for multi-line macros (and aligning their backslashes), etc. + +Remember that you can always tweak the changes afterwards in those cases +where the tool did not do an optimal job. But as a first approximation, +it can be very useful. + +There are integrations for many popular text editors. For some of them, +like vim, emacs, BBEdit and Visual Studio you can find support built-in. +For instructions, read the appropiate section at: + + https://clang.llvm.org/docs/ClangFormat.html + +For Atom, Eclipse, Sublime Text, Visual Studio Code, XCode and other +editors and IDEs you should be able to find ready-to-use plugins. + +For this use case, consider using a secondary ``.clang-format`` +so that you can tweak a few options. See clangformatextra_. + + +.. _clangformatmissing: + +Missing support +--------------- + +``clang-format`` is missing support for some things that are common +in kernel code. They are easy to remember, so if you use the tool +regularly, you will quickly learn to avoid/ignore those. + +In particular, some very common ones you will notice are: + + - Aligned blocks of one-line ``#defines``, e.g.:: + + #define TRACING_MAP_BITS_DEFAULT 11 + #define TRACING_MAP_BITS_MAX 17 + #define TRACING_MAP_BITS_MIN 7 + + vs.:: + + #define TRACING_MAP_BITS_DEFAULT 11 + #define TRACING_MAP_BITS_MAX 17 + #define TRACING_MAP_BITS_MIN 7 + + - Aligned designated initializers, e.g.:: + + static const struct file_operations uprobe_events_ops = { + .owner = THIS_MODULE, + .open = probes_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, + .write = probes_write, + }; + + vs.:: + + static const struct file_operations uprobe_events_ops = { + .owner = THIS_MODULE, + .open = probes_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, + .write = probes_write, + }; + + +.. _clangformatextra: + +Extra features/options +---------------------- + +Some features/style options are not enabled by default in the configuration +file in order to minimize the differences between the output and the current +code. In other words, to make the difference as small as possible, +which makes reviewing full-file style, as well diffs and patches as easy +as possible. + +In other cases (e.g. particular subsystems/folders/files), the kernel style +might be different and enabling some of these options may approximate +better the style there. + +For instance: + + - Aligning assignments (``AlignConsecutiveAssignments``). + + - Aligning declarations (``AlignConsecutiveDeclarations``). + + - Reflowing text in comments (``ReflowComments``). + + - Sorting ``#includes`` (``SortIncludes``). + +They are typically useful for block re-formatting, rather than full-file. +You might want to create another ``.clang-format`` file and use that one +from your editor/IDE instead. diff --git a/Documentation/process/coding-style.rst b/Documentation/process/coding-style.rst index d98deb62c400..4e7c0a1c427a 100644 --- a/Documentation/process/coding-style.rst +++ b/Documentation/process/coding-style.rst @@ -631,6 +631,14 @@ options ``-kr -i8`` (stands for ``K&R, 8 character indents``), or use re-formatting you may want to take a look at the man page. But remember: ``indent`` is not a fix for bad programming. +Note that you can also use the ``clang-format`` tool to help you with +these rules, to quickly re-format parts of your code automatically, +and to review full files in order to spot coding style mistakes, +typos and possible improvements. It is also handy for sorting ``#includes``, +for aligning variables/macros, for reflowing text and other similar tasks. +See the file :ref:`Documentation/process/clang-format.rst ` +for more details. + 10) Kconfig configuration files ------------------------------- From 2cfe0d3009418a132b93d78642a8059a38fe5944 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 10 Apr 2018 16:32:44 -0700 Subject: [PATCH 085/140] task_struct: only use anon struct under randstruct plugin The original intent for always adding the anonymous struct in task_struct was to make sure we had compiler coverage. However, this caused pathological padding of 40 bytes at the start of task_struct. Instead, move the anonymous struct to being only used when struct layout randomization is enabled. Link: http://lkml.kernel.org/r/20180327213609.GA2964@beast Fixes: 29e48ce87f1e ("task_struct: Allow randomized") Signed-off-by: Kees Cook Reported-by: Peter Zijlstra Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compiler-clang.h | 3 --- include/linux/compiler-gcc.h | 12 +++--------- 2 files changed, 3 insertions(+), 12 deletions(-) diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index d3f264a5b04d..ceb96ecab96e 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -17,9 +17,6 @@ */ #define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__) -#define randomized_struct_fields_start struct { -#define randomized_struct_fields_end }; - /* all clang versions usable with the kernel support KASAN ABI version 5 */ #define KASAN_ABI_VERSION 5 diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index e2c7f4369eff..b4bf73f5e38f 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -242,6 +242,9 @@ #if defined(RANDSTRUCT_PLUGIN) && !defined(__CHECKER__) #define __randomize_layout __attribute__((randomize_layout)) #define __no_randomize_layout __attribute__((no_randomize_layout)) +/* This anon struct can add padding, so only enable it under randstruct. */ +#define randomized_struct_fields_start struct { +#define randomized_struct_fields_end } __randomize_layout; #endif #endif /* GCC_VERSION >= 40500 */ @@ -256,15 +259,6 @@ */ #define __visible __attribute__((externally_visible)) -/* - * RANDSTRUCT_PLUGIN wants to use an anonymous struct, but it is only - * possible since GCC 4.6. To provide as much build testing coverage - * as possible, this is used for all GCC 4.6+ builds, and not just on - * RANDSTRUCT_PLUGIN builds. - */ -#define randomized_struct_fields_start struct { -#define randomized_struct_fields_end } __randomize_layout; - #endif /* GCC_VERSION >= 40600 */ From d387a31d75496e076046013f1670d17b8694213e Mon Sep 17 00:00:00 2001 From: Alexandre Bounine Date: Tue, 10 Apr 2018 16:32:48 -0700 Subject: [PATCH 086/140] MAINTAINERS: update email address for Alexandre Bounine Link: http://lkml.kernel.org/r/1522958149-6157-1-git-send-email-alex.bou9@gmail.com Signed-off-by: Alexandre Bounine Reviewed-by: Andrew Morton Cc: Barry Wood Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 3e2c01faf53d..acba38738dc1 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11754,7 +11754,7 @@ F: drivers/char/random.c RAPIDIO SUBSYSTEM M: Matt Porter -M: Alexandre Bounine +M: Alexandre Bounine S: Maintained F: drivers/rapidio/ From 5f00ae0d3ef8d36041d8d40ec71ab31b22764cba Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 10 Apr 2018 16:32:51 -0700 Subject: [PATCH 087/140] lib/Kconfig.debug: Debug Lockups and Hangs: keep SOFTLOCKUP options together Keep all of the SOFTLOCKUP kconfig symbols together (instead of injecting the HARDLOCKUP symbols in the midst of them) so that the config tools display them with their dependencies. Tested with 'make {menuconfig/nconfig/gconfig/xconfig}'. Link: http://lkml.kernel.org/r/6be2d9ed-4656-5b94-460d-7f051e2c7570@infradead.org Fixes: 05a4a9527931 ("kernel/watchdog: split up config options") Signed-off-by: Randy Dunlap Cc: Nicholas Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig.debug | 48 +++++++++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 51c6bf0d93c6..c40c7b734cd1 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -800,6 +800,30 @@ config SOFTLOCKUP_DETECTOR chance to run. The current stack trace is displayed upon detection and the system will stay locked up. +config BOOTPARAM_SOFTLOCKUP_PANIC + bool "Panic (Reboot) On Soft Lockups" + depends on SOFTLOCKUP_DETECTOR + help + Say Y here to enable the kernel to panic on "soft lockups", + which are bugs that cause the kernel to loop in kernel + mode for more than 20 seconds (configurable using the watchdog_thresh + sysctl), without giving other tasks a chance to run. + + The panic can be used in combination with panic_timeout, + to cause the system to reboot automatically after a + lockup has been detected. This feature is useful for + high-availability systems that have uptime guarantees and + where a lockup must be resolved ASAP. + + Say N if unsure. + +config BOOTPARAM_SOFTLOCKUP_PANIC_VALUE + int + depends on SOFTLOCKUP_DETECTOR + range 0 1 + default 0 if !BOOTPARAM_SOFTLOCKUP_PANIC + default 1 if BOOTPARAM_SOFTLOCKUP_PANIC + config HARDLOCKUP_DETECTOR_PERF bool select SOFTLOCKUP_DETECTOR @@ -849,30 +873,6 @@ config BOOTPARAM_HARDLOCKUP_PANIC_VALUE default 0 if !BOOTPARAM_HARDLOCKUP_PANIC default 1 if BOOTPARAM_HARDLOCKUP_PANIC -config BOOTPARAM_SOFTLOCKUP_PANIC - bool "Panic (Reboot) On Soft Lockups" - depends on SOFTLOCKUP_DETECTOR - help - Say Y here to enable the kernel to panic on "soft lockups", - which are bugs that cause the kernel to loop in kernel - mode for more than 20 seconds (configurable using the watchdog_thresh - sysctl), without giving other tasks a chance to run. - - The panic can be used in combination with panic_timeout, - to cause the system to reboot automatically after a - lockup has been detected. This feature is useful for - high-availability systems that have uptime guarantees and - where a lockup must be resolved ASAP. - - Say N if unsure. - -config BOOTPARAM_SOFTLOCKUP_PANIC_VALUE - int - depends on SOFTLOCKUP_DETECTOR - range 0 1 - default 0 if !BOOTPARAM_SOFTLOCKUP_PANIC - default 1 if BOOTPARAM_SOFTLOCKUP_PANIC - config DETECT_HUNG_TASK bool "Detect Hung Tasks" depends on DEBUG_KERNEL From f6f66c1bf53079ce1f0789c8b482fba35b81617d Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 10 Apr 2018 16:32:54 -0700 Subject: [PATCH 088/140] lib/test_bitmap.c: do not accidentally use stack VLA This avoids an accidental stack VLA (since the compiler thinks the value of "len" can change, even when marked "const"). This just replaces it with a #define so it will DTRT. Seen with -Wvla. Fixed as part of the directive to remove all VLAs from the kernel: https://lkml.org/lkml/2018/3/7/621 Link: http://lkml.kernel.org/r/20180307212555.GA17927@beast Signed-off-by: Kees Cook Reviewed-by: Andrew Morton Cc: Yury Norov Cc: Andy Shevchenko Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/test_bitmap.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/lib/test_bitmap.c b/lib/test_bitmap.c index 413367cf569e..de16f7869fb1 100644 --- a/lib/test_bitmap.c +++ b/lib/test_bitmap.c @@ -296,15 +296,17 @@ static void __init test_bitmap_parselist(void) } } +#define EXP_BYTES (sizeof(exp) * 8) + static void __init test_bitmap_arr32(void) { - unsigned int nbits, next_bit, len = sizeof(exp) * 8; + unsigned int nbits, next_bit; u32 arr[sizeof(exp) / 4]; - DECLARE_BITMAP(bmap2, len); + DECLARE_BITMAP(bmap2, EXP_BYTES); memset(arr, 0xa5, sizeof(arr)); - for (nbits = 0; nbits < len; ++nbits) { + for (nbits = 0; nbits < EXP_BYTES; ++nbits) { bitmap_to_arr32(arr, exp, nbits); bitmap_from_arr32(bmap2, arr, nbits); expect_eq_bitmap(bmap2, exp, nbits); @@ -316,7 +318,7 @@ static void __init test_bitmap_arr32(void) " tail is not safely cleared: %d\n", nbits, next_bit); - if (nbits < len - 32) + if (nbits < EXP_BYTES - 32) expect_eq_uint(arr[DIV_ROUND_UP(nbits, 32)], 0xa5a5a5a5); } From 854686f4edf483db1e0d26d972bdb8fb65c8bfaa Mon Sep 17 00:00:00 2001 From: Jinbum Park Date: Tue, 10 Apr 2018 16:32:58 -0700 Subject: [PATCH 089/140] lib: add testing module for UBSAN This is a test module for UBSAN. It triggers all undefined behaviors that linux supports now, and detect them. All test-cases have passed by compiling with gcc-5.5.0. If use gcc-4.9.x, misaligned, out-of-bounds, object-size-mismatch will not be detected. Because gcc-4.9.x doesn't support them. Link: http://lkml.kernel.org/r/20180309102247.GA2944@pjb1027-Latitude-E5410 Signed-off-by: Jinbum Park Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig.ubsan | 7 +++ lib/Makefile | 2 + lib/test_ubsan.c | 144 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 153 insertions(+) create mode 100644 lib/test_ubsan.c diff --git a/lib/Kconfig.ubsan b/lib/Kconfig.ubsan index a669c193b878..19d42ea75ec2 100644 --- a/lib/Kconfig.ubsan +++ b/lib/Kconfig.ubsan @@ -46,3 +46,10 @@ config UBSAN_NULL help This option enables detection of memory accesses via a null pointer. + +config TEST_UBSAN + tristate "Module for testing for undefined behavior detection" + depends on m && UBSAN + help + This is a test module for UBSAN. + It triggers various undefined behavior, and detect it. diff --git a/lib/Makefile b/lib/Makefile index 6200f978740d..ce20696d5a92 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -54,6 +54,8 @@ obj-$(CONFIG_TEST_SYSCTL) += test_sysctl.o obj-$(CONFIG_TEST_HASH) += test_hash.o test_siphash.o obj-$(CONFIG_TEST_KASAN) += test_kasan.o CFLAGS_test_kasan.o += -fno-builtin +obj-$(CONFIG_TEST_UBSAN) += test_ubsan.o +UBSAN_SANITIZE_test_ubsan.o := y obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o obj-$(CONFIG_TEST_LIST_SORT) += test_list_sort.o obj-$(CONFIG_TEST_LKM) += test_module.o diff --git a/lib/test_ubsan.c b/lib/test_ubsan.c new file mode 100644 index 000000000000..58dedff36b17 --- /dev/null +++ b/lib/test_ubsan.c @@ -0,0 +1,144 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include + +typedef void(*test_ubsan_fp)(void); + +static void test_ubsan_add_overflow(void) +{ + volatile int val = INT_MAX; + + val += 2; +} + +static void test_ubsan_sub_overflow(void) +{ + volatile int val = INT_MIN; + volatile int val2 = 2; + + val -= val2; +} + +static void test_ubsan_mul_overflow(void) +{ + volatile int val = INT_MAX / 2; + + val *= 3; +} + +static void test_ubsan_negate_overflow(void) +{ + volatile int val = INT_MIN; + + val = -val; +} + +static void test_ubsan_divrem_overflow(void) +{ + volatile int val = 16; + volatile int val2 = 0; + + val /= val2; +} + +static void test_ubsan_vla_bound_not_positive(void) +{ + volatile int size = -1; + char buf[size]; + + (void)buf; +} + +static void test_ubsan_shift_out_of_bounds(void) +{ + volatile int val = -1; + int val2 = 10; + + val2 <<= val; +} + +static void test_ubsan_out_of_bounds(void) +{ + volatile int i = 4, j = 5; + volatile int arr[i]; + + arr[j] = i; +} + +static void test_ubsan_load_invalid_value(void) +{ + volatile char *dst, *src; + bool val, val2, *ptr; + char c = 4; + + dst = (char *)&val; + src = &c; + *dst = *src; + + ptr = &val2; + val2 = val; +} + +static void test_ubsan_null_ptr_deref(void) +{ + volatile int *ptr = NULL; + int val; + + val = *ptr; +} + +void test_ubsan_misaligned_access(void) +{ + volatile char arr[5] __aligned(4) = {1, 2, 3, 4, 5}; + volatile int *ptr, val = 6; + + ptr = (int *)(arr + 1); + *ptr = val; +} + +static void test_ubsan_object_size_mismatch(void) +{ + /* "((aligned(8)))" helps this not into be misaligned for ptr-access. */ + volatile int val __aligned(8) = 4; + volatile long long *ptr, val2; + + ptr = (long long *)&val; + val2 = *ptr; +} + +static const test_ubsan_fp test_ubsan_array[] = { + test_ubsan_add_overflow, + test_ubsan_sub_overflow, + test_ubsan_mul_overflow, + test_ubsan_negate_overflow, + test_ubsan_divrem_overflow, + test_ubsan_vla_bound_not_positive, + test_ubsan_shift_out_of_bounds, + test_ubsan_out_of_bounds, + test_ubsan_load_invalid_value, + //test_ubsan_null_ptr_deref, /* exclude it because there is a crash */ + test_ubsan_misaligned_access, + test_ubsan_object_size_mismatch, +}; + +static int __init test_ubsan_init(void) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(test_ubsan_array); i++) + test_ubsan_array[i](); + + (void)test_ubsan_null_ptr_deref; /* to avoid unsed-function warning */ + return 0; +} +module_init(test_ubsan_init); + +static void __exit test_ubsan_exit(void) +{ + /* do nothing */ +} +module_exit(test_ubsan_exit); + +MODULE_AUTHOR("Jinbum Park "); +MODULE_LICENSE("GPL v2"); From 317506009216f5103e185fe626d61361a899909d Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 10 Apr 2018 16:33:02 -0700 Subject: [PATCH 090/140] lib/test_ubsan.c: make test_ubsan_misaligned_access() static test_ubsan_misaligned_access() is local to the source and does not need to be in global scope, so make it static. Cleans up sparse warning: lib/test_ubsan.c:91:6: warning: symbol 'test_ubsan_misaligned_access' was not declared. Should it be static? Link: http://lkml.kernel.org/r/20180313103048.28513-1-colin.king@canonical.com Signed-off-by: Colin Ian King Cc: Jinbum Park Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/test_ubsan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/test_ubsan.c b/lib/test_ubsan.c index 58dedff36b17..280f4979d00e 100644 --- a/lib/test_ubsan.c +++ b/lib/test_ubsan.c @@ -88,7 +88,7 @@ static void test_ubsan_null_ptr_deref(void) val = *ptr; } -void test_ubsan_misaligned_access(void) +static void test_ubsan_misaligned_access(void) { volatile char arr[5] __aligned(4) = {1, 2, 3, 4, 5}; volatile int *ptr, val = 6; From 68c1f08203f2b06b3b888229b1524cfbfe51660d Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 10 Apr 2018 16:33:06 -0700 Subject: [PATCH 091/140] lib/list_debug.c: print unmangled addresses The entire point of printing the pointers in list_debug is to see if there's any useful information in them (eg poison values, ASCII, etc); obscuring them to see if they compare equal makes them much less useful. If an attacker can force this message to be printed, we've already lost. Link: http://lkml.kernel.org/r/20180401223237.GV13332@bombadil.infradead.org Signed-off-by: Matthew Wilcox Reviewed-by: Tobin C. Harding Reviewed-by: Andrew Morton Cc: Eric Biggers Cc: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/list_debug.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/lib/list_debug.c b/lib/list_debug.c index a34db8d27667..5d5424b51b74 100644 --- a/lib/list_debug.c +++ b/lib/list_debug.c @@ -21,13 +21,13 @@ bool __list_add_valid(struct list_head *new, struct list_head *prev, struct list_head *next) { if (CHECK_DATA_CORRUPTION(next->prev != prev, - "list_add corruption. next->prev should be prev (%p), but was %p. (next=%p).\n", + "list_add corruption. next->prev should be prev (%px), but was %px. (next=%px).\n", prev, next->prev, next) || CHECK_DATA_CORRUPTION(prev->next != next, - "list_add corruption. prev->next should be next (%p), but was %p. (prev=%p).\n", + "list_add corruption. prev->next should be next (%px), but was %px. (prev=%px).\n", next, prev->next, prev) || CHECK_DATA_CORRUPTION(new == prev || new == next, - "list_add double add: new=%p, prev=%p, next=%p.\n", + "list_add double add: new=%px, prev=%px, next=%px.\n", new, prev, next)) return false; @@ -43,16 +43,16 @@ bool __list_del_entry_valid(struct list_head *entry) next = entry->next; if (CHECK_DATA_CORRUPTION(next == LIST_POISON1, - "list_del corruption, %p->next is LIST_POISON1 (%p)\n", + "list_del corruption, %px->next is LIST_POISON1 (%px)\n", entry, LIST_POISON1) || CHECK_DATA_CORRUPTION(prev == LIST_POISON2, - "list_del corruption, %p->prev is LIST_POISON2 (%p)\n", + "list_del corruption, %px->prev is LIST_POISON2 (%px)\n", entry, LIST_POISON2) || CHECK_DATA_CORRUPTION(prev->next != entry, - "list_del corruption. prev->next should be %p, but was %p\n", + "list_del corruption. prev->next should be %px, but was %px\n", entry, prev->next) || CHECK_DATA_CORRUPTION(next->prev != entry, - "list_del corruption. next->prev should be %p, but was %p\n", + "list_del corruption. next->prev should be %px, but was %px\n", entry, next->prev)) return false; From 85e12066ea09bbee5c99ff2dbde9934291533b0d Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 10 Apr 2018 16:33:09 -0700 Subject: [PATCH 092/140] checkpatch: improve parse_email signature checking Bare email addresses with non alphanumeric characters require escape quoting before being substituted in the parse_email routine. e.g. Reported-by: syzbot+bbd8e9a06452cc48059b@syzkaller.appspotmail.com Do so. Link: http://lkml.kernel.org/r/1518631805.3678.12.camel@perches.com Signed-off-by: Joe Perches Cc: Dmitry Vyukov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 764ffd1bb1c5..b464a4c3f863 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -1075,7 +1075,7 @@ sub parse_email { } elsif ($formatted_email =~ /(\S+\@\S+)(.*)$/) { $address = $1; $comment = $2 if defined $2; - $formatted_email =~ s/$address.*$//; + $formatted_email =~ s/\Q$address\E.*$//; $name = $formatted_email; $name = trim($name); $name =~ s/^\"|\"$//g; From 9f3a89926d6dfc30a4fd1bbcb92cc7b218d3786d Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Tue, 10 Apr 2018 16:33:13 -0700 Subject: [PATCH 093/140] checkpatch.pl: add SPDX license tag check Add SPDX license tag check based on the rules defined in Documentation/process/license-rules.rst. To summarize, SPDX license tags should be on the 1st line (or 2nd line in scripts) using the appropriate comment style for the file type. Link: http://lkml.kernel.org/r/20180202154026.15298-1-robh@kernel.org Signed-off-by: Rob Herring Signed-off-by: Joe Perches Acked-by: Greg Kroah-Hartman Acked-by: Philippe Ombredanne Cc: Andy Whitcroft Cc: Joe Perches Cc: Thomas Gleixner Cc: Igor Stoppa Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index b464a4c3f863..0f022b56f117 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2257,6 +2257,8 @@ sub process { my $camelcase_file_seeded = 0; + my $checklicenseline = 1; + sanitise_line_reset(); my $line; foreach my $rawline (@rawlines) { @@ -2448,6 +2450,7 @@ sub process { } else { $check = $check_orig; } + $checklicenseline = 1; next; } @@ -2911,6 +2914,30 @@ sub process { } } +# check for using SPDX license tag at beginning of files + if ($realline == $checklicenseline) { + if ($rawline =~ /^[ \+]\s*\#\!\s*\//) { + $checklicenseline = 2; + } elsif ($rawline =~ /^\+/) { + my $comment = ""; + if ($realfile =~ /\.(h|s|S)$/) { + $comment = '/*'; + } elsif ($realfile =~ /\.(c|dts|dtsi)$/) { + $comment = '//'; + } elsif (($checklicenseline == 2) || $realfile =~ /\.(sh|pl|py|awk|tc)$/) { + $comment = '#'; + } elsif ($realfile =~ /\.rst$/) { + $comment = '..'; + } + + if ($comment !~ /^$/ && + $rawline !~ /^\+\Q$comment\E SPDX-License-Identifier: /) { + WARN("SPDX_LICENSE_TAG", + "Missing or malformed SPDX-License-Identifier tag in line $checklicenseline\n" . $herecurr); + } + } + } + # check we are in a valid source file if not then ignore this hunk next if ($realfile !~ /\.(h|c|s|S|sh|dtsi|dts)$/); From 3d102fc0e7b02d4e16752c15aa92923405b01388 Mon Sep 17 00:00:00 2001 From: Gilad Ben-Yossef Date: Tue, 10 Apr 2018 16:33:17 -0700 Subject: [PATCH 094/140] checkpatch: add Crypto ON_STACK to declaration_macros Add the crypto API *_ON_STACK to $declaration_macros. Resolves the following false warning: WARNING: Missing a blank line after declarations + int err; + SHASH_DESC_ON_STACK(desc, ctx_p->shash_tfm); Link: http://lkml.kernel.org/r/1518941636-4484-1-git-send-email-gilad@benyossef.com Signed-off-by: Gilad Ben-Yossef Signed-off-by: Joe Perches Acked-by: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 0f022b56f117..43322bd8f66a 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -791,7 +791,8 @@ our $FuncArg = qr{$Typecast{0,1}($LvalOrFunc|$Constant|$String)}; our $declaration_macros = qr{(?x: (?:$Storage\s+)?(?:[A-Z_][A-Z0-9]*_){0,2}(?:DEFINE|DECLARE)(?:_[A-Z0-9]+){1,6}\s*\(| (?:$Storage\s+)?[HLP]?LIST_HEAD\s*\(| - (?:$Storage\s+)?${Type}\s+uninitialized_var\s*\( + (?:$Storage\s+)?${Type}\s+uninitialized_var\s*\(| + (?:SKCIPHER_REQUEST|SHASH_DESC|AHASH_REQUEST)_ON_STACK\s*\( )}; sub deparenthesize { From 2a9f9d851c602b6ef0d0a52fb1996772edf218cb Mon Sep 17 00:00:00 2001 From: "Tobin C. Harding" Date: Tue, 10 Apr 2018 16:33:20 -0700 Subject: [PATCH 095/140] checkpatch: add sub routine get_stat_real() checkpatch currently contains duplicate code. We can define a sub routine and call that instead. This reduces code duplication and line count. Add subroutine get_stat_real() Link: http://lkml.kernel.org/r/1519700648-23108-2-git-send-email-me@tobin.cc Signed-off-by: Tobin C. Harding Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 43322bd8f66a..a0808e46c6fe 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -1645,6 +1645,17 @@ sub raw_line { return $line; } +sub get_stat_real { + my ($linenr, $lc) = @_; + + my $stat_real = raw_line($linenr, 0); + for (my $count = $linenr + 1; $count <= $lc; $count++) { + $stat_real = $stat_real . "\n" . raw_line($count, 0); + } + + return $stat_real; +} + sub cat_vet { my ($vet) = @_; my ($res, $coded); @@ -5821,17 +5832,15 @@ sub process { } } if ($bad_extension ne "") { - my $stat_real = raw_line($linenr, 0); + my $stat_real = get_stat_real($linenr, $lc); my $ext_type = "Invalid"; my $use = ""; - for (my $count = $linenr + 1; $count <= $lc; $count++) { - $stat_real = $stat_real . "\n" . raw_line($count, 0); - } if ($bad_extension =~ /p[Ff]/) { $ext_type = "Deprecated"; $use = " - use %pS instead"; $use =~ s/pS/ps/ if ($bad_extension =~ /pf/); } + WARN("VSPRINTF_POINTER_EXTENSION", "$ext_type vsprintf pointer extension '$bad_extension'$use\n" . "$here\n$stat_real\n"); } @@ -5946,10 +5955,7 @@ sub process { $stat !~ /(?:$Compare)\s*\bsscanf\s*$balanced_parens/)) { my $lc = $stat =~ tr@\n@@; $lc = $lc + $linenr; - my $stat_real = raw_line($linenr, 0); - for (my $count = $linenr + 1; $count <= $lc; $count++) { - $stat_real = $stat_real . "\n" . raw_line($count, 0); - } + my $stat_real = get_stat_real($linenr, $lc); WARN("NAKED_SSCANF", "unchecked sscanf return value\n" . "$here\n$stat_real\n"); } @@ -5960,10 +5966,7 @@ sub process { $line =~ /\bsscanf\b/) { my $lc = $stat =~ tr@\n@@; $lc = $lc + $linenr; - my $stat_real = raw_line($linenr, 0); - for (my $count = $linenr + 1; $count <= $lc; $count++) { - $stat_real = $stat_real . "\n" . raw_line($count, 0); - } + my $stat_real = get_stat_real($linenr, $lc); if ($stat_real =~ /\bsscanf\b\s*\(\s*$FuncArg\s*,\s*("[^"]+")/) { my $format = $6; my $count = $format =~ tr@%@%@; @@ -6397,10 +6400,7 @@ sub process { my $lc = $stat =~ tr@\n@@; $lc = $lc + $linenr; - my $stat_real = raw_line($linenr, 0); - for (my $count = $linenr + 1; $count <= $lc; $count++) { - $stat_real = $stat_real . "\n" . raw_line($count, 0); - } + my $stat_real = get_stat_real($linenr, $lc); my $skip_args = ""; if ($arg_pos > 1) { From c2066ca350b32f1d3d69743c59099c6f91f7a559 Mon Sep 17 00:00:00 2001 From: "Tobin C. Harding" Date: Tue, 10 Apr 2018 16:33:24 -0700 Subject: [PATCH 096/140] checkpatch: remove unused variable declarations Variables are declared and not used, we should remove them. Link: http://lkml.kernel.org/r/1519700648-23108-3-git-send-email-me@tobin.cc Signed-off-by: Tobin C. Harding Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 2 -- 1 file changed, 2 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index a0808e46c6fe..ea6d0f3fc057 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -6096,7 +6096,6 @@ sub process { } if ($r1 !~ /^sizeof\b/ && $r2 =~ /^sizeof\s*\S/ && !($r1 =~ /^$Constant$/ || $r1 =~ /^[A-Z_][A-Z0-9_]*$/)) { - my $ctx = ''; my $herectx = $here . "\n"; my $cnt = statement_rawlines($stat); for (my $n = 0; $n < $cnt; $n++) { @@ -6184,7 +6183,6 @@ sub process { if ($^V && $^V ge 5.10.0 && defined $stat && $stat =~ /^\+[$;\s]*(?:case[$;\s]+\w+[$;\s]*:[$;\s]*|)*[$;\s]*\bdefault[$;\s]*:[$;\s]*;/g) { - my $ctx = ''; my $herectx = $here . "\n"; my $cnt = statement_rawlines($stat); for (my $n = 0; $n < $cnt; $n++) { From e3d95a2a05afd6b51ade4686291b0ed8581dd5e5 Mon Sep 17 00:00:00 2001 From: "Tobin C. Harding" Date: Tue, 10 Apr 2018 16:33:27 -0700 Subject: [PATCH 097/140] checkpatch: add sub routine get_stat_here() checkpatch currently contains duplicate code. We can define a sub routine and call that instead. This reduces code duplication and line count. Add subroutine get_stat_here(). Link: http://lkml.kernel.org/r/1519700648-23108-4-git-send-email-me@tobin.cc Signed-off-by: Tobin C. Harding Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 52 +++++++++++++++++-------------------------- 1 file changed, 20 insertions(+), 32 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index ea6d0f3fc057..8be5297d18b6 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -1656,6 +1656,17 @@ sub get_stat_real { return $stat_real; } +sub get_stat_here { + my ($linenr, $cnt, $here) = @_; + + my $herectx = $here . "\n"; + for (my $n = 0; $n < $cnt; $n++) { + $herectx .= raw_line($linenr, $n) . "\n"; + } + + return $herectx; +} + sub cat_vet { my ($vet) = @_; my ($res, $coded); @@ -4967,12 +4978,8 @@ sub process { #print "REST<$rest> dstat<$dstat> ctx<$ctx>\n"; $ctx =~ s/\n*$//; - my $herectx = $here . "\n"; my $stmt_cnt = statement_rawlines($ctx); - - for (my $n = 0; $n < $stmt_cnt; $n++) { - $herectx .= raw_line($linenr, $n) . "\n"; - } + my $herectx = get_stat_here($linenr, $stmt_cnt, $here); if ($dstat ne '' && $dstat !~ /^(?:$Ident|-?$Constant),$/ && # 10, // foo(), @@ -5044,12 +5051,9 @@ sub process { # check for macros with flow control, but without ## concatenation # ## concatenation is commonly a macro that defines a function so ignore those if ($has_flow_statement && !$has_arg_concat) { - my $herectx = $here . "\n"; my $cnt = statement_rawlines($ctx); + my $herectx = get_stat_here($linenr, $cnt, $here); - for (my $n = 0; $n < $cnt; $n++) { - $herectx .= raw_line($linenr, $n) . "\n"; - } WARN("MACRO_WITH_FLOW_CONTROL", "Macros with flow control statements should be avoided\n" . "$herectx"); } @@ -5089,11 +5093,7 @@ sub process { $ctx =~ s/\n*$//; my $cnt = statement_rawlines($ctx); - my $herectx = $here . "\n"; - - for (my $n = 0; $n < $cnt; $n++) { - $herectx .= raw_line($linenr, $n) . "\n"; - } + my $herectx = get_stat_here($linenr, $cnt, $here); if (($stmts =~ tr/;/;/) == 1 && $stmts !~ /^\s*(if|while|for|switch)\b/) { @@ -5107,11 +5107,7 @@ sub process { } elsif ($dstat =~ /^\+\s*#\s*define\s+$Ident.*;\s*$/) { $ctx =~ s/\n*$//; my $cnt = statement_rawlines($ctx); - my $herectx = $here . "\n"; - - for (my $n = 0; $n < $cnt; $n++) { - $herectx .= raw_line($linenr, $n) . "\n"; - } + my $herectx = get_stat_here($linenr, $cnt, $here); WARN("TRAILING_SEMICOLON", "macros should not use a trailing semicolon\n" . "$herectx"); @@ -5234,12 +5230,8 @@ sub process { } } if ($level == 0 && $block =~ /^\s*\{/ && !$allowed) { - my $herectx = $here . "\n"; my $cnt = statement_rawlines($block); - - for (my $n = 0; $n < $cnt; $n++) { - $herectx .= raw_line($linenr, $n) . "\n"; - } + my $herectx = get_stat_here($linenr, $cnt, $here); WARN("BRACES", "braces {} are not necessary for single statement blocks\n" . $herectx); @@ -6096,11 +6088,9 @@ sub process { } if ($r1 !~ /^sizeof\b/ && $r2 =~ /^sizeof\s*\S/ && !($r1 =~ /^$Constant$/ || $r1 =~ /^[A-Z_][A-Z0-9_]*$/)) { - my $herectx = $here . "\n"; my $cnt = statement_rawlines($stat); - for (my $n = 0; $n < $cnt; $n++) { - $herectx .= raw_line($linenr, $n) . "\n"; - } + my $herectx = get_stat_here($linenr, $cnt, $here); + if (WARN("ALLOC_WITH_MULTIPLY", "Prefer $newfunc over $oldfunc with multiply\n" . $herectx) && $cnt == 1 && @@ -6183,11 +6173,9 @@ sub process { if ($^V && $^V ge 5.10.0 && defined $stat && $stat =~ /^\+[$;\s]*(?:case[$;\s]+\w+[$;\s]*:[$;\s]*|)*[$;\s]*\bdefault[$;\s]*:[$;\s]*;/g) { - my $herectx = $here . "\n"; my $cnt = statement_rawlines($stat); - for (my $n = 0; $n < $cnt; $n++) { - $herectx .= raw_line($linenr, $n) . "\n"; - } + my $herectx = get_stat_here($linenr, $cnt, $here); + WARN("DEFAULT_NO_BREAK", "switch default: should use break\n" . $herectx); } From e3c6bc95668b9b9fc1e74f221551dfa622ea9061 Mon Sep 17 00:00:00 2001 From: "Tobin C. Harding" Date: Tue, 10 Apr 2018 16:33:31 -0700 Subject: [PATCH 098/140] checkpatch: warn for use of %px Usage of the new %px specifier potentially leaks sensitive information. Printing kernel addresses exposes the kernel layout in memory, this is potentially exploitable. We have tools in the kernel to help us do the right thing. We can have checkpatch warn developers of potential dangers of using %px. Have checkpatch emit a warning for usage of specifier %px. Link: http://lkml.kernel.org/r/1519700648-23108-5-git-send-email-me@tobin.cc Signed-off-by: Tobin C. Harding Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 50 ++++++++++++++++++++++++++++--------------- 1 file changed, 33 insertions(+), 17 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 8be5297d18b6..788c90c1ae2a 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -5812,29 +5812,45 @@ sub process { defined $stat && $stat =~ /^\+(?![^\{]*\{\s*).*\b(\w+)\s*\(.*$String\s*,/s && $1 !~ /^_*volatile_*$/) { - my $bad_extension = ""; + my $specifier; + my $extension; + my $bad_specifier = ""; + my $stat_real; + my $lc = $stat =~ tr@\n@@; $lc = $lc + $linenr; for (my $count = $linenr; $count <= $lc; $count++) { my $fmt = get_quoted_string($lines[$count - 1], raw_line($count, 0)); $fmt =~ s/%%//g; - if ($fmt =~ /(\%[\*\d\.]*p(?![\WSsBKRraEhMmIiUDdgVCbGNOx]).)/) { - $bad_extension = $1; - last; - } - } - if ($bad_extension ne "") { - my $stat_real = get_stat_real($linenr, $lc); - my $ext_type = "Invalid"; - my $use = ""; - if ($bad_extension =~ /p[Ff]/) { - $ext_type = "Deprecated"; - $use = " - use %pS instead"; - $use =~ s/pS/ps/ if ($bad_extension =~ /pf/); - } - WARN("VSPRINTF_POINTER_EXTENSION", - "$ext_type vsprintf pointer extension '$bad_extension'$use\n" . "$here\n$stat_real\n"); + while ($fmt =~ /(\%[\*\d\.]*p(\w))/g) { + $specifier = $1; + $extension = $2; + if ($extension !~ /[SsBKRraEhMmIiUDdgVCbGNOx]/) { + $bad_specifier = $specifier; + last; + } + if ($extension eq "x" && !defined($stat_real)) { + if (!defined($stat_real)) { + $stat_real = get_stat_real($linenr, $lc); + } + WARN("VSPRINTF_SPECIFIER_PX", + "Using vsprintf specifier '\%px' potentially exposes the kernel memory layout, if you don't really need the address please consider using '\%p'.\n" . "$here\n$stat_real\n"); + } + } + if ($bad_specifier ne "") { + my $stat_real = get_stat_real($linenr, $lc); + my $ext_type = "Invalid"; + my $use = ""; + if ($bad_specifier =~ /p[Ff]/) { + $ext_type = "Deprecated"; + $use = " - use %pS instead"; + $use =~ s/pS/ps/ if ($bad_specifier =~ /pf/); + } + + WARN("VSPRINTF_POINTER_EXTENSION", + "$ext_type vsprintf pointer extension '$bad_specifier'$use\n" . "$here\n$stat_real\n"); + } } } From 478b17998066a6a70a7f265f4feba09c6945ab62 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 10 Apr 2018 16:33:34 -0700 Subject: [PATCH 099/140] checkpatch: improve get_quoted_string for TRACE_EVENT macros The get_quoted_string function does not expect invalid arguments. The $stat test can return non-statements for complicated macros like TRACE_EVENT. Allow the $stat block and test for vsprintf misuses to exceed the actual block length and possibly test invalid lines by validating the arguments of get_quoted_string. Return "" if either get_quoted_string argument is undefined. Miscellanea: o Properly align the comment for the vsprintf extension test Link: http://lkml.kernel.org/r/9e9725342ca3dfc0f5e3e0b8ca3c482b0e5712cc.1520356392.git.joe@perches.com Signed-off-by: Joe Perches Reported-by: Chuck Lever Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 788c90c1ae2a..18bf4bf1d0fc 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -1298,6 +1298,7 @@ sub sanitise_line { sub get_quoted_string { my ($line, $rawline) = @_; + return "" if (!defined($line) || !defined($rawline)); return "" if ($line !~ m/($String)/g); return substr($rawline, $-[0], $+[0] - $-[0]); } @@ -5807,7 +5808,7 @@ sub process { } } - # check for vsprintf extension %p misuses +# check for vsprintf extension %p misuses if ($^V && $^V ge 5.10.0 && defined $stat && $stat =~ /^\+(?![^\{]*\{\s*).*\b(\w+)\s*\(.*$String\s*,/s && From 8d2e11b22d79053e832d17084bc912102c6cbb62 Mon Sep 17 00:00:00 2001 From: Claudio Fontana Date: Tue, 10 Apr 2018 16:33:42 -0700 Subject: [PATCH 100/140] checkpatch: two spelling fixes completly -> completely wacking -> whacking Link: http://lkml.kernel.org/r/1520405394-5586-1-git-send-email-claudio.fontana@gliwa.com Signed-off-by: Claudio Fontana Acked-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 18bf4bf1d0fc..5fe361e1ed5e 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -1218,7 +1218,7 @@ sub sanitise_line { for ($off = 1; $off < length($line); $off++) { $c = substr($line, $off, 1); - # Comments we are wacking completly including the begin + # Comments we are whacking completely including the begin # and end, all to $;. if ($sanitise_quote eq '' && substr($line, $off, 2) eq '/*') { $sanitise_quote = '*/'; From bc22d9a7d3aa76bc090d844e6aad18db9cc69237 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 10 Apr 2018 16:33:53 -0700 Subject: [PATCH 101/140] checkpatch: test SYMBOLIC_PERMS multiple times per line There are occasions where symbolic perms are used in a ternary like return (channel == 0) ? S_IRUGO | S_IWUSR : S_IRUGO; The current test will find the first use "S_IRUGO | S_IWUSR" but not the second use "S_IRUGO" on the same line. Improve the test to look for all instances on a line. Link: http://lkml.kernel.org/r/1522127944.12357.49.camel@perches.com Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 5fe361e1ed5e..d2464002bb40 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -6429,7 +6429,7 @@ sub process { } # check for uses of S_ that could be octal for readability - if ($line =~ /\b($multi_mode_perms_string_search)\b/) { + while ($line =~ m{\b($multi_mode_perms_string_search)\b}g) { my $oval = $1; my $octal = perms_to_octal($oval); if (WARN("SYMBOLIC_PERMS", From 6a487211ec720658f3e3c39eecd0b6829eafa6d4 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 10 Apr 2018 16:34:04 -0700 Subject: [PATCH 102/140] checkpatch: add test for assignment at start of line Kernel style seems to prefer line wrapping an assignment with the assignment operator on the previous line like: identifier = expression; over identifier = expression; somewhere around a 50:1 ratio $ git grep -P "[^=]=\s*$" -- "*.[ch]" | wc -l 52008 $ git grep -P "^\s+[\*\/\+\|\%\-]?=[^=>]" | wc -l 1161 So add a --strict test for that condition. Link: http://lkml.kernel.org/r/1522275726.2210.12.camel@perches.com Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index d2464002bb40..5deee8bd0bae 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -3062,6 +3062,12 @@ sub process { } } +# check for assignments on the start of a line + if ($sline =~ /^\+\s+($Assignment)[^=]/) { + CHK("ASSIGNMENT_CONTINUATIONS", + "Assignment operator '$1' should be on the previous line\n" . $hereprev); + } + # check for && or || at the start of a line if ($rawline =~ /^\+\s*(&&|\|\|)/) { CHK("LOGICAL_CONTINUATIONS", From 38dca988bb208e66d6fdb6346f7266f3d2d1a8a4 Mon Sep 17 00:00:00 2001 From: Heinrich Schuchardt Date: Tue, 10 Apr 2018 16:34:14 -0700 Subject: [PATCH 103/140] checkpatch: allow space between colon and bracket Allow a space between a colon and subsequent opening bracket. This sequence may occur in inline assembler statements like asm( "ldr %[out], [%[in]]\n\t" : [out] "=r" (ret) : [in] "r" (addr) ); Link: http://lkml.kernel.org/r/20180403191655.23700-1-xypron.glpk@gmx.de Signed-off-by: Heinrich Schuchardt Acked-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 5deee8bd0bae..eb534d48140e 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -4089,7 +4089,7 @@ sub process { my ($where, $prefix) = ($-[1], $1); if ($prefix !~ /$Type\s+$/ && ($where != 0 || $prefix !~ /^.\s+$/) && - $prefix !~ /[{,]\s+$/) { + $prefix !~ /[{,:]\s+$/) { if (ERROR("BRACKET_SPACE", "space prohibited before open square bracket '['\n" . $herecurr) && $fix) { From 5d430902615bdea9d9bc205ed06ffc40d86ea66f Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 10 Apr 2018 16:34:25 -0700 Subject: [PATCH 104/140] checkpatch: whinge about bool bitfields Using bool in a bitfield isn't a good idea as the alignment behavior is arch implementation defined. Suggest using unsigned int or u<8|16|32> instead. Link: http://lkml.kernel.org/r/e22fb871b1b7f2fda4b22f3a24e0d7f092eb612c.camel@perches.com Signed-off-by: Joe Perches Suggested-by: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index eb534d48140e..e16d6713f236 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -6251,6 +6251,12 @@ sub process { } } +# check for bool bitfields + if ($sline =~ /^.\s+bool\s*$Ident\s*:\s*\d+\s*;/) { + WARN("BOOL_BITFIELD", + "Avoid using bool as bitfield. Prefer bool bitfields as unsigned int or u<8|16|32>\n" . $herecurr); + } + # check for semaphores initialized locked if ($line =~ /^.\s*sema_init.+,\W?0\W?\)/) { WARN("CONSIDER_COMPLETION", From 1a6a05a4fa862631df738dae76b4531ee15e5a0f Mon Sep 17 00:00:00 2001 From: Aaro Koskinen Date: Tue, 10 Apr 2018 16:34:34 -0700 Subject: [PATCH 105/140] init/ramdisk: use pr_cont() at the end of ramdisk loading Use pr_cont() at the end of ramdisk loading. This will avoid the rotator and an extra newline appearing in the dmesg. Before: RAMDISK: Loading 2436KiB [1 disk] into ram disk... | done. After: RAMDISK: Loading 2436KiB [1 disk] into ram disk... done. Link: http://lkml.kernel.org/r/20180302205552.16031-1-aaro.koskinen@iki.fi Signed-off-by: Aaro Koskinen Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- init/do_mounts_rd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/init/do_mounts_rd.c b/init/do_mounts_rd.c index 12c159824c7b..035a5f0ab26b 100644 --- a/init/do_mounts_rd.c +++ b/init/do_mounts_rd.c @@ -255,7 +255,7 @@ int __init rd_load_image(char *from) nblocks, ((nblocks-1)/devblocks)+1, nblocks>devblocks ? "s" : ""); for (i = 0, disk = 1; i < nblocks; i++) { if (i && (i % devblocks == 0)) { - printk("done disk #%d.\n", disk++); + pr_cont("done disk #%d.\n", disk++); rotate = 0; if (ksys_close(in_fd)) { printk("Error closing the disk.\n"); @@ -278,7 +278,7 @@ int __init rd_load_image(char *from) } #endif } - printk("done.\n"); + pr_cont("done.\n"); successful_load: res = 1; From ad12c3a6ef1c78d0d0dbbe48dfcd416583f515ad Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 10 Apr 2018 16:34:37 -0700 Subject: [PATCH 106/140] autofs4: use wait_event_killable This playing with signals to allow only fatal signals appears to predate the introduction of wait_event_killable(), and I'm fairly sure that wait_event_killable is what was meant to happen here. [avagin@openvz.org: use wake_up() instead of wake_up_interruptible] Link: http://lkml.kernel.org/r/20180331022839.21277-1-avagin@openvz.org Link: http://lkml.kernel.org/r/20180319191609.23880-1-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Ian Kent Cc: Matthew Wilcox Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs4/waitq.c | 29 ++--------------------------- 1 file changed, 2 insertions(+), 27 deletions(-) diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index a0c57c37fa21..be9c3dc048ab 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -19,9 +19,6 @@ */ static autofs_wqt_t autofs4_next_wait_queue = 1; -/* These are the signals we allow interrupting a pending mount */ -#define SHUTDOWN_SIGS (sigmask(SIGKILL) | sigmask(SIGINT) | sigmask(SIGQUIT)) - void autofs4_catatonic_mode(struct autofs_sb_info *sbi) { struct autofs_wait_queue *wq, *nwq; @@ -486,29 +483,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, * wq->name.name is NULL iff the lock is already released * or the mount has been made catatonic. */ - if (wq->name.name) { - /* Block all but "shutdown" signals while waiting */ - unsigned long shutdown_sigs_mask; - unsigned long irqflags; - sigset_t oldset; - - spin_lock_irqsave(¤t->sighand->siglock, irqflags); - oldset = current->blocked; - shutdown_sigs_mask = SHUTDOWN_SIGS & ~oldset.sig[0]; - siginitsetinv(¤t->blocked, shutdown_sigs_mask); - recalc_sigpending(); - spin_unlock_irqrestore(¤t->sighand->siglock, irqflags); - - wait_event_interruptible(wq->queue, wq->name.name == NULL); - - spin_lock_irqsave(¤t->sighand->siglock, irqflags); - current->blocked = oldset; - recalc_sigpending(); - spin_unlock_irqrestore(¤t->sighand->siglock, irqflags); - } else { - pr_debug("skipped sleeping\n"); - } - + wait_event_killable(wq->queue, wq->name.name == NULL); status = wq->status; /* @@ -574,7 +549,7 @@ int autofs4_wait_release(struct autofs_sb_info *sbi, autofs_wqt_t wait_queue_tok kfree(wq->name.name); wq->name.name = NULL; /* Do not wait on this queue */ wq->status = status; - wake_up_interruptible(&wq->queue); + wake_up(&wq->queue); if (!--wq->wait_ctr) kfree(wq); mutex_unlock(&sbi->wq_mutex); From 9ad553abe66f8be3f4755e9fa0a6ba137ce76341 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 10 Apr 2018 16:34:41 -0700 Subject: [PATCH 107/140] fs/reiserfs/journal.c: add missing resierfs_warning() arg One use of the reiserfs_warning() macro in journal_init_dev() is missing a parameter, causing the following warning: REISERFS warning (device loop0): journal_init_dev: Cannot open '%s': %i journal_init_dev: This also causes a WARN_ONCE() warning in the vsprintf code, and then a panic if panic_on_warn is set. Please remove unsupported %/ in format string WARNING: CPU: 1 PID: 4480 at lib/vsprintf.c:2138 format_decode+0x77f/0x830 lib/vsprintf.c:2138 Kernel panic - not syncing: panic_on_warn set ... Just add another string argument to the macro invocation. Addresses https://syzkaller.appspot.com/bug?id=0627d4551fdc39bf1ef5d82cd9eef587047f7718 Link: http://lkml.kernel.org/r/d678ebe1-6f54-8090-df4c-b9affad62293@infradead.org Signed-off-by: Randy Dunlap Reported-by: Tested-by: Randy Dunlap Acked-by: Jeff Mahoney Cc: Alexander Viro Cc: Jan Kara Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/journal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 70057359fbaf..23148c3ed675 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -2643,7 +2643,7 @@ static int journal_init_dev(struct super_block *super, if (IS_ERR(journal->j_dev_bd)) { result = PTR_ERR(journal->j_dev_bd); journal->j_dev_bd = NULL; - reiserfs_warning(super, + reiserfs_warning(super, "sh-457", "journal_init_dev: Cannot open '%s': %i", jdev_name, result); return result; From 0965232035cfa59a64d197cf8a8ee0bc407bb3e4 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 10 Apr 2018 16:34:45 -0700 Subject: [PATCH 108/140] seq_file: allocate seq_file from kmem_cache For fine-grained debugging and usercopy protection. Link: http://lkml.kernel.org/r/20180310085027.GA17121@avx2 Signed-off-by: Alexey Dobriyan Reviewed-by: Andrew Morton Cc: Al Viro Cc: Glauber Costa Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/seq_file.c | 12 ++++++++++-- include/linux/seq_file.h | 1 + init/main.c | 1 + 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/fs/seq_file.c b/fs/seq_file.c index 0677e89f3c6f..3cb340583074 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -6,6 +6,7 @@ * initial implementation -- AV, Oct 2001. */ +#include #include #include #include @@ -19,6 +20,8 @@ #include #include +static struct kmem_cache *seq_file_cache __ro_after_init; + static void seq_set_overflow(struct seq_file *m) { m->count = m->size; @@ -51,7 +54,7 @@ int seq_open(struct file *file, const struct seq_operations *op) WARN_ON(file->private_data); - p = kzalloc(sizeof(*p), GFP_KERNEL); + p = kmem_cache_zalloc(seq_file_cache, GFP_KERNEL); if (!p) return -ENOMEM; @@ -366,7 +369,7 @@ int seq_release(struct inode *inode, struct file *file) { struct seq_file *m = file->private_data; kvfree(m->buf); - kfree(m); + kmem_cache_free(seq_file_cache, m); return 0; } EXPORT_SYMBOL(seq_release); @@ -1106,3 +1109,8 @@ seq_hlist_next_percpu(void *v, struct hlist_head __percpu *head, return NULL; } EXPORT_SYMBOL(seq_hlist_next_percpu); + +void __init seq_file_init(void) +{ + seq_file_cache = KMEM_CACHE(seq_file, SLAB_PANIC); +} diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h index 23d6a92cea9f..a121982af0f5 100644 --- a/include/linux/seq_file.h +++ b/include/linux/seq_file.h @@ -240,4 +240,5 @@ extern struct hlist_node *seq_hlist_start_percpu(struct hlist_head __percpu *hea extern struct hlist_node *seq_hlist_next_percpu(void *v, struct hlist_head __percpu *head, int *cpu, loff_t *pos); +void seq_file_init(void); #endif diff --git a/init/main.c b/init/main.c index 50359a3162d0..b795aa341a3a 100644 --- a/init/main.c +++ b/init/main.c @@ -715,6 +715,7 @@ asmlinkage __visible void __init start_kernel(void) vfs_caches_init(); pagecache_init(); signals_init(); + seq_file_init(); proc_root_init(); nsfs_init(); cpuset_init(); From d64d01a155f84850f7dc9795f464e3df9a5ddb10 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 10 Apr 2018 16:34:49 -0700 Subject: [PATCH 109/140] seq_file: account everything to kmemcg All it takes to open a file and read 1 byte from it. seq_file will be allocated along with any private allocations, and more importantly seq file buffer which is 1 page by default. Link: http://lkml.kernel.org/r/20180310085252.GB17121@avx2 Signed-off-by: Alexey Dobriyan Reviewed-by: Andrew Morton Acked-by: Michal Hocko Cc: Al Viro Cc: Glauber Costa Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/seq_file.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/seq_file.c b/fs/seq_file.c index 3cb340583074..c6c27f1f9c98 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -29,7 +29,7 @@ static void seq_set_overflow(struct seq_file *m) static void *seq_buf_alloc(unsigned long size) { - return kvmalloc(size, GFP_KERNEL); + return kvmalloc(size, GFP_KERNEL_ACCOUNT); } /** @@ -566,7 +566,7 @@ static void single_stop(struct seq_file *p, void *v) int single_open(struct file *file, int (*show)(struct seq_file *, void *), void *data) { - struct seq_operations *op = kmalloc(sizeof(*op), GFP_KERNEL); + struct seq_operations *op = kmalloc(sizeof(*op), GFP_KERNEL_ACCOUNT); int res = -ENOMEM; if (op) { @@ -628,7 +628,7 @@ void *__seq_open_private(struct file *f, const struct seq_operations *ops, void *private; struct seq_file *seq; - private = kzalloc(psize, GFP_KERNEL); + private = kzalloc(psize, GFP_KERNEL_ACCOUNT); if (private == NULL) goto out; @@ -1112,5 +1112,5 @@ EXPORT_SYMBOL(seq_hlist_next_percpu); void __init seq_file_init(void) { - seq_file_cache = KMEM_CACHE(seq_file, SLAB_PANIC); + seq_file_cache = KMEM_CACHE(seq_file, SLAB_ACCOUNT|SLAB_PANIC); } From 8f2af155b513583e8b149a384551f13e1ac5dc72 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 10 Apr 2018 16:34:53 -0700 Subject: [PATCH 110/140] exec: pass stack rlimit into mm layout functions Patch series "exec: Pin stack limit during exec". Attempts to solve problems with the stack limit changing during exec continue to be frustrated[1][2]. In addition to the specific issues around the Stack Clash family of flaws, Andy Lutomirski pointed out[3] other places during exec where the stack limit is used and is assumed to be unchanging. Given the many places it gets used and the fact that it can be manipulated/raced via setrlimit() and prlimit(), I think the only way to handle this is to move away from the "current" view of the stack limit and instead attach it to the bprm, and plumb this down into the functions that need to know the stack limits. This series implements the approach. [1] 04e35f4495dd ("exec: avoid RLIMIT_STACK races with prlimit()") [2] 779f4e1c6c7c ("Revert "exec: avoid RLIMIT_STACK races with prlimit()"") [3] to security@kernel.org, "Subject: existing rlimit races?" This patch (of 3): Since it is possible that the stack rlimit can change externally during exec (either via another thread calling setrlimit() or another process calling prlimit()), provide a way to pass the rlimit down into the per-architecture mm layout functions so that the rlimit can stay in the bprm structure instead of sitting in the signal structure until exec is finalized. Link: http://lkml.kernel.org/r/1518638796-20819-2-git-send-email-keescook@chromium.org Signed-off-by: Kees Cook Cc: Michal Hocko Cc: Ben Hutchings Cc: Willy Tarreau Cc: Hugh Dickins Cc: Oleg Nesterov Cc: "Jason A. Donenfeld" Cc: Rik van Riel Cc: Laura Abbott Cc: Greg KH Cc: Andy Lutomirski Cc: Ben Hutchings Cc: Brad Spengler Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/mm/mmap.c | 14 +++++++------- arch/arm64/mm/mmap.c | 14 +++++++------- arch/mips/mm/mmap.c | 14 +++++++------- arch/parisc/kernel/sys_parisc.c | 16 +++++++++++----- arch/powerpc/mm/mmap.c | 28 ++++++++++++++++------------ arch/s390/mm/mmap.c | 15 ++++++++------- arch/sparc/kernel/sys_sparc_64.c | 4 ++-- arch/x86/mm/mmap.c | 18 +++++++++++------- fs/exec.c | 8 +++++++- include/linux/sched/mm.h | 6 ++++-- mm/util.c | 2 +- 11 files changed, 81 insertions(+), 58 deletions(-) diff --git a/arch/arm/mm/mmap.c b/arch/arm/mm/mmap.c index eb1de66517d5..f866870db749 100644 --- a/arch/arm/mm/mmap.c +++ b/arch/arm/mm/mmap.c @@ -21,20 +21,20 @@ #define MIN_GAP (128*1024*1024UL) #define MAX_GAP ((TASK_SIZE)/6*5) -static int mmap_is_legacy(void) +static int mmap_is_legacy(struct rlimit *rlim_stack) { if (current->personality & ADDR_COMPAT_LAYOUT) return 1; - if (rlimit(RLIMIT_STACK) == RLIM_INFINITY) + if (rlim_stack->rlim_cur == RLIM_INFINITY) return 1; return sysctl_legacy_va_layout; } -static unsigned long mmap_base(unsigned long rnd) +static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) { - unsigned long gap = rlimit(RLIMIT_STACK); + unsigned long gap = rlim_stack->rlim_cur; if (gap < MIN_GAP) gap = MIN_GAP; @@ -180,18 +180,18 @@ unsigned long arch_mmap_rnd(void) return rnd << PAGE_SHIFT; } -void arch_pick_mmap_layout(struct mm_struct *mm) +void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) { unsigned long random_factor = 0UL; if (current->flags & PF_RANDOMIZE) random_factor = arch_mmap_rnd(); - if (mmap_is_legacy()) { + if (mmap_is_legacy(rlim_stack)) { mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; mm->get_unmapped_area = arch_get_unmapped_area; } else { - mm->mmap_base = mmap_base(random_factor); + mm->mmap_base = mmap_base(random_factor, rlim_stack); mm->get_unmapped_area = arch_get_unmapped_area_topdown; } } diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c index decccffb03ca..842c8a5fcd53 100644 --- a/arch/arm64/mm/mmap.c +++ b/arch/arm64/mm/mmap.c @@ -38,12 +38,12 @@ #define MIN_GAP (SZ_128M) #define MAX_GAP (STACK_TOP/6*5) -static int mmap_is_legacy(void) +static int mmap_is_legacy(struct rlimit *rlim_stack) { if (current->personality & ADDR_COMPAT_LAYOUT) return 1; - if (rlimit(RLIMIT_STACK) == RLIM_INFINITY) + if (rlim_stack->rlim_cur == RLIM_INFINITY) return 1; return sysctl_legacy_va_layout; @@ -62,9 +62,9 @@ unsigned long arch_mmap_rnd(void) return rnd << PAGE_SHIFT; } -static unsigned long mmap_base(unsigned long rnd) +static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) { - unsigned long gap = rlimit(RLIMIT_STACK); + unsigned long gap = rlim_stack->rlim_cur; unsigned long pad = (STACK_RND_MASK << PAGE_SHIFT) + stack_guard_gap; /* Values close to RLIM_INFINITY can overflow. */ @@ -83,7 +83,7 @@ static unsigned long mmap_base(unsigned long rnd) * This function, called very early during the creation of a new process VM * image, sets up which VM layout function to use: */ -void arch_pick_mmap_layout(struct mm_struct *mm) +void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) { unsigned long random_factor = 0UL; @@ -94,11 +94,11 @@ void arch_pick_mmap_layout(struct mm_struct *mm) * Fall back to the standard layout if the personality bit is set, or * if the expected stack growth is unlimited: */ - if (mmap_is_legacy()) { + if (mmap_is_legacy(rlim_stack)) { mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; mm->get_unmapped_area = arch_get_unmapped_area; } else { - mm->mmap_base = mmap_base(random_factor); + mm->mmap_base = mmap_base(random_factor, rlim_stack); mm->get_unmapped_area = arch_get_unmapped_area_topdown; } } diff --git a/arch/mips/mm/mmap.c b/arch/mips/mm/mmap.c index 33d3251ecd37..2f616ebeb7e0 100644 --- a/arch/mips/mm/mmap.c +++ b/arch/mips/mm/mmap.c @@ -24,20 +24,20 @@ EXPORT_SYMBOL(shm_align_mask); #define MIN_GAP (128*1024*1024UL) #define MAX_GAP ((TASK_SIZE)/6*5) -static int mmap_is_legacy(void) +static int mmap_is_legacy(struct rlimit *rlim_stack) { if (current->personality & ADDR_COMPAT_LAYOUT) return 1; - if (rlimit(RLIMIT_STACK) == RLIM_INFINITY) + if (rlim_stack->rlim_cur == RLIM_INFINITY) return 1; return sysctl_legacy_va_layout; } -static unsigned long mmap_base(unsigned long rnd) +static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) { - unsigned long gap = rlimit(RLIMIT_STACK); + unsigned long gap = rlim_stack->rlim_cur; if (gap < MIN_GAP) gap = MIN_GAP; @@ -158,18 +158,18 @@ unsigned long arch_mmap_rnd(void) return rnd << PAGE_SHIFT; } -void arch_pick_mmap_layout(struct mm_struct *mm) +void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) { unsigned long random_factor = 0UL; if (current->flags & PF_RANDOMIZE) random_factor = arch_mmap_rnd(); - if (mmap_is_legacy()) { + if (mmap_is_legacy(rlim_stack)) { mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; mm->get_unmapped_area = arch_get_unmapped_area; } else { - mm->mmap_base = mmap_base(random_factor); + mm->mmap_base = mmap_base(random_factor, rlim_stack); mm->get_unmapped_area = arch_get_unmapped_area_topdown; } } diff --git a/arch/parisc/kernel/sys_parisc.c b/arch/parisc/kernel/sys_parisc.c index 8c99ebbe2bac..43b308cfdf53 100644 --- a/arch/parisc/kernel/sys_parisc.c +++ b/arch/parisc/kernel/sys_parisc.c @@ -70,12 +70,18 @@ static inline unsigned long COLOR_ALIGN(unsigned long addr, * Top of mmap area (just below the process stack). */ -static unsigned long mmap_upper_limit(void) +/* + * When called from arch_get_unmapped_area(), rlim_stack will be NULL, + * indicating that "current" should be used instead of a passed-in + * value from the exec bprm as done with arch_pick_mmap_layout(). + */ +static unsigned long mmap_upper_limit(struct rlimit *rlim_stack) { unsigned long stack_base; /* Limit stack size - see setup_arg_pages() in fs/exec.c */ - stack_base = rlimit_max(RLIMIT_STACK); + stack_base = rlim_stack ? rlim_stack->rlim_max + : rlimit_max(RLIMIT_STACK); if (stack_base > STACK_SIZE_MAX) stack_base = STACK_SIZE_MAX; @@ -127,7 +133,7 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, info.flags = 0; info.length = len; info.low_limit = mm->mmap_legacy_base; - info.high_limit = mmap_upper_limit(); + info.high_limit = mmap_upper_limit(NULL); info.align_mask = last_mmap ? (PAGE_MASK & (SHM_COLOUR - 1)) : 0; info.align_offset = shared_align_offset(last_mmap, pgoff); addr = vm_unmapped_area(&info); @@ -250,10 +256,10 @@ static unsigned long mmap_legacy_base(void) * This function, called very early during the creation of a new * process VM image, sets up which VM layout function to use: */ -void arch_pick_mmap_layout(struct mm_struct *mm) +void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) { mm->mmap_legacy_base = mmap_legacy_base(); - mm->mmap_base = mmap_upper_limit(); + mm->mmap_base = mmap_upper_limit(rlim_stack); if (mmap_is_legacy()) { mm->mmap_base = mm->mmap_legacy_base; diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c index d503f344e476..b24ce40acd47 100644 --- a/arch/powerpc/mm/mmap.c +++ b/arch/powerpc/mm/mmap.c @@ -39,12 +39,12 @@ #define MIN_GAP (128*1024*1024) #define MAX_GAP (TASK_SIZE/6*5) -static inline int mmap_is_legacy(void) +static inline int mmap_is_legacy(struct rlimit *rlim_stack) { if (current->personality & ADDR_COMPAT_LAYOUT) return 1; - if (rlimit(RLIMIT_STACK) == RLIM_INFINITY) + if (rlim_stack->rlim_cur == RLIM_INFINITY) return 1; return sysctl_legacy_va_layout; @@ -76,9 +76,10 @@ static inline unsigned long stack_maxrandom_size(void) return (1<<30); } -static inline unsigned long mmap_base(unsigned long rnd) +static inline unsigned long mmap_base(unsigned long rnd, + struct rlimit *rlim_stack) { - unsigned long gap = rlimit(RLIMIT_STACK); + unsigned long gap = rlim_stack->rlim_cur; unsigned long pad = stack_maxrandom_size() + stack_guard_gap; /* Values close to RLIM_INFINITY can overflow. */ @@ -196,26 +197,28 @@ radix__arch_get_unmapped_area_topdown(struct file *filp, } static void radix__arch_pick_mmap_layout(struct mm_struct *mm, - unsigned long random_factor) + unsigned long random_factor, + struct rlimit *rlim_stack) { - if (mmap_is_legacy()) { + if (mmap_is_legacy(rlim_stack)) { mm->mmap_base = TASK_UNMAPPED_BASE; mm->get_unmapped_area = radix__arch_get_unmapped_area; } else { - mm->mmap_base = mmap_base(random_factor); + mm->mmap_base = mmap_base(random_factor, rlim_stack); mm->get_unmapped_area = radix__arch_get_unmapped_area_topdown; } } #else /* dummy */ extern void radix__arch_pick_mmap_layout(struct mm_struct *mm, - unsigned long random_factor); + unsigned long random_factor, + struct rlimit *rlim_stack); #endif /* * This function, called very early during the creation of a new * process VM image, sets up which VM layout function to use: */ -void arch_pick_mmap_layout(struct mm_struct *mm) +void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) { unsigned long random_factor = 0UL; @@ -223,16 +226,17 @@ void arch_pick_mmap_layout(struct mm_struct *mm) random_factor = arch_mmap_rnd(); if (radix_enabled()) - return radix__arch_pick_mmap_layout(mm, random_factor); + return radix__arch_pick_mmap_layout(mm, random_factor, + rlim_stack); /* * Fall back to the standard layout if the personality * bit is set, or if the expected stack growth is unlimited: */ - if (mmap_is_legacy()) { + if (mmap_is_legacy(rlim_stack)) { mm->mmap_base = TASK_UNMAPPED_BASE; mm->get_unmapped_area = arch_get_unmapped_area; } else { - mm->mmap_base = mmap_base(random_factor); + mm->mmap_base = mmap_base(random_factor, rlim_stack); mm->get_unmapped_area = arch_get_unmapped_area_topdown; } } diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c index 831bdcf407bb..0a7627cdb34e 100644 --- a/arch/s390/mm/mmap.c +++ b/arch/s390/mm/mmap.c @@ -37,11 +37,11 @@ static unsigned long stack_maxrandom_size(void) #define MIN_GAP (32*1024*1024) #define MAX_GAP (STACK_TOP/6*5) -static inline int mmap_is_legacy(void) +static inline int mmap_is_legacy(struct rlimit *rlim_stack) { if (current->personality & ADDR_COMPAT_LAYOUT) return 1; - if (rlimit(RLIMIT_STACK) == RLIM_INFINITY) + if (rlim_stack->rlim_cur == RLIM_INFINITY) return 1; return sysctl_legacy_va_layout; } @@ -56,9 +56,10 @@ static unsigned long mmap_base_legacy(unsigned long rnd) return TASK_UNMAPPED_BASE + rnd; } -static inline unsigned long mmap_base(unsigned long rnd) +static inline unsigned long mmap_base(unsigned long rnd, + struct rlimit *rlim_stack) { - unsigned long gap = rlimit(RLIMIT_STACK); + unsigned long gap = rlim_stack->rlim_cur; if (gap < MIN_GAP) gap = MIN_GAP; @@ -184,7 +185,7 @@ check_asce_limit: * This function, called very early during the creation of a new * process VM image, sets up which VM layout function to use: */ -void arch_pick_mmap_layout(struct mm_struct *mm) +void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) { unsigned long random_factor = 0UL; @@ -195,11 +196,11 @@ void arch_pick_mmap_layout(struct mm_struct *mm) * Fall back to the standard layout if the personality * bit is set, or if the expected stack growth is unlimited: */ - if (mmap_is_legacy()) { + if (mmap_is_legacy(rlim_stack)) { mm->mmap_base = mmap_base_legacy(random_factor); mm->get_unmapped_area = arch_get_unmapped_area; } else { - mm->mmap_base = mmap_base(random_factor); + mm->mmap_base = mmap_base(random_factor, rlim_stack); mm->get_unmapped_area = arch_get_unmapped_area_topdown; } } diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c index 348a17ecdf66..9ef8de63f28b 100644 --- a/arch/sparc/kernel/sys_sparc_64.c +++ b/arch/sparc/kernel/sys_sparc_64.c @@ -276,7 +276,7 @@ static unsigned long mmap_rnd(void) return rnd << PAGE_SHIFT; } -void arch_pick_mmap_layout(struct mm_struct *mm) +void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) { unsigned long random_factor = mmap_rnd(); unsigned long gap; @@ -285,7 +285,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm) * Fall back to the standard layout if the personality * bit is set, or if the expected stack growth is unlimited: */ - gap = rlimit(RLIMIT_STACK); + gap = rlim_stack->rlim_cur; if (!test_thread_flag(TIF_32BIT) || (current->personality & ADDR_COMPAT_LAYOUT) || gap == RLIM_INFINITY || diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 155ecbac9e28..48c591251600 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -90,9 +90,10 @@ unsigned long arch_mmap_rnd(void) return arch_rnd(mmap_is_ia32() ? mmap32_rnd_bits : mmap64_rnd_bits); } -static unsigned long mmap_base(unsigned long rnd, unsigned long task_size) +static unsigned long mmap_base(unsigned long rnd, unsigned long task_size, + struct rlimit *rlim_stack) { - unsigned long gap = rlimit(RLIMIT_STACK); + unsigned long gap = rlim_stack->rlim_cur; unsigned long pad = stack_maxrandom_size(task_size) + stack_guard_gap; unsigned long gap_min, gap_max; @@ -126,16 +127,17 @@ static unsigned long mmap_legacy_base(unsigned long rnd, * process VM image, sets up which VM layout function to use: */ static void arch_pick_mmap_base(unsigned long *base, unsigned long *legacy_base, - unsigned long random_factor, unsigned long task_size) + unsigned long random_factor, unsigned long task_size, + struct rlimit *rlim_stack) { *legacy_base = mmap_legacy_base(random_factor, task_size); if (mmap_is_legacy()) *base = *legacy_base; else - *base = mmap_base(random_factor, task_size); + *base = mmap_base(random_factor, task_size, rlim_stack); } -void arch_pick_mmap_layout(struct mm_struct *mm) +void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) { if (mmap_is_legacy()) mm->get_unmapped_area = arch_get_unmapped_area; @@ -143,7 +145,8 @@ void arch_pick_mmap_layout(struct mm_struct *mm) mm->get_unmapped_area = arch_get_unmapped_area_topdown; arch_pick_mmap_base(&mm->mmap_base, &mm->mmap_legacy_base, - arch_rnd(mmap64_rnd_bits), task_size_64bit(0)); + arch_rnd(mmap64_rnd_bits), task_size_64bit(0), + rlim_stack); #ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES /* @@ -153,7 +156,8 @@ void arch_pick_mmap_layout(struct mm_struct *mm) * mmap_base, the compat syscall uses mmap_compat_base. */ arch_pick_mmap_base(&mm->mmap_compat_base, &mm->mmap_compat_legacy_base, - arch_rnd(mmap32_rnd_bits), task_size_32bit()); + arch_rnd(mmap32_rnd_bits), task_size_32bit(), + rlim_stack); #endif } diff --git a/fs/exec.c b/fs/exec.c index a919a827d181..f4469ab88c7a 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1323,6 +1323,8 @@ EXPORT_SYMBOL(would_dump); void setup_new_exec(struct linux_binprm * bprm) { + struct rlimit rlim_stack; + /* * Once here, prepare_binrpm() will not be called any more, so * the final state of setuid/setgid/fscaps can be merged into the @@ -1345,7 +1347,11 @@ void setup_new_exec(struct linux_binprm * bprm) current->signal->rlim[RLIMIT_STACK].rlim_cur = _STK_LIM; } - arch_pick_mmap_layout(current->mm); + task_lock(current->group_leader); + rlim_stack = current->signal->rlim[RLIMIT_STACK]; + task_unlock(current->group_leader); + + arch_pick_mmap_layout(current->mm, &rlim_stack); current->sas_ss_sp = current->sas_ss_size = 0; diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 9806184bb3d5..2c570cd934af 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -104,7 +104,8 @@ static inline void mm_update_next_owner(struct mm_struct *mm) #endif /* CONFIG_MEMCG */ #ifdef CONFIG_MMU -extern void arch_pick_mmap_layout(struct mm_struct *mm); +extern void arch_pick_mmap_layout(struct mm_struct *mm, + struct rlimit *rlim_stack); extern unsigned long arch_get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); @@ -113,7 +114,8 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); #else -static inline void arch_pick_mmap_layout(struct mm_struct *mm) {} +static inline void arch_pick_mmap_layout(struct mm_struct *mm, + struct rlimit *rlim_stack) {} #endif static inline bool in_vfork(struct task_struct *tsk) diff --git a/mm/util.c b/mm/util.c index 73676f0f1b43..1fc4fa7576f7 100644 --- a/mm/util.c +++ b/mm/util.c @@ -287,7 +287,7 @@ int vma_is_stack_for_current(struct vm_area_struct *vma) } #if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) -void arch_pick_mmap_layout(struct mm_struct *mm) +void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) { mm->mmap_base = TASK_UNMAPPED_BASE; mm->get_unmapped_area = arch_get_unmapped_area; From b83838313386f617d6bd8201be7f5b532059bba1 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 10 Apr 2018 16:34:57 -0700 Subject: [PATCH 111/140] exec: introduce finalize_exec() before start_thread() Provide a final callback into fs/exec.c before start_thread() takes over, to handle any last-minute changes, like the coming restoration of the stack limit. Link: http://lkml.kernel.org/r/1518638796-20819-3-git-send-email-keescook@chromium.org Signed-off-by: Kees Cook Cc: Andy Lutomirski Cc: Ben Hutchings Cc: Ben Hutchings Cc: Brad Spengler Cc: Greg KH Cc: Hugh Dickins Cc: "Jason A. Donenfeld" Cc: Laura Abbott Cc: Michal Hocko Cc: Oleg Nesterov Cc: Rik van Riel Cc: Willy Tarreau Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_aout.c | 1 + fs/binfmt_elf.c | 1 + fs/binfmt_elf_fdpic.c | 1 + fs/binfmt_flat.c | 1 + fs/exec.c | 6 ++++++ include/linux/binfmts.h | 1 + 6 files changed, 11 insertions(+) diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index ce1824f47ba6..c3deb2e35f20 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -330,6 +330,7 @@ beyond_if: #ifdef __alpha__ regs->gp = ex.a_gpvalue; #endif + finalize_exec(bprm); start_thread(regs, ex.a_entry, current->mm->start_stack); return 0; } diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index bdb201230bae..3edca6cb9a33 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1155,6 +1155,7 @@ static int load_elf_binary(struct linux_binprm *bprm) ELF_PLAT_INIT(regs, reloc_func_desc); #endif + finalize_exec(bprm); start_thread(regs, elf_entry, bprm->p); retval = 0; out: diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 429326b6e2e7..d90993adeffa 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -463,6 +463,7 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm) dynaddr); #endif + finalize_exec(bprm); /* everything is now ready... get the userspace context ready to roll */ entryaddr = interp_params.entry_addr ?: exec_params.entry_addr; start_thread(regs, entryaddr, current->mm->start_stack); diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index 5d6b94475f27..82a48e830018 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -994,6 +994,7 @@ static int load_flat_binary(struct linux_binprm *bprm) FLAT_PLAT_INIT(regs); #endif + finalize_exec(bprm); pr_debug("start_thread(regs=0x%p, entry=0x%lx, start_stack=0x%lx)\n", regs, start_addr, current->mm->start_stack); start_thread(regs, start_addr, current->mm->start_stack); diff --git a/fs/exec.c b/fs/exec.c index f4469ab88c7a..422ad79a7a03 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1384,6 +1384,12 @@ void setup_new_exec(struct linux_binprm * bprm) } EXPORT_SYMBOL(setup_new_exec); +/* Runs immediately before start_thread() takes over. */ +void finalize_exec(struct linux_binprm *bprm) +{ +} +EXPORT_SYMBOL(finalize_exec); + /* * Prepare credentials and lock ->cred_guard_mutex. * install_exec_creds() commits the new creds and drops the lock. diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index b0abe21d6cc9..40e52afbb2b0 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -118,6 +118,7 @@ extern int __must_check remove_arg_zero(struct linux_binprm *); extern int search_binary_handler(struct linux_binprm *); extern int flush_old_exec(struct linux_binprm * bprm); extern void setup_new_exec(struct linux_binprm * bprm); +extern void finalize_exec(struct linux_binprm *bprm); extern void would_dump(struct linux_binprm *, struct file *); extern int suid_dumpable; From c31dbb146dd44af44bc60780ce8fa7a9f5f746df Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 10 Apr 2018 16:35:01 -0700 Subject: [PATCH 112/140] exec: pin stack limit during exec Since the stack rlimit is used in multiple places during exec and it can be changed via other threads (via setrlimit()) or processes (via prlimit()), the assumption that the value doesn't change cannot be made. This leads to races with mm layout selection and argument size calculations. This changes the exec path to use the rlimit stored in bprm instead of in current. Before starting the thread, the bprm stack rlimit is stored back to current. Link: http://lkml.kernel.org/r/1518638796-20819-4-git-send-email-keescook@chromium.org Fixes: 64701dee4178e ("exec: Use sane stack rlimit under secureexec") Signed-off-by: Kees Cook Reported-by: Ben Hutchings Reported-by: Andy Lutomirski Reported-by: Brad Spengler Acked-by: Michal Hocko Cc: Ben Hutchings Cc: Greg KH Cc: Hugh Dickins Cc: "Jason A. Donenfeld" Cc: Laura Abbott Cc: Oleg Nesterov Cc: Rik van Riel Cc: Willy Tarreau Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 27 +++++++++++++++------------ include/linux/binfmts.h | 2 ++ 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index 422ad79a7a03..183059c427b9 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -257,7 +257,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, * to work from. */ limit = _STK_LIM / 4 * 3; - limit = min(limit, rlimit(RLIMIT_STACK) / 4); + limit = min(limit, bprm->rlim_stack.rlim_cur / 4); if (size > limit) goto fail; } @@ -411,6 +411,11 @@ static int bprm_mm_init(struct linux_binprm *bprm) if (!mm) goto err; + /* Save current stack limit for all calculations made during exec. */ + task_lock(current->group_leader); + bprm->rlim_stack = current->signal->rlim[RLIMIT_STACK]; + task_unlock(current->group_leader); + err = __bprm_mm_init(bprm); if (err) goto err; @@ -697,7 +702,7 @@ int setup_arg_pages(struct linux_binprm *bprm, #ifdef CONFIG_STACK_GROWSUP /* Limit stack size */ - stack_base = rlimit_max(RLIMIT_STACK); + stack_base = bprm->rlim_stack.rlim_max; if (stack_base > STACK_SIZE_MAX) stack_base = STACK_SIZE_MAX; @@ -770,7 +775,7 @@ int setup_arg_pages(struct linux_binprm *bprm, * Align this down to a page boundary as expand_stack * will align it up. */ - rlim_stack = rlimit(RLIMIT_STACK) & PAGE_MASK; + rlim_stack = bprm->rlim_stack.rlim_cur & PAGE_MASK; #ifdef CONFIG_STACK_GROWSUP if (stack_size + stack_expand > rlim_stack) stack_base = vma->vm_start + rlim_stack; @@ -1323,8 +1328,6 @@ EXPORT_SYMBOL(would_dump); void setup_new_exec(struct linux_binprm * bprm) { - struct rlimit rlim_stack; - /* * Once here, prepare_binrpm() will not be called any more, so * the final state of setuid/setgid/fscaps can be merged into the @@ -1343,15 +1346,11 @@ void setup_new_exec(struct linux_binprm * bprm) * RLIMIT_STACK, but after the point of no return to avoid * needing to clean up the change on failure. */ - if (current->signal->rlim[RLIMIT_STACK].rlim_cur > _STK_LIM) - current->signal->rlim[RLIMIT_STACK].rlim_cur = _STK_LIM; + if (bprm->rlim_stack.rlim_cur > _STK_LIM) + bprm->rlim_stack.rlim_cur = _STK_LIM; } - task_lock(current->group_leader); - rlim_stack = current->signal->rlim[RLIMIT_STACK]; - task_unlock(current->group_leader); - - arch_pick_mmap_layout(current->mm, &rlim_stack); + arch_pick_mmap_layout(current->mm, &bprm->rlim_stack); current->sas_ss_sp = current->sas_ss_size = 0; @@ -1387,6 +1386,10 @@ EXPORT_SYMBOL(setup_new_exec); /* Runs immediately before start_thread() takes over. */ void finalize_exec(struct linux_binprm *bprm) { + /* Store any stack rlimit changes before starting thread. */ + task_lock(current->group_leader); + current->signal->rlim[RLIMIT_STACK] = bprm->rlim_stack; + task_unlock(current->group_leader); } EXPORT_SYMBOL(finalize_exec); diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index 40e52afbb2b0..4955e0863b83 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -61,6 +61,8 @@ struct linux_binprm { unsigned interp_flags; unsigned interp_data; unsigned long loader, exec; + + struct rlimit rlim_stack; /* Saved RLIMIT_STACK used during exec. */ } __randomize_layout; #define BINPRM_FLAGS_ENFORCE_NONDUMP_BIT 0 From b94bb1f6104a1d66b8900d79eb0d28fd894c380e Mon Sep 17 00:00:00 2001 From: Vasyl Gomonovych Date: Tue, 10 Apr 2018 16:35:06 -0700 Subject: [PATCH 113/140] drivers/rapidio/rio-scan.c: fix typo in comment Fix typo in the words 'receiver', 'specified', 'during' Link: http://lkml.kernel.org/r/20180321211035.8904-1-gomonovych@gmail.com Signed-off-by: Vasyl Gomonovych Cc: Matt Porter Cc: Alexandre Bounine Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rapidio/rio-scan.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/rapidio/rio-scan.c b/drivers/rapidio/rio-scan.c index 23429bdaca84..161b927d9de1 100644 --- a/drivers/rapidio/rio-scan.c +++ b/drivers/rapidio/rio-scan.c @@ -76,7 +76,7 @@ static u16 rio_destid_alloc(struct rio_net *net) } /** - * rio_destid_reserve - Reserve the specivied destID + * rio_destid_reserve - Reserve the specified destID * @net: RIO network * @destid: destID to reserve * @@ -885,7 +885,7 @@ static struct rio_net *rio_scan_alloc_net(struct rio_mport *mport, * * For each enumerated device, ensure that each switch in a system * has correct routing entries. Add routes for devices that where - * unknown dirung the first enumeration pass through the switch. + * unknown during the first enumeration pass through the switch. */ static void rio_update_route_tables(struct rio_net *net) { @@ -983,7 +983,7 @@ static int rio_enum_mport(struct rio_mport *mport, u32 flags) /* reserve mport destID in new net */ rio_destid_reserve(net, mport->host_deviceid); - /* Enable Input Output Port (transmitter reviever) */ + /* Enable Input Output Port (transmitter receiver) */ rio_enable_rx_tx_port(mport, 1, 0, 0, 0); /* Set component tag for host */ From bbd876adb8c7294ad86ef0542d6b528b7ab06f48 Mon Sep 17 00:00:00 2001 From: Ioan Nicu Date: Tue, 10 Apr 2018 16:35:10 -0700 Subject: [PATCH 114/140] rapidio: use a reference count for struct mport_dma_req Once the dma request is passed to the DMA engine, the DMA subsystem would hold a pointer to this structure and could call the completion callback after do_dma_request() has timed out. The current code deals with this by putting timed out SYNC requests to a pending list and freeing them later, when the mport cdev device is released. This still does not guarantee that the DMA subsystem is really done with those transfers, so in theory dma_xfer_callback/dma_req_free could be called after mport_cdev_release_dma and could potentially access already freed memory. This patch simplifies the current handling by using a kref in the mport dma request structure, so that it gets freed only when nobody uses it anymore. This also simplifies the code a bit, as FAF transfers are now handled in the same way as SYNC and ASYNC transfers. There is no need anymore for the pending list and for the dma workqueue which was used in case of FAF transfers, so we remove them both. Link: http://lkml.kernel.org/r/20180405203342.GA16191@nokia.com Signed-off-by: Ioan Nicu Acked-by: Alexandre Bounine Cc: Barry Wood Cc: Matt Porter Cc: Christophe JAILLET Cc: Al Viro Cc: Logan Gunthorpe Cc: Chris Wilson Cc: Tvrtko Ursulin Cc: Frank Kunz Cc: Alexander Sverdlin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rapidio/devices/rio_mport_cdev.c | 122 ++++------------------- 1 file changed, 18 insertions(+), 104 deletions(-) diff --git a/drivers/rapidio/devices/rio_mport_cdev.c b/drivers/rapidio/devices/rio_mport_cdev.c index cfb54e01d758..9d27016c899e 100644 --- a/drivers/rapidio/devices/rio_mport_cdev.c +++ b/drivers/rapidio/devices/rio_mport_cdev.c @@ -212,7 +212,6 @@ struct mport_cdev_priv { #ifdef CONFIG_RAPIDIO_DMA_ENGINE struct dma_chan *dmach; struct list_head async_list; - struct list_head pend_list; spinlock_t req_lock; struct mutex dma_lock; struct kref dma_ref; @@ -258,8 +257,6 @@ static DECLARE_WAIT_QUEUE_HEAD(mport_cdev_wait); static struct class *dev_class; static dev_t dev_number; -static struct workqueue_struct *dma_wq; - static void mport_release_mapping(struct kref *ref); static int rio_mport_maint_rd(struct mport_cdev_priv *priv, void __user *arg, @@ -539,6 +536,7 @@ static int maint_comptag_set(struct mport_cdev_priv *priv, void __user *arg) #ifdef CONFIG_RAPIDIO_DMA_ENGINE struct mport_dma_req { + struct kref refcount; struct list_head node; struct file *filp; struct mport_cdev_priv *priv; @@ -554,11 +552,6 @@ struct mport_dma_req { struct completion req_comp; }; -struct mport_faf_work { - struct work_struct work; - struct mport_dma_req *req; -}; - static void mport_release_def_dma(struct kref *dma_ref) { struct mport_dev *md = @@ -578,8 +571,10 @@ static void mport_release_dma(struct kref *dma_ref) complete(&priv->comp); } -static void dma_req_free(struct mport_dma_req *req) +static void dma_req_free(struct kref *ref) { + struct mport_dma_req *req = container_of(ref, struct mport_dma_req, + refcount); struct mport_cdev_priv *priv = req->priv; unsigned int i; @@ -611,30 +606,7 @@ static void dma_xfer_callback(void *param) req->status = dma_async_is_tx_complete(priv->dmach, req->cookie, NULL, NULL); complete(&req->req_comp); -} - -static void dma_faf_cleanup(struct work_struct *_work) -{ - struct mport_faf_work *work = container_of(_work, - struct mport_faf_work, work); - struct mport_dma_req *req = work->req; - - dma_req_free(req); - kfree(work); -} - -static void dma_faf_callback(void *param) -{ - struct mport_dma_req *req = (struct mport_dma_req *)param; - struct mport_faf_work *work; - - work = kmalloc(sizeof(*work), GFP_ATOMIC); - if (!work) - return; - - INIT_WORK(&work->work, dma_faf_cleanup); - work->req = req; - queue_work(dma_wq, &work->work); + kref_put(&req->refcount, dma_req_free); } /* @@ -765,16 +737,14 @@ static int do_dma_request(struct mport_dma_req *req, goto err_out; } - if (sync == RIO_TRANSFER_FAF) - tx->callback = dma_faf_callback; - else - tx->callback = dma_xfer_callback; + tx->callback = dma_xfer_callback; tx->callback_param = req; req->dmach = chan; req->sync = sync; req->status = DMA_IN_PROGRESS; init_completion(&req->req_comp); + kref_get(&req->refcount); cookie = dmaengine_submit(tx); req->cookie = cookie; @@ -785,6 +755,7 @@ static int do_dma_request(struct mport_dma_req *req, if (dma_submit_error(cookie)) { rmcd_error("submit err=%d (addr:0x%llx len:0x%llx)", cookie, xfer->rio_addr, xfer->length); + kref_put(&req->refcount, dma_req_free); ret = -EIO; goto err_out; } @@ -860,6 +831,8 @@ rio_dma_transfer(struct file *filp, u32 transfer_mode, if (!req) return -ENOMEM; + kref_init(&req->refcount); + ret = get_dma_channel(priv); if (ret) { kfree(req); @@ -968,42 +941,20 @@ rio_dma_transfer(struct file *filp, u32 transfer_mode, ret = do_dma_request(req, xfer, sync, nents); if (ret >= 0) { - if (sync == RIO_TRANSFER_SYNC) - goto sync_out; - return ret; /* return ASYNC cookie */ + if (sync == RIO_TRANSFER_ASYNC) + return ret; /* return ASYNC cookie */ + } else { + rmcd_debug(DMA, "do_dma_request failed with err=%d", ret); } - if (ret == -ETIMEDOUT || ret == -EINTR) { - /* - * This can happen only in case of SYNC transfer. - * Do not free unfinished request structure immediately. - * Place it into pending list and deal with it later - */ - spin_lock(&priv->req_lock); - list_add_tail(&req->node, &priv->pend_list); - spin_unlock(&priv->req_lock); - return ret; - } - - - rmcd_debug(DMA, "do_dma_request failed with err=%d", ret); -sync_out: - dma_unmap_sg(chan->device->dev, req->sgt.sgl, req->sgt.nents, dir); - sg_free_table(&req->sgt); err_pg: - if (page_list) { + if (!req->page_list) { for (i = 0; i < nr_pages; i++) put_page(page_list[i]); kfree(page_list); } err_req: - if (req->map) { - mutex_lock(&md->buf_mutex); - kref_put(&req->map->ref, mport_release_mapping); - mutex_unlock(&md->buf_mutex); - } - put_dma_channel(priv); - kfree(req); + kref_put(&req->refcount, dma_req_free); return ret; } @@ -1121,7 +1072,7 @@ static int rio_mport_wait_for_async_dma(struct file *filp, void __user *arg) ret = 0; if (req->status != DMA_IN_PROGRESS && req->status != DMA_PAUSED) - dma_req_free(req); + kref_put(&req->refcount, dma_req_free); return ret; @@ -1966,7 +1917,6 @@ static int mport_cdev_open(struct inode *inode, struct file *filp) #ifdef CONFIG_RAPIDIO_DMA_ENGINE INIT_LIST_HEAD(&priv->async_list); - INIT_LIST_HEAD(&priv->pend_list); spin_lock_init(&priv->req_lock); mutex_init(&priv->dma_lock); #endif @@ -2006,8 +1956,6 @@ static void mport_cdev_release_dma(struct file *filp) md = priv->md; - flush_workqueue(dma_wq); - spin_lock(&priv->req_lock); if (!list_empty(&priv->async_list)) { rmcd_debug(EXIT, "async list not empty filp=%p %s(%d)", @@ -2023,20 +1971,7 @@ static void mport_cdev_release_dma(struct file *filp) req->filp, req->cookie, completion_done(&req->req_comp)?"yes":"no"); list_del(&req->node); - dma_req_free(req); - } - } - - if (!list_empty(&priv->pend_list)) { - rmcd_debug(EXIT, "Free pending DMA requests for filp=%p %s(%d)", - filp, current->comm, task_pid_nr(current)); - list_for_each_entry_safe(req, - req_next, &priv->pend_list, node) { - rmcd_debug(EXIT, "free req->filp=%p cookie=%d compl=%s", - req->filp, req->cookie, - completion_done(&req->req_comp)?"yes":"no"); - list_del(&req->node); - dma_req_free(req); + kref_put(&req->refcount, dma_req_free); } } @@ -2048,15 +1983,6 @@ static void mport_cdev_release_dma(struct file *filp) current->comm, task_pid_nr(current), wret); } - spin_lock(&priv->req_lock); - - if (!list_empty(&priv->pend_list)) { - rmcd_debug(EXIT, "ATTN: pending DMA requests, filp=%p %s(%d)", - filp, current->comm, task_pid_nr(current)); - } - - spin_unlock(&priv->req_lock); - if (priv->dmach != priv->md->dma_chan) { rmcd_debug(EXIT, "Release DMA channel for filp=%p %s(%d)", filp, current->comm, task_pid_nr(current)); @@ -2573,8 +2499,6 @@ static void mport_cdev_remove(struct mport_dev *md) cdev_device_del(&md->cdev, &md->dev); mport_cdev_kill_fasync(md); - flush_workqueue(dma_wq); - /* TODO: do we need to give clients some time to close file * descriptors? Simple wait for XX, or kref? */ @@ -2691,17 +2615,8 @@ static int __init mport_init(void) goto err_cli; } - dma_wq = create_singlethread_workqueue("dma_wq"); - if (!dma_wq) { - rmcd_error("failed to create DMA work queue"); - ret = -ENOMEM; - goto err_wq; - } - return 0; -err_wq: - class_interface_unregister(&rio_mport_interface); err_cli: unregister_chrdev_region(dev_number, RIO_MAX_MPORTS); err_chr: @@ -2717,7 +2632,6 @@ static void __exit mport_exit(void) class_interface_unregister(&rio_mport_interface); class_destroy(dev_class); unregister_chrdev_region(dev_number, RIO_MAX_MPORTS); - destroy_workqueue(dma_wq); } module_init(mport_init); From 2d87b309a5d66c3ec0b4d985fe29b547282e7427 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 10 Apr 2018 16:35:14 -0700 Subject: [PATCH 115/140] kernel/sysctl.c: fix sizeof argument to match variable name Fix sizeof argument to be the same as the data variable name. Probably a copy/paste error. Mostly harmless since both variables are unsigned int. Fixes kernel bugzilla #197371: Possible access to unintended variable in "kernel/sysctl.c" line 1339 https://bugzilla.kernel.org/show_bug.cgi?id=197371 Link: http://lkml.kernel.org/r/e0d0531f-361e-ef5f-8499-32743ba907e1@infradead.org Signed-off-by: Randy Dunlap Reported-by: Petru Mihancea Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index bdf7090b106d..a2854f6e0743 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1340,7 +1340,7 @@ static struct ctl_table vm_table[] = { { .procname = "dirtytime_expire_seconds", .data = &dirtytime_expire_interval, - .maxlen = sizeof(dirty_expire_interval), + .maxlen = sizeof(dirtytime_expire_interval), .mode = 0644, .proc_handler = dirtytime_interval_handler, .extra1 = &zero, From edc41b3c5489996e4c1ec820bf102660bf745c45 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Tue, 10 Apr 2018 16:35:18 -0700 Subject: [PATCH 116/140] kernel/params.c: downgrade warning for unsafe parameters As using an unsafe module parameter is, by its very definition, an expected user action, emitting a warning is overkill. Nothing has yet gone wrong, and we add a taint flag for any future oops should something actually go wrong. So instead of having a user controllable pr_warn, downgrade it to a pr_notice for "a normal, but significant condition". We make use of unsafe kernel parameters in igt (https://cgit.freedesktop.org/drm/igt-gpu-tools/) (we have not yet succeeded in removing all such debugging options), which generates a warning and taints the kernel. The warning is unhelpful as we then need to filter it out again as we check that every test themselves do not provoke any kernel warnings. Link: http://lkml.kernel.org/r/20180226151919.9674-1-chris@chris-wilson.co.uk Fixes: 91f9d330cc14 ("module: make it possible to have unsafe, tainting module params") Signed-off-by: Chris Wilson Acked-by: Jani Nikula Reviewed-by: Andrew Morton Cc: Rusty Russell Cc: Jean Delvare Cc: Li Zhong Cc: Petri Latvala Cc: Daniel Vetter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/params.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/params.c b/kernel/params.c index cc9108c2a1fd..ce89f757e6da 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -111,8 +111,8 @@ bool parameq(const char *a, const char *b) static void param_check_unsafe(const struct kernel_param *kp) { if (kp->flags & KERNEL_PARAM_FL_UNSAFE) { - pr_warn("Setting dangerous option %s - tainting kernel\n", - kp->name); + pr_notice("Setting dangerous option %s - tainting kernel\n", + kp->name); add_taint(TAINT_USER, LOCKDEP_STILL_OK); } } From c21a6970ae727839a2f300cd8dd957de0d0238c3 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 10 Apr 2018 16:35:23 -0700 Subject: [PATCH 117/140] ipc/shm: introduce shmctl(SHM_STAT_ANY) Patch series "sysvipc: introduce STAT_ANY commands", v2. The following patches adds the discussed (see [1]) new command for shm as well as for sems and msq as they are subject to the same discrepancies for ipc object permission checks between the syscall and via procfs. These new commands are justified in that (1) we are stuck with this semantics as changing syscall and procfs can break userland; and (2) some users can benefit from performance (for large amounts of shm segments, for example) from not having to parse the procfs interface. Once merged, I will submit the necesary manpage updates. But I'm thinking something like: : diff --git a/man2/shmctl.2 b/man2/shmctl.2 : index 7bb503999941..bb00bbe21a57 100644 : --- a/man2/shmctl.2 : +++ b/man2/shmctl.2 : @@ -41,6 +41,7 @@ : .\" 2005-04-25, mtk -- noted aberrant Linux behavior w.r.t. new : .\" attaches to a segment that has already been marked for deletion. : .\" 2005-08-02, mtk: Added IPC_INFO, SHM_INFO, SHM_STAT descriptions. : +.\" 2018-02-13, dbueso: Added SHM_STAT_ANY description. : .\" : .TH SHMCTL 2 2017-09-15 "Linux" "Linux Programmer's Manual" : .SH NAME : @@ -242,6 +243,18 @@ However, the : argument is not a segment identifier, but instead an index into : the kernel's internal array that maintains information about : all shared memory segments on the system. : +.TP : +.BR SHM_STAT_ANY " (Linux-specific)" : +Return a : +.I shmid_ds : +structure as for : +.BR SHM_STAT . : +However, the : +.I shm_perm.mode : +is not checked for read access for : +.IR shmid , : +resembing the behaviour of : +/proc/sysvipc/shm. : .PP : The caller can prevent or allow swapping of a shared : memory segment with the following \fIcmd\fP values: : @@ -287,7 +300,7 @@ operation returns the index of the highest used entry in the : kernel's internal array recording information about all : shared memory segments. : (This information can be used with repeated : -.B SHM_STAT : +.B SHM_STAT/SHM_STAT_ANY : operations to obtain information about all shared memory segments : on the system.) : A successful : @@ -328,7 +341,7 @@ isn't accessible. : \fIshmid\fP is not a valid identifier, or \fIcmd\fP : is not a valid command. : Or: for a : -.B SHM_STAT : +.B SHM_STAT/SHM_STAT_ANY : operation, the index value specified in : .I shmid : referred to an array slot that is currently unused. This patch (of 3): There is a permission discrepancy when consulting shm ipc object metadata between /proc/sysvipc/shm (0444) and the SHM_STAT shmctl command. The later does permission checks for the object vs S_IRUGO. As such there can be cases where EACCESS is returned via syscall but the info is displayed anyways in the procfs files. While this might have security implications via info leaking (albeit no writing to the shm metadata), this behavior goes way back and showing all the objects regardless of the permissions was most likely an overlook - so we are stuck with it. Furthermore, modifying either the syscall or the procfs file can cause userspace programs to break (ie ipcs). Some applications require getting the procfs info (without root privileges) and can be rather slow in comparison with a syscall -- up to 500x in some reported cases. This patch introduces a new SHM_STAT_ANY command such that the shm ipc object permissions are ignored, and only audited instead. In addition, I've left the lsm security hook checks in place, as if some policy can block the call, then the user has no other choice than just parsing the procfs file. [1] https://lkml.org/lkml/2017/12/19/220 Link: http://lkml.kernel.org/r/20180215162458.10059-2-dave@stgolabs.net Signed-off-by: Davidlohr Bueso Acked-by: Michal Hocko Cc: Michael Kerrisk Cc: Manfred Spraul Cc: Eric W. Biederman Cc: Kees Cook Cc: Robert Kettler Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/shm.h | 5 +++-- ipc/shm.c | 23 ++++++++++++++++++----- security/selinux/hooks.c | 1 + security/smack/smack_lsm.c | 1 + 4 files changed, 23 insertions(+), 7 deletions(-) diff --git a/include/uapi/linux/shm.h b/include/uapi/linux/shm.h index 4de12a39b075..dde1344f047c 100644 --- a/include/uapi/linux/shm.h +++ b/include/uapi/linux/shm.h @@ -83,8 +83,9 @@ struct shmid_ds { #define SHM_UNLOCK 12 /* ipcs ctl commands */ -#define SHM_STAT 13 -#define SHM_INFO 14 +#define SHM_STAT 13 +#define SHM_INFO 14 +#define SHM_STAT_ANY 15 /* Obsolete, used only for backwards compatibility */ struct shminfo { diff --git a/ipc/shm.c b/ipc/shm.c index acefe44fefef..1a28b6a96449 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -947,14 +947,14 @@ static int shmctl_stat(struct ipc_namespace *ns, int shmid, memset(tbuf, 0, sizeof(*tbuf)); rcu_read_lock(); - if (cmd == SHM_STAT) { + if (cmd == SHM_STAT || cmd == SHM_STAT_ANY) { shp = shm_obtain_object(ns, shmid); if (IS_ERR(shp)) { err = PTR_ERR(shp); goto out_unlock; } id = shp->shm_perm.id; - } else { + } else { /* IPC_STAT */ shp = shm_obtain_object_check(ns, shmid); if (IS_ERR(shp)) { err = PTR_ERR(shp); @@ -962,9 +962,20 @@ static int shmctl_stat(struct ipc_namespace *ns, int shmid, } } - err = -EACCES; - if (ipcperms(ns, &shp->shm_perm, S_IRUGO)) - goto out_unlock; + /* + * Semantically SHM_STAT_ANY ought to be identical to + * that functionality provided by the /proc/sysvipc/ + * interface. As such, only audit these calls and + * do not do traditional S_IRUGO permission checks on + * the ipc object. + */ + if (cmd == SHM_STAT_ANY) + audit_ipc_obj(&shp->shm_perm); + else { + err = -EACCES; + if (ipcperms(ns, &shp->shm_perm, S_IRUGO)) + goto out_unlock; + } err = security_shm_shmctl(&shp->shm_perm, cmd); if (err) @@ -1104,6 +1115,7 @@ long ksys_shmctl(int shmid, int cmd, struct shmid_ds __user *buf) return err; } case SHM_STAT: + case SHM_STAT_ANY: case IPC_STAT: { err = shmctl_stat(ns, shmid, cmd, &sem64); if (err < 0) @@ -1282,6 +1294,7 @@ long compat_ksys_shmctl(int shmid, int cmd, void __user *uptr) return err; } case IPC_STAT: + case SHM_STAT_ANY: case SHM_STAT: err = shmctl_stat(ns, shmid, cmd, &sem64); if (err < 0) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 1eeb70e439d7..1287013f747d 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -6157,6 +6157,7 @@ static int selinux_shm_shmctl(struct kern_ipc_perm *shp, int cmd) SECCLASS_SYSTEM, SYSTEM__IPC_INFO, NULL); case IPC_STAT: case SHM_STAT: + case SHM_STAT_ANY: perms = SHM__GETATTR | SHM__ASSOCIATE; break; case IPC_SET: diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 73549007bf9e..0daab3019023 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -3046,6 +3046,7 @@ static int smack_shm_shmctl(struct kern_ipc_perm *isp, int cmd) switch (cmd) { case IPC_STAT: case SHM_STAT: + case SHM_STAT_ANY: may = MAY_READ; break; case IPC_SET: From a280d6dc77eb6002f269d58cd47c7c7e69b617b6 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 10 Apr 2018 16:35:26 -0700 Subject: [PATCH 118/140] ipc/sem: introduce semctl(SEM_STAT_ANY) There is a permission discrepancy when consulting shm ipc object metadata between /proc/sysvipc/sem (0444) and the SEM_STAT semctl command. The later does permission checks for the object vs S_IRUGO. As such there can be cases where EACCESS is returned via syscall but the info is displayed anyways in the procfs files. While this might have security implications via info leaking (albeit no writing to the sma metadata), this behavior goes way back and showing all the objects regardless of the permissions was most likely an overlook - so we are stuck with it. Furthermore, modifying either the syscall or the procfs file can cause userspace programs to break (ie ipcs). Some applications require getting the procfs info (without root privileges) and can be rather slow in comparison with a syscall -- up to 500x in some reported cases for shm. This patch introduces a new SEM_STAT_ANY command such that the sem ipc object permissions are ignored, and only audited instead. In addition, I've left the lsm security hook checks in place, as if some policy can block the call, then the user has no other choice than just parsing the procfs file. Link: http://lkml.kernel.org/r/20180215162458.10059-3-dave@stgolabs.net Signed-off-by: Davidlohr Bueso Reported-by: Robert Kettler Cc: Eric W. Biederman Cc: Kees Cook Cc: Manfred Spraul Cc: Michael Kerrisk Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/sem.h | 1 + ipc/sem.c | 17 ++++++++++++----- security/selinux/hooks.c | 1 + security/smack/smack_lsm.c | 1 + 4 files changed, 15 insertions(+), 5 deletions(-) diff --git a/include/uapi/linux/sem.h b/include/uapi/linux/sem.h index 9c3e745b0656..39a1876f039e 100644 --- a/include/uapi/linux/sem.h +++ b/include/uapi/linux/sem.h @@ -19,6 +19,7 @@ /* ipcs ctl cmds */ #define SEM_STAT 18 #define SEM_INFO 19 +#define SEM_STAT_ANY 20 /* Obsolete, used only for backwards compatibility and libc5 compiles */ struct semid_ds { diff --git a/ipc/sem.c b/ipc/sem.c index 2994da8ccc7f..06be75d9217a 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -1220,14 +1220,14 @@ static int semctl_stat(struct ipc_namespace *ns, int semid, memset(semid64, 0, sizeof(*semid64)); rcu_read_lock(); - if (cmd == SEM_STAT) { + if (cmd == SEM_STAT || cmd == SEM_STAT_ANY) { sma = sem_obtain_object(ns, semid); if (IS_ERR(sma)) { err = PTR_ERR(sma); goto out_unlock; } id = sma->sem_perm.id; - } else { + } else { /* IPC_STAT */ sma = sem_obtain_object_check(ns, semid); if (IS_ERR(sma)) { err = PTR_ERR(sma); @@ -1235,9 +1235,14 @@ static int semctl_stat(struct ipc_namespace *ns, int semid, } } - err = -EACCES; - if (ipcperms(ns, &sma->sem_perm, S_IRUGO)) - goto out_unlock; + /* see comment for SHM_STAT_ANY */ + if (cmd == SEM_STAT_ANY) + audit_ipc_obj(&sma->sem_perm); + else { + err = -EACCES; + if (ipcperms(ns, &sma->sem_perm, S_IRUGO)) + goto out_unlock; + } err = security_sem_semctl(&sma->sem_perm, cmd); if (err) @@ -1626,6 +1631,7 @@ long ksys_semctl(int semid, int semnum, int cmd, unsigned long arg) return semctl_info(ns, semid, cmd, p); case IPC_STAT: case SEM_STAT: + case SEM_STAT_ANY: err = semctl_stat(ns, semid, cmd, &semid64); if (err < 0) return err; @@ -1732,6 +1738,7 @@ long compat_ksys_semctl(int semid, int semnum, int cmd, int arg) return semctl_info(ns, semid, cmd, p); case IPC_STAT: case SEM_STAT: + case SEM_STAT_ANY: err = semctl_stat(ns, semid, cmd, &semid64); if (err < 0) return err; diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 1287013f747d..927904d0f115 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -6273,6 +6273,7 @@ static int selinux_sem_semctl(struct kern_ipc_perm *sma, int cmd) break; case IPC_STAT: case SEM_STAT: + case SEM_STAT_ANY: perms = SEM__GETATTR | SEM__ASSOCIATE; break; default: diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 0daab3019023..cb36498a5076 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -3140,6 +3140,7 @@ static int smack_sem_semctl(struct kern_ipc_perm *isp, int cmd) case GETALL: case IPC_STAT: case SEM_STAT: + case SEM_STAT_ANY: may = MAY_READ; break; case SETVAL: From 23c8cec8cf679b10997a512abb1e86f0cedc42ba Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 10 Apr 2018 16:35:30 -0700 Subject: [PATCH 119/140] ipc/msg: introduce msgctl(MSG_STAT_ANY) There is a permission discrepancy when consulting msq ipc object metadata between /proc/sysvipc/msg (0444) and the MSG_STAT shmctl command. The later does permission checks for the object vs S_IRUGO. As such there can be cases where EACCESS is returned via syscall but the info is displayed anyways in the procfs files. While this might have security implications via info leaking (albeit no writing to the msq metadata), this behavior goes way back and showing all the objects regardless of the permissions was most likely an overlook - so we are stuck with it. Furthermore, modifying either the syscall or the procfs file can cause userspace programs to break (ie ipcs). Some applications require getting the procfs info (without root privileges) and can be rather slow in comparison with a syscall -- up to 500x in some reported cases for shm. This patch introduces a new MSG_STAT_ANY command such that the msq ipc object permissions are ignored, and only audited instead. In addition, I've left the lsm security hook checks in place, as if some policy can block the call, then the user has no other choice than just parsing the procfs file. Link: http://lkml.kernel.org/r/20180215162458.10059-4-dave@stgolabs.net Signed-off-by: Davidlohr Bueso Reported-by: Robert Kettler Cc: Eric W. Biederman Cc: Kees Cook Cc: Manfred Spraul Cc: Michael Kerrisk Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/msg.h | 1 + ipc/msg.c | 17 ++++++++++++----- security/selinux/hooks.c | 1 + security/smack/smack_lsm.c | 1 + 4 files changed, 15 insertions(+), 5 deletions(-) diff --git a/include/uapi/linux/msg.h b/include/uapi/linux/msg.h index 5d5ab81dc9be..e4a0d9a9a9e8 100644 --- a/include/uapi/linux/msg.h +++ b/include/uapi/linux/msg.h @@ -7,6 +7,7 @@ /* ipcs ctl commands */ #define MSG_STAT 11 #define MSG_INFO 12 +#define MSG_STAT_ANY 13 /* msgrcv options */ #define MSG_NOERROR 010000 /* no error if message is too big */ diff --git a/ipc/msg.c b/ipc/msg.c index 114a21189613..56fd1c73eedc 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -497,14 +497,14 @@ static int msgctl_stat(struct ipc_namespace *ns, int msqid, memset(p, 0, sizeof(*p)); rcu_read_lock(); - if (cmd == MSG_STAT) { + if (cmd == MSG_STAT || cmd == MSG_STAT_ANY) { msq = msq_obtain_object(ns, msqid); if (IS_ERR(msq)) { err = PTR_ERR(msq); goto out_unlock; } id = msq->q_perm.id; - } else { + } else { /* IPC_STAT */ msq = msq_obtain_object_check(ns, msqid); if (IS_ERR(msq)) { err = PTR_ERR(msq); @@ -512,9 +512,14 @@ static int msgctl_stat(struct ipc_namespace *ns, int msqid, } } - err = -EACCES; - if (ipcperms(ns, &msq->q_perm, S_IRUGO)) - goto out_unlock; + /* see comment for SHM_STAT_ANY */ + if (cmd == MSG_STAT_ANY) + audit_ipc_obj(&msq->q_perm); + else { + err = -EACCES; + if (ipcperms(ns, &msq->q_perm, S_IRUGO)) + goto out_unlock; + } err = security_msg_queue_msgctl(&msq->q_perm, cmd); if (err) @@ -572,6 +577,7 @@ long ksys_msgctl(int msqid, int cmd, struct msqid_ds __user *buf) return err; } case MSG_STAT: /* msqid is an index rather than a msg queue id */ + case MSG_STAT_ANY: case IPC_STAT: err = msgctl_stat(ns, msqid, cmd, &msqid64); if (err < 0) @@ -690,6 +696,7 @@ long compat_ksys_msgctl(int msqid, int cmd, void __user *uptr) } case IPC_STAT: case MSG_STAT: + case MSG_STAT_ANY: err = msgctl_stat(ns, msqid, cmd, &msqid64); if (err < 0) return err; diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 927904d0f115..4cafe6a19167 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -6006,6 +6006,7 @@ static int selinux_msg_queue_msgctl(struct kern_ipc_perm *msq, int cmd) SECCLASS_SYSTEM, SYSTEM__IPC_INFO, NULL); case IPC_STAT: case MSG_STAT: + case MSG_STAT_ANY: perms = MSGQ__GETATTR | MSGQ__ASSOCIATE; break; case IPC_SET: diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index cb36498a5076..0b414836bebd 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -3230,6 +3230,7 @@ static int smack_msg_queue_msgctl(struct kern_ipc_perm *isp, int cmd) switch (cmd) { case IPC_STAT: case MSG_STAT: + case MSG_STAT_ANY: may = MAY_READ; break; case IPC_SET: From 64a11f3dc20b45fdc8c058296b4f6449e4b9f24c Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 10 Apr 2018 16:35:35 -0700 Subject: [PATCH 120/140] fs/proc/proc_sysctl.c: fix typo in sysctl_check_table_array() Patch series "ipc: Clamp *mni to the real IPCMNI limit", v3. The sysctl parameters msgmni, shmmni and semmni have an inherent limit of IPC_MNI (32k). However, users may not be aware of that because they can write a value much higher than that without getting any error or notification. Reading the parameters back will show the newly written values which are not real. Enforcing the limit by failing sysctl parameter write, however, can break existing user applications. To address this delemma, a new flags field is introduced into the ctl_table. The value CTL_FLAGS_CLAMP_RANGE can be added to any ctl_table entries to enable a looser range clamping without returning any error. For example, .flags = CTL_FLAGS_CLAMP_RANGE, This flags value are now used for the range checking of shmmni, msgmni and semmni without breaking existing applications. If any out of range value is written to those sysctl parameters, the following warning will be printed instead. Kernel parameter "shmmni" was set out of range [0, 32768], clamped to 32768. Reading the values back will show 32768 instead of some fake values. This patch (of 6): Fix a typo. Link: http://lkml.kernel.org/r/1519926220-7453-2-git-send-email-longman@redhat.com Signed-off-by: Waiman Long Reviewed-by: Andrew Morton Acked-by: Luis R. Rodriguez Cc: Davidlohr Bueso Cc: Manfred Spraul Cc: Kees Cook Cc: Al Viro Cc: Matthew Wilcox Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/proc_sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 4654fc3c246f..8989936f2995 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1086,7 +1086,7 @@ static int sysctl_check_table_array(const char *path, struct ctl_table *table) if ((table->proc_handler == proc_douintvec) || (table->proc_handler == proc_douintvec_minmax)) { if (table->maxlen != sizeof(unsigned int)) - err |= sysctl_err(path, table, "array now allowed"); + err |= sysctl_err(path, table, "array not allowed"); } return err; From 24704f36196ce79b48dd3921e782d15fd9c87959 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 10 Apr 2018 16:35:38 -0700 Subject: [PATCH 121/140] kernel/sysctl.c: add kdoc comments to do_proc_do{u}intvec_minmax_conv_param Kdoc comments are added to the do_proc_dointvec_minmax_conv_param and do_proc_douintvec_minmax_conv_param structures thare are used internally for range checking. The error codes returned by proc_dointvec_minmax() and proc_douintvec_minmax() are also documented. Link: http://lkml.kernel.org/r/1519926220-7453-3-git-send-email-longman@redhat.com Signed-off-by: Waiman Long Reviewed-by: Andrew Morton Acked-by: Luis R. Rodriguez Cc: Al Viro Cc: Davidlohr Bueso Cc: Kees Cook Cc: Manfred Spraul Cc: Matthew Wilcox Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index a2854f6e0743..6a78cf70761d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2511,6 +2511,15 @@ static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write, } #endif +/** + * struct do_proc_dointvec_minmax_conv_param - proc_dointvec_minmax() range checking structure + * @min: pointer to minimum allowable value + * @max: pointer to maximum allowable value + * + * The do_proc_dointvec_minmax_conv_param structure provides the + * minimum and maximum values for doing range checking for those sysctl + * parameters that use the proc_dointvec_minmax() handler. + */ struct do_proc_dointvec_minmax_conv_param { int *min; int *max; @@ -2554,7 +2563,7 @@ static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp, * This routine will ensure the values are within the range specified by * table->extra1 (min) and table->extra2 (max). * - * Returns 0 on success. + * Returns 0 on success or -EINVAL on write when the range check fails. */ int proc_dointvec_minmax(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -2567,6 +2576,15 @@ int proc_dointvec_minmax(struct ctl_table *table, int write, do_proc_dointvec_minmax_conv, ¶m); } +/** + * struct do_proc_douintvec_minmax_conv_param - proc_douintvec_minmax() range checking structure + * @min: pointer to minimum allowable value + * @max: pointer to maximum allowable value + * + * The do_proc_douintvec_minmax_conv_param structure provides the + * minimum and maximum values for doing range checking for those sysctl + * parameters that use the proc_douintvec_minmax() handler. + */ struct do_proc_douintvec_minmax_conv_param { unsigned int *min; unsigned int *max; @@ -2614,7 +2632,7 @@ static int do_proc_douintvec_minmax_conv(unsigned long *lvalp, * check for UINT_MAX to avoid having to support wrap around uses from * userspace. * - * Returns 0 on success. + * Returns 0 on success or -ERANGE on write when the range check fails. */ int proc_douintvec_minmax(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) From a61fc2cbdf6a753b7000dd216c62285a68755147 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 10 Apr 2018 16:35:42 -0700 Subject: [PATCH 122/140] ipc/shm.c: shm_split(): remove unneeded test for NULL shm_file_data.vm_ops This was added by the recent "ipc/shm.c: add split function to shm_vm_ops", but it is not necessary. Reviewed-by: Mike Kravetz Cc: Laurent Dufour Cc: Dan Williams Cc: Michal Hocko Cc: Davidlohr Bueso Cc: Manfred Spraul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/shm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ipc/shm.c b/ipc/shm.c index 1a28b6a96449..5639345dbec9 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -415,7 +415,7 @@ static int shm_split(struct vm_area_struct *vma, unsigned long addr) struct file *file = vma->vm_file; struct shm_file_data *sfd = shm_file_data(file); - if (sfd->vm_ops && sfd->vm_ops->split) + if (sfd->vm_ops->split) return sfd->vm_ops->split(vma, addr); return 0; From de99626c2e89713cd29860ca26b584d1e6219da0 Mon Sep 17 00:00:00 2001 From: Valentin Vidic Date: Tue, 10 Apr 2018 16:35:46 -0700 Subject: [PATCH 123/140] include/linux/kfifo.h: fix comment Clean up unusual formatting in the note about locking. Link: http://lkml.kernel.org/r/20180324002630.13046-1-Valentin.Vidic@CARNet.hr Signed-off-by: Valentin Vidic Cc: Stefani Seibold Cc: Mauro Carvalho Chehab Cc: Christophe JAILLET Cc: Jiri Kosina Cc: Sean Young Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kfifo.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/kfifo.h b/include/linux/kfifo.h index e251533a5939..89fc8dc7bf38 100644 --- a/include/linux/kfifo.h +++ b/include/linux/kfifo.h @@ -41,11 +41,11 @@ */ /* - * Note about locking : There is no locking required until only * one reader - * and one writer is using the fifo and no kfifo_reset() will be * called - * kfifo_reset_out() can be safely used, until it will be only called + * Note about locking: There is no locking required until only one reader + * and one writer is using the fifo and no kfifo_reset() will be called. + * kfifo_reset_out() can be safely used, until it will be only called * in the reader thread. - * For multiple writer and one reader there is only a need to lock the writer. + * For multiple writer and one reader there is only a need to lock the writer. * And vice versa for only one writer and multiple reader there is only a need * to lock the reader. */ From 32785c0539b7e96f77a14a4f4ab225712665a5a4 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Tue, 10 Apr 2018 16:35:49 -0700 Subject: [PATCH 124/140] fs/dcache.c: add cond_resched() in shrink_dentry_list() As previously reported (https://patchwork.kernel.org/patch/8642031/) it's possible to call shrink_dentry_list with a large number of dentries (> 10000). This, in turn, could trigger the softlockup detector and possibly trigger a panic. In addition to the unmount path being vulnerable to this scenario, at SuSE we've observed similar situation happening during process exit on processes that touch a lot of dentries. Here is an excerpt from a crash dump. The number after the colon are the number of dentries on the list passed to shrink_dentry_list: PID 99760: 10722 PID 107530: 215 PID 108809: 24134 PID 108877: 21331 PID 141708: 16487 So we want to kill between 15k-25k dentries without yielding. And one possible call stack looks like: 4 [ffff8839ece41db0] _raw_spin_lock at ffffffff8152a5f8 5 [ffff8839ece41db0] evict at ffffffff811c3026 6 [ffff8839ece41dd0] __dentry_kill at ffffffff811bf258 7 [ffff8839ece41df0] shrink_dentry_list at ffffffff811bf593 8 [ffff8839ece41e18] shrink_dcache_parent at ffffffff811bf830 9 [ffff8839ece41e50] proc_flush_task at ffffffff8120dd61 10 [ffff8839ece41ec0] release_task at ffffffff81059ebd 11 [ffff8839ece41f08] do_exit at ffffffff8105b8ce 12 [ffff8839ece41f78] sys_exit at ffffffff8105bd53 13 [ffff8839ece41f80] system_call_fastpath at ffffffff81532909 While some of the callers of shrink_dentry_list do use cond_resched, this is not sufficient to prevent softlockups. So just move cond_resched into shrink_dentry_list from its callers. David said: I've found hundreds of occurrences of warnings that we emit when need_resched stays set for a prolonged period of time with the stack trace that is included in the change log. Link: http://lkml.kernel.org/r/1521718946-31521-1-git-send-email-nborisov@suse.com Signed-off-by: Nikolay Borisov Reviewed-by: Andrew Morton Acked-by: David Rientjes Cc: Alexander Viro Cc: Goldwyn Rodrigues Cc: Jeff Mahoney Cc: Davidlohr Bueso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dcache.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 915816e90049..86d2de63461e 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1052,6 +1052,8 @@ static void shrink_dentry_list(struct list_head *list) while (!list_empty(list)) { struct dentry *dentry, *parent; + cond_resched(); + dentry = list_entry(list->prev, struct dentry, d_lru); spin_lock(&dentry->d_lock); rcu_read_lock(); @@ -1205,7 +1207,6 @@ void shrink_dcache_sb(struct super_block *sb) this_cpu_sub(nr_dentry_unused, freed); shrink_dentry_list(&dispose); - cond_resched(); } while (list_lru_count(&sb->s_dentry_lru) > 0); } EXPORT_SYMBOL(shrink_dcache_sb); @@ -1487,7 +1488,6 @@ void shrink_dcache_parent(struct dentry *parent) break; shrink_dentry_list(&data.dispose); - cond_resched(); } } EXPORT_SYMBOL(shrink_dcache_parent); @@ -1614,7 +1614,6 @@ void d_invalidate(struct dentry *dentry) detach_mounts(data.mountpoint); dput(data.mountpoint); } - cond_resched(); } } EXPORT_SYMBOL(d_invalidate); From 721d8b41aba3d99a9e9abaa398ad908e94053a30 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 10 Apr 2018 16:35:53 -0700 Subject: [PATCH 125/140] MAINTAINERS: update bouncing aacraid@adaptec.com addresses Adaptec is now part of Microsemi. Commit 2a81ffdd9da1 ("MAINTAINERS: Update email address for aacraid") updated only one of the driver maintainer addresses. Update the other two sections as the aacraid@adaptec.com address bounces. Link: http://lkml.kernel.org/r/1522103936.12357.27.camel@perches.com Signed-off-by: Joe Perches Cc: Dave Carroll Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index acba38738dc1..189b1bf2d7f0 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4392,7 +4392,7 @@ S: Maintained F: drivers/staging/fsl-dpaa2/ethsw DPT_I2O SCSI RAID DRIVER -M: Adaptec OEM Raid Solutions +M: Adaptec OEM Raid Solutions L: linux-scsi@vger.kernel.org W: http://www.adaptec.com/ S: Maintained @@ -7345,7 +7345,7 @@ F: include/linux/ipmi* F: include/uapi/linux/ipmi* IPS SCSI RAID DRIVER -M: Adaptec OEM Raid Solutions +M: Adaptec OEM Raid Solutions L: linux-scsi@vger.kernel.org W: http://www.adaptec.com/ S: Maintained From a4ff8e8620d3f4f50ac4b41e8067b7d395056843 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 10 Apr 2018 16:35:57 -0700 Subject: [PATCH 126/140] mm: introduce MAP_FIXED_NOREPLACE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "mm: introduce MAP_FIXED_NOREPLACE", v2. This has started as a follow up discussion [3][4] resulting in the runtime failure caused by hardening patch [5] which removes MAP_FIXED from the elf loader because MAP_FIXED is inherently dangerous as it might silently clobber an existing underlying mapping (e.g. stack). The reason for the failure is that some architectures enforce an alignment for the given address hint without MAP_FIXED used (e.g. for shared or file backed mappings). One way around this would be excluding those archs which do alignment tricks from the hardening [6]. The patch is really trivial but it has been objected, rightfully so, that this screams for a more generic solution. We basically want a non-destructive MAP_FIXED. The first patch introduced MAP_FIXED_NOREPLACE which enforces the given address but unlike MAP_FIXED it fails with EEXIST if the given range conflicts with an existing one. The flag is introduced as a completely new one rather than a MAP_FIXED extension because of the backward compatibility. We really want a never-clobber semantic even on older kernels which do not recognize the flag. Unfortunately mmap sucks wrt flags evaluation because we do not EINVAL on unknown flags. On those kernels we would simply use the traditional hint based semantic so the caller can still get a different address (which sucks) but at least not silently corrupt an existing mapping. I do not see a good way around that. Except we won't export expose the new semantic to the userspace at all. It seems there are users who would like to have something like that. Jemalloc has been mentioned by Michael Ellerman [7] Florian Weimer has mentioned the following: : glibc ld.so currently maps DSOs without hints. This means that the kernel : will map right next to each other, and the offsets between them a completely : predictable. We would like to change that and supply a random address in a : window of the address space. If there is a conflict, we do not want the : kernel to pick a non-random address. Instead, we would try again with a : random address. John Hubbard has mentioned CUDA example : a) Searches /proc//maps for a "suitable" region of available : VA space. "Suitable" generally means it has to have a base address : within a certain limited range (a particular device model might : have odd limitations, for example), it has to be large enough, and : alignment has to be large enough (again, various devices may have : constraints that lead us to do this). : : This is of course subject to races with other threads in the process. : : Let's say it finds a region starting at va. : : b) Next it does: : p = mmap(va, ...) : : *without* setting MAP_FIXED, of course (so va is just a hint), to : attempt to safely reserve that region. If p != va, then in most cases, : this is a failure (almost certainly due to another thread getting a : mapping from that region before we did), and so this layer now has to : call munmap(), before returning a "failure: retry" to upper layers. : : IMPROVEMENT: --> if instead, we could call this: : : p = mmap(va, ... MAP_FIXED_NOREPLACE ...) : : , then we could skip the munmap() call upon failure. This : is a small thing, but it is useful here. (Thanks to Piotr : Jaroszynski and Mark Hairgrove for helping me get that detail : exactly right, btw.) : : c) After that, CUDA suballocates from p, via: : : q = mmap(sub_region_start, ... MAP_FIXED ...) : : Interestingly enough, "freeing" is also done via MAP_FIXED, and : setting PROT_NONE to the subregion. Anyway, I just included (c) for : general interest. Atomic address range probing in the multithreaded programs in general sounds like an interesting thing to me. The second patch simply replaces MAP_FIXED use in elf loader by MAP_FIXED_NOREPLACE. I believe other places which rely on MAP_FIXED should follow. Actually real MAP_FIXED usages should be docummented properly and they should be more of an exception. [1] http://lkml.kernel.org/r/20171116101900.13621-1-mhocko@kernel.org [2] http://lkml.kernel.org/r/20171129144219.22867-1-mhocko@kernel.org [3] http://lkml.kernel.org/r/20171107162217.382cd754@canb.auug.org.au [4] http://lkml.kernel.org/r/1510048229.12079.7.camel@abdul.in.ibm.com [5] http://lkml.kernel.org/r/20171023082608.6167-1-mhocko@kernel.org [6] http://lkml.kernel.org/r/20171113094203.aofz2e7kueitk55y@dhcp22.suse.cz [7] http://lkml.kernel.org/r/87efp1w7vy.fsf@concordia.ellerman.id.au This patch (of 2): MAP_FIXED is used quite often to enforce mapping at the particular range. The main problem of this flag is, however, that it is inherently dangerous because it unmaps existing mappings covered by the requested range. This can cause silent memory corruptions. Some of them even with serious security implications. While the current semantic might be really desiderable in many cases there are others which would want to enforce the given range but rather see a failure than a silent memory corruption on a clashing range. Please note that there is no guarantee that a given range is obeyed by the mmap even when it is free - e.g. arch specific code is allowed to apply an alignment. Introduce a new MAP_FIXED_NOREPLACE flag for mmap to achieve this behavior. It has the same semantic as MAP_FIXED wrt. the given address request with a single exception that it fails with EEXIST if the requested address is already covered by an existing mapping. We still do rely on get_unmaped_area to handle all the arch specific MAP_FIXED treatment and check for a conflicting vma after it returns. The flag is introduced as a completely new one rather than a MAP_FIXED extension because of the backward compatibility. We really want a never-clobber semantic even on older kernels which do not recognize the flag. Unfortunately mmap sucks wrt. flags evaluation because we do not EINVAL on unknown flags. On those kernels we would simply use the traditional hint based semantic so the caller can still get a different address (which sucks) but at least not silently corrupt an existing mapping. I do not see a good way around that. [mpe@ellerman.id.au: fix whitespace] [fail on clashing range with EEXIST as per Florian Weimer] [set MAP_FIXED before round_hint_to_min as per Khalid Aziz] Link: http://lkml.kernel.org/r/20171213092550.2774-2-mhocko@kernel.org Reviewed-by: Khalid Aziz Signed-off-by: Michal Hocko Acked-by: Michael Ellerman Cc: Khalid Aziz Cc: Russell King - ARM Linux Cc: Andrea Arcangeli Cc: Florian Weimer Cc: John Hubbard Cc: Matthew Wilcox Cc: Abdul Haleem Cc: Joel Stanley Cc: Kees Cook Cc: Michal Hocko Cc: Jason Evans Cc: David Goldblatt Cc: Edward Tomasz Napierała Cc: Anshuman Khandual Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/include/uapi/asm/mman.h | 1 + arch/mips/include/uapi/asm/mman.h | 1 + arch/parisc/include/uapi/asm/mman.h | 1 + arch/xtensa/include/uapi/asm/mman.h | 1 + include/uapi/asm-generic/mman-common.h | 1 + mm/mmap.c | 11 +++++++++++ 6 files changed, 16 insertions(+) diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h index 2dbdf59258d9..f9d4e6b6d4bd 100644 --- a/arch/alpha/include/uapi/asm/mman.h +++ b/arch/alpha/include/uapi/asm/mman.h @@ -32,6 +32,7 @@ #define MAP_NONBLOCK 0x40000 /* do not block on IO */ #define MAP_STACK 0x80000 /* give out an address that is best suited for process/thread stacks */ #define MAP_HUGETLB 0x100000 /* create a huge page mapping */ +#define MAP_FIXED_NOREPLACE 0x200000/* MAP_FIXED which doesn't unmap underlying mapping */ #define MS_ASYNC 1 /* sync memory asynchronously */ #define MS_SYNC 2 /* synchronous memory sync */ diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h index 606e02ca4b6c..3035ca499cd8 100644 --- a/arch/mips/include/uapi/asm/mman.h +++ b/arch/mips/include/uapi/asm/mman.h @@ -50,6 +50,7 @@ #define MAP_NONBLOCK 0x20000 /* do not block on IO */ #define MAP_STACK 0x40000 /* give out an address that is best suited for process/thread stacks */ #define MAP_HUGETLB 0x80000 /* create a huge page mapping */ +#define MAP_FIXED_NOREPLACE 0x100000 /* MAP_FIXED which doesn't unmap underlying mapping */ /* * Flags for msync diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h index a056a642bb31..870fbf8c7088 100644 --- a/arch/parisc/include/uapi/asm/mman.h +++ b/arch/parisc/include/uapi/asm/mman.h @@ -26,6 +26,7 @@ #define MAP_NONBLOCK 0x20000 /* do not block on IO */ #define MAP_STACK 0x40000 /* give out an address that is best suited for process/thread stacks */ #define MAP_HUGETLB 0x80000 /* create a huge page mapping */ +#define MAP_FIXED_NOREPLACE 0x100000 /* MAP_FIXED which doesn't unmap underlying mapping */ #define MS_SYNC 1 /* synchronous memory sync */ #define MS_ASYNC 2 /* sync memory asynchronously */ diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h index 3e9d01ada81f..58f29a9d895d 100644 --- a/arch/xtensa/include/uapi/asm/mman.h +++ b/arch/xtensa/include/uapi/asm/mman.h @@ -57,6 +57,7 @@ #define MAP_NONBLOCK 0x20000 /* do not block on IO */ #define MAP_STACK 0x40000 /* give out an address that is best suited for process/thread stacks */ #define MAP_HUGETLB 0x80000 /* create a huge page mapping */ +#define MAP_FIXED_NOREPLACE 0x100000 /* MAP_FIXED which doesn't unmap underlying mapping */ #ifdef CONFIG_MMAP_ALLOW_UNINITIALIZED # define MAP_UNINITIALIZED 0x4000000 /* For anonymous mmap, memory could be * uninitialized */ diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index f8b134f5608f..4aa65a8d7e92 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -26,6 +26,7 @@ #else # define MAP_UNINITIALIZED 0x0 /* Don't support this flag */ #endif +#define MAP_FIXED_NOREPLACE 0x80000 /* MAP_FIXED which doesn't unmap underlying mapping */ /* * Flags for mlock diff --git a/mm/mmap.c b/mm/mmap.c index f2154fc2548b..188f195883b9 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1342,6 +1342,10 @@ unsigned long do_mmap(struct file *file, unsigned long addr, if (!(file && path_noexec(&file->f_path))) prot |= PROT_EXEC; + /* force arch specific MAP_FIXED handling in get_unmapped_area */ + if (flags & MAP_FIXED_NOREPLACE) + flags |= MAP_FIXED; + if (!(flags & MAP_FIXED)) addr = round_hint_to_min(addr); @@ -1365,6 +1369,13 @@ unsigned long do_mmap(struct file *file, unsigned long addr, if (offset_in_page(addr)) return addr; + if (flags & MAP_FIXED_NOREPLACE) { + struct vm_area_struct *vma = find_vma(mm, addr); + + if (vma && vma->vm_start <= addr) + return -EEXIST; + } + if (prot == PROT_EXEC) { pkey = execute_only_pkey(mm); if (pkey < 0) From 4ed28639519c7bad5f518e70b3284c6e0763e650 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 10 Apr 2018 16:36:01 -0700 Subject: [PATCH 127/140] fs, elf: drop MAP_FIXED usage from elf_map Both load_elf_interp and load_elf_binary rely on elf_map to map segments on a controlled address and they use MAP_FIXED to enforce that. This is however dangerous thing prone to silent data corruption which can be even exploitable. Let's take CVE-2017-1000253 as an example. At the time (before commit eab09532d400: "binfmt_elf: use ELF_ET_DYN_BASE only for PIE") ELF_ET_DYN_BASE was at TASK_SIZE / 3 * 2 which is not that far away from the stack top on 32b (legacy) memory layout (only 1GB away). Therefore we could end up mapping over the existing stack with some luck. The issue has been fixed since then (a87938b2e246: "fs/binfmt_elf.c: fix bug in loading of PIE binaries"), ELF_ET_DYN_BASE moved moved much further from the stack (eab09532d400 and later by c715b72c1ba4: "mm: revert x86_64 and arm64 ELF_ET_DYN_BASE base changes") and excessive stack consumption early during execve fully stopped by da029c11e6b1 ("exec: Limit arg stack to at most 75% of _STK_LIM"). So we should be safe and any attack should be impractical. On the other hand this is just too subtle assumption so it can break quite easily and hard to spot. I believe that the MAP_FIXED usage in load_elf_binary (et. al) is still fundamentally dangerous. Moreover it shouldn't be even needed. We are at the early process stage and so there shouldn't be unrelated mappings (except for stack and loader) existing so mmap for a given address should succeed even without MAP_FIXED. Something is terribly wrong if this is not the case and we should rather fail than silently corrupt the underlying mapping. Address this issue by changing MAP_FIXED to the newly added MAP_FIXED_NOREPLACE. This will mean that mmap will fail if there is an existing mapping clashing with the requested one without clobbering it. [mhocko@suse.com: fix build] [akpm@linux-foundation.org: coding-style fixes] [avagin@openvz.org: don't use the same value for MAP_FIXED_NOREPLACE and MAP_SYNC] Link: http://lkml.kernel.org/r/20171218184916.24445-1-avagin@openvz.org Link: http://lkml.kernel.org/r/20171213092550.2774-3-mhocko@kernel.org Signed-off-by: Michal Hocko Signed-off-by: Andrei Vagin Signed-off-by: Michal Hocko Reviewed-by: Khalid Aziz Acked-by: Michael Ellerman Acked-by: Kees Cook Cc: Abdul Haleem Cc: Joel Stanley Cc: Anshuman Khandual Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_elf.c | 13 +++++++++---- include/uapi/asm-generic/mman-common.h | 4 +++- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 3edca6cb9a33..46f0438088d3 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -377,6 +377,11 @@ static unsigned long elf_map(struct file *filep, unsigned long addr, } else map_addr = vm_mmap(filep, addr, size, prot, type, off); + if ((type & MAP_FIXED_NOREPLACE) && BAD_ADDR(map_addr)) + pr_info("%d (%s): Uhuuh, elf segment at %p requested but the memory is mapped already\n", + task_pid_nr(current), current->comm, + (void *)addr); + return(map_addr); } @@ -575,7 +580,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, elf_prot |= PROT_EXEC; vaddr = eppnt->p_vaddr; if (interp_elf_ex->e_type == ET_EXEC || load_addr_set) - elf_type |= MAP_FIXED; + elf_type |= MAP_FIXED_NOREPLACE; else if (no_base && interp_elf_ex->e_type == ET_DYN) load_addr = -vaddr; @@ -939,7 +944,7 @@ static int load_elf_binary(struct linux_binprm *bprm) * the ET_DYN load_addr calculations, proceed normally. */ if (loc->elf_ex.e_type == ET_EXEC || load_addr_set) { - elf_flags |= MAP_FIXED; + elf_flags |= MAP_FIXED_NOREPLACE; } else if (loc->elf_ex.e_type == ET_DYN) { /* * This logic is run once for the first LOAD Program @@ -975,7 +980,7 @@ static int load_elf_binary(struct linux_binprm *bprm) load_bias = ELF_ET_DYN_BASE; if (current->flags & PF_RANDOMIZE) load_bias += arch_mmap_rnd(); - elf_flags |= MAP_FIXED; + elf_flags |= MAP_FIXED_NOREPLACE; } else load_bias = 0; @@ -1235,7 +1240,7 @@ static int load_elf_library(struct file *file) (eppnt->p_filesz + ELF_PAGEOFFSET(eppnt->p_vaddr)), PROT_READ | PROT_WRITE | PROT_EXEC, - MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE, + MAP_FIXED_NOREPLACE | MAP_PRIVATE | MAP_DENYWRITE, (eppnt->p_offset - ELF_PAGEOFFSET(eppnt->p_vaddr))); if (error != ELF_PAGESTART(eppnt->p_vaddr)) diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index 4aa65a8d7e92..e7ee32861d51 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -26,7 +26,9 @@ #else # define MAP_UNINITIALIZED 0x0 /* Don't support this flag */ #endif -#define MAP_FIXED_NOREPLACE 0x80000 /* MAP_FIXED which doesn't unmap underlying mapping */ + +/* 0x0100 - 0x80000 flags are defined in asm-generic/mman.h */ +#define MAP_FIXED_NOREPLACE 0x100000 /* MAP_FIXED which doesn't unmap underlying mapping */ /* * Flags for mlock From ad55eac74f2016c6dc132b9502f794156858a3d1 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 10 Apr 2018 16:36:05 -0700 Subject: [PATCH 128/140] elf: enforce MAP_FIXED on overlaying elf segments Anshuman has reported that with "fs, elf: drop MAP_FIXED usage from elf_map" applied, some ELF binaries in his environment fail to start with [ 23.423642] 9148 (sed): Uhuuh, elf segment at 0000000010030000 requested but the memory is mapped already [ 23.423706] requested [10030000, 10040000] mapped [10030000, 10040000] 100073 anon The reason is that the above binary has overlapping elf segments: LOAD 0x0000000000000000 0x0000000010000000 0x0000000010000000 0x0000000000013a8c 0x0000000000013a8c R E 10000 LOAD 0x000000000001fd40 0x000000001002fd40 0x000000001002fd40 0x00000000000002c0 0x00000000000005e8 RW 10000 LOAD 0x0000000000020328 0x0000000010030328 0x0000000010030328 0x0000000000000384 0x00000000000094a0 RW 10000 That binary has two RW LOAD segments, the first crosses a page border into the second 0x1002fd40 (LOAD2-vaddr) + 0x5e8 (LOAD2-memlen) == 0x10030328 (LOAD3-vaddr) Handle this situation by enforcing MAP_FIXED when we establish a temporary brk VMA to handle overlapping segments. All other mappings will still use MAP_FIXED_NOREPLACE. Link: http://lkml.kernel.org/r/20180213100440.GM3443@dhcp22.suse.cz Signed-off-by: Michal Hocko Reported-by: Anshuman Khandual Reviewed-by: Khalid Aziz Cc: Andrei Vagin Cc: Michael Ellerman Cc: Kees Cook Cc: Abdul Haleem Cc: Joel Stanley Cc: Stephen Rothwell Cc: Mark Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_elf.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 46f0438088d3..41e04183e4ce 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -895,7 +895,7 @@ static int load_elf_binary(struct linux_binprm *bprm) the correct location in memory. */ for(i = 0, elf_ppnt = elf_phdata; i < loc->elf_ex.e_phnum; i++, elf_ppnt++) { - int elf_prot = 0, elf_flags; + int elf_prot = 0, elf_flags, elf_fixed = MAP_FIXED_NOREPLACE; unsigned long k, vaddr; unsigned long total_size = 0; @@ -927,6 +927,13 @@ static int load_elf_binary(struct linux_binprm *bprm) */ } } + + /* + * Some binaries have overlapping elf segments and then + * we have to forcefully map over an existing mapping + * e.g. over this newly established brk mapping. + */ + elf_fixed = MAP_FIXED; } if (elf_ppnt->p_flags & PF_R) @@ -944,7 +951,7 @@ static int load_elf_binary(struct linux_binprm *bprm) * the ET_DYN load_addr calculations, proceed normally. */ if (loc->elf_ex.e_type == ET_EXEC || load_addr_set) { - elf_flags |= MAP_FIXED_NOREPLACE; + elf_flags |= elf_fixed; } else if (loc->elf_ex.e_type == ET_DYN) { /* * This logic is run once for the first LOAD Program @@ -980,7 +987,7 @@ static int load_elf_binary(struct linux_binprm *bprm) load_bias = ELF_ET_DYN_BASE; if (current->flags & PF_RANDOMIZE) load_bias += arch_mmap_rnd(); - elf_flags |= MAP_FIXED_NOREPLACE; + elf_flags |= elf_fixed; } else load_bias = 0; From 6f84f8d1587f20f60592cf1b1792ca639f37d429 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Tue, 10 Apr 2018 16:36:10 -0700 Subject: [PATCH 129/140] xen, mm: allow deferred page initialization for xen pv domains Juergen Gross noticed that commit f7f99100d8d ("mm: stop zeroing memory during allocation in vmemmap") broke XEN PV domains when deferred struct page initialization is enabled. This is because the xen's PagePinned() flag is getting erased from struct pages when they are initialized later in boot. Juergen fixed this problem by disabling deferred pages on xen pv domains. It is desirable, however, to have this feature available as it reduces boot time. This fix re-enables the feature for pv-dmains, and fixes the problem the following way: The fix is to delay setting PagePinned flag until struct pages for all allocated memory are initialized, i.e. until after free_all_bootmem(). A new x86_init.hyper op init_after_bootmem() is called to let xen know that boot allocator is done, and hence struct pages for all the allocated memory are now initialized. If deferred page initialization is enabled, the rest of struct pages are going to be initialized later in boot once page_alloc_init_late() is called. xen_after_bootmem() walks page table's pages and marks them pinned. Link: http://lkml.kernel.org/r/20180226160112.24724-2-pasha.tatashin@oracle.com Signed-off-by: Pavel Tatashin Acked-by: Ingo Molnar Reviewed-by: Juergen Gross Tested-by: Juergen Gross Cc: Daniel Jordan Cc: Pavel Tatashin Cc: Alok Kataria Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Boris Ostrovsky Cc: Michal Hocko Cc: Vlastimil Babka Cc: Andy Lutomirski Cc: Laura Abbott Cc: Kirill A. Shutemov Cc: Borislav Petkov Cc: Mathias Krause Cc: Jinbum Park Cc: Dan Williams Cc: Baoquan He Cc: Jia Zhang Cc: Mel Gorman Cc: Johannes Weiner Cc: Stefano Stabellini Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/x86_init.h | 2 ++ arch/x86/kernel/x86_init.c | 1 + arch/x86/mm/init_32.c | 1 + arch/x86/mm/init_64.c | 1 + arch/x86/xen/mmu_pv.c | 38 ++++++++++++++++++++++----------- mm/page_alloc.c | 4 ---- 6 files changed, 31 insertions(+), 16 deletions(-) diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 199e15bd3ec5..ce8b4da07e35 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -122,12 +122,14 @@ struct x86_init_pci { * @guest_late_init: guest late init * @x2apic_available: X2APIC detection * @init_mem_mapping: setup early mappings during init_mem_mapping() + * @init_after_bootmem: guest init after boot allocator is finished */ struct x86_hyper_init { void (*init_platform)(void); void (*guest_late_init)(void); bool (*x2apic_available)(void); void (*init_mem_mapping)(void); + void (*init_after_bootmem)(void); }; /** diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index ebda84a91510..3ab867603e81 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -92,6 +92,7 @@ struct x86_init_ops x86_init __initdata = { .guest_late_init = x86_init_noop, .x2apic_available = bool_x86_init_noop, .init_mem_mapping = x86_init_noop, + .init_after_bootmem = x86_init_noop, }, .acpi = { diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 396e1f0151ac..8008db2bddb3 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -778,6 +778,7 @@ void __init mem_init(void) free_all_bootmem(); after_bootmem = 1; + x86_init.hyper.init_after_bootmem(); mem_init_print_info(NULL); printk(KERN_INFO "virtual kernel memory layout:\n" diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index dca9abf2b85c..66de40e45f58 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1185,6 +1185,7 @@ void __init mem_init(void) /* this will put all memory onto the freelists */ free_all_bootmem(); after_bootmem = 1; + x86_init.hyper.init_after_bootmem(); /* * Must be done after boot memory is put on freelist, because here we diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index d20763472920..486c0a34d00b 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -116,6 +116,8 @@ DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */ static phys_addr_t xen_pt_base, xen_pt_size __initdata; +static DEFINE_STATIC_KEY_FALSE(xen_struct_pages_ready); + /* * Just beyond the highest usermode address. STACK_TOP_MAX has a * redzone above it, so round it up to a PGD boundary. @@ -155,11 +157,18 @@ void make_lowmem_page_readwrite(void *vaddr) } +/* + * During early boot all page table pages are pinned, but we do not have struct + * pages, so return true until struct pages are ready. + */ static bool xen_page_pinned(void *ptr) { - struct page *page = virt_to_page(ptr); + if (static_branch_likely(&xen_struct_pages_ready)) { + struct page *page = virt_to_page(ptr); - return PagePinned(page); + return PagePinned(page); + } + return true; } static void xen_extend_mmu_update(const struct mmu_update *update) @@ -836,11 +845,6 @@ void xen_mm_pin_all(void) spin_unlock(&pgd_lock); } -/* - * The init_mm pagetable is really pinned as soon as its created, but - * that's before we have page structures to store the bits. So do all - * the book-keeping now. - */ static int __init xen_mark_pinned(struct mm_struct *mm, struct page *page, enum pt_level level) { @@ -848,8 +852,18 @@ static int __init xen_mark_pinned(struct mm_struct *mm, struct page *page, return 0; } -static void __init xen_mark_init_mm_pinned(void) +/* + * The init_mm pagetable is really pinned as soon as its created, but + * that's before we have page structures to store the bits. So do all + * the book-keeping now once struct pages for allocated pages are + * initialized. This happens only after free_all_bootmem() is called. + */ +static void __init xen_after_bootmem(void) { + static_branch_enable(&xen_struct_pages_ready); +#ifdef CONFIG_X86_64 + SetPagePinned(virt_to_page(level3_user_vsyscall)); +#endif xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP); } @@ -1623,14 +1637,15 @@ static inline void __set_pfn_prot(unsigned long pfn, pgprot_t prot) static inline void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level) { - bool pinned = PagePinned(virt_to_page(mm->pgd)); + bool pinned = xen_page_pinned(mm->pgd); trace_xen_mmu_alloc_ptpage(mm, pfn, level, pinned); if (pinned) { struct page *page = pfn_to_page(pfn); - SetPagePinned(page); + if (static_branch_likely(&xen_struct_pages_ready)) + SetPagePinned(page); if (!PageHighMem(page)) { xen_mc_batch(); @@ -2364,9 +2379,7 @@ static void __init xen_post_allocator_init(void) #ifdef CONFIG_X86_64 pv_mmu_ops.write_cr3 = &xen_write_cr3; - SetPagePinned(virt_to_page(level3_user_vsyscall)); #endif - xen_mark_init_mm_pinned(); } static void xen_leave_lazy_mmu(void) @@ -2450,6 +2463,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = { void __init xen_init_mmu_ops(void) { x86_init.paging.pagetable_init = xen_pagetable_init; + x86_init.hyper.init_after_bootmem = xen_after_bootmem; pv_mmu_ops = xen_mmu_ops; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b4390db64da3..905db9d7962f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -46,7 +46,6 @@ #include #include #include -#include #include #include #include @@ -317,9 +316,6 @@ static inline bool update_defer_init(pg_data_t *pgdat, /* Always populate low zones for address-constrained allocations */ if (zone_end < pgdat_end_pfn(pgdat)) return true; - /* Xen PV domains need page structures early */ - if (xen_pv_domain()) - return true; (*nr_initialised)++; if ((*nr_initialised > pgdat->static_init_pgcnt) && (pfn & (PAGES_PER_SECTION - 1)) == 0) { From 2a6cc8a6c0cb44baf7df2f64e5090aaf726002c3 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 10 Apr 2018 16:36:15 -0700 Subject: [PATCH 130/140] linux/const.h: prefix include guard of uapi/linux/const.h with _UAPI Patch series "linux/const.h: cleanups of macros such as UL(), _BITUL(), BIT() etc", v3. ARM, ARM64, UniCore32 define UL() as a shorthand of _AC(..., UL). More architectures may introduce it in the future. UL() is arch-agnostic, and useful. So let's move it to include/linux/const.h Currently, must be included to use UL(). It pulls in more bloats just for defining some bit macros. I posted V2 one year ago. The previous posts are: https://patchwork.kernel.org/patch/9498273/ https://patchwork.kernel.org/patch/9498275/ https://patchwork.kernel.org/patch/9498269/ https://patchwork.kernel.org/patch/9498271/ At that time, what blocked this series was a comment from David Howells: You need to be very careful doing this. Some userspace stuff depends on the guard macro names on the kernel header files. (https://patchwork.kernel.org/patch/9498275/) Looking at the code closer, I noticed this is not a problem. See the following line. https://github.com/torvalds/linux/blob/v4.16-rc2/scripts/headers_install.sh#L40 scripts/headers_install.sh rips off _UAPI prefix from guard macro names. I ran "make headers_install" and confirmed the result is what I expect. So, we can prefix the include guard of include/uapi/linux/const.h, and add a new include/linux/const.h. This patch (of 4): I am going to add include/linux/const.h for the kernel space. Add _UAPI to the include guard of include/uapi/linux/const.h to prepare for that. Please notice the guard name of the exported one will be kept as-is. So, this commit has no impact to the userspace even if some userspace stuff depends on the guard macro names. scripts/headers_install.sh processes exported headers by SED, and rips off "_UAPI" from guard macro names. #ifndef _UAPI_LINUX_CONST_H #define _UAPI_LINUX_CONST_H will be turned into #ifndef _LINUX_CONST_H #define _LINUX_CONST_H Link: http://lkml.kernel.org/r/1519301715-31798-2-git-send-email-yamada.masahiro@socionext.com Signed-off-by: Masahiro Yamada Cc: David Howells Cc: Will Deacon Cc: Guan Xuetao Cc: Geert Uytterhoeven Cc: Catalin Marinas Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/const.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/uapi/linux/const.h b/include/uapi/linux/const.h index 92537757590a..c5a60ebabf5e 100644 --- a/include/uapi/linux/const.h +++ b/include/uapi/linux/const.h @@ -1,8 +1,8 @@ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ /* const.h: Macros for dealing with constants. */ -#ifndef _LINUX_CONST_H -#define _LINUX_CONST_H +#ifndef _UAPI_LINUX_CONST_H +#define _UAPI_LINUX_CONST_H /* Some constant macros are used in both assembler and * C code. Therefore we cannot annotate them always with @@ -25,4 +25,4 @@ #define _BITUL(x) (_AC(1,UL) << (x)) #define _BITULL(x) (_AC(1,ULL) << (x)) -#endif /* !(_LINUX_CONST_H) */ +#endif /* _UAPI_LINUX_CONST_H */ From 2dd8a62c647691161a2346546834262597739872 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 10 Apr 2018 16:36:19 -0700 Subject: [PATCH 131/140] linux/const.h: move UL() macro to include/linux/const.h ARM, ARM64 and UniCore32 duplicate the definition of UL(): #define UL(x) _AC(x, UL) This is not actually arch-specific, so it will be useful to move it to a common header. Currently, we only have the uapi variant for linux/const.h, so I am creating include/linux/const.h. I also added _UL(), _ULL() and ULL() because _AC() is mostly used in the form either _AC(..., UL) or _AC(..., ULL). I expect they will be replaced in follow-up cleanups. The underscore-prefixed ones should be used for exported headers. Link: http://lkml.kernel.org/r/1519301715-31798-4-git-send-email-yamada.masahiro@socionext.com Signed-off-by: Masahiro Yamada Acked-by: Guan Xuetao Acked-by: Catalin Marinas Acked-by: Russell King Cc: David Howells Cc: Geert Uytterhoeven Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/include/asm/memory.h | 6 ------ arch/arm64/include/asm/memory.h | 6 ------ arch/unicore32/include/asm/memory.h | 6 ------ include/linux/const.h | 9 +++++++++ include/uapi/linux/const.h | 3 +++ 5 files changed, 12 insertions(+), 18 deletions(-) create mode 100644 include/linux/const.h diff --git a/arch/arm/include/asm/memory.h b/arch/arm/include/asm/memory.h index 496667703693..ed8fd0d19a3e 100644 --- a/arch/arm/include/asm/memory.h +++ b/arch/arm/include/asm/memory.h @@ -22,12 +22,6 @@ #include #endif -/* - * Allow for constants defined here to be used from assembly code - * by prepending the UL suffix only with actual C code compilation. - */ -#define UL(x) _AC(x, UL) - /* PAGE_OFFSET - the virtual address of the start of the kernel image */ #define PAGE_OFFSET UL(CONFIG_PAGE_OFFSET) diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index 50fa96a49792..49d99214f43c 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -28,12 +28,6 @@ #include #include -/* - * Allow for constants defined here to be used from assembly code - * by prepending the UL suffix only with actual C code compilation. - */ -#define UL(x) _AC(x, UL) - /* * Size of the PCI I/O space. This must remain a power of two so that * IO_SPACE_LIMIT acts as a mask for the low bits of I/O addresses. diff --git a/arch/unicore32/include/asm/memory.h b/arch/unicore32/include/asm/memory.h index 3bb0a29fd2d7..66bb9f6525c0 100644 --- a/arch/unicore32/include/asm/memory.h +++ b/arch/unicore32/include/asm/memory.h @@ -19,12 +19,6 @@ #include #include -/* - * Allow for constants defined here to be used from assembly code - * by prepending the UL suffix only with actual C code compilation. - */ -#define UL(x) _AC(x, UL) - /* * PAGE_OFFSET - the virtual address of the start of the kernel image * TASK_SIZE - the maximum size of a user space task. diff --git a/include/linux/const.h b/include/linux/const.h new file mode 100644 index 000000000000..7b55a55f5911 --- /dev/null +++ b/include/linux/const.h @@ -0,0 +1,9 @@ +#ifndef _LINUX_CONST_H +#define _LINUX_CONST_H + +#include + +#define UL(x) (_UL(x)) +#define ULL(x) (_ULL(x)) + +#endif /* _LINUX_CONST_H */ diff --git a/include/uapi/linux/const.h b/include/uapi/linux/const.h index c5a60ebabf5e..09bc0e0e97e3 100644 --- a/include/uapi/linux/const.h +++ b/include/uapi/linux/const.h @@ -22,6 +22,9 @@ #define _AT(T,X) ((T)(X)) #endif +#define _UL(x) (_AC(x, UL)) +#define _ULL(x) (_AC(x, ULL)) + #define _BITUL(x) (_AC(1,UL) << (x)) #define _BITULL(x) (_AC(1,ULL) << (x)) From 21e7bc600e3b662020c05fd0749bcf85f16336f7 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 10 Apr 2018 16:36:24 -0700 Subject: [PATCH 132/140] linux/const.h: refactor _BITUL and _BITULL a bit Minor cleanups available by _UL and _ULL. Link: http://lkml.kernel.org/r/1519301715-31798-5-git-send-email-yamada.masahiro@socionext.com Signed-off-by: Masahiro Yamada Cc: Catalin Marinas Cc: David Howells Cc: Geert Uytterhoeven Cc: Guan Xuetao Cc: Russell King Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/const.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/const.h b/include/uapi/linux/const.h index 09bc0e0e97e3..5ed721ad5b19 100644 --- a/include/uapi/linux/const.h +++ b/include/uapi/linux/const.h @@ -25,7 +25,7 @@ #define _UL(x) (_AC(x, UL)) #define _ULL(x) (_AC(x, ULL)) -#define _BITUL(x) (_AC(1,UL) << (x)) -#define _BITULL(x) (_AC(1,ULL) << (x)) +#define _BITUL(x) (_UL(1) << (x)) +#define _BITULL(x) (_ULL(1) << (x)) #endif /* _UAPI_LINUX_CONST_H */ From fa290cda102c096f5ca394277d65d3dbd689930b Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 10 Apr 2018 16:36:28 -0700 Subject: [PATCH 133/140] radix tree: use GFP_ZONEMASK bits of gfp_t for flags Patch series "XArray", v9. (First part thereof). This patchset is, I believe, appropriate for merging for 4.17. It contains the XArray implementation, to eventually replace the radix tree, and converts the page cache to use it. This conversion keeps the radix tree and XArray data structures in sync at all times. That allows us to convert the page cache one function at a time and should allow for easier bisection. Other than renaming some elements of the structures, the data structures are fundamentally unchanged; a radix tree walk and an XArray walk will touch the same number of cachelines. I have changes planned to the XArray data structure, but those will happen in future patches. Improvements the XArray has over the radix tree: - The radix tree provides operations like other trees do; 'insert' and 'delete'. But what most users really want is an automatically resizing array, and so it makes more sense to give users an API that is like an array -- 'load' and 'store'. We still have an 'insert' operation for users that really want that semantic. - The XArray considers locking as part of its API. This simplifies a lot of users who formerly had to manage their own locking just for the radix tree. It also improves code generation as we can now tell RCU that we're holding a lock and it doesn't need to generate as much fencing code. The other advantage is that tree nodes can be moved (not yet implemented). - GFP flags are now parameters to calls which may need to allocate memory. The radix tree forced users to decide what the allocation flags would be at creation time. It's much clearer to specify them at allocation time. - Memory is not preloaded; we don't tie up dozens of pages on the off chance that the slab allocator fails. Instead, we drop the lock, allocate a new node and retry the operation. We have to convert all the radix tree, IDA and IDR preload users before we can realise this benefit, but I have not yet found a user which cannot be converted. - The XArray provides a cmpxchg operation. The radix tree forces users to roll their own (and at least four have). - Iterators take a 'max' parameter. That simplifies many users and will reduce the amount of iteration done. - Iteration can proceed backwards. We only have one user for this, but since it's called as part of the pagefault readahead algorithm, that seemed worth mentioning. - RCU-protected pointers are not exposed as part of the API. There are some fun bugs where the page cache forgets to use rcu_dereference() in the current codebase. - Value entries gain an extra bit compared to radix tree exceptional entries. That gives us the extra bit we need to put huge page swap entries in the page cache. - Some iterators now take a 'filter' argument instead of having separate iterators for tagged/untagged iterations. The page cache is improved by this: - Shorter, easier to read code - More efficient iterations - Reduction in size of struct address_space - Fewer walks from the top of the data structure; the XArray API encourages staying at the leaf node and conducting operations there. This patch (of 8): None of these bits may be used for slab allocations, so we can use them as radix tree flags as long as we mask them off before passing them to the slab allocator. Move the IDR flag from the high bits to the GFP_ZONEMASK bits. Link: http://lkml.kernel.org/r/20180313132639.17387-3-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Jeff Layton Cc: Darrick J. Wong Cc: Dave Chinner Cc: Ryusuke Konishi Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/idr.h | 3 ++- include/linux/radix-tree.h | 7 ++++--- lib/radix-tree.c | 3 ++- tools/testing/radix-tree/linux/gfp.h | 1 + 4 files changed, 9 insertions(+), 5 deletions(-) diff --git a/include/linux/idr.h b/include/linux/idr.h index 7d6a6313f0ab..913c335054f0 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h @@ -29,7 +29,8 @@ struct idr { #define IDR_FREE 0 /* Set the IDR flag and the IDR_FREE tag */ -#define IDR_RT_MARKER ((__force gfp_t)(3 << __GFP_BITS_SHIFT)) +#define IDR_RT_MARKER (ROOT_IS_IDR | (__force gfp_t) \ + (1 << (ROOT_TAG_SHIFT + IDR_FREE))) #define IDR_INIT_BASE(base) { \ .idr_rt = RADIX_TREE_INIT(IDR_RT_MARKER), \ diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index fc55ff31eca7..6c4e2e716dac 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -104,9 +104,10 @@ struct radix_tree_node { unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS]; }; -/* The top bits of gfp_mask are used to store the root tags and the IDR flag */ -#define ROOT_IS_IDR ((__force gfp_t)(1 << __GFP_BITS_SHIFT)) -#define ROOT_TAG_SHIFT (__GFP_BITS_SHIFT + 1) +/* The IDR tag is stored in the low bits of the GFP flags */ +#define ROOT_IS_IDR ((__force gfp_t)4) +/* The top bits of gfp_mask are used to store the root tags */ +#define ROOT_TAG_SHIFT (__GFP_BITS_SHIFT) struct radix_tree_root { gfp_t gfp_mask; diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 8e00138d593f..da9e10c827df 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -146,7 +146,7 @@ static unsigned int radix_tree_descend(const struct radix_tree_node *parent, static inline gfp_t root_gfp_mask(const struct radix_tree_root *root) { - return root->gfp_mask & __GFP_BITS_MASK; + return root->gfp_mask & (__GFP_BITS_MASK & ~GFP_ZONEMASK); } static inline void tag_set(struct radix_tree_node *node, unsigned int tag, @@ -2285,6 +2285,7 @@ void __init radix_tree_init(void) int ret; BUILD_BUG_ON(RADIX_TREE_MAX_TAGS + __GFP_BITS_SHIFT > 32); + BUILD_BUG_ON(ROOT_IS_IDR & ~GFP_ZONEMASK); radix_tree_node_cachep = kmem_cache_create("radix_tree_node", sizeof(struct radix_tree_node), 0, SLAB_PANIC | SLAB_RECLAIM_ACCOUNT, diff --git a/tools/testing/radix-tree/linux/gfp.h b/tools/testing/radix-tree/linux/gfp.h index e3201ccf54c3..32159c08a52e 100644 --- a/tools/testing/radix-tree/linux/gfp.h +++ b/tools/testing/radix-tree/linux/gfp.h @@ -19,6 +19,7 @@ #define __GFP_RECLAIM (__GFP_DIRECT_RECLAIM|__GFP_KSWAPD_RECLAIM) +#define GFP_ZONEMASK 0x0fu #define GFP_ATOMIC (__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM) #define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS) #define GFP_NOWAIT (__GFP_KSWAPD_RECLAIM) From 60a052719af6cf34cee53c6b93b2d31cfc795de7 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 10 Apr 2018 16:36:33 -0700 Subject: [PATCH 134/140] mac80211_hwsim: use DEFINE_IDA This is preferred to opencoding an IDA_INIT. Link: http://lkml.kernel.org/r/20180313132639.17387-2-willy@infradead.org Signed-off-by: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/net/wireless/mac80211_hwsim.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c index 6afe896e5cb8..96d26cfae90b 100644 --- a/drivers/net/wireless/mac80211_hwsim.c +++ b/drivers/net/wireless/mac80211_hwsim.c @@ -253,7 +253,7 @@ static inline void hwsim_clear_chanctx_magic(struct ieee80211_chanctx_conf *c) static unsigned int hwsim_net_id; -static struct ida hwsim_netgroup_ida = IDA_INIT; +static DEFINE_IDA(hwsim_netgroup_ida); struct hwsim_net { int netgroup; From 427c896f262ad70d1e6b04890945675df13bb031 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 10 Apr 2018 16:36:36 -0700 Subject: [PATCH 135/140] arm64: turn flush_dcache_mmap_lock into a no-op ARM64 doesn't walk the VMA tree in its flush_dcache_page() implementation, so has no need to take the tree_lock. Link: http://lkml.kernel.org/r/20180313132639.17387-4-willy@infradead.org Signed-off-by: Matthew Wilcox Reviewed-by: Will Deacon Cc: Darrick J. Wong Cc: Dave Chinner Cc: Jeff Layton Cc: Ryusuke Konishi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/include/asm/cacheflush.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h index 7dfcec4700fe..0094c6653b06 100644 --- a/arch/arm64/include/asm/cacheflush.h +++ b/arch/arm64/include/asm/cacheflush.h @@ -140,10 +140,8 @@ static inline void __flush_icache_all(void) dsb(ish); } -#define flush_dcache_mmap_lock(mapping) \ - spin_lock_irq(&(mapping)->tree_lock) -#define flush_dcache_mmap_unlock(mapping) \ - spin_unlock_irq(&(mapping)->tree_lock) +#define flush_dcache_mmap_lock(mapping) do { } while (0) +#define flush_dcache_mmap_unlock(mapping) do { } while (0) /* * We don't appear to need to do anything here. In fact, if we did, we'd From d339d705f7024d602b1449e36e53451a0fcde9fa Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 10 Apr 2018 16:36:40 -0700 Subject: [PATCH 136/140] unicore32: turn flush_dcache_mmap_lock into a no-op Unicore doesn't walk the VMA tree in its flush_dcache_page() implementation, so has no need to take the tree_lock. Link: http://lkml.kernel.org/r/20180313132639.17387-5-willy@infradead.org Signed-off-by: Matthew Wilcox Cc: Darrick J. Wong Cc: Dave Chinner Cc: Jeff Layton Cc: Ryusuke Konishi Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/unicore32/include/asm/cacheflush.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/unicore32/include/asm/cacheflush.h b/arch/unicore32/include/asm/cacheflush.h index a5e08e2d5d6d..1d9132b66039 100644 --- a/arch/unicore32/include/asm/cacheflush.h +++ b/arch/unicore32/include/asm/cacheflush.h @@ -170,10 +170,8 @@ extern void flush_cache_page(struct vm_area_struct *vma, #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 extern void flush_dcache_page(struct page *); -#define flush_dcache_mmap_lock(mapping) \ - spin_lock_irq(&(mapping)->tree_lock) -#define flush_dcache_mmap_unlock(mapping) \ - spin_unlock_irq(&(mapping)->tree_lock) +#define flush_dcache_mmap_lock(mapping) do { } while (0) +#define flush_dcache_mmap_unlock(mapping) do { } while (0) #define flush_icache_user_range(vma, page, addr, len) \ flush_dcache_page(page) From f82b376413298ddd39a2391e38260c15cdebf380 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 10 Apr 2018 16:36:44 -0700 Subject: [PATCH 137/140] export __set_page_dirty XFS currently contains a copy-and-paste of __set_page_dirty(). Export it from buffer.c instead. Link: http://lkml.kernel.org/r/20180313132639.17387-6-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Jeff Layton Reviewed-by: Darrick J. Wong Cc: Ryusuke Konishi Cc: Dave Chinner Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 3 ++- fs/xfs/xfs_aops.c | 15 ++------------- include/linux/mm.h | 1 + 3 files changed, 5 insertions(+), 14 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index ec5dd39071e6..64b1e2065b6b 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -594,7 +594,7 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode); * * The caller must hold lock_page_memcg(). */ -static void __set_page_dirty(struct page *page, struct address_space *mapping, +void __set_page_dirty(struct page *page, struct address_space *mapping, int warn) { unsigned long flags; @@ -608,6 +608,7 @@ static void __set_page_dirty(struct page *page, struct address_space *mapping, } spin_unlock_irqrestore(&mapping->tree_lock, flags); } +EXPORT_SYMBOL_GPL(__set_page_dirty); /* * Add a page to the dirty page list. diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 436a1de3fcdf..0ab824f574ed 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1467,19 +1467,8 @@ xfs_vm_set_page_dirty( newly_dirty = !TestSetPageDirty(page); spin_unlock(&mapping->private_lock); - if (newly_dirty) { - /* sigh - __set_page_dirty() is static, so copy it here, too */ - unsigned long flags; - - spin_lock_irqsave(&mapping->tree_lock, flags); - if (page->mapping) { /* Race with truncate? */ - WARN_ON_ONCE(!PageUptodate(page)); - account_page_dirtied(page, mapping); - radix_tree_tag_set(&mapping->page_tree, - page_index(page), PAGECACHE_TAG_DIRTY); - } - spin_unlock_irqrestore(&mapping->tree_lock, flags); - } + if (newly_dirty) + __set_page_dirty(page, mapping, 1); unlock_page_memcg(page); if (newly_dirty) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); diff --git a/include/linux/mm.h b/include/linux/mm.h index 342c441c25d0..f13bc25f7a9f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1466,6 +1466,7 @@ extern int try_to_release_page(struct page * page, gfp_t gfp_mask); extern void do_invalidatepage(struct page *page, unsigned int offset, unsigned int length); +void __set_page_dirty(struct page *, struct address_space *, int warn); int __set_page_dirty_nobuffers(struct page *page); int __set_page_dirty_no_writeback(struct page *page); int redirty_page_for_writepage(struct writeback_control *wbc, From e5a955419642e0842fd26e1ada6ab3328018ca16 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 10 Apr 2018 16:36:48 -0700 Subject: [PATCH 138/140] fscache: use appropriate radix tree accessors Don't open-code accesses to data structure internals. Link: http://lkml.kernel.org/r/20180313132639.17387-7-willy@infradead.org Signed-off-by: Matthew Wilcox Reviewed-by: Jeff Layton Cc: Darrick J. Wong Cc: Dave Chinner Cc: Ryusuke Konishi Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fscache/cookie.c | 2 +- fs/fscache/object.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index 7dc55b93a830..97137d7ec5ee 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -832,7 +832,7 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, /* Clear pointers back to the netfs */ cookie->netfs_data = NULL; cookie->def = NULL; - BUG_ON(cookie->stores.rnode); + BUG_ON(!radix_tree_empty(&cookie->stores)); if (cookie->parent) { ASSERTCMP(atomic_read(&cookie->parent->usage), >, 0); diff --git a/fs/fscache/object.c b/fs/fscache/object.c index 1085ca12e25c..20e0d0a4dc8c 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -973,7 +973,7 @@ static const struct fscache_state *_fscache_invalidate_object(struct fscache_obj * retire the object instead. */ if (!fscache_use_cookie(object)) { - ASSERT(object->cookie->stores.rnode == NULL); + ASSERT(radix_tree_empty(&object->cookie->stores)); set_bit(FSCACHE_OBJECT_RETIRED, &object->flags); _leave(" [no cookie]"); return transit_to(KILL_OBJECT); From f6bb2a2c0b81c47282ddb7883f92e65a063c27dd Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 10 Apr 2018 16:36:52 -0700 Subject: [PATCH 139/140] xarray: add the xa_lock to the radix_tree_root This results in no change in structure size on 64-bit machines as it fits in the padding between the gfp_t and the void *. 32-bit machines will grow the structure from 8 to 12 bytes. Almost all radix trees are protected with (at least) a spinlock, so as they are converted from radix trees to xarrays, the data structures will shrink again. Initialising the spinlock requires a name for the benefit of lockdep, so RADIX_TREE_INIT() now needs to know the name of the radix tree it's initialising, and so do IDR_INIT() and IDA_INIT(). Also add the xa_lock() and xa_unlock() family of wrappers to make it easier to use the lock. If we could rely on -fplan9-extensions in the compiler, we could avoid all of this syntactic sugar, but that wasn't added until gcc 4.6. Link: http://lkml.kernel.org/r/20180313132639.17387-8-willy@infradead.org Signed-off-by: Matthew Wilcox Reviewed-by: Jeff Layton Cc: Darrick J. Wong Cc: Dave Chinner Cc: Ryusuke Konishi Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/f2fs/gc.c | 2 +- include/linux/idr.h | 19 ++++++++++--------- include/linux/radix-tree.h | 7 +++++-- include/linux/xarray.h | 24 ++++++++++++++++++++++++ kernel/pid.c | 2 +- tools/include/linux/spinlock.h | 1 + 6 files changed, 42 insertions(+), 13 deletions(-) create mode 100644 include/linux/xarray.h diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index bfb7a4a3a929..9327411fd93b 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1015,7 +1015,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, unsigned int init_segno = segno; struct gc_inode_list gc_list = { .ilist = LIST_HEAD_INIT(gc_list.ilist), - .iroot = RADIX_TREE_INIT(GFP_NOFS), + .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS), }; trace_f2fs_gc_begin(sbi->sb, sync, background, diff --git a/include/linux/idr.h b/include/linux/idr.h index 913c335054f0..e856f4e0ab35 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h @@ -32,27 +32,28 @@ struct idr { #define IDR_RT_MARKER (ROOT_IS_IDR | (__force gfp_t) \ (1 << (ROOT_TAG_SHIFT + IDR_FREE))) -#define IDR_INIT_BASE(base) { \ - .idr_rt = RADIX_TREE_INIT(IDR_RT_MARKER), \ +#define IDR_INIT_BASE(name, base) { \ + .idr_rt = RADIX_TREE_INIT(name, IDR_RT_MARKER), \ .idr_base = (base), \ .idr_next = 0, \ } /** * IDR_INIT() - Initialise an IDR. + * @name: Name of IDR. * * A freshly-initialised IDR contains no IDs. */ -#define IDR_INIT IDR_INIT_BASE(0) +#define IDR_INIT(name) IDR_INIT_BASE(name, 0) /** - * DEFINE_IDR() - Define a statically-allocated IDR - * @name: Name of IDR + * DEFINE_IDR() - Define a statically-allocated IDR. + * @name: Name of IDR. * * An IDR defined using this macro is ready for use with no additional * initialisation required. It contains no IDs. */ -#define DEFINE_IDR(name) struct idr name = IDR_INIT +#define DEFINE_IDR(name) struct idr name = IDR_INIT(name) /** * idr_get_cursor - Return the current position of the cyclic allocator @@ -219,10 +220,10 @@ struct ida { struct radix_tree_root ida_rt; }; -#define IDA_INIT { \ - .ida_rt = RADIX_TREE_INIT(IDR_RT_MARKER | GFP_NOWAIT), \ +#define IDA_INIT(name) { \ + .ida_rt = RADIX_TREE_INIT(name, IDR_RT_MARKER | GFP_NOWAIT), \ } -#define DEFINE_IDA(name) struct ida name = IDA_INIT +#define DEFINE_IDA(name) struct ida name = IDA_INIT(name) int ida_pre_get(struct ida *ida, gfp_t gfp_mask); int ida_get_new_above(struct ida *ida, int starting_id, int *p_id); diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 6c4e2e716dac..34149e8b5f73 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -110,20 +110,23 @@ struct radix_tree_node { #define ROOT_TAG_SHIFT (__GFP_BITS_SHIFT) struct radix_tree_root { + spinlock_t xa_lock; gfp_t gfp_mask; struct radix_tree_node __rcu *rnode; }; -#define RADIX_TREE_INIT(mask) { \ +#define RADIX_TREE_INIT(name, mask) { \ + .xa_lock = __SPIN_LOCK_UNLOCKED(name.xa_lock), \ .gfp_mask = (mask), \ .rnode = NULL, \ } #define RADIX_TREE(name, mask) \ - struct radix_tree_root name = RADIX_TREE_INIT(mask) + struct radix_tree_root name = RADIX_TREE_INIT(name, mask) #define INIT_RADIX_TREE(root, mask) \ do { \ + spin_lock_init(&(root)->xa_lock); \ (root)->gfp_mask = (mask); \ (root)->rnode = NULL; \ } while (0) diff --git a/include/linux/xarray.h b/include/linux/xarray.h new file mode 100644 index 000000000000..2dfc8006fe64 --- /dev/null +++ b/include/linux/xarray.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +#ifndef _LINUX_XARRAY_H +#define _LINUX_XARRAY_H +/* + * eXtensible Arrays + * Copyright (c) 2017 Microsoft Corporation + * Author: Matthew Wilcox + */ + +#include + +#define xa_trylock(xa) spin_trylock(&(xa)->xa_lock) +#define xa_lock(xa) spin_lock(&(xa)->xa_lock) +#define xa_unlock(xa) spin_unlock(&(xa)->xa_lock) +#define xa_lock_bh(xa) spin_lock_bh(&(xa)->xa_lock) +#define xa_unlock_bh(xa) spin_unlock_bh(&(xa)->xa_lock) +#define xa_lock_irq(xa) spin_lock_irq(&(xa)->xa_lock) +#define xa_unlock_irq(xa) spin_unlock_irq(&(xa)->xa_lock) +#define xa_lock_irqsave(xa, flags) \ + spin_lock_irqsave(&(xa)->xa_lock, flags) +#define xa_unlock_irqrestore(xa, flags) \ + spin_unlock_irqrestore(&(xa)->xa_lock, flags) + +#endif /* _LINUX_XARRAY_H */ diff --git a/kernel/pid.c b/kernel/pid.c index ed6c343fe50d..157fe4b19971 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -70,7 +70,7 @@ int pid_max_max = PID_MAX_LIMIT; */ struct pid_namespace init_pid_ns = { .kref = KREF_INIT(2), - .idr = IDR_INIT, + .idr = IDR_INIT(init_pid_ns.idr), .pid_allocated = PIDNS_ADDING, .level = 0, .child_reaper = &init_task, diff --git a/tools/include/linux/spinlock.h b/tools/include/linux/spinlock.h index 4ed569fcb139..b21b586b9854 100644 --- a/tools/include/linux/spinlock.h +++ b/tools/include/linux/spinlock.h @@ -7,6 +7,7 @@ #define spinlock_t pthread_mutex_t #define DEFINE_SPINLOCK(x) pthread_mutex_t x = PTHREAD_MUTEX_INITIALIZER; +#define __SPIN_LOCK_UNLOCKED(x) (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER #define spin_lock_irqsave(x, f) (void)f, pthread_mutex_lock(x) #define spin_unlock_irqrestore(x, f) (void)f, pthread_mutex_unlock(x) From b93b016313b3ba8003c3b8bb71f569af91f19fc7 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 10 Apr 2018 16:36:56 -0700 Subject: [PATCH 140/140] page cache: use xa_lock Remove the address_space ->tree_lock and use the xa_lock newly added to the radix_tree_root. Rename the address_space ->page_tree to ->i_pages, since we don't really care that it's a tree. [willy@infradead.org: fix nds32, fs/dax.c] Link: http://lkml.kernel.org/r/20180406145415.GB20605@bombadil.infradead.orgLink: http://lkml.kernel.org/r/20180313132639.17387-9-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Jeff Layton Cc: Darrick J. Wong Cc: Dave Chinner Cc: Ryusuke Konishi Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cgroup-v1/memory.txt | 2 +- Documentation/vm/page_migration | 14 +- arch/arm/include/asm/cacheflush.h | 6 +- arch/nds32/include/asm/cacheflush.h | 4 +- arch/nios2/include/asm/cacheflush.h | 6 +- arch/parisc/include/asm/cacheflush.h | 6 +- drivers/staging/lustre/lustre/llite/glimpse.c | 2 +- .../staging/lustre/lustre/mdc/mdc_request.c | 8 +- fs/afs/write.c | 9 +- fs/btrfs/compression.c | 2 +- fs/btrfs/extent_io.c | 16 +-- fs/buffer.c | 13 +- fs/cifs/file.c | 9 +- fs/dax.c | 124 +++++++++--------- fs/f2fs/data.c | 6 +- fs/f2fs/dir.c | 6 +- fs/f2fs/inline.c | 6 +- fs/f2fs/node.c | 8 +- fs/fs-writeback.c | 22 ++-- fs/inode.c | 11 +- fs/nilfs2/btnode.c | 20 +-- fs/nilfs2/page.c | 22 ++-- include/linux/backing-dev.h | 14 +- include/linux/fs.h | 8 +- include/linux/mm.h | 2 +- include/linux/pagemap.h | 4 +- mm/filemap.c | 84 ++++++------ mm/huge_memory.c | 10 +- mm/khugepaged.c | 49 ++++--- mm/memcontrol.c | 4 +- mm/migrate.c | 32 ++--- mm/page-writeback.c | 43 +++--- mm/readahead.c | 2 +- mm/rmap.c | 4 +- mm/shmem.c | 60 ++++----- mm/swap_state.c | 17 ++- mm/truncate.c | 22 ++-- mm/vmscan.c | 12 +- mm/workingset.c | 22 ++-- 39 files changed, 345 insertions(+), 366 deletions(-) diff --git a/Documentation/cgroup-v1/memory.txt b/Documentation/cgroup-v1/memory.txt index a4af2e124e24..3682e99234c2 100644 --- a/Documentation/cgroup-v1/memory.txt +++ b/Documentation/cgroup-v1/memory.txt @@ -262,7 +262,7 @@ When oom event notifier is registered, event will be delivered. 2.6 Locking lock_page_cgroup()/unlock_page_cgroup() should not be called under - mapping->tree_lock. + the i_pages lock. Other lock order is following: PG_locked. diff --git a/Documentation/vm/page_migration b/Documentation/vm/page_migration index 0478ae2ad44a..496868072e24 100644 --- a/Documentation/vm/page_migration +++ b/Documentation/vm/page_migration @@ -90,7 +90,7 @@ Steps: 1. Lock the page to be migrated -2. Insure that writeback is complete. +2. Ensure that writeback is complete. 3. Lock the new page that we want to move to. It is locked so that accesses to this (not yet uptodate) page immediately lock while the move is in progress. @@ -100,8 +100,8 @@ Steps: mapcount is not zero then we do not migrate the page. All user space processes that attempt to access the page will now wait on the page lock. -5. The radix tree lock is taken. This will cause all processes trying - to access the page via the mapping to block on the radix tree spinlock. +5. The i_pages lock is taken. This will cause all processes trying + to access the page via the mapping to block on the spinlock. 6. The refcount of the page is examined and we back out if references remain otherwise we know that we are the only one referencing this page. @@ -114,12 +114,12 @@ Steps: 9. The radix tree is changed to point to the new page. -10. The reference count of the old page is dropped because the radix tree +10. The reference count of the old page is dropped because the address space reference is gone. A reference to the new page is established because - the new page is referenced to by the radix tree. + the new page is referenced by the address space. -11. The radix tree lock is dropped. With that lookups in the mapping - become possible again. Processes will move from spinning on the tree_lock +11. The i_pages lock is dropped. With that lookups in the mapping + become possible again. Processes will move from spinning on the lock to sleeping on the locked new page. 12. The page contents are copied to the new page. diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h index 74504b154256..869080bedb89 100644 --- a/arch/arm/include/asm/cacheflush.h +++ b/arch/arm/include/asm/cacheflush.h @@ -318,10 +318,8 @@ static inline void flush_anon_page(struct vm_area_struct *vma, #define ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE extern void flush_kernel_dcache_page(struct page *); -#define flush_dcache_mmap_lock(mapping) \ - spin_lock_irq(&(mapping)->tree_lock) -#define flush_dcache_mmap_unlock(mapping) \ - spin_unlock_irq(&(mapping)->tree_lock) +#define flush_dcache_mmap_lock(mapping) xa_lock_irq(&mapping->i_pages) +#define flush_dcache_mmap_unlock(mapping) xa_unlock_irq(&mapping->i_pages) #define flush_icache_user_range(vma,page,addr,len) \ flush_dcache_page(page) diff --git a/arch/nds32/include/asm/cacheflush.h b/arch/nds32/include/asm/cacheflush.h index 7b9b20a381cb..1240f148ec0f 100644 --- a/arch/nds32/include/asm/cacheflush.h +++ b/arch/nds32/include/asm/cacheflush.h @@ -34,8 +34,8 @@ void flush_anon_page(struct vm_area_struct *vma, void flush_kernel_dcache_page(struct page *page); void flush_icache_range(unsigned long start, unsigned long end); void flush_icache_page(struct vm_area_struct *vma, struct page *page); -#define flush_dcache_mmap_lock(mapping) spin_lock_irq(&(mapping)->tree_lock) -#define flush_dcache_mmap_unlock(mapping) spin_unlock_irq(&(mapping)->tree_lock) +#define flush_dcache_mmap_lock(mapping) xa_lock_irq(&(mapping)->i_pages) +#define flush_dcache_mmap_unlock(mapping) xa_unlock_irq(&(mapping)->i_pages) #else #include diff --git a/arch/nios2/include/asm/cacheflush.h b/arch/nios2/include/asm/cacheflush.h index 55e383c173f7..18eb9f69f806 100644 --- a/arch/nios2/include/asm/cacheflush.h +++ b/arch/nios2/include/asm/cacheflush.h @@ -46,9 +46,7 @@ extern void copy_from_user_page(struct vm_area_struct *vma, struct page *page, extern void flush_dcache_range(unsigned long start, unsigned long end); extern void invalidate_dcache_range(unsigned long start, unsigned long end); -#define flush_dcache_mmap_lock(mapping) \ - spin_lock_irq(&(mapping)->tree_lock) -#define flush_dcache_mmap_unlock(mapping) \ - spin_unlock_irq(&(mapping)->tree_lock) +#define flush_dcache_mmap_lock(mapping) xa_lock_irq(&mapping->i_pages) +#define flush_dcache_mmap_unlock(mapping) xa_unlock_irq(&mapping->i_pages) #endif /* _ASM_NIOS2_CACHEFLUSH_H */ diff --git a/arch/parisc/include/asm/cacheflush.h b/arch/parisc/include/asm/cacheflush.h index bd5ce31936f5..0c83644bfa5c 100644 --- a/arch/parisc/include/asm/cacheflush.h +++ b/arch/parisc/include/asm/cacheflush.h @@ -55,10 +55,8 @@ void invalidate_kernel_vmap_range(void *vaddr, int size); #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 extern void flush_dcache_page(struct page *page); -#define flush_dcache_mmap_lock(mapping) \ - spin_lock_irq(&(mapping)->tree_lock) -#define flush_dcache_mmap_unlock(mapping) \ - spin_unlock_irq(&(mapping)->tree_lock) +#define flush_dcache_mmap_lock(mapping) xa_lock_irq(&mapping->i_pages) +#define flush_dcache_mmap_unlock(mapping) xa_unlock_irq(&mapping->i_pages) #define flush_icache_page(vma,page) do { \ flush_kernel_dcache_page(page); \ diff --git a/drivers/staging/lustre/lustre/llite/glimpse.c b/drivers/staging/lustre/lustre/llite/glimpse.c index c43ac574274c..3075358f3f08 100644 --- a/drivers/staging/lustre/lustre/llite/glimpse.c +++ b/drivers/staging/lustre/lustre/llite/glimpse.c @@ -69,7 +69,7 @@ blkcnt_t dirty_cnt(struct inode *inode) void *results[1]; if (inode->i_mapping) - cnt += radix_tree_gang_lookup_tag(&inode->i_mapping->page_tree, + cnt += radix_tree_gang_lookup_tag(&inode->i_mapping->i_pages, results, 0, 1, PAGECACHE_TAG_DIRTY); if (cnt == 0 && atomic_read(&vob->vob_mmap_cnt) > 0) diff --git a/drivers/staging/lustre/lustre/mdc/mdc_request.c b/drivers/staging/lustre/lustre/mdc/mdc_request.c index 3b1c8e5a3053..8ee7b4d273b2 100644 --- a/drivers/staging/lustre/lustre/mdc/mdc_request.c +++ b/drivers/staging/lustre/lustre/mdc/mdc_request.c @@ -934,14 +934,14 @@ static struct page *mdc_page_locate(struct address_space *mapping, __u64 *hash, struct page *page; int found; - spin_lock_irq(&mapping->tree_lock); - found = radix_tree_gang_lookup(&mapping->page_tree, + xa_lock_irq(&mapping->i_pages); + found = radix_tree_gang_lookup(&mapping->i_pages, (void **)&page, offset, 1); if (found > 0 && !radix_tree_exceptional_entry(page)) { struct lu_dirpage *dp; get_page(page); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); /* * In contrast to find_lock_page() we are sure that directory * page cannot be truncated (while DLM lock is held) and, @@ -989,7 +989,7 @@ static struct page *mdc_page_locate(struct address_space *mapping, __u64 *hash, page = ERR_PTR(-EIO); } } else { - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); page = NULL; } return page; diff --git a/fs/afs/write.c b/fs/afs/write.c index 9370e2feb999..dbc3c0b0142d 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -570,10 +570,11 @@ static int afs_writepages_region(struct address_space *mapping, _debug("wback %lx", page->index); - /* at this point we hold neither mapping->tree_lock nor lock on - * the page itself: the page may be truncated or invalidated - * (changing page->mapping to NULL), or even swizzled back from - * swapper_space to tmpfs file mapping + /* + * at this point we hold neither the i_pages lock nor the + * page lock: the page may be truncated or invalidated + * (changing page->mapping to NULL), or even swizzled + * back from swapper_space to tmpfs file mapping */ ret = lock_page_killable(page); if (ret < 0) { diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 562c3e633403..578181cd96b5 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -458,7 +458,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, break; rcu_read_lock(); - page = radix_tree_lookup(&mapping->page_tree, pg_index); + page = radix_tree_lookup(&mapping->i_pages, pg_index); rcu_read_unlock(); if (page && !radix_tree_exceptional_entry(page)) { misses++; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 47a8fe9d22e8..cf87976e389d 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3963,11 +3963,11 @@ retry: done_index = page->index; /* - * At this point we hold neither mapping->tree_lock nor - * lock on the page itself: the page may be truncated or - * invalidated (changing page->mapping to NULL), or even - * swizzled back from swapper_space to tmpfs file - * mapping + * At this point we hold neither the i_pages lock nor + * the page lock: the page may be truncated or + * invalidated (changing page->mapping to NULL), + * or even swizzled back from swapper_space to + * tmpfs file mapping */ if (!trylock_page(page)) { flush_write_bio(epd); @@ -5174,13 +5174,13 @@ void clear_extent_buffer_dirty(struct extent_buffer *eb) WARN_ON(!PagePrivate(page)); clear_page_dirty_for_io(page); - spin_lock_irq(&page->mapping->tree_lock); + xa_lock_irq(&page->mapping->i_pages); if (!PageDirty(page)) { - radix_tree_tag_clear(&page->mapping->page_tree, + radix_tree_tag_clear(&page->mapping->i_pages, page_index(page), PAGECACHE_TAG_DIRTY); } - spin_unlock_irq(&page->mapping->tree_lock); + xa_unlock_irq(&page->mapping->i_pages); ClearPageError(page); unlock_page(page); } diff --git a/fs/buffer.c b/fs/buffer.c index 64b1e2065b6b..f3491074b035 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -185,10 +185,9 @@ EXPORT_SYMBOL(end_buffer_write_sync); * we get exclusion from try_to_free_buffers with the blockdev mapping's * private_lock. * - * Hack idea: for the blockdev mapping, i_bufferlist_lock contention + * Hack idea: for the blockdev mapping, private_lock contention * may be quite high. This code could TryLock the page, and if that - * succeeds, there is no need to take private_lock. (But if - * private_lock is contended then so is mapping->tree_lock). + * succeeds, there is no need to take private_lock. */ static struct buffer_head * __find_get_block_slow(struct block_device *bdev, sector_t block) @@ -599,14 +598,14 @@ void __set_page_dirty(struct page *page, struct address_space *mapping, { unsigned long flags; - spin_lock_irqsave(&mapping->tree_lock, flags); + xa_lock_irqsave(&mapping->i_pages, flags); if (page->mapping) { /* Race with truncate? */ WARN_ON_ONCE(warn && !PageUptodate(page)); account_page_dirtied(page, mapping); - radix_tree_tag_set(&mapping->page_tree, + radix_tree_tag_set(&mapping->i_pages, page_index(page), PAGECACHE_TAG_DIRTY); } - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); } EXPORT_SYMBOL_GPL(__set_page_dirty); @@ -1096,7 +1095,7 @@ __getblk_slow(struct block_device *bdev, sector_t block, * inode list. * * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock, - * mapping->tree_lock and mapping->host->i_lock. + * i_pages lock and mapping->host->i_lock. */ void mark_buffer_dirty(struct buffer_head *bh) { diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 7cee97b93a61..4bcd4e838b47 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1987,11 +1987,10 @@ wdata_prepare_pages(struct cifs_writedata *wdata, unsigned int found_pages, for (i = 0; i < found_pages; i++) { page = wdata->pages[i]; /* - * At this point we hold neither mapping->tree_lock nor - * lock on the page itself: the page may be truncated or - * invalidated (changing page->mapping to NULL), or even - * swizzled back from swapper_space to tmpfs file - * mapping + * At this point we hold neither the i_pages lock nor the + * page lock: the page may be truncated or invalidated + * (changing page->mapping to NULL), or even swizzled + * back from swapper_space to tmpfs file mapping */ if (nr_pages == 0) diff --git a/fs/dax.c b/fs/dax.c index a77394fe586e..aaec72ded1b6 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -158,11 +158,9 @@ static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mo } /* - * We do not necessarily hold the mapping->tree_lock when we call this - * function so it is possible that 'entry' is no longer a valid item in the - * radix tree. This is okay because all we really need to do is to find the - * correct waitqueue where tasks might be waiting for that old 'entry' and - * wake them. + * @entry may no longer be the entry at the index in the mapping. + * The important information it's conveying is whether the entry at + * this index used to be a PMD entry. */ static void dax_wake_mapping_entry_waiter(struct address_space *mapping, pgoff_t index, void *entry, bool wake_all) @@ -174,7 +172,7 @@ static void dax_wake_mapping_entry_waiter(struct address_space *mapping, /* * Checking for locked entry and prepare_to_wait_exclusive() happens - * under mapping->tree_lock, ditto for entry handling in our callers. + * under the i_pages lock, ditto for entry handling in our callers. * So at this point all tasks that could have seen our entry locked * must be in the waitqueue and the following check will see them. */ @@ -183,41 +181,39 @@ static void dax_wake_mapping_entry_waiter(struct address_space *mapping, } /* - * Check whether the given slot is locked. The function must be called with - * mapping->tree_lock held + * Check whether the given slot is locked. Must be called with the i_pages + * lock held. */ static inline int slot_locked(struct address_space *mapping, void **slot) { unsigned long entry = (unsigned long) - radix_tree_deref_slot_protected(slot, &mapping->tree_lock); + radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock); return entry & RADIX_DAX_ENTRY_LOCK; } /* - * Mark the given slot is locked. The function must be called with - * mapping->tree_lock held + * Mark the given slot as locked. Must be called with the i_pages lock held. */ static inline void *lock_slot(struct address_space *mapping, void **slot) { unsigned long entry = (unsigned long) - radix_tree_deref_slot_protected(slot, &mapping->tree_lock); + radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock); entry |= RADIX_DAX_ENTRY_LOCK; - radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry); + radix_tree_replace_slot(&mapping->i_pages, slot, (void *)entry); return (void *)entry; } /* - * Mark the given slot is unlocked. The function must be called with - * mapping->tree_lock held + * Mark the given slot as unlocked. Must be called with the i_pages lock held. */ static inline void *unlock_slot(struct address_space *mapping, void **slot) { unsigned long entry = (unsigned long) - radix_tree_deref_slot_protected(slot, &mapping->tree_lock); + radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock); entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK; - radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry); + radix_tree_replace_slot(&mapping->i_pages, slot, (void *)entry); return (void *)entry; } @@ -228,7 +224,7 @@ static inline void *unlock_slot(struct address_space *mapping, void **slot) * put_locked_mapping_entry() when he locked the entry and now wants to * unlock it. * - * The function must be called with mapping->tree_lock held. + * Must be called with the i_pages lock held. */ static void *get_unlocked_mapping_entry(struct address_space *mapping, pgoff_t index, void ***slotp) @@ -241,7 +237,7 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping, ewait.wait.func = wake_exceptional_entry_func; for (;;) { - entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, + entry = __radix_tree_lookup(&mapping->i_pages, index, NULL, &slot); if (!entry || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)) || @@ -254,10 +250,10 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping, wq = dax_entry_waitqueue(mapping, index, entry, &ewait.key); prepare_to_wait_exclusive(wq, &ewait.wait, TASK_UNINTERRUPTIBLE); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); schedule(); finish_wait(wq, &ewait.wait); - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); } } @@ -266,15 +262,15 @@ static void dax_unlock_mapping_entry(struct address_space *mapping, { void *entry, **slot; - spin_lock_irq(&mapping->tree_lock); - entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot); + xa_lock_irq(&mapping->i_pages); + entry = __radix_tree_lookup(&mapping->i_pages, index, NULL, &slot); if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) || !slot_locked(mapping, slot))) { - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); return; } unlock_slot(mapping, slot); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); dax_wake_mapping_entry_waiter(mapping, index, entry, false); } @@ -388,7 +384,7 @@ static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index, void *entry, **slot; restart: - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); entry = get_unlocked_mapping_entry(mapping, index, &slot); if (WARN_ON_ONCE(entry && !radix_tree_exceptional_entry(entry))) { @@ -420,12 +416,12 @@ restart: if (pmd_downgrade) { /* * Make sure 'entry' remains valid while we drop - * mapping->tree_lock. + * the i_pages lock. */ entry = lock_slot(mapping, slot); } - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); /* * Besides huge zero pages the only other thing that gets * downgraded are empty entries which don't need to be @@ -442,27 +438,27 @@ restart: put_locked_mapping_entry(mapping, index); return ERR_PTR(err); } - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); if (!entry) { /* - * We needed to drop the page_tree lock while calling + * We needed to drop the i_pages lock while calling * radix_tree_preload() and we didn't have an entry to * lock. See if another thread inserted an entry at * our index during this time. */ - entry = __radix_tree_lookup(&mapping->page_tree, index, + entry = __radix_tree_lookup(&mapping->i_pages, index, NULL, &slot); if (entry) { radix_tree_preload_end(); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); goto restart; } } if (pmd_downgrade) { dax_disassociate_entry(entry, mapping, false); - radix_tree_delete(&mapping->page_tree, index); + radix_tree_delete(&mapping->i_pages, index); mapping->nrexceptional--; dax_wake_mapping_entry_waiter(mapping, index, entry, true); @@ -470,11 +466,11 @@ restart: entry = dax_radix_locked_entry(0, size_flag | RADIX_DAX_EMPTY); - err = __radix_tree_insert(&mapping->page_tree, index, + err = __radix_tree_insert(&mapping->i_pages, index, dax_radix_order(entry), entry); radix_tree_preload_end(); if (err) { - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); /* * Our insertion of a DAX entry failed, most likely * because we were inserting a PMD entry and it @@ -487,12 +483,12 @@ restart: } /* Good, we have inserted empty locked entry into the tree. */ mapping->nrexceptional++; - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); return entry; } entry = lock_slot(mapping, slot); out_unlock: - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); return entry; } @@ -501,23 +497,23 @@ static int __dax_invalidate_mapping_entry(struct address_space *mapping, { int ret = 0; void *entry; - struct radix_tree_root *page_tree = &mapping->page_tree; + struct radix_tree_root *pages = &mapping->i_pages; - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(pages); entry = get_unlocked_mapping_entry(mapping, index, NULL); if (!entry || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry))) goto out; if (!trunc && - (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) || - radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))) + (radix_tree_tag_get(pages, index, PAGECACHE_TAG_DIRTY) || + radix_tree_tag_get(pages, index, PAGECACHE_TAG_TOWRITE))) goto out; dax_disassociate_entry(entry, mapping, trunc); - radix_tree_delete(page_tree, index); + radix_tree_delete(pages, index); mapping->nrexceptional--; ret = 1; out: put_unlocked_mapping_entry(mapping, index, entry); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(pages); return ret; } /* @@ -587,7 +583,7 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, void *entry, pfn_t pfn_t, unsigned long flags, bool dirty) { - struct radix_tree_root *page_tree = &mapping->page_tree; + struct radix_tree_root *pages = &mapping->i_pages; unsigned long pfn = pfn_t_to_pfn(pfn_t); pgoff_t index = vmf->pgoff; void *new_entry; @@ -604,7 +600,7 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, unmap_mapping_pages(mapping, vmf->pgoff, 1, false); } - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(pages); new_entry = dax_radix_locked_entry(pfn, flags); if (dax_entry_size(entry) != dax_entry_size(new_entry)) { dax_disassociate_entry(entry, mapping, false); @@ -624,17 +620,17 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, void **slot; void *ret; - ret = __radix_tree_lookup(page_tree, index, &node, &slot); + ret = __radix_tree_lookup(pages, index, &node, &slot); WARN_ON_ONCE(ret != entry); - __radix_tree_replace(page_tree, node, slot, + __radix_tree_replace(pages, node, slot, new_entry, NULL); entry = new_entry; } if (dirty) - radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); + radix_tree_tag_set(pages, index, PAGECACHE_TAG_DIRTY); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(pages); return entry; } @@ -723,7 +719,7 @@ unlock_pte: static int dax_writeback_one(struct dax_device *dax_dev, struct address_space *mapping, pgoff_t index, void *entry) { - struct radix_tree_root *page_tree = &mapping->page_tree; + struct radix_tree_root *pages = &mapping->i_pages; void *entry2, **slot; unsigned long pfn; long ret = 0; @@ -736,7 +732,7 @@ static int dax_writeback_one(struct dax_device *dax_dev, if (WARN_ON(!radix_tree_exceptional_entry(entry))) return -EIO; - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(pages); entry2 = get_unlocked_mapping_entry(mapping, index, &slot); /* Entry got punched out / reallocated? */ if (!entry2 || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry2))) @@ -755,7 +751,7 @@ static int dax_writeback_one(struct dax_device *dax_dev, } /* Another fsync thread may have already written back this entry */ - if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)) + if (!radix_tree_tag_get(pages, index, PAGECACHE_TAG_TOWRITE)) goto put_unlocked; /* Lock the entry to serialize with page faults */ entry = lock_slot(mapping, slot); @@ -763,11 +759,11 @@ static int dax_writeback_one(struct dax_device *dax_dev, * We can clear the tag now but we have to be careful so that concurrent * dax_writeback_one() calls for the same index cannot finish before we * actually flush the caches. This is achieved as the calls will look - * at the entry only under tree_lock and once they do that they will - * see the entry locked and wait for it to unlock. + * at the entry only under the i_pages lock and once they do that + * they will see the entry locked and wait for it to unlock. */ - radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE); - spin_unlock_irq(&mapping->tree_lock); + radix_tree_tag_clear(pages, index, PAGECACHE_TAG_TOWRITE); + xa_unlock_irq(pages); /* * Even if dax_writeback_mapping_range() was given a wbc->range_start @@ -787,16 +783,16 @@ static int dax_writeback_one(struct dax_device *dax_dev, * the pfn mappings are writeprotected and fault waits for mapping * entry lock. */ - spin_lock_irq(&mapping->tree_lock); - radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY); - spin_unlock_irq(&mapping->tree_lock); + xa_lock_irq(pages); + radix_tree_tag_clear(pages, index, PAGECACHE_TAG_DIRTY); + xa_unlock_irq(pages); trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT); put_locked_mapping_entry(mapping, index); return ret; put_unlocked: put_unlocked_mapping_entry(mapping, index, entry2); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(pages); return ret; } @@ -1566,21 +1562,21 @@ static int dax_insert_pfn_mkwrite(struct vm_fault *vmf, pgoff_t index = vmf->pgoff; int vmf_ret, error; - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); entry = get_unlocked_mapping_entry(mapping, index, &slot); /* Did we race with someone splitting entry or so? */ if (!entry || (pe_size == PE_SIZE_PTE && !dax_is_pte_entry(entry)) || (pe_size == PE_SIZE_PMD && !dax_is_pmd_entry(entry))) { put_unlocked_mapping_entry(mapping, index, entry); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf, VM_FAULT_NOPAGE); return VM_FAULT_NOPAGE; } - radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY); + radix_tree_tag_set(&mapping->i_pages, index, PAGECACHE_TAG_DIRTY); entry = lock_slot(mapping, slot); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); switch (pe_size) { case PE_SIZE_PTE: error = vm_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index db50686f5096..02237d4d91f5 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2424,12 +2424,12 @@ void f2fs_set_page_dirty_nobuffers(struct page *page) SetPageDirty(page); spin_unlock(&mapping->private_lock); - spin_lock_irqsave(&mapping->tree_lock, flags); + xa_lock_irqsave(&mapping->i_pages, flags); WARN_ON_ONCE(!PageUptodate(page)); account_page_dirtied(page, mapping); - radix_tree_tag_set(&mapping->page_tree, + radix_tree_tag_set(&mapping->i_pages, page_index(page), PAGECACHE_TAG_DIRTY); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); unlock_page_memcg(page); __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index fe661274ff10..8c9c2f31b253 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -732,10 +732,10 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, if (bit_pos == NR_DENTRY_IN_BLOCK && !truncate_hole(dir, page->index, page->index + 1)) { - spin_lock_irqsave(&mapping->tree_lock, flags); - radix_tree_tag_clear(&mapping->page_tree, page_index(page), + xa_lock_irqsave(&mapping->i_pages, flags); + radix_tree_tag_clear(&mapping->i_pages, page_index(page), PAGECACHE_TAG_DIRTY); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); clear_page_dirty_for_io(page); ClearPagePrivate(page); diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 3b77d6421218..265da200daa8 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -226,10 +226,10 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page) kunmap_atomic(src_addr); set_page_dirty(dn.inode_page); - spin_lock_irqsave(&mapping->tree_lock, flags); - radix_tree_tag_clear(&mapping->page_tree, page_index(page), + xa_lock_irqsave(&mapping->i_pages, flags); + radix_tree_tag_clear(&mapping->i_pages, page_index(page), PAGECACHE_TAG_DIRTY); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); set_inode_flag(inode, FI_APPEND_WRITE); set_inode_flag(inode, FI_DATA_EXIST); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 9a99243054ba..f202398e20ea 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -91,11 +91,11 @@ static void clear_node_page_dirty(struct page *page) unsigned int long flags; if (PageDirty(page)) { - spin_lock_irqsave(&mapping->tree_lock, flags); - radix_tree_tag_clear(&mapping->page_tree, + xa_lock_irqsave(&mapping->i_pages, flags); + radix_tree_tag_clear(&mapping->i_pages, page_index(page), PAGECACHE_TAG_DIRTY); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); clear_page_dirty_for_io(page); dec_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_NODES); @@ -1161,7 +1161,7 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) f2fs_bug_on(sbi, check_nid_range(sbi, nid)); rcu_read_lock(); - apage = radix_tree_lookup(&NODE_MAPPING(sbi)->page_tree, nid); + apage = radix_tree_lookup(&NODE_MAPPING(sbi)->i_pages, nid); rcu_read_unlock(); if (apage) return; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 1280f915079b..4b12ba70a895 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -347,9 +347,9 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) * By the time control reaches here, RCU grace period has passed * since I_WB_SWITCH assertion and all wb stat update transactions * between unlocked_inode_to_wb_begin/end() are guaranteed to be - * synchronizing against mapping->tree_lock. + * synchronizing against the i_pages lock. * - * Grabbing old_wb->list_lock, inode->i_lock and mapping->tree_lock + * Grabbing old_wb->list_lock, inode->i_lock and the i_pages lock * gives us exclusion against all wb related operations on @inode * including IO list manipulations and stat updates. */ @@ -361,7 +361,7 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING); } spin_lock(&inode->i_lock); - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); /* * Once I_FREEING is visible under i_lock, the eviction path owns @@ -373,22 +373,22 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) /* * Count and transfer stats. Note that PAGECACHE_TAG_DIRTY points * to possibly dirty pages while PAGECACHE_TAG_WRITEBACK points to - * pages actually under underwriteback. + * pages actually under writeback. */ - radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, 0, + radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, 0, PAGECACHE_TAG_DIRTY) { struct page *page = radix_tree_deref_slot_protected(slot, - &mapping->tree_lock); + &mapping->i_pages.xa_lock); if (likely(page) && PageDirty(page)) { dec_wb_stat(old_wb, WB_RECLAIMABLE); inc_wb_stat(new_wb, WB_RECLAIMABLE); } } - radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, 0, + radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, 0, PAGECACHE_TAG_WRITEBACK) { struct page *page = radix_tree_deref_slot_protected(slot, - &mapping->tree_lock); + &mapping->i_pages.xa_lock); if (likely(page)) { WARN_ON_ONCE(!PageWriteback(page)); dec_wb_stat(old_wb, WB_WRITEBACK); @@ -430,7 +430,7 @@ skip_switch: */ smp_store_release(&inode->i_state, inode->i_state & ~I_WB_SWITCH); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); spin_unlock(&inode->i_lock); spin_unlock(&new_wb->list_lock); spin_unlock(&old_wb->list_lock); @@ -506,8 +506,8 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) /* * In addition to synchronizing among switchers, I_WB_SWITCH tells - * the RCU protected stat update paths to grab the mapping's - * tree_lock so that stat transfer can synchronize against them. + * the RCU protected stat update paths to grab the i_page + * lock so that stat transfer can synchronize against them. * Let's continue after I_WB_SWITCH is guaranteed to be visible. */ call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn); diff --git a/fs/inode.c b/fs/inode.c index b153aeaa61ea..13ceb98c3bd3 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -348,8 +348,7 @@ EXPORT_SYMBOL(inc_nlink); static void __address_space_init_once(struct address_space *mapping) { - INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC | __GFP_ACCOUNT); - spin_lock_init(&mapping->tree_lock); + INIT_RADIX_TREE(&mapping->i_pages, GFP_ATOMIC | __GFP_ACCOUNT); init_rwsem(&mapping->i_mmap_rwsem); INIT_LIST_HEAD(&mapping->private_list); spin_lock_init(&mapping->private_lock); @@ -504,14 +503,14 @@ EXPORT_SYMBOL(__remove_inode_hash); void clear_inode(struct inode *inode) { /* - * We have to cycle tree_lock here because reclaim can be still in the + * We have to cycle the i_pages lock here because reclaim can be in the * process of removing the last page (in __delete_from_page_cache()) - * and we must not free mapping under it. + * and we must not free the mapping under it. */ - spin_lock_irq(&inode->i_data.tree_lock); + xa_lock_irq(&inode->i_data.i_pages); BUG_ON(inode->i_data.nrpages); BUG_ON(inode->i_data.nrexceptional); - spin_unlock_irq(&inode->i_data.tree_lock); + xa_unlock_irq(&inode->i_data.i_pages); BUG_ON(!list_empty(&inode->i_data.private_list)); BUG_ON(!(inode->i_state & I_FREEING)); BUG_ON(inode->i_state & I_CLEAR); diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index c21e0b4454a6..dec98cab729d 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -193,9 +193,9 @@ retry: (unsigned long long)oldkey, (unsigned long long)newkey); - spin_lock_irq(&btnc->tree_lock); - err = radix_tree_insert(&btnc->page_tree, newkey, obh->b_page); - spin_unlock_irq(&btnc->tree_lock); + xa_lock_irq(&btnc->i_pages); + err = radix_tree_insert(&btnc->i_pages, newkey, obh->b_page); + xa_unlock_irq(&btnc->i_pages); /* * Note: page->index will not change to newkey until * nilfs_btnode_commit_change_key() will be called. @@ -251,11 +251,11 @@ void nilfs_btnode_commit_change_key(struct address_space *btnc, (unsigned long long)newkey); mark_buffer_dirty(obh); - spin_lock_irq(&btnc->tree_lock); - radix_tree_delete(&btnc->page_tree, oldkey); - radix_tree_tag_set(&btnc->page_tree, newkey, + xa_lock_irq(&btnc->i_pages); + radix_tree_delete(&btnc->i_pages, oldkey); + radix_tree_tag_set(&btnc->i_pages, newkey, PAGECACHE_TAG_DIRTY); - spin_unlock_irq(&btnc->tree_lock); + xa_unlock_irq(&btnc->i_pages); opage->index = obh->b_blocknr = newkey; unlock_page(opage); @@ -283,9 +283,9 @@ void nilfs_btnode_abort_change_key(struct address_space *btnc, return; if (nbh == NULL) { /* blocksize == pagesize */ - spin_lock_irq(&btnc->tree_lock); - radix_tree_delete(&btnc->page_tree, newkey); - spin_unlock_irq(&btnc->tree_lock); + xa_lock_irq(&btnc->i_pages); + radix_tree_delete(&btnc->i_pages, newkey); + xa_unlock_irq(&btnc->i_pages); unlock_page(ctxt->bh->b_page); } else brelse(nbh); diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 68241512d7c1..4cb850a6f1c2 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -331,15 +331,15 @@ repeat: struct page *page2; /* move the page to the destination cache */ - spin_lock_irq(&smap->tree_lock); - page2 = radix_tree_delete(&smap->page_tree, offset); + xa_lock_irq(&smap->i_pages); + page2 = radix_tree_delete(&smap->i_pages, offset); WARN_ON(page2 != page); smap->nrpages--; - spin_unlock_irq(&smap->tree_lock); + xa_unlock_irq(&smap->i_pages); - spin_lock_irq(&dmap->tree_lock); - err = radix_tree_insert(&dmap->page_tree, offset, page); + xa_lock_irq(&dmap->i_pages); + err = radix_tree_insert(&dmap->i_pages, offset, page); if (unlikely(err < 0)) { WARN_ON(err == -EEXIST); page->mapping = NULL; @@ -348,11 +348,11 @@ repeat: page->mapping = dmap; dmap->nrpages++; if (PageDirty(page)) - radix_tree_tag_set(&dmap->page_tree, + radix_tree_tag_set(&dmap->i_pages, offset, PAGECACHE_TAG_DIRTY); } - spin_unlock_irq(&dmap->tree_lock); + xa_unlock_irq(&dmap->i_pages); } unlock_page(page); } @@ -474,15 +474,15 @@ int __nilfs_clear_page_dirty(struct page *page) struct address_space *mapping = page->mapping; if (mapping) { - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); if (test_bit(PG_dirty, &page->flags)) { - radix_tree_tag_clear(&mapping->page_tree, + radix_tree_tag_clear(&mapping->i_pages, page_index(page), PAGECACHE_TAG_DIRTY); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); return clear_page_dirty_for_io(page); } - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); return 0; } return TestClearPageDirty(page); diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index e6cbb915ee56..09da0f124699 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -329,7 +329,7 @@ static inline bool inode_to_wb_is_valid(struct inode *inode) * @inode: inode of interest * * Returns the wb @inode is currently associated with. The caller must be - * holding either @inode->i_lock, @inode->i_mapping->tree_lock, or the + * holding either @inode->i_lock, the i_pages lock, or the * associated wb's list_lock. */ static inline struct bdi_writeback *inode_to_wb(const struct inode *inode) @@ -337,7 +337,7 @@ static inline struct bdi_writeback *inode_to_wb(const struct inode *inode) #ifdef CONFIG_LOCKDEP WARN_ON_ONCE(debug_locks && (!lockdep_is_held(&inode->i_lock) && - !lockdep_is_held(&inode->i_mapping->tree_lock) && + !lockdep_is_held(&inode->i_mapping->i_pages.xa_lock) && !lockdep_is_held(&inode->i_wb->list_lock))); #endif return inode->i_wb; @@ -349,7 +349,7 @@ static inline struct bdi_writeback *inode_to_wb(const struct inode *inode) * @lockedp: temp bool output param, to be passed to the end function * * The caller wants to access the wb associated with @inode but isn't - * holding inode->i_lock, mapping->tree_lock or wb->list_lock. This + * holding inode->i_lock, the i_pages lock or wb->list_lock. This * function determines the wb associated with @inode and ensures that the * association doesn't change until the transaction is finished with * unlocked_inode_to_wb_end(). @@ -370,11 +370,11 @@ unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp) *lockedp = smp_load_acquire(&inode->i_state) & I_WB_SWITCH; if (unlikely(*lockedp)) - spin_lock_irq(&inode->i_mapping->tree_lock); + xa_lock_irq(&inode->i_mapping->i_pages); /* - * Protected by either !I_WB_SWITCH + rcu_read_lock() or tree_lock. - * inode_to_wb() will bark. Deref directly. + * Protected by either !I_WB_SWITCH + rcu_read_lock() or the i_pages + * lock. inode_to_wb() will bark. Deref directly. */ return inode->i_wb; } @@ -387,7 +387,7 @@ unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp) static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked) { if (unlikely(locked)) - spin_unlock_irq(&inode->i_mapping->tree_lock); + xa_unlock_irq(&inode->i_mapping->i_pages); rcu_read_unlock(); } diff --git a/include/linux/fs.h b/include/linux/fs.h index 2aa02cad94d4..92efaf1f8977 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -390,12 +391,11 @@ int pagecache_write_end(struct file *, struct address_space *mapping, struct address_space { struct inode *host; /* owner: inode, block_device */ - struct radix_tree_root page_tree; /* radix tree of all pages */ - spinlock_t tree_lock; /* and lock protecting it */ + struct radix_tree_root i_pages; /* cached pages */ atomic_t i_mmap_writable;/* count VM_SHARED mappings */ struct rb_root_cached i_mmap; /* tree of private and shared mappings */ struct rw_semaphore i_mmap_rwsem; /* protect tree, count, list */ - /* Protected by tree_lock together with the radix tree */ + /* Protected by the i_pages lock */ unsigned long nrpages; /* number of total pages */ /* number of shadow or DAX exceptional entries */ unsigned long nrexceptional; @@ -1989,7 +1989,7 @@ static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp) * * I_WB_SWITCH Cgroup bdi_writeback switching in progress. Used to * synchronize competing switching instances and to tell - * wb stat updates to grab mapping->tree_lock. See + * wb stat updates to grab the i_pages lock. See * inode_switch_wb_work_fn() for details. * * I_OVL_INUSE Used by overlayfs to get exclusive ownership on upper diff --git a/include/linux/mm.h b/include/linux/mm.h index f13bc25f7a9f..1ac1f06a4be6 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -747,7 +747,7 @@ int finish_mkwrite_fault(struct vm_fault *vmf); * refcount. The each user mapping also has a reference to the page. * * The pagecache pages are stored in a per-mapping radix tree, which is - * rooted at mapping->page_tree, and indexed by offset. + * rooted at mapping->i_pages, and indexed by offset. * Where 2.4 and early 2.6 kernels kept dirty/clean pages in per-address_space * lists, we instead now tag pages as dirty/writeback in the radix tree. * diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 34ce3ebf97d5..b1bd2186e6d2 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -144,7 +144,7 @@ void release_pages(struct page **pages, int nr); * 3. check the page is still in pagecache (if no, goto 1) * * Remove-side that cares about stability of _refcount (eg. reclaim) has the - * following (with tree_lock held for write): + * following (with the i_pages lock held): * A. atomically check refcount is correct and set it to 0 (atomic_cmpxchg) * B. remove page from pagecache * C. free the page @@ -157,7 +157,7 @@ void release_pages(struct page **pages, int nr); * * It is possible that between 1 and 2, the page is removed then the exact same * page is inserted into the same position in pagecache. That's OK: the - * old find_get_page using tree_lock could equally have run before or after + * old find_get_page using a lock could equally have run before or after * such a re-insertion, depending on order that locks are granted. * * Lookups racing against pagecache insertion isn't a big problem: either 1 diff --git a/mm/filemap.c b/mm/filemap.c index 693f62212a59..ab77e19ab09c 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -66,7 +66,7 @@ * ->i_mmap_rwsem (truncate_pagecache) * ->private_lock (__free_pte->__set_page_dirty_buffers) * ->swap_lock (exclusive_swap_page, others) - * ->mapping->tree_lock + * ->i_pages lock * * ->i_mutex * ->i_mmap_rwsem (truncate->unmap_mapping_range) @@ -74,7 +74,7 @@ * ->mmap_sem * ->i_mmap_rwsem * ->page_table_lock or pte_lock (various, mainly in memory.c) - * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock) + * ->i_pages lock (arch-dependent flush_dcache_mmap_lock) * * ->mmap_sem * ->lock_page (access_process_vm) @@ -84,7 +84,7 @@ * * bdi->wb.list_lock * sb_lock (fs/fs-writeback.c) - * ->mapping->tree_lock (__sync_single_inode) + * ->i_pages lock (__sync_single_inode) * * ->i_mmap_rwsem * ->anon_vma.lock (vma_adjust) @@ -95,11 +95,11 @@ * ->page_table_lock or pte_lock * ->swap_lock (try_to_unmap_one) * ->private_lock (try_to_unmap_one) - * ->tree_lock (try_to_unmap_one) + * ->i_pages lock (try_to_unmap_one) * ->zone_lru_lock(zone) (follow_page->mark_page_accessed) * ->zone_lru_lock(zone) (check_pte_range->isolate_lru_page) * ->private_lock (page_remove_rmap->set_page_dirty) - * ->tree_lock (page_remove_rmap->set_page_dirty) + * ->i_pages lock (page_remove_rmap->set_page_dirty) * bdi.wb->list_lock (page_remove_rmap->set_page_dirty) * ->inode->i_lock (page_remove_rmap->set_page_dirty) * ->memcg->move_lock (page_remove_rmap->lock_page_memcg) @@ -118,14 +118,15 @@ static int page_cache_tree_insert(struct address_space *mapping, void **slot; int error; - error = __radix_tree_create(&mapping->page_tree, page->index, 0, + error = __radix_tree_create(&mapping->i_pages, page->index, 0, &node, &slot); if (error) return error; if (*slot) { void *p; - p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock); + p = radix_tree_deref_slot_protected(slot, + &mapping->i_pages.xa_lock); if (!radix_tree_exceptional_entry(p)) return -EEXIST; @@ -133,7 +134,7 @@ static int page_cache_tree_insert(struct address_space *mapping, if (shadowp) *shadowp = p; } - __radix_tree_replace(&mapping->page_tree, node, slot, page, + __radix_tree_replace(&mapping->i_pages, node, slot, page, workingset_lookup_update(mapping)); mapping->nrpages++; return 0; @@ -155,13 +156,13 @@ static void page_cache_tree_delete(struct address_space *mapping, struct radix_tree_node *node; void **slot; - __radix_tree_lookup(&mapping->page_tree, page->index + i, + __radix_tree_lookup(&mapping->i_pages, page->index + i, &node, &slot); VM_BUG_ON_PAGE(!node && nr != 1, page); - radix_tree_clear_tags(&mapping->page_tree, node, slot); - __radix_tree_replace(&mapping->page_tree, node, slot, shadow, + radix_tree_clear_tags(&mapping->i_pages, node, slot); + __radix_tree_replace(&mapping->i_pages, node, slot, shadow, workingset_lookup_update(mapping)); } @@ -253,7 +254,7 @@ static void unaccount_page_cache_page(struct address_space *mapping, /* * Delete a page from the page cache and free it. Caller has to make * sure the page is locked and that nobody else uses it - or that usage - * is safe. The caller must hold the mapping's tree_lock. + * is safe. The caller must hold the i_pages lock. */ void __delete_from_page_cache(struct page *page, void *shadow) { @@ -296,9 +297,9 @@ void delete_from_page_cache(struct page *page) unsigned long flags; BUG_ON(!PageLocked(page)); - spin_lock_irqsave(&mapping->tree_lock, flags); + xa_lock_irqsave(&mapping->i_pages, flags); __delete_from_page_cache(page, NULL); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); page_cache_free_page(mapping, page); } @@ -309,14 +310,14 @@ EXPORT_SYMBOL(delete_from_page_cache); * @mapping: the mapping to which pages belong * @pvec: pagevec with pages to delete * - * The function walks over mapping->page_tree and removes pages passed in @pvec - * from the radix tree. The function expects @pvec to be sorted by page index. - * It tolerates holes in @pvec (radix tree entries at those indices are not + * The function walks over mapping->i_pages and removes pages passed in @pvec + * from the mapping. The function expects @pvec to be sorted by page index. + * It tolerates holes in @pvec (mapping entries at those indices are not * modified). The function expects only THP head pages to be present in the - * @pvec and takes care to delete all corresponding tail pages from the radix - * tree as well. + * @pvec and takes care to delete all corresponding tail pages from the + * mapping as well. * - * The function expects mapping->tree_lock to be held. + * The function expects the i_pages lock to be held. */ static void page_cache_tree_delete_batch(struct address_space *mapping, @@ -330,11 +331,11 @@ page_cache_tree_delete_batch(struct address_space *mapping, pgoff_t start; start = pvec->pages[0]->index; - radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { + radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { if (i >= pagevec_count(pvec) && !tail_pages) break; page = radix_tree_deref_slot_protected(slot, - &mapping->tree_lock); + &mapping->i_pages.xa_lock); if (radix_tree_exceptional_entry(page)) continue; if (!tail_pages) { @@ -357,8 +358,8 @@ page_cache_tree_delete_batch(struct address_space *mapping, } else { tail_pages--; } - radix_tree_clear_tags(&mapping->page_tree, iter.node, slot); - __radix_tree_replace(&mapping->page_tree, iter.node, slot, NULL, + radix_tree_clear_tags(&mapping->i_pages, iter.node, slot); + __radix_tree_replace(&mapping->i_pages, iter.node, slot, NULL, workingset_lookup_update(mapping)); total_pages++; } @@ -374,14 +375,14 @@ void delete_from_page_cache_batch(struct address_space *mapping, if (!pagevec_count(pvec)) return; - spin_lock_irqsave(&mapping->tree_lock, flags); + xa_lock_irqsave(&mapping->i_pages, flags); for (i = 0; i < pagevec_count(pvec); i++) { trace_mm_filemap_delete_from_page_cache(pvec->pages[i]); unaccount_page_cache_page(mapping, pvec->pages[i]); } page_cache_tree_delete_batch(mapping, pvec); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); for (i = 0; i < pagevec_count(pvec); i++) page_cache_free_page(mapping, pvec->pages[i]); @@ -798,7 +799,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) new->mapping = mapping; new->index = offset; - spin_lock_irqsave(&mapping->tree_lock, flags); + xa_lock_irqsave(&mapping->i_pages, flags); __delete_from_page_cache(old, NULL); error = page_cache_tree_insert(mapping, new, NULL); BUG_ON(error); @@ -810,7 +811,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) __inc_node_page_state(new, NR_FILE_PAGES); if (PageSwapBacked(new)) __inc_node_page_state(new, NR_SHMEM); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); mem_cgroup_migrate(old, new); radix_tree_preload_end(); if (freepage) @@ -852,7 +853,7 @@ static int __add_to_page_cache_locked(struct page *page, page->mapping = mapping; page->index = offset; - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); error = page_cache_tree_insert(mapping, page, shadowp); radix_tree_preload_end(); if (unlikely(error)) @@ -861,7 +862,7 @@ static int __add_to_page_cache_locked(struct page *page, /* hugetlb pages do not participate in page cache accounting. */ if (!huge) __inc_node_page_state(page, NR_FILE_PAGES); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); if (!huge) mem_cgroup_commit_charge(page, memcg, false, false); trace_mm_filemap_add_to_page_cache(page); @@ -869,7 +870,7 @@ static int __add_to_page_cache_locked(struct page *page, err_insert: page->mapping = NULL; /* Leave page->index set: truncation relies upon it */ - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); if (!huge) mem_cgroup_cancel_charge(page, memcg, false); put_page(page); @@ -1353,7 +1354,7 @@ pgoff_t page_cache_next_hole(struct address_space *mapping, for (i = 0; i < max_scan; i++) { struct page *page; - page = radix_tree_lookup(&mapping->page_tree, index); + page = radix_tree_lookup(&mapping->i_pages, index); if (!page || radix_tree_exceptional_entry(page)) break; index++; @@ -1394,7 +1395,7 @@ pgoff_t page_cache_prev_hole(struct address_space *mapping, for (i = 0; i < max_scan; i++) { struct page *page; - page = radix_tree_lookup(&mapping->page_tree, index); + page = radix_tree_lookup(&mapping->i_pages, index); if (!page || radix_tree_exceptional_entry(page)) break; index--; @@ -1427,7 +1428,7 @@ struct page *find_get_entry(struct address_space *mapping, pgoff_t offset) rcu_read_lock(); repeat: page = NULL; - pagep = radix_tree_lookup_slot(&mapping->page_tree, offset); + pagep = radix_tree_lookup_slot(&mapping->i_pages, offset); if (pagep) { page = radix_tree_deref_slot(pagep); if (unlikely(!page)) @@ -1633,7 +1634,7 @@ unsigned find_get_entries(struct address_space *mapping, return 0; rcu_read_lock(); - radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { + radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { struct page *head, *page; repeat: page = radix_tree_deref_slot(slot); @@ -1710,7 +1711,7 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, return 0; rcu_read_lock(); - radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, *start) { + radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, *start) { struct page *head, *page; if (iter.index > end) @@ -1795,7 +1796,7 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, return 0; rcu_read_lock(); - radix_tree_for_each_contig(slot, &mapping->page_tree, &iter, index) { + radix_tree_for_each_contig(slot, &mapping->i_pages, &iter, index) { struct page *head, *page; repeat: page = radix_tree_deref_slot(slot); @@ -1875,8 +1876,7 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, return 0; rcu_read_lock(); - radix_tree_for_each_tagged(slot, &mapping->page_tree, - &iter, *index, tag) { + radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, *index, tag) { struct page *head, *page; if (iter.index > end) @@ -1969,8 +1969,7 @@ unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, return 0; rcu_read_lock(); - radix_tree_for_each_tagged(slot, &mapping->page_tree, - &iter, start, tag) { + radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, start, tag) { struct page *head, *page; repeat: page = radix_tree_deref_slot(slot); @@ -2624,8 +2623,7 @@ void filemap_map_pages(struct vm_fault *vmf, struct page *head, *page; rcu_read_lock(); - radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, - start_pgoff) { + radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start_pgoff) { if (iter.index > end_pgoff) break; repeat: diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 3f3267af4e3b..14ed6ee5e02f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2450,7 +2450,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, } else { /* Additional pin to radix tree */ page_ref_add(head, 2); - spin_unlock(&head->mapping->tree_lock); + xa_unlock(&head->mapping->i_pages); } spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags); @@ -2658,15 +2658,15 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) if (mapping) { void **pslot; - spin_lock(&mapping->tree_lock); - pslot = radix_tree_lookup_slot(&mapping->page_tree, + xa_lock(&mapping->i_pages); + pslot = radix_tree_lookup_slot(&mapping->i_pages, page_index(head)); /* * Check if the head page is present in radix tree. * We assume all tail are present too, if head is there. */ if (radix_tree_deref_slot_protected(pslot, - &mapping->tree_lock) != head) + &mapping->i_pages.xa_lock) != head) goto fail; } @@ -2700,7 +2700,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) } spin_unlock(&pgdata->split_queue_lock); fail: if (mapping) - spin_unlock(&mapping->tree_lock); + xa_unlock(&mapping->i_pages); spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags); unfreeze_page(head); ret = -EBUSY; diff --git a/mm/khugepaged.c b/mm/khugepaged.c index eb32d0707c80..d7b2a4bf8671 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1344,8 +1344,8 @@ static void collapse_shmem(struct mm_struct *mm, */ index = start; - spin_lock_irq(&mapping->tree_lock); - radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { + xa_lock_irq(&mapping->i_pages); + radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { int n = min(iter.index, end) - index; /* @@ -1358,7 +1358,7 @@ static void collapse_shmem(struct mm_struct *mm, } nr_none += n; for (; index < min(iter.index, end); index++) { - radix_tree_insert(&mapping->page_tree, index, + radix_tree_insert(&mapping->i_pages, index, new_page + (index % HPAGE_PMD_NR)); } @@ -1367,16 +1367,16 @@ static void collapse_shmem(struct mm_struct *mm, break; page = radix_tree_deref_slot_protected(slot, - &mapping->tree_lock); + &mapping->i_pages.xa_lock); if (radix_tree_exceptional_entry(page) || !PageUptodate(page)) { - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); /* swap in or instantiate fallocated page */ if (shmem_getpage(mapping->host, index, &page, SGP_NOHUGE)) { result = SCAN_FAIL; goto tree_unlocked; } - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); } else if (trylock_page(page)) { get_page(page); } else { @@ -1385,7 +1385,7 @@ static void collapse_shmem(struct mm_struct *mm, } /* - * The page must be locked, so we can drop the tree_lock + * The page must be locked, so we can drop the i_pages lock * without racing with truncate. */ VM_BUG_ON_PAGE(!PageLocked(page), page); @@ -1396,7 +1396,7 @@ static void collapse_shmem(struct mm_struct *mm, result = SCAN_TRUNCATED; goto out_unlock; } - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); if (isolate_lru_page(page)) { result = SCAN_DEL_PAGE_LRU; @@ -1406,11 +1406,11 @@ static void collapse_shmem(struct mm_struct *mm, if (page_mapped(page)) unmap_mapping_pages(mapping, index, 1, false); - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); - slot = radix_tree_lookup_slot(&mapping->page_tree, index); + slot = radix_tree_lookup_slot(&mapping->i_pages, index); VM_BUG_ON_PAGE(page != radix_tree_deref_slot_protected(slot, - &mapping->tree_lock), page); + &mapping->i_pages.xa_lock), page); VM_BUG_ON_PAGE(page_mapped(page), page); /* @@ -1431,14 +1431,14 @@ static void collapse_shmem(struct mm_struct *mm, list_add_tail(&page->lru, &pagelist); /* Finally, replace with the new page. */ - radix_tree_replace_slot(&mapping->page_tree, slot, + radix_tree_replace_slot(&mapping->i_pages, slot, new_page + (index % HPAGE_PMD_NR)); slot = radix_tree_iter_resume(slot, &iter); index++; continue; out_lru: - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); putback_lru_page(page); out_isolate_failed: unlock_page(page); @@ -1464,14 +1464,14 @@ out_unlock: } for (; index < end; index++) { - radix_tree_insert(&mapping->page_tree, index, + radix_tree_insert(&mapping->i_pages, index, new_page + (index % HPAGE_PMD_NR)); } nr_none += n; } tree_locked: - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); tree_unlocked: if (result == SCAN_SUCCEED) { @@ -1520,9 +1520,8 @@ tree_unlocked: } else { /* Something went wrong: rollback changes to the radix-tree */ shmem_uncharge(mapping->host, nr_none); - spin_lock_irq(&mapping->tree_lock); - radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, - start) { + xa_lock_irq(&mapping->i_pages); + radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { if (iter.index >= end) break; page = list_first_entry_or_null(&pagelist, @@ -1532,8 +1531,7 @@ tree_unlocked: break; nr_none--; /* Put holes back where they were */ - radix_tree_delete(&mapping->page_tree, - iter.index); + radix_tree_delete(&mapping->i_pages, iter.index); continue; } @@ -1542,16 +1540,15 @@ tree_unlocked: /* Unfreeze the page. */ list_del(&page->lru); page_ref_unfreeze(page, 2); - radix_tree_replace_slot(&mapping->page_tree, - slot, page); + radix_tree_replace_slot(&mapping->i_pages, slot, page); slot = radix_tree_iter_resume(slot, &iter); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); putback_lru_page(page); unlock_page(page); - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); } VM_BUG_ON(nr_none); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); /* Unfreeze new_page, caller would take care about freeing it */ page_ref_unfreeze(new_page, 1); @@ -1579,7 +1576,7 @@ static void khugepaged_scan_shmem(struct mm_struct *mm, swap = 0; memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); rcu_read_lock(); - radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { + radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { if (iter.index >= start + HPAGE_PMD_NR) break; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7978c6faae06..e074f7c637aa 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5974,9 +5974,9 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) /* * Interrupts should be disabled here because the caller holds the - * mapping->tree_lock lock which is taken with interrupts-off. It is + * i_pages lock which is taken with interrupts-off. It is * important here to have the interrupts disabled because it is the - * only synchronisation we have for udpating the per-CPU variables. + * only synchronisation we have for updating the per-CPU variables. */ VM_BUG_ON(!irqs_disabled()); mem_cgroup_charge_statistics(memcg, page, PageTransHuge(page), diff --git a/mm/migrate.c b/mm/migrate.c index 51b55f2d2db5..f65dd69e1fd1 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -467,20 +467,21 @@ int migrate_page_move_mapping(struct address_space *mapping, oldzone = page_zone(page); newzone = page_zone(newpage); - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); - pslot = radix_tree_lookup_slot(&mapping->page_tree, + pslot = radix_tree_lookup_slot(&mapping->i_pages, page_index(page)); expected_count += 1 + page_has_private(page); if (page_count(page) != expected_count || - radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) { - spin_unlock_irq(&mapping->tree_lock); + radix_tree_deref_slot_protected(pslot, + &mapping->i_pages.xa_lock) != page) { + xa_unlock_irq(&mapping->i_pages); return -EAGAIN; } if (!page_ref_freeze(page, expected_count)) { - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); return -EAGAIN; } @@ -494,7 +495,7 @@ int migrate_page_move_mapping(struct address_space *mapping, if (mode == MIGRATE_ASYNC && head && !buffer_migrate_lock_buffers(head, mode)) { page_ref_unfreeze(page, expected_count); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); return -EAGAIN; } @@ -522,7 +523,7 @@ int migrate_page_move_mapping(struct address_space *mapping, SetPageDirty(newpage); } - radix_tree_replace_slot(&mapping->page_tree, pslot, newpage); + radix_tree_replace_slot(&mapping->i_pages, pslot, newpage); /* * Drop cache reference from old page by unfreezing @@ -531,7 +532,7 @@ int migrate_page_move_mapping(struct address_space *mapping, */ page_ref_unfreeze(page, expected_count - 1); - spin_unlock(&mapping->tree_lock); + xa_unlock(&mapping->i_pages); /* Leave irq disabled to prevent preemption while updating stats */ /* @@ -574,20 +575,19 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, int expected_count; void **pslot; - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); - pslot = radix_tree_lookup_slot(&mapping->page_tree, - page_index(page)); + pslot = radix_tree_lookup_slot(&mapping->i_pages, page_index(page)); expected_count = 2 + page_has_private(page); if (page_count(page) != expected_count || - radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) { - spin_unlock_irq(&mapping->tree_lock); + radix_tree_deref_slot_protected(pslot, &mapping->i_pages.xa_lock) != page) { + xa_unlock_irq(&mapping->i_pages); return -EAGAIN; } if (!page_ref_freeze(page, expected_count)) { - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); return -EAGAIN; } @@ -596,11 +596,11 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, get_page(newpage); - radix_tree_replace_slot(&mapping->page_tree, pslot, newpage); + radix_tree_replace_slot(&mapping->i_pages, pslot, newpage); page_ref_unfreeze(page, expected_count - 1); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); return MIGRATEPAGE_SUCCESS; } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 586f31261c83..5c1a3279e63f 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2099,7 +2099,8 @@ void __init page_writeback_init(void) * so that it can tag pages faster than a dirtying process can create them). */ /* - * We tag pages in batches of WRITEBACK_TAG_BATCH to reduce tree_lock latency. + * We tag pages in batches of WRITEBACK_TAG_BATCH to reduce the i_pages lock + * latency. */ void tag_pages_for_writeback(struct address_space *mapping, pgoff_t start, pgoff_t end) @@ -2109,22 +2110,22 @@ void tag_pages_for_writeback(struct address_space *mapping, struct radix_tree_iter iter; void **slot; - spin_lock_irq(&mapping->tree_lock); - radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, start, + xa_lock_irq(&mapping->i_pages); + radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, start, PAGECACHE_TAG_DIRTY) { if (iter.index > end) break; - radix_tree_iter_tag_set(&mapping->page_tree, &iter, + radix_tree_iter_tag_set(&mapping->i_pages, &iter, PAGECACHE_TAG_TOWRITE); tagged++; if ((tagged % WRITEBACK_TAG_BATCH) != 0) continue; slot = radix_tree_iter_resume(slot, &iter); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); cond_resched(); - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); } - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); } EXPORT_SYMBOL(tag_pages_for_writeback); @@ -2467,13 +2468,13 @@ int __set_page_dirty_nobuffers(struct page *page) return 1; } - spin_lock_irqsave(&mapping->tree_lock, flags); + xa_lock_irqsave(&mapping->i_pages, flags); BUG_ON(page_mapping(page) != mapping); WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); account_page_dirtied(page, mapping); - radix_tree_tag_set(&mapping->page_tree, page_index(page), + radix_tree_tag_set(&mapping->i_pages, page_index(page), PAGECACHE_TAG_DIRTY); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); unlock_page_memcg(page); if (mapping->host) { @@ -2718,11 +2719,10 @@ int test_clear_page_writeback(struct page *page) struct backing_dev_info *bdi = inode_to_bdi(inode); unsigned long flags; - spin_lock_irqsave(&mapping->tree_lock, flags); + xa_lock_irqsave(&mapping->i_pages, flags); ret = TestClearPageWriteback(page); if (ret) { - radix_tree_tag_clear(&mapping->page_tree, - page_index(page), + radix_tree_tag_clear(&mapping->i_pages, page_index(page), PAGECACHE_TAG_WRITEBACK); if (bdi_cap_account_writeback(bdi)) { struct bdi_writeback *wb = inode_to_wb(inode); @@ -2736,7 +2736,7 @@ int test_clear_page_writeback(struct page *page) PAGECACHE_TAG_WRITEBACK)) sb_clear_inode_writeback(mapping->host); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); } else { ret = TestClearPageWriteback(page); } @@ -2766,7 +2766,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write) struct backing_dev_info *bdi = inode_to_bdi(inode); unsigned long flags; - spin_lock_irqsave(&mapping->tree_lock, flags); + xa_lock_irqsave(&mapping->i_pages, flags); ret = TestSetPageWriteback(page); if (!ret) { bool on_wblist; @@ -2774,8 +2774,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write) on_wblist = mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK); - radix_tree_tag_set(&mapping->page_tree, - page_index(page), + radix_tree_tag_set(&mapping->i_pages, page_index(page), PAGECACHE_TAG_WRITEBACK); if (bdi_cap_account_writeback(bdi)) inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK); @@ -2789,14 +2788,12 @@ int __test_set_page_writeback(struct page *page, bool keep_write) sb_mark_inode_writeback(mapping->host); } if (!PageDirty(page)) - radix_tree_tag_clear(&mapping->page_tree, - page_index(page), + radix_tree_tag_clear(&mapping->i_pages, page_index(page), PAGECACHE_TAG_DIRTY); if (!keep_write) - radix_tree_tag_clear(&mapping->page_tree, - page_index(page), + radix_tree_tag_clear(&mapping->i_pages, page_index(page), PAGECACHE_TAG_TOWRITE); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); } else { ret = TestSetPageWriteback(page); } @@ -2816,7 +2813,7 @@ EXPORT_SYMBOL(__test_set_page_writeback); */ int mapping_tagged(struct address_space *mapping, int tag) { - return radix_tree_tagged(&mapping->page_tree, tag); + return radix_tree_tagged(&mapping->i_pages, tag); } EXPORT_SYMBOL(mapping_tagged); diff --git a/mm/readahead.c b/mm/readahead.c index 4d57b4644f98..539bbb6c1fad 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -175,7 +175,7 @@ int __do_page_cache_readahead(struct address_space *mapping, struct file *filp, break; rcu_read_lock(); - page = radix_tree_lookup(&mapping->page_tree, page_offset); + page = radix_tree_lookup(&mapping->i_pages, page_offset); rcu_read_unlock(); if (page && !radix_tree_exceptional_entry(page)) continue; diff --git a/mm/rmap.c b/mm/rmap.c index 9122787c4947..f0dd4e4565bc 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -32,11 +32,11 @@ * mmlist_lock (in mmput, drain_mmlist and others) * mapping->private_lock (in __set_page_dirty_buffers) * mem_cgroup_{begin,end}_page_stat (memcg->move_lock) - * mapping->tree_lock (widely used) + * i_pages lock (widely used) * inode->i_lock (in set_page_dirty's __mark_inode_dirty) * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty) * sb_lock (within inode_lock in fs/fs-writeback.c) - * mapping->tree_lock (widely used, in set_page_dirty, + * i_pages lock (widely used, in set_page_dirty, * in arch-dependent flush_dcache_mmap_lock, * within bdi.wb->list_lock in __sync_single_inode) * diff --git a/mm/shmem.c b/mm/shmem.c index 4424fc0c33aa..9d6c7e595415 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -332,12 +332,12 @@ static int shmem_radix_tree_replace(struct address_space *mapping, VM_BUG_ON(!expected); VM_BUG_ON(!replacement); - item = __radix_tree_lookup(&mapping->page_tree, index, &node, &pslot); + item = __radix_tree_lookup(&mapping->i_pages, index, &node, &pslot); if (!item) return -ENOENT; if (item != expected) return -ENOENT; - __radix_tree_replace(&mapping->page_tree, node, pslot, + __radix_tree_replace(&mapping->i_pages, node, pslot, replacement, NULL); return 0; } @@ -355,7 +355,7 @@ static bool shmem_confirm_swap(struct address_space *mapping, void *item; rcu_read_lock(); - item = radix_tree_lookup(&mapping->page_tree, index); + item = radix_tree_lookup(&mapping->i_pages, index); rcu_read_unlock(); return item == swp_to_radix_entry(swap); } @@ -590,14 +590,14 @@ static int shmem_add_to_page_cache(struct page *page, page->mapping = mapping; page->index = index; - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); if (PageTransHuge(page)) { void __rcu **results; pgoff_t idx; int i; error = 0; - if (radix_tree_gang_lookup_slot(&mapping->page_tree, + if (radix_tree_gang_lookup_slot(&mapping->i_pages, &results, &idx, index, 1) && idx < index + HPAGE_PMD_NR) { error = -EEXIST; @@ -605,14 +605,14 @@ static int shmem_add_to_page_cache(struct page *page, if (!error) { for (i = 0; i < HPAGE_PMD_NR; i++) { - error = radix_tree_insert(&mapping->page_tree, + error = radix_tree_insert(&mapping->i_pages, index + i, page + i); VM_BUG_ON(error); } count_vm_event(THP_FILE_ALLOC); } } else if (!expected) { - error = radix_tree_insert(&mapping->page_tree, index, page); + error = radix_tree_insert(&mapping->i_pages, index, page); } else { error = shmem_radix_tree_replace(mapping, index, expected, page); @@ -624,10 +624,10 @@ static int shmem_add_to_page_cache(struct page *page, __inc_node_page_state(page, NR_SHMEM_THPS); __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr); __mod_node_page_state(page_pgdat(page), NR_SHMEM, nr); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); } else { page->mapping = NULL; - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); page_ref_sub(page, nr); } return error; @@ -643,13 +643,13 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap) VM_BUG_ON_PAGE(PageCompound(page), page); - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); error = shmem_radix_tree_replace(mapping, page->index, page, radswap); page->mapping = NULL; mapping->nrpages--; __dec_node_page_state(page, NR_FILE_PAGES); __dec_node_page_state(page, NR_SHMEM); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); put_page(page); BUG_ON(error); } @@ -662,9 +662,9 @@ static int shmem_free_swap(struct address_space *mapping, { void *old; - spin_lock_irq(&mapping->tree_lock); - old = radix_tree_delete_item(&mapping->page_tree, index, radswap); - spin_unlock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); + old = radix_tree_delete_item(&mapping->i_pages, index, radswap); + xa_unlock_irq(&mapping->i_pages); if (old != radswap) return -ENOENT; free_swap_and_cache(radix_to_swp_entry(radswap)); @@ -675,7 +675,7 @@ static int shmem_free_swap(struct address_space *mapping, * Determine (in bytes) how many of the shmem object's pages mapped by the * given offsets are swapped out. * - * This is safe to call without i_mutex or mapping->tree_lock thanks to RCU, + * This is safe to call without i_mutex or the i_pages lock thanks to RCU, * as long as the inode doesn't go away and racy results are not a problem. */ unsigned long shmem_partial_swap_usage(struct address_space *mapping, @@ -688,7 +688,7 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping, rcu_read_lock(); - radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { + radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { if (iter.index >= end) break; @@ -717,7 +717,7 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping, * Determine (in bytes) how many of the shmem object's pages mapped by the * given vma is swapped out. * - * This is safe to call without i_mutex or mapping->tree_lock thanks to RCU, + * This is safe to call without i_mutex or the i_pages lock thanks to RCU, * as long as the inode doesn't go away and racy results are not a problem. */ unsigned long shmem_swap_usage(struct vm_area_struct *vma) @@ -1132,7 +1132,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, int error = 0; radswap = swp_to_radix_entry(swap); - index = find_swap_entry(&mapping->page_tree, radswap); + index = find_swap_entry(&mapping->i_pages, radswap); if (index == -1) return -EAGAIN; /* tell shmem_unuse we found nothing */ @@ -1448,7 +1448,7 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp, hindex = round_down(index, HPAGE_PMD_NR); rcu_read_lock(); - if (radix_tree_gang_lookup_slot(&mapping->page_tree, &results, &idx, + if (radix_tree_gang_lookup_slot(&mapping->i_pages, &results, &idx, hindex, 1) && idx < hindex + HPAGE_PMD_NR) { rcu_read_unlock(); return NULL; @@ -1561,14 +1561,14 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, * Our caller will very soon move newpage out of swapcache, but it's * a nice clean interface for us to replace oldpage by newpage there. */ - spin_lock_irq(&swap_mapping->tree_lock); + xa_lock_irq(&swap_mapping->i_pages); error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage, newpage); if (!error) { __inc_node_page_state(newpage, NR_FILE_PAGES); __dec_node_page_state(oldpage, NR_FILE_PAGES); } - spin_unlock_irq(&swap_mapping->tree_lock); + xa_unlock_irq(&swap_mapping->i_pages); if (unlikely(error)) { /* @@ -2634,7 +2634,7 @@ static void shmem_tag_pins(struct address_space *mapping) start = 0; rcu_read_lock(); - radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { + radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { page = radix_tree_deref_slot(slot); if (!page || radix_tree_exception(page)) { if (radix_tree_deref_retry(page)) { @@ -2642,10 +2642,10 @@ static void shmem_tag_pins(struct address_space *mapping) continue; } } else if (page_count(page) - page_mapcount(page) > 1) { - spin_lock_irq(&mapping->tree_lock); - radix_tree_tag_set(&mapping->page_tree, iter.index, + xa_lock_irq(&mapping->i_pages); + radix_tree_tag_set(&mapping->i_pages, iter.index, SHMEM_TAG_PINNED); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); } if (need_resched()) { @@ -2677,7 +2677,7 @@ static int shmem_wait_for_pins(struct address_space *mapping) error = 0; for (scan = 0; scan <= LAST_SCAN; scan++) { - if (!radix_tree_tagged(&mapping->page_tree, SHMEM_TAG_PINNED)) + if (!radix_tree_tagged(&mapping->i_pages, SHMEM_TAG_PINNED)) break; if (!scan) @@ -2687,7 +2687,7 @@ static int shmem_wait_for_pins(struct address_space *mapping) start = 0; rcu_read_lock(); - radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, + radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, start, SHMEM_TAG_PINNED) { page = radix_tree_deref_slot(slot); @@ -2713,10 +2713,10 @@ static int shmem_wait_for_pins(struct address_space *mapping) error = -EBUSY; } - spin_lock_irq(&mapping->tree_lock); - radix_tree_tag_clear(&mapping->page_tree, + xa_lock_irq(&mapping->i_pages); + radix_tree_tag_clear(&mapping->i_pages, iter.index, SHMEM_TAG_PINNED); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); continue_resched: if (need_resched()) { slot = radix_tree_iter_resume(slot, &iter); diff --git a/mm/swap_state.c b/mm/swap_state.c index f233dccd3b1b..07f9aa2340c3 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -124,10 +124,10 @@ int __add_to_swap_cache(struct page *page, swp_entry_t entry) SetPageSwapCache(page); address_space = swap_address_space(entry); - spin_lock_irq(&address_space->tree_lock); + xa_lock_irq(&address_space->i_pages); for (i = 0; i < nr; i++) { set_page_private(page + i, entry.val + i); - error = radix_tree_insert(&address_space->page_tree, + error = radix_tree_insert(&address_space->i_pages, idx + i, page + i); if (unlikely(error)) break; @@ -145,13 +145,13 @@ int __add_to_swap_cache(struct page *page, swp_entry_t entry) VM_BUG_ON(error == -EEXIST); set_page_private(page + i, 0UL); while (i--) { - radix_tree_delete(&address_space->page_tree, idx + i); + radix_tree_delete(&address_space->i_pages, idx + i); set_page_private(page + i, 0UL); } ClearPageSwapCache(page); page_ref_sub(page, nr); } - spin_unlock_irq(&address_space->tree_lock); + xa_unlock_irq(&address_space->i_pages); return error; } @@ -188,7 +188,7 @@ void __delete_from_swap_cache(struct page *page) address_space = swap_address_space(entry); idx = swp_offset(entry); for (i = 0; i < nr; i++) { - radix_tree_delete(&address_space->page_tree, idx + i); + radix_tree_delete(&address_space->i_pages, idx + i); set_page_private(page + i, 0); } ClearPageSwapCache(page); @@ -272,9 +272,9 @@ void delete_from_swap_cache(struct page *page) entry.val = page_private(page); address_space = swap_address_space(entry); - spin_lock_irq(&address_space->tree_lock); + xa_lock_irq(&address_space->i_pages); __delete_from_swap_cache(page); - spin_unlock_irq(&address_space->tree_lock); + xa_unlock_irq(&address_space->i_pages); put_swap_page(page, entry); page_ref_sub(page, hpage_nr_pages(page)); @@ -628,12 +628,11 @@ int init_swap_address_space(unsigned int type, unsigned long nr_pages) return -ENOMEM; for (i = 0; i < nr; i++) { space = spaces + i; - INIT_RADIX_TREE(&space->page_tree, GFP_ATOMIC|__GFP_NOWARN); + INIT_RADIX_TREE(&space->i_pages, GFP_ATOMIC|__GFP_NOWARN); atomic_set(&space->i_mmap_writable, 0); space->a_ops = &swap_aops; /* swap cache doesn't use writeback related tags */ mapping_set_no_writeback_tags(space); - spin_lock_init(&space->tree_lock); } nr_swapper_spaces[type] = nr; rcu_assign_pointer(swapper_spaces[type], spaces); diff --git a/mm/truncate.c b/mm/truncate.c index c34e2fd4f583..1d2fb2dca96f 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -36,11 +36,11 @@ static inline void __clear_shadow_entry(struct address_space *mapping, struct radix_tree_node *node; void **slot; - if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot)) + if (!__radix_tree_lookup(&mapping->i_pages, index, &node, &slot)) return; if (*slot != entry) return; - __radix_tree_replace(&mapping->page_tree, node, slot, NULL, + __radix_tree_replace(&mapping->i_pages, node, slot, NULL, workingset_update_node); mapping->nrexceptional--; } @@ -48,9 +48,9 @@ static inline void __clear_shadow_entry(struct address_space *mapping, static void clear_shadow_entry(struct address_space *mapping, pgoff_t index, void *entry) { - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); __clear_shadow_entry(mapping, index, entry); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); } /* @@ -79,7 +79,7 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping, dax = dax_mapping(mapping); lock = !dax && indices[j] < end; if (lock) - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); for (i = j; i < pagevec_count(pvec); i++) { struct page *page = pvec->pages[i]; @@ -102,7 +102,7 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping, } if (lock) - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); pvec->nr = j; } @@ -518,8 +518,8 @@ void truncate_inode_pages_final(struct address_space *mapping) * modification that does not see AS_EXITING is * completed before starting the final truncate. */ - spin_lock_irq(&mapping->tree_lock); - spin_unlock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); + xa_unlock_irq(&mapping->i_pages); truncate_inode_pages(mapping, 0); } @@ -627,13 +627,13 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) return 0; - spin_lock_irqsave(&mapping->tree_lock, flags); + xa_lock_irqsave(&mapping->i_pages, flags); if (PageDirty(page)) goto failed; BUG_ON(page_has_private(page)); __delete_from_page_cache(page, NULL); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); if (mapping->a_ops->freepage) mapping->a_ops->freepage(page); @@ -641,7 +641,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) put_page(page); /* pagecache ref */ return 1; failed: - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); return 0; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 671597ce1ea0..8b920ce3ae02 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -693,7 +693,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, BUG_ON(!PageLocked(page)); BUG_ON(mapping != page_mapping(page)); - spin_lock_irqsave(&mapping->tree_lock, flags); + xa_lock_irqsave(&mapping->i_pages, flags); /* * The non racy check for a busy page. * @@ -717,7 +717,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, * load is not satisfied before that of page->_refcount. * * Note that if SetPageDirty is always performed via set_page_dirty, - * and thus under tree_lock, then this ordering is not required. + * and thus under the i_pages lock, then this ordering is not required. */ if (unlikely(PageTransHuge(page)) && PageSwapCache(page)) refcount = 1 + HPAGE_PMD_NR; @@ -735,7 +735,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, swp_entry_t swap = { .val = page_private(page) }; mem_cgroup_swapout(page, swap); __delete_from_swap_cache(page); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); put_swap_page(page, swap); } else { void (*freepage)(struct page *); @@ -756,13 +756,13 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, * only page cache pages found in these are zero pages * covering holes, and because we don't want to mix DAX * exceptional entries and shadow exceptional entries in the - * same page_tree. + * same address_space. */ if (reclaimed && page_is_file_cache(page) && !mapping_exiting(mapping) && !dax_mapping(mapping)) shadow = workingset_eviction(mapping, page); __delete_from_page_cache(page, shadow); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); if (freepage != NULL) freepage(page); @@ -771,7 +771,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, return 1; cannot_free: - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); return 0; } diff --git a/mm/workingset.c b/mm/workingset.c index b7d616a3bbbe..40ee02c83978 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -202,7 +202,7 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, * @mapping: address space the page was backing * @page: the page being evicted * - * Returns a shadow entry to be stored in @mapping->page_tree in place + * Returns a shadow entry to be stored in @mapping->i_pages in place * of the evicted @page so that a later refault can be detected. */ void *workingset_eviction(struct address_space *mapping, struct page *page) @@ -348,7 +348,7 @@ void workingset_update_node(struct radix_tree_node *node) * * Avoid acquiring the list_lru lock when the nodes are * already where they should be. The list_empty() test is safe - * as node->private_list is protected by &mapping->tree_lock. + * as node->private_list is protected by the i_pages lock. */ if (node->count && node->count == node->exceptional) { if (list_empty(&node->private_list)) @@ -366,7 +366,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, unsigned long nodes; unsigned long cache; - /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ + /* list_lru lock nests inside the IRQ-safe i_pages lock */ local_irq_disable(); nodes = list_lru_shrink_count(&shadow_nodes, sc); local_irq_enable(); @@ -419,21 +419,21 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, /* * Page cache insertions and deletions synchroneously maintain - * the shadow node LRU under the mapping->tree_lock and the + * the shadow node LRU under the i_pages lock and the * lru_lock. Because the page cache tree is emptied before * the inode can be destroyed, holding the lru_lock pins any * address_space that has radix tree nodes on the LRU. * - * We can then safely transition to the mapping->tree_lock to + * We can then safely transition to the i_pages lock to * pin only the address_space of the particular node we want * to reclaim, take the node off-LRU, and drop the lru_lock. */ node = container_of(item, struct radix_tree_node, private_list); - mapping = container_of(node->root, struct address_space, page_tree); + mapping = container_of(node->root, struct address_space, i_pages); /* Coming from the list, invert the lock order */ - if (!spin_trylock(&mapping->tree_lock)) { + if (!xa_trylock(&mapping->i_pages)) { spin_unlock(lru_lock); ret = LRU_RETRY; goto out; @@ -468,11 +468,11 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, if (WARN_ON_ONCE(node->exceptional)) goto out_invalid; inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM); - __radix_tree_delete_node(&mapping->page_tree, node, + __radix_tree_delete_node(&mapping->i_pages, node, workingset_lookup_update(mapping)); out_invalid: - spin_unlock(&mapping->tree_lock); + xa_unlock(&mapping->i_pages); ret = LRU_REMOVED_RETRY; out: local_irq_enable(); @@ -487,7 +487,7 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker, { unsigned long ret; - /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ + /* list_lru lock nests inside the IRQ-safe i_pages lock */ local_irq_disable(); ret = list_lru_shrink_walk(&shadow_nodes, sc, shadow_lru_isolate, NULL); local_irq_enable(); @@ -503,7 +503,7 @@ static struct shrinker workingset_shadow_shrinker = { /* * Our list_lru->lock is IRQ-safe as it nests inside the IRQ-safe - * mapping->tree_lock. + * i_pages lock. */ static struct lock_class_key shadow_nodes_key;