From fae1383b38a105a0454acab19b094c510728fde5 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 7 Jan 2019 18:57:20 +0900 Subject: [PATCH 001/200] powerpc: use a CONSOLE_LOGLEVEL_DEBUG macro Use a CONSOLE_LOGLEVEL_DEBUG macro for console_loglevel rather than a naked number. Signed-off-by: Sergey Senozhatsky Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/udbg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/udbg.c b/arch/powerpc/kernel/udbg.c index 7cc38b5b58bc..8db4891acdaf 100644 --- a/arch/powerpc/kernel/udbg.c +++ b/arch/powerpc/kernel/udbg.c @@ -74,7 +74,7 @@ void __init udbg_early_init(void) #endif #ifdef CONFIG_PPC_EARLY_DEBUG - console_loglevel = 10; + console_loglevel = CONSOLE_LOGLEVEL_DEBUG; register_early_udbg_console(); #endif From 3b702ddd066813952154c22dd76d3b0c10644940 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Fri, 4 Jan 2019 22:31:52 +0100 Subject: [PATCH 002/200] powerpc/hvsi: Fix spelling mistake: "lenght" should be "length" Signed-off-by: Matteo Croce Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/hvsi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/hvsi.h b/arch/powerpc/include/asm/hvsi.h index 3fdc54df63c9..464a7519ed64 100644 --- a/arch/powerpc/include/asm/hvsi.h +++ b/arch/powerpc/include/asm/hvsi.h @@ -64,7 +64,7 @@ struct hvsi_priv { unsigned int inbuf_len; /* data in input buffer */ unsigned char inbuf[HVSI_INBUF_SIZE]; unsigned int inbuf_cur; /* Cursor in input buffer */ - unsigned int inbuf_pktlen; /* packet lenght from cursor */ + unsigned int inbuf_pktlen; /* packet length from cursor */ atomic_t seqno; /* packet sequence number */ unsigned int opened:1; /* driver opened */ unsigned int established:1; /* protocol established */ From 31367b9a01d6a3f4f77694bd44f547d6f738ff28 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 8 Jan 2019 15:00:10 -0600 Subject: [PATCH 003/200] powerpc/ps3: Use struct_size() in kzalloc() One of the more common cases of allocation size calculations is finding the size of a structure that has a zero-sized array at the end, along with memory for some number of elements for that array. For example: struct foo { int stuff; void *entry[]; }; instance = kzalloc(sizeof(struct foo) + sizeof(void *) * count, GFP_KERNEL); Instead of leaving these open-coded and prone to type mistakes, we can now use the new struct_size() helper: instance = kzalloc(struct_size(instance, entry, count), GFP_KERNEL); This code was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/ps3/device-init.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/powerpc/platforms/ps3/device-init.c b/arch/powerpc/platforms/ps3/device-init.c index e7075aaff1bb..59587b75493d 100644 --- a/arch/powerpc/platforms/ps3/device-init.c +++ b/arch/powerpc/platforms/ps3/device-init.c @@ -354,9 +354,7 @@ static int ps3_setup_storage_dev(const struct ps3_repository_device *repo, repo->dev_index, repo->dev_type, port, blk_size, num_blocks, num_regions); - p = kzalloc(sizeof(struct ps3_storage_device) + - num_regions * sizeof(struct ps3_storage_region), - GFP_KERNEL); + p = kzalloc(struct_size(p, regions, num_regions), GFP_KERNEL); if (!p) { result = -ENOMEM; goto fail_malloc; From 607ea5090b3fb61fea1d0bc5278e6c1d40ab5bd6 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 8 Jan 2019 11:37:19 +0000 Subject: [PATCH 004/200] powerpc/irq: drop arch_early_irq_init() arch_early_irq_init() does nothing different than the weak arch_early_irq_init() in kernel/softirq.c Fixes: 089fb442f301 ("powerpc: Use ARCH_IRQ_INIT_FLAGS") Signed-off-by: Christophe Leroy Acked-by: Thomas Gleixner Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/irq.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 916ddc4aac44..bb299613a462 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -827,11 +827,6 @@ int irq_choose_cpu(const struct cpumask *mask) } #endif -int arch_early_irq_init(void) -{ - return 0; -} - #ifdef CONFIG_PPC64 static int __init setup_noirqdistrib(char *str) { From 7cd4774ff7a495839f49a582be7299598eeffc37 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Wed, 9 Jan 2019 12:10:58 +0000 Subject: [PATCH 005/200] powerpc/mm: Fix debugfs_simple_attr.cocci warnings Use DEFINE_DEBUGFS_ATTRIBUTE rather than DEFINE_SIMPLE_ATTRIBUTE for debugfs files. Generated by: scripts/coccinelle/api/debugfs/debugfs_simple_attr.cocci Signed-off-by: YueHaibing Signed-off-by: Michael Ellerman --- arch/powerpc/mm/hash_utils_64.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 0cc7fbc3bd1c..4aa0797000f7 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -1889,12 +1889,12 @@ static int hpt_order_set(void *data, u64 val) return mmu_hash_ops.resize_hpt(val); } -DEFINE_SIMPLE_ATTRIBUTE(fops_hpt_order, hpt_order_get, hpt_order_set, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_hpt_order, hpt_order_get, hpt_order_set, "%llu\n"); static int __init hash64_debugfs(void) { - if (!debugfs_create_file("hpt_order", 0600, powerpc_debugfs_root, - NULL, &fops_hpt_order)) { + if (!debugfs_create_file_unsafe("hpt_order", 0600, powerpc_debugfs_root, + NULL, &fops_hpt_order)) { pr_err("lpar: unable to create hpt_order debugsfs file\n"); } From c142e9741e61577c45f2441214c999f25857bdd1 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 11 Jan 2019 12:22:31 +0900 Subject: [PATCH 006/200] KVM: powerpc: remove -I. header search paths The header search path -I. in kernel Makefiles is very suspicious; it allows the compiler to search for headers in the top of $(srctree), where obviously no header file exists. Commit 46f43c6ee022 ("KVM: powerpc: convert marker probes to event trace") first added these options, but they are completely useless. Signed-off-by: Masahiro Yamada Signed-off-by: Michael Ellerman --- arch/powerpc/kvm/Makefile | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile index 64f1135e7732..3223aec88b2c 100644 --- a/arch/powerpc/kvm/Makefile +++ b/arch/powerpc/kvm/Makefile @@ -10,11 +10,6 @@ common-objs-y = $(KVM)/kvm_main.o $(KVM)/eventfd.o common-objs-$(CONFIG_KVM_VFIO) += $(KVM)/vfio.o common-objs-$(CONFIG_KVM_MMIO) += $(KVM)/coalesced_mmio.o -CFLAGS_e500_mmu.o := -I. -CFLAGS_e500_mmu_host.o := -I. -CFLAGS_emulate.o := -I. -CFLAGS_emulate_loadstore.o := -I. - common-objs-y += powerpc.o emulate_loadstore.o obj-$(CONFIG_KVM_EXIT_TIMING) += timing.o obj-$(CONFIG_KVM_BOOK3S_HANDLER) += book3s_exports.o From b00899b8957800e2ac1af2cf6254abf3ec9fad85 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 11 Jan 2019 12:22:32 +0900 Subject: [PATCH 007/200] powerpc: remove redundant header search path additions The same path -Iarch/$(ARCH) is passed to KBUILD_CPPFLAGS, KBUILD_AFLAGS, and KBUILD_CFLAGS. As you see in scripts/Makefile.lib, KBUILD_CPPFLAGS is passed to c_flags and a_flags as well. Passing it to KBUILD_CPPFLAGS is enough. Signed-off-by: Masahiro Yamada Signed-off-by: Michael Ellerman --- arch/powerpc/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index 488c9edffa58..ac033341ed55 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -213,9 +213,9 @@ endif asinstr := $(call as-instr,lis 9$(comma)foo@high,-DHAVE_AS_ATHIGH=1) KBUILD_CPPFLAGS += -Iarch/$(ARCH) $(asinstr) -KBUILD_AFLAGS += -Iarch/$(ARCH) $(AFLAGS-y) +KBUILD_AFLAGS += $(AFLAGS-y) KBUILD_CFLAGS += $(call cc-option,-msoft-float) -KBUILD_CFLAGS += -pipe -Iarch/$(ARCH) $(CFLAGS-y) +KBUILD_CFLAGS += -pipe $(CFLAGS-y) CPP = $(CC) -E $(KBUILD_CFLAGS) CHECKFLAGS += -m$(BITS) -D__powerpc__ -D__powerpc$(BITS)__ From fbe3ab014f37f67766e6cf5b0ce79d5e4197c536 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 11 Jan 2019 12:22:33 +0900 Subject: [PATCH 008/200] powerpc: math-emu: remove unneeded header search paths The header search path -I. in kernel Makefiles is very suspicious; it allows the compiler to search for headers in the top of $(srctree), where obviously no header file exists. -Iinclude/math-emu seems unnecessary because all files include headers in the form of #include . I was able to build without these header search paths. Signed-off-by: Masahiro Yamada Signed-off-by: Michael Ellerman --- arch/powerpc/math-emu/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/math-emu/Makefile b/arch/powerpc/math-emu/Makefile index 494df26c5988..a8794032f15f 100644 --- a/arch/powerpc/math-emu/Makefile +++ b/arch/powerpc/math-emu/Makefile @@ -17,4 +17,4 @@ obj-$(CONFIG_SPE) += math_efp.o CFLAGS_fabs.o = -fno-builtin-fabs CFLAGS_math.o = -fno-builtin-fabs -ccflags-y = -I. -Iinclude/math-emu -w +ccflags-y = -w From 00def7130af8b3fad1bdef98429c94a67dbbd896 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 8 Jan 2019 12:37:20 -0600 Subject: [PATCH 009/200] powerpc/spufs: use struct_size() in kmalloc() One of the more common cases of allocation size calculations is finding the size of a structure that has a zero-sized array at the end, along with memory for some number of elements for that array. For example: struct foo { int stuff; void *entry[]; }; instance = kmalloc(sizeof(struct foo) + sizeof(void *) * count, GFP_KERNEL); Instead of leaving these open-coded and prone to type mistakes, we can now use the new struct_size() helper: instance = kmalloc(struct_size(instance, entry, count), GFP_KERNEL); This code was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/cell/spufs/file.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c index ae8123edddc6..48c2477e7e2a 100644 --- a/arch/powerpc/platforms/cell/spufs/file.c +++ b/arch/powerpc/platforms/cell/spufs/file.c @@ -2338,9 +2338,8 @@ static int spufs_switch_log_open(struct inode *inode, struct file *file) goto out; } - ctx->switch_log = kmalloc(sizeof(struct switch_log) + - SWITCH_LOG_BUFSIZE * sizeof(struct switch_log_entry), - GFP_KERNEL); + ctx->switch_log = kmalloc(struct_size(ctx->switch_log, log, + SWITCH_LOG_BUFSIZE), GFP_KERNEL); if (!ctx->switch_log) { rc = -ENOMEM; From 8acb88682cc00a41a677c2455a7c992d78e43035 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 8 Jan 2019 15:08:00 +0000 Subject: [PATCH 010/200] powerpc/ipic: drop unused functions ipic_set_highest_priority(), ipic_enable_mcp() and ipic_disable_mcp() are unused. This patch drops them. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/ipic.h | 3 --- arch/powerpc/sysdev/ipic.c | 35 --------------------------------- 2 files changed, 38 deletions(-) diff --git a/arch/powerpc/include/asm/ipic.h b/arch/powerpc/include/asm/ipic.h index 3dbd47f2bffe..abad50a745db 100644 --- a/arch/powerpc/include/asm/ipic.h +++ b/arch/powerpc/include/asm/ipic.h @@ -69,10 +69,7 @@ enum ipic_mcp_irq { IPIC_MCP_MU = 7, }; -extern void ipic_set_highest_priority(unsigned int irq); extern void ipic_set_default_priority(void); -extern void ipic_enable_mcp(enum ipic_mcp_irq mcp_irq); -extern void ipic_disable_mcp(enum ipic_mcp_irq mcp_irq); extern u32 ipic_get_mcp_status(void); extern void ipic_clear_mcp_status(u32 mask); diff --git a/arch/powerpc/sysdev/ipic.c b/arch/powerpc/sysdev/ipic.c index 8030a0f55e96..fd129c8ecceb 100644 --- a/arch/powerpc/sysdev/ipic.c +++ b/arch/powerpc/sysdev/ipic.c @@ -771,21 +771,6 @@ struct ipic * __init ipic_init(struct device_node *node, unsigned int flags) return ipic; } -void ipic_set_highest_priority(unsigned int virq) -{ - struct ipic *ipic = ipic_from_irq(virq); - unsigned int src = virq_to_hw(virq); - u32 temp; - - temp = ipic_read(ipic->regs, IPIC_SICFR); - - /* clear and set HPI */ - temp &= 0x7f000000; - temp |= (src & 0x7f) << 24; - - ipic_write(ipic->regs, IPIC_SICFR, temp); -} - void ipic_set_default_priority(void) { ipic_write(primary_ipic->regs, IPIC_SIPRR_A, IPIC_PRIORITY_DEFAULT); @@ -796,26 +781,6 @@ void ipic_set_default_priority(void) ipic_write(primary_ipic->regs, IPIC_SMPRR_B, IPIC_PRIORITY_DEFAULT); } -void ipic_enable_mcp(enum ipic_mcp_irq mcp_irq) -{ - struct ipic *ipic = primary_ipic; - u32 temp; - - temp = ipic_read(ipic->regs, IPIC_SERMR); - temp |= (1 << (31 - mcp_irq)); - ipic_write(ipic->regs, IPIC_SERMR, temp); -} - -void ipic_disable_mcp(enum ipic_mcp_irq mcp_irq) -{ - struct ipic *ipic = primary_ipic; - u32 temp; - - temp = ipic_read(ipic->regs, IPIC_SERMR); - temp &= (1 << (31 - mcp_irq)); - ipic_write(ipic->regs, IPIC_SERMR, temp); -} - u32 ipic_get_mcp_status(void) { return primary_ipic ? ipic_read(primary_ipic->regs, IPIC_SERSR) : 0; From acef5e0165912c459a9ae98a25f0f87908ced0f9 Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Tue, 8 Jan 2019 12:52:50 +0000 Subject: [PATCH 011/200] powerpc/dts: Build virtex dtbs I wanted to test the virtex440-ml507 qemu machine and found that the dtb for it was not built. All powerpc dtbs are only built when CONFIG_OF_ALL_DTBS is set which depend on COMPILE_TEST. This patch enables building of the virtex dtbs when CONFIG_XILINX_VIRTEX440_GENERIC_BOARD is enabled. Signed-off-by: Corentin Labbe [mpe: Put both targets on a single line] Signed-off-by: Michael Ellerman --- arch/powerpc/boot/dts/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/boot/dts/Makefile b/arch/powerpc/boot/dts/Makefile index fb335d05aae8..1cbc0e4ce857 100644 --- a/arch/powerpc/boot/dts/Makefile +++ b/arch/powerpc/boot/dts/Makefile @@ -4,3 +4,4 @@ subdir-y += fsl dtstree := $(srctree)/$(src) dtb-$(CONFIG_OF_ALL_DTBS) := $(patsubst $(dtstree)/%.dts,%.dtb, $(wildcard $(dtstree)/*.dts)) +dtb-$(CONFIG_XILINX_VIRTEX440_GENERIC_BOARD) += virtex440-ml507.dtb virtex440-ml510.dtb From a652758ac1475f69d28d11b3528c4f489416c877 Mon Sep 17 00:00:00 2001 From: Joel Stanley Date: Sat, 12 Jan 2019 10:50:56 +1100 Subject: [PATCH 012/200] powerpc: Use ALIGN instead of BLOCK In the ld documentation under Builtin Functions: BLOCK(exp) This is a synonym for ALIGN, for compatibility with older linker scripts. Clang's linker (lld) doesn't know about BLOCK so remove this use of it. Suggested-by: George Rimar Signed-off-by: Joel Stanley Reviewed-by: Nick Desaulniers Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/vmlinux.lds.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index ad1c77f71f54..3ae4c959f95b 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -86,11 +86,11 @@ SECTIONS #ifdef CONFIG_PPC64 /* - * BLOCK(0) overrides the default output section alignment because + * ALIGN(0) overrides the default output section alignment because * this needs to start right after .head.text in order for fixed * section placement to work. */ - .text BLOCK(0) : AT(ADDR(.text) - LOAD_OFFSET) { + .text ALIGN(0) : AT(ADDR(.text) - LOAD_OFFSET) { #ifdef CONFIG_LD_HEAD_STUB_CATCH KEEP(*(.linker_stub_catch)); . = . ; From cd6b8a631c5de3a6b7c8ef30337fd02bd8210a44 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Mon, 14 Jan 2019 11:38:49 +1100 Subject: [PATCH 013/200] powerpc/mm: Fix compile when CONFIG_PPC_RADIX_MMU is not defined This adds some stubs for hash only configs. Signed-off-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman --- .../include/asm/book3s/64/tlbflush-radix.h | 30 +++++++++++++++---- 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h index 671316f9e95d..05147cecb8df 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h @@ -13,8 +13,32 @@ static inline int mmu_get_ap(int psize) #ifdef CONFIG_PPC_RADIX_MMU extern void radix__tlbiel_all(unsigned int action); +extern void radix__flush_tlb_lpid_page(unsigned int lpid, + unsigned long addr, + unsigned long page_size); +extern void radix__flush_pwc_lpid(unsigned int lpid); +extern void radix__flush_tlb_lpid(unsigned int lpid); +extern void radix__local_flush_tlb_lpid_guest(unsigned int lpid); #else static inline void radix__tlbiel_all(unsigned int action) { WARN_ON(1); }; +static inline void radix__flush_tlb_lpid_page(unsigned int lpid, + unsigned long addr, + unsigned long page_size) +{ + WARN_ON(1); +} +static inline void radix__flush_pwc_lpid(unsigned int lpid) +{ + WARN_ON(1); +} +static inline void radix__flush_tlb_lpid(unsigned int lpid) +{ + WARN_ON(1); +} +static inline void radix__local_flush_tlb_lpid_guest(unsigned int lpid) +{ + WARN_ON(1); +} #endif extern void radix__flush_hugetlb_tlb_range(struct vm_area_struct *vma, @@ -49,12 +73,6 @@ extern void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr); extern void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr); extern void radix__flush_tlb_all(void); -extern void radix__flush_tlb_lpid_page(unsigned int lpid, - unsigned long addr, - unsigned long page_size); -extern void radix__flush_pwc_lpid(unsigned int lpid); -extern void radix__flush_tlb_lpid(unsigned int lpid); extern void radix__local_flush_tlb_lpid(unsigned int lpid); -extern void radix__local_flush_tlb_lpid_guest(unsigned int lpid); #endif From c35f78d7a422750917029d20d9e57000b1181d75 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Mon, 14 Jan 2019 11:40:27 +1100 Subject: [PATCH 014/200] powerpc/powernv: Remove never used pnv_power9_force_smt4 This removes never used symbol - pnv_power9_force_smt4. Note that we might still want to add stubs for: void pnv_power9_force_smt4_catch(void); void pnv_power9_force_smt4_release(void); Fixes: 7672691a08c88 "powerpc/powernv: Provide a way to force a core into SMT4 mode" Signed-off-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/powernv.h | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/powerpc/include/asm/powernv.h b/arch/powerpc/include/asm/powernv.h index 2f3ff7a27881..362ea12a4501 100644 --- a/arch/powerpc/include/asm/powernv.h +++ b/arch/powerpc/include/asm/powernv.h @@ -40,7 +40,6 @@ static inline int pnv_npu2_handle_fault(struct npu_context *context, } static inline void pnv_tm_init(void) { } -static inline void pnv_power9_force_smt4(void) { } #endif #endif /* _ASM_POWERNV_H */ From 797eadd9c80ca3b3f913ccde29f8a6015f9974f9 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Mon, 14 Jan 2019 11:41:38 +1100 Subject: [PATCH 015/200] powerpc/powernv/npu: Remove obsolete comment about TCE_KILL_INVAL_ALL TCE_KILL_INVAL_ALL has moved long ago but the comment was forgotted so finish the move and remove the comment. Fixes: 0bbcdb437da0c4a "powerpc/powernv/npu: TCE Kill helpers cleanup" Signed-off-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/npu-dma.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c index d7f742ed48ba..2ca79823e3ba 100644 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ b/arch/powerpc/platforms/powernv/npu-dma.c @@ -28,10 +28,6 @@ */ static DEFINE_SPINLOCK(npu_context_lock); -/* - * Other types of TCE cache invalidation are not functional in the - * hardware. - */ static struct pci_dev *get_pci_dev(struct device_node *dn) { struct pci_dn *pdn = PCI_DN(dn); From f4ddc19a711736eb54fb8259499faf5af0f72545 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jonathan=20Neusch=C3=A4fer?= Date: Sat, 12 Jan 2019 17:21:23 +0100 Subject: [PATCH 016/200] powerpc: wii.dts: Add interrupt-related properties to GPIO node MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Hollywood GPIO controller is connected to the Hollywood PIC (&PIC1) at IRQs 10 and 11; IRQ 10 for GPIO lines that are configured for access by the PPC, 11 for GPIO lines that are configured for access by the ARM926. Signed-off-by: Jonathan Neuschäfer Signed-off-by: Michael Ellerman --- arch/powerpc/boot/dts/wii.dts | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/powerpc/boot/dts/wii.dts b/arch/powerpc/boot/dts/wii.dts index 104b1d6d5695..caf735fa11b7 100644 --- a/arch/powerpc/boot/dts/wii.dts +++ b/arch/powerpc/boot/dts/wii.dts @@ -187,6 +187,11 @@ "DEBUG0", "DEBUG1", "DEBUG2", "DEBUG3", "DEBUG4", "DEBUG5", "DEBUG6", "DEBUG7"; + interrupt-controller; + #interrupt-cells = <2>; + interrupts = <10>; + interrupt-parent = <&PIC1>; + /* * This is commented out while a standard binding * for i2c over gpio is defined. From 8de7547e03059281fda075355c1146941fbbe76f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jonathan=20Neusch=C3=A4fer?= Date: Sat, 12 Jan 2019 17:21:24 +0100 Subject: [PATCH 017/200] powerpc: wii.dts: Add GPIO keys MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Wii has POWER and EJECT buttons, which are connected through normalization logic to the GPIO controller (the length of an assertion of these signals is always the same, regardless of how long the user pressed the buttons). Signed-off-by: Jonathan Neuschäfer Signed-off-by: Michael Ellerman --- arch/powerpc/boot/dts/wii.dts | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/arch/powerpc/boot/dts/wii.dts b/arch/powerpc/boot/dts/wii.dts index caf735fa11b7..c406bdb4f36f 100644 --- a/arch/powerpc/boot/dts/wii.dts +++ b/arch/powerpc/boot/dts/wii.dts @@ -14,6 +14,7 @@ /dts-v1/; #include +#include /* * This is commented-out for now. @@ -240,5 +241,21 @@ panic-indicator; }; }; + + gpio-keys { + compatible = "gpio-keys"; + + power { + label = "Power Button"; + gpios = <&GPIO 0 GPIO_ACTIVE_HIGH>; + linux,code = ; + }; + + eject { + label = "Eject Button"; + gpios = <&GPIO 6 GPIO_ACTIVE_HIGH>; + linux,code = ; + }; + }; }; From a65329aa7d613288626275546074f1aae5a04965 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 8 Jan 2019 09:31:21 -0200 Subject: [PATCH 018/200] selftests/powerpc: New TM signal self test A new self test that forces MSR[TS] to be set without calling any TM instruction. This test also tries to cause a page fault at a signal handler, exactly between MSR[TS] set and tm_recheckpoint(), forcing thread->texasr to be rewritten with TEXASR[FS] = 0, which will cause a BUG when tm_recheckpoint() is called. This test is not deterministic, since it is hard to guarantee that the page access will cause a page fault. In order to force more page faults at signal context, the signal handler and the ucontext are being mapped into a MADV_DONTNEED memory chunks. Tests have shown that the bug could be exposed with few interactions in a buggy kernel. This test is configured to loop 5000x, having a good chance to hit the kernel issue in just one run. This self test takes less than two seconds to run. This test uses set/getcontext because the kernel will recheckpoint zeroed structures, causing the test to segfault, which is undesired because the test needs to rerun, so, there is a signal handler for SIGSEGV which will restart the test. v2: Uses the MADV_DONTNEED memory advice v3: Fix memcpy and 32-bits compilation v4: Does not define unused macros Signed-off-by: Breno Leitao Signed-off-by: Michael Ellerman --- tools/testing/selftests/powerpc/include/reg.h | 8 + .../testing/selftests/powerpc/include/utils.h | 2 + tools/testing/selftests/powerpc/tm/.gitignore | 1 + tools/testing/selftests/powerpc/tm/Makefile | 4 +- .../powerpc/tm/tm-signal-context-force-tm.c | 184 ++++++++++++++++++ 5 files changed, 198 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/powerpc/tm/tm-signal-context-force-tm.c diff --git a/tools/testing/selftests/powerpc/include/reg.h b/tools/testing/selftests/powerpc/include/reg.h index 52b4710469d2..96043b9b9829 100644 --- a/tools/testing/selftests/powerpc/include/reg.h +++ b/tools/testing/selftests/powerpc/include/reg.h @@ -77,6 +77,14 @@ #define TEXASR_TE 0x0000000004000000 #define TEXASR_ROT 0x0000000002000000 +/* MSR register bits */ +#define MSR_TS_S_LG 33 /* Trans Mem state: Suspended */ + +#define __MASK(X) (1UL<<(X)) + +/* macro to check TM MSR bits */ +#define MSR_TS_S __MASK(MSR_TS_S_LG) /* Transaction Suspended */ + /* Vector Instructions */ #define VSX_XX1(xs, ra, rb) (((xs) & 0x1f) << 21 | ((ra) << 16) | \ ((rb) << 11) | (((xs) >> 5))) diff --git a/tools/testing/selftests/powerpc/include/utils.h b/tools/testing/selftests/powerpc/include/utils.h index ae43a614835d..7636bf45d5d5 100644 --- a/tools/testing/selftests/powerpc/include/utils.h +++ b/tools/testing/selftests/powerpc/include/utils.h @@ -102,8 +102,10 @@ do { \ #if defined(__powerpc64__) #define UCONTEXT_NIA(UC) (UC)->uc_mcontext.gp_regs[PT_NIP] +#define UCONTEXT_MSR(UC) (UC)->uc_mcontext.gp_regs[PT_MSR] #elif defined(__powerpc__) #define UCONTEXT_NIA(UC) (UC)->uc_mcontext.uc_regs->gregs[PT_NIP] +#define UCONTEXT_MSR(UC) (UC)->uc_mcontext.uc_regs->gregs[PT_MSR] #else #error implement UCONTEXT_NIA #endif diff --git a/tools/testing/selftests/powerpc/tm/.gitignore b/tools/testing/selftests/powerpc/tm/.gitignore index 208452a93e2c..951fe855f7cd 100644 --- a/tools/testing/selftests/powerpc/tm/.gitignore +++ b/tools/testing/selftests/powerpc/tm/.gitignore @@ -11,6 +11,7 @@ tm-signal-context-chk-fpu tm-signal-context-chk-gpr tm-signal-context-chk-vmx tm-signal-context-chk-vsx +tm-signal-context-force-tm tm-signal-sigreturn-nt tm-vmx-unavail tm-unavailable diff --git a/tools/testing/selftests/powerpc/tm/Makefile b/tools/testing/selftests/powerpc/tm/Makefile index 75a685359129..c0734ed0ef56 100644 --- a/tools/testing/selftests/powerpc/tm/Makefile +++ b/tools/testing/selftests/powerpc/tm/Makefile @@ -4,7 +4,8 @@ SIGNAL_CONTEXT_CHK_TESTS := tm-signal-context-chk-gpr tm-signal-context-chk-fpu TEST_GEN_PROGS := tm-resched-dscr tm-syscall tm-signal-msr-resv tm-signal-stack \ tm-vmxcopy tm-fork tm-tar tm-tmspr tm-vmx-unavail tm-unavailable tm-trap \ - $(SIGNAL_CONTEXT_CHK_TESTS) tm-sigreturn tm-signal-sigreturn-nt + $(SIGNAL_CONTEXT_CHK_TESTS) tm-sigreturn tm-signal-sigreturn-nt \ + tm-signal-context-force-tm top_srcdir = ../../../../.. include ../../lib.mk @@ -20,6 +21,7 @@ $(OUTPUT)/tm-vmx-unavail: CFLAGS += -pthread -m64 $(OUTPUT)/tm-resched-dscr: ../pmu/lib.c $(OUTPUT)/tm-unavailable: CFLAGS += -O0 -pthread -m64 -Wno-error=uninitialized -mvsx $(OUTPUT)/tm-trap: CFLAGS += -O0 -pthread -m64 +$(OUTPUT)/tm-signal-context-force-tm: CFLAGS += -pthread -m64 SIGNAL_CONTEXT_CHK_TESTS := $(patsubst %,$(OUTPUT)/%,$(SIGNAL_CONTEXT_CHK_TESTS)) $(SIGNAL_CONTEXT_CHK_TESTS): tm-signal.S diff --git a/tools/testing/selftests/powerpc/tm/tm-signal-context-force-tm.c b/tools/testing/selftests/powerpc/tm/tm-signal-context-force-tm.c new file mode 100644 index 000000000000..31717625f318 --- /dev/null +++ b/tools/testing/selftests/powerpc/tm/tm-signal-context-force-tm.c @@ -0,0 +1,184 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2018, Breno Leitao, Gustavo Romero, IBM Corp. + * + * This test raises a SIGUSR1 signal, and toggle the MSR[TS] + * fields at the signal handler. With MSR[TS] being set, the kernel will + * force a recheckpoint, which may cause a segfault when returning to + * user space. Since the test needs to re-run, the segfault needs to be + * caught and handled. + * + * In order to continue the test even after a segfault, the context is + * saved prior to the signal being raised, and it is restored when there is + * a segmentation fault. This happens for COUNT_MAX times. + * + * This test never fails (as returning EXIT_FAILURE). It either succeeds, + * or crash the kernel (on a buggy kernel). + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include + +#include "tm.h" +#include "utils.h" +#include "reg.h" + +#define COUNT_MAX 5000 /* Number of interactions */ + +/* + * This test only runs on 64 bits system. Unsetting MSR_TS_S to avoid + * compilation issue on 32 bits system. There is no side effect, since the + * whole test will be skipped if it is not running on 64 bits system. + */ +#ifndef __powerpc64__ +#undef MSR_TS_S +#define MSR_TS_S 0 +#endif + +/* Setting contexts because the test will crash and we want to recover */ +ucontext_t init_context, main_context; + +static int count, first_time; + +void usr_signal_handler(int signo, siginfo_t *si, void *uc) +{ + ucontext_t *ucp = uc; + int ret; + + /* + * Allocating memory in a signal handler, and never freeing it on + * purpose, forcing the heap increase, so, the memory leak is what + * we want here. + */ + ucp->uc_link = mmap(NULL, sizeof(ucontext_t), + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); + if (ucp->uc_link == (void *)-1) { + perror("Mmap failed"); + exit(-1); + } + + /* Forcing the page to be allocated in a page fault */ + ret = madvise(ucp->uc_link, sizeof(ucontext_t), MADV_DONTNEED); + if (ret) { + perror("madvise failed"); + exit(-1); + } + + memcpy(&ucp->uc_link->uc_mcontext, &ucp->uc_mcontext, + sizeof(ucp->uc_mcontext)); + + /* Forcing to enable MSR[TM] */ + UCONTEXT_MSR(ucp) |= MSR_TS_S; + + /* + * A fork inside a signal handler seems to be more efficient than a + * fork() prior to the signal being raised. + */ + if (fork() == 0) { + /* + * Both child and parent will return, but, child returns + * with count set so it will exit in the next segfault. + * Parent will continue to loop. + */ + count = COUNT_MAX; + } + + /* + * If the change above does not hit the bug, it will cause a + * segmentation fault, since the ck structures are NULL. + */ +} + +void seg_signal_handler(int signo, siginfo_t *si, void *uc) +{ + if (count == COUNT_MAX) { + /* Return to tm_signal_force_msr() and exit */ + setcontext(&main_context); + } + + count++; + + /* Reexecute the test */ + setcontext(&init_context); +} + +void tm_trap_test(void) +{ + struct sigaction usr_sa, seg_sa; + stack_t ss; + + usr_sa.sa_flags = SA_SIGINFO | SA_ONSTACK; + usr_sa.sa_sigaction = usr_signal_handler; + + seg_sa.sa_flags = SA_SIGINFO; + seg_sa.sa_sigaction = seg_signal_handler; + + /* + * Set initial context. Will get back here from + * seg_signal_handler() + */ + getcontext(&init_context); + + /* Allocated an alternative signal stack area */ + ss.ss_sp = mmap(NULL, SIGSTKSZ, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); + ss.ss_size = SIGSTKSZ; + ss.ss_flags = 0; + + if (ss.ss_sp == (void *)-1) { + perror("mmap error\n"); + exit(-1); + } + + /* Force the allocation through a page fault */ + if (madvise(ss.ss_sp, SIGSTKSZ, MADV_DONTNEED)) { + perror("madvise\n"); + exit(-1); + } + + /* Setting an alternative stack to generate a page fault when + * the signal is raised. + */ + if (sigaltstack(&ss, NULL)) { + perror("sigaltstack\n"); + exit(-1); + } + + /* The signal handler will enable MSR_TS */ + sigaction(SIGUSR1, &usr_sa, NULL); + /* If it does not crash, it will segfault, avoid it to retest */ + sigaction(SIGSEGV, &seg_sa, NULL); + + raise(SIGUSR1); +} + +int tm_signal_context_force_tm(void) +{ + SKIP_IF(!have_htm()); + /* + * Skipping if not running on 64 bits system, since I think it is + * not possible to set mcontext's [MSR] with TS, due to it being 32 + * bits. + */ + SKIP_IF(!is_ppc64le()); + + /* Will get back here after COUNT_MAX interactions */ + getcontext(&main_context); + + if (!first_time++) + tm_trap_test(); + + return EXIT_SUCCESS; +} + +int main(int argc, char **argv) +{ + test_harness(tm_signal_context_force_tm, "tm_signal_context_force_tm"); +} From 782274434d6f2e8aa8c573cb24fef94a164f2ce0 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 10 Jan 2019 22:57:35 +1100 Subject: [PATCH 019/200] powerpc: Stop using pr_cont() in __die() Using pr_cont() risks having our output interleaved with other output from other CPUs. Instead print everything in a single printk() call. Signed-off-by: Michael Ellerman Reviewed-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/traps.c | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 64936b60d521..164fc92895be 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -257,24 +257,14 @@ static int __die(const char *str, struct pt_regs *regs, long err) { printk("Oops: %s, sig: %ld [#%d]\n", str, err, ++die_counter); - if (IS_ENABLED(CONFIG_CPU_LITTLE_ENDIAN)) - printk("LE "); - else - printk("BE "); - - if (IS_ENABLED(CONFIG_PREEMPT)) - pr_cont("PREEMPT "); - - if (IS_ENABLED(CONFIG_SMP)) - pr_cont("SMP NR_CPUS=%d ", NR_CPUS); - - if (debug_pagealloc_enabled()) - pr_cont("DEBUG_PAGEALLOC "); - - if (IS_ENABLED(CONFIG_NUMA)) - pr_cont("NUMA "); - - pr_cont("%s\n", ppc_md.name ? ppc_md.name : ""); + printk("%s %s%s%s%s%s %s\n", + IS_ENABLED(CONFIG_CPU_LITTLE_ENDIAN) ? "LE" : "BE", + IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "", + IS_ENABLED(CONFIG_SMP) ? " SMP" : "", + IS_ENABLED(CONFIG_SMP) ? (" NR_CPUS=" __stringify(NR_CPUS)) : "", + debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "", + IS_ENABLED(CONFIG_NUMA) ? " NUMA" : "", + ppc_md.name ? ppc_md.name : ""); if (notify_die(DIE_OOPS, str, regs, err, 255, SIGSEGV) == NOTIFY_STOP) return 1; From 184051396b9d75a2a9ae64f134f3ee9d7250801c Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 10 Jan 2019 22:57:36 +1100 Subject: [PATCH 020/200] powerpc: Show PAGE_SIZE in __die() output The page size the kernel is built with is useful info when debugging a crash, so add it to the output in __die(). Result looks like eg: kernel BUG at drivers/misc/lkdtm/bugs.c:63! Oops: Exception in kernel mode, sig: 5 [#1] LE PAGE_SIZE=64K SMP NR_CPUS=2048 NUMA pSeries Modules linked in: vmx_crypto kvm binfmt_misc ip_tables Signed-off-by: Michael Ellerman Reviewed-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/traps.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 164fc92895be..a872c64618ad 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -257,8 +257,9 @@ static int __die(const char *str, struct pt_regs *regs, long err) { printk("Oops: %s, sig: %ld [#%d]\n", str, err, ++die_counter); - printk("%s %s%s%s%s%s %s\n", + printk("%s PAGE_SIZE=%luK%s%s%s%s%s %s\n", IS_ENABLED(CONFIG_CPU_LITTLE_ENDIAN) ? "LE" : "BE", + PAGE_SIZE / 1024, IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "", IS_ENABLED(CONFIG_SMP) ? " SMP" : "", IS_ENABLED(CONFIG_SMP) ? (" NR_CPUS=" __stringify(NR_CPUS)) : "", From 16842516ea9cb237079cb9ffcb6466ce760fc37d Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 10 Jan 2019 22:57:37 +1100 Subject: [PATCH 021/200] powerpc/64s: Add MMU type to __die() output On Power9 machines (64-bit Book3S), we can be running with either the Hash table or Radix tree MMU enabled. So add some text to the __die() output to tell us which is enabled, for the case where all you have is the oops output and no other information. Example output: kernel BUG at drivers/misc/lkdtm/bugs.c:63! Oops: Exception in kernel mode, sig: 5 [#1] LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries Modules linked in: kvm vmx_crypto binfmt_misc ip_tables x_tables Signed-off-by: Michael Ellerman Reviewed-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/traps.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index a872c64618ad..5e917a84f949 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -257,9 +257,11 @@ static int __die(const char *str, struct pt_regs *regs, long err) { printk("Oops: %s, sig: %ld [#%d]\n", str, err, ++die_counter); - printk("%s PAGE_SIZE=%luK%s%s%s%s%s %s\n", + printk("%s PAGE_SIZE=%luK%s%s%s%s%s%s%s %s\n", IS_ENABLED(CONFIG_CPU_LITTLE_ENDIAN) ? "LE" : "BE", PAGE_SIZE / 1024, + early_radix_enabled() ? " MMU=Radix" : "", + early_mmu_has_feature(MMU_FTR_HPTE_TABLE) ? " MMU=Hash" : "", IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "", IS_ENABLED(CONFIG_SMP) ? " SMP" : "", IS_ENABLED(CONFIG_SMP) ? (" NR_CPUS=" __stringify(NR_CPUS)) : "", From 9bd10b649826774bb5e1e7fb67544e6500e3f005 Mon Sep 17 00:00:00 2001 From: Mathieu Malaterre Date: Mon, 14 Jan 2019 21:13:04 +0100 Subject: [PATCH 022/200] powerpc: Allow CPU selection of G4/74xx variant GCC supports -mcpu=G4 This patch gives the opportunity to select ALTIVEC for this variant. Signed-off-by: Mathieu Malaterre Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/Kconfig.cputype | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 8c7464c3f27f..7c544607ba32 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -153,6 +153,11 @@ config E300C3_CPU bool "e300c3 (831x)" depends on PPC_BOOK3S_32 +config G4_CPU + bool "G4 (74xx)" + depends on PPC_BOOK3S_32 + select ALTIVEC + endchoice config TARGET_CPU_BOOL @@ -171,6 +176,7 @@ config TARGET_CPU default "860" if 860_CPU default "e300c2" if E300C2_CPU default "e300c3" if E300C3_CPU + default "G4" if G4_CPU config PPC_BOOK3S def_bool y From 63da6caeb84cfad3d1e5774b7049dd1d2c9dec62 Mon Sep 17 00:00:00 2001 From: Igor Stoppa Date: Fri, 7 Sep 2018 18:35:26 +0300 Subject: [PATCH 023/200] powerpc: remove unnecessary unlikely() WARN_ON() already contains an unlikely(), so it's not necessary to wrap it into another. Signed-off-by: Igor Stoppa Cc: Arseny Solokha Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: linuxppc-dev@lists.ozlabs.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Michael Ellerman --- arch/powerpc/mm/tlb_nohash.c | 2 +- arch/powerpc/sysdev/xive/common.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c index ae5d568e267f..ac23dc1c6535 100644 --- a/arch/powerpc/mm/tlb_nohash.c +++ b/arch/powerpc/mm/tlb_nohash.c @@ -302,7 +302,7 @@ void __flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr, * This function as well as __local_flush_tlb_page() must only be called * for user contexts. */ - if (unlikely(WARN_ON(!mm))) + if (WARN_ON(!mm)) return; preempt_disable(); diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c index 94a69a62f5db..70a8f9e31a2d 100644 --- a/arch/powerpc/sysdev/xive/common.c +++ b/arch/powerpc/sysdev/xive/common.c @@ -442,7 +442,7 @@ static void xive_dec_target_count(int cpu) struct xive_cpu *xc = per_cpu(xive_cpu, cpu); struct xive_q *q = &xc->queue[xive_irq_priority]; - if (unlikely(WARN_ON(cpu < 0 || !xc))) { + if (WARN_ON(cpu < 0 || !xc)) { pr_err("%s: cpu=%d xc=%p\n", __func__, cpu, xc); return; } From 81b61324922c67f73813d8a9c175f3c153f6a1c6 Mon Sep 17 00:00:00 2001 From: Nathan Fontenot Date: Mon, 29 Oct 2018 13:43:36 -0500 Subject: [PATCH 024/200] powerpc/pseries: Perform full re-add of CPU for topology update post-migration On pseries systems, performing a partition migration can result in altering the nodes a CPU is assigned to on the destination system. For exampl, pre-migration on the source system CPUs are in node 1 and 3, post-migration on the destination system CPUs are in nodes 2 and 3. Handling the node change for a CPU can cause corruption in the slab cache if we hit a timing where a CPUs node is changed while cache_reap() is invoked. The corruption occurs because the slab cache code appears to rely on the CPU and slab cache pages being on the same node. The current dynamic updating of a CPUs node done in arch/powerpc/mm/numa.c does not prevent us from hitting this scenario. Changing the device tree property update notification handler that recognizes an affinity change for a CPU to do a full DLPAR remove and add of the CPU instead of dynamically changing its node resolves this issue. Signed-off-by: Nathan Fontenot Signed-off-by: Michael W. Bringmann Tested-by: Michael W. Bringmann Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/topology.h | 2 ++ arch/powerpc/mm/numa.c | 9 +-------- arch/powerpc/platforms/pseries/hotplug-cpu.c | 19 +++++++++++++++++++ 3 files changed, 22 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index a4a718dbfec6..f85e2b01c3df 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -132,6 +132,8 @@ static inline void shared_proc_topology_init(void) {} #define topology_sibling_cpumask(cpu) (per_cpu(cpu_sibling_map, cpu)) #define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu)) #define topology_core_id(cpu) (cpu_to_core_id(cpu)) + +int dlpar_cpu_readd(int cpu); #endif #endif diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 87f0dd004295..b5d1c45c1475 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -1460,13 +1460,6 @@ static void reset_topology_timer(void) #ifdef CONFIG_SMP -static void stage_topology_update(int core_id) -{ - cpumask_or(&cpu_associativity_changes_mask, - &cpu_associativity_changes_mask, cpu_sibling_mask(core_id)); - reset_topology_timer(); -} - static int dt_update_callback(struct notifier_block *nb, unsigned long action, void *data) { @@ -1479,7 +1472,7 @@ static int dt_update_callback(struct notifier_block *nb, !of_prop_cmp(update->prop->name, "ibm,associativity")) { u32 core_id; of_property_read_u32(update->dn, "reg", &core_id); - stage_topology_update(core_id); + rc = dlpar_cpu_readd(core_id); rc = NOTIFY_OK; } break; diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c index 2f8e62163602..97feb6e79f1a 100644 --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c @@ -802,6 +802,25 @@ static int dlpar_cpu_add_by_count(u32 cpus_to_add) return rc; } +int dlpar_cpu_readd(int cpu) +{ + struct device_node *dn; + struct device *dev; + u32 drc_index; + int rc; + + dev = get_cpu_device(cpu); + dn = dev->of_node; + + rc = of_property_read_u32(dn, "ibm,my-drc-index", &drc_index); + + rc = dlpar_cpu_remove_by_index(drc_index); + if (!rc) + rc = dlpar_cpu_add(drc_index); + + return rc; +} + int dlpar_cpu(struct pseries_hp_errorlog *hp_elog) { u32 count, drc_index; From 9bf3d3c4e4fd82c7174f4856df372ab2a71005b9 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 29 Jan 2019 16:37:55 +0000 Subject: [PATCH 025/200] powerpc/traps: Fix the message printed when stack overflows Today's message is useless: [ 42.253267] Kernel stack overflow in process (ptrval), r1=c65500b0 This patch fixes it: [ 66.905235] Kernel stack overflow in process sh[356], r1=c65560b0 Fixes: ad67b74d2469 ("printk: hash addresses printed with %p") Cc: stable@vger.kernel.org # v4.15+ Signed-off-by: Christophe Leroy [mpe: Use task_pid_nr()] Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/traps.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 5e917a84f949..040b60293613 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -1535,8 +1535,8 @@ bail: void StackOverflow(struct pt_regs *regs) { - printk(KERN_CRIT "Kernel stack overflow in process %p, r1=%lx\n", - current, regs->gpr[1]); + pr_crit("Kernel stack overflow in process %s[%d], r1=%lx\n", + current->comm, task_pid_nr(current), regs->gpr[1]); debugger(regs); show_regs(regs); panic("kernel stack overflow"); From edeb304f659792fb5bab90d7d6f3408b4c7301fb Mon Sep 17 00:00:00 2001 From: Vaibhav Jain Date: Tue, 29 Jan 2019 16:36:18 +0530 Subject: [PATCH 026/200] cxl: Wrap iterations over afu slices inside 'afu_list_lock' Within cxl module, iteration over array 'adapter->afu' may be racy at few points as it might be simultaneously read during an EEH and its contents being set to NULL while driver is being unloaded or unbound from the adapter. This might result in a NULL pointer to 'struct afu' being de-referenced during an EEH thereby causing a kernel oops. This patch fixes this by making sure that all access to the array 'adapter->afu' is wrapped within the context of spin-lock 'adapter->afu_list_lock'. Fixes: 9e8df8a21963 ("cxl: EEH support") Cc: stable@vger.kernel.org # v4.3+ Acked-by: Andrew Donnellan Acked-by: Frederic Barrat Acked-by: Christophe Lombard Signed-off-by: Vaibhav Jain Signed-off-by: Michael Ellerman --- drivers/misc/cxl/guest.c | 2 ++ drivers/misc/cxl/pci.c | 39 ++++++++++++++++++++++++++++++--------- 2 files changed, 32 insertions(+), 9 deletions(-) diff --git a/drivers/misc/cxl/guest.c b/drivers/misc/cxl/guest.c index 5d28d9e454f5..08f4a512afad 100644 --- a/drivers/misc/cxl/guest.c +++ b/drivers/misc/cxl/guest.c @@ -267,6 +267,7 @@ static int guest_reset(struct cxl *adapter) int i, rc; pr_devel("Adapter reset request\n"); + spin_lock(&adapter->afu_list_lock); for (i = 0; i < adapter->slices; i++) { if ((afu = adapter->afu[i])) { pci_error_handlers(afu, CXL_ERROR_DETECTED_EVENT, @@ -283,6 +284,7 @@ static int guest_reset(struct cxl *adapter) pci_error_handlers(afu, CXL_RESUME_EVENT, 0); } } + spin_unlock(&adapter->afu_list_lock); return rc; } diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c index c79ba1c699ad..300531d6136f 100644 --- a/drivers/misc/cxl/pci.c +++ b/drivers/misc/cxl/pci.c @@ -1805,7 +1805,7 @@ static pci_ers_result_t cxl_vphb_error_detected(struct cxl_afu *afu, /* There should only be one entry, but go through the list * anyway */ - if (afu->phb == NULL) + if (afu == NULL || afu->phb == NULL) return result; list_for_each_entry(afu_dev, &afu->phb->bus->devices, bus_list) { @@ -1832,7 +1832,8 @@ static pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev, { struct cxl *adapter = pci_get_drvdata(pdev); struct cxl_afu *afu; - pci_ers_result_t result = PCI_ERS_RESULT_NEED_RESET, afu_result; + pci_ers_result_t result = PCI_ERS_RESULT_NEED_RESET; + pci_ers_result_t afu_result = PCI_ERS_RESULT_NEED_RESET; int i; /* At this point, we could still have an interrupt pending. @@ -1843,6 +1844,7 @@ static pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev, /* If we're permanently dead, give up. */ if (state == pci_channel_io_perm_failure) { + spin_lock(&adapter->afu_list_lock); for (i = 0; i < adapter->slices; i++) { afu = adapter->afu[i]; /* @@ -1851,6 +1853,7 @@ static pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev, */ cxl_vphb_error_detected(afu, state); } + spin_unlock(&adapter->afu_list_lock); return PCI_ERS_RESULT_DISCONNECT; } @@ -1932,11 +1935,17 @@ static pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev, * * In slot_reset, free the old resources and allocate new ones. * * In resume, clear the flag to allow things to start. */ + + /* Make sure no one else changes the afu list */ + spin_lock(&adapter->afu_list_lock); + for (i = 0; i < adapter->slices; i++) { afu = adapter->afu[i]; - afu_result = cxl_vphb_error_detected(afu, state); + if (afu == NULL) + continue; + afu_result = cxl_vphb_error_detected(afu, state); cxl_context_detach_all(afu); cxl_ops->afu_deactivate_mode(afu, afu->current_mode); pci_deconfigure_afu(afu); @@ -1948,6 +1957,7 @@ static pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev, (result == PCI_ERS_RESULT_NEED_RESET)) result = PCI_ERS_RESULT_NONE; } + spin_unlock(&adapter->afu_list_lock); /* should take the context lock here */ if (cxl_adapter_context_lock(adapter) != 0) @@ -1980,14 +1990,18 @@ static pci_ers_result_t cxl_pci_slot_reset(struct pci_dev *pdev) */ cxl_adapter_context_unlock(adapter); + spin_lock(&adapter->afu_list_lock); for (i = 0; i < adapter->slices; i++) { afu = adapter->afu[i]; + if (afu == NULL) + continue; + if (pci_configure_afu(afu, adapter, pdev)) - goto err; + goto err_unlock; if (cxl_afu_select_best_mode(afu)) - goto err; + goto err_unlock; if (afu->phb == NULL) continue; @@ -1999,16 +2013,16 @@ static pci_ers_result_t cxl_pci_slot_reset(struct pci_dev *pdev) ctx = cxl_get_context(afu_dev); if (ctx && cxl_release_context(ctx)) - goto err; + goto err_unlock; ctx = cxl_dev_context_init(afu_dev); if (IS_ERR(ctx)) - goto err; + goto err_unlock; afu_dev->dev.archdata.cxl_ctx = ctx; if (cxl_ops->afu_check_and_enable(afu)) - goto err; + goto err_unlock; afu_dev->error_state = pci_channel_io_normal; @@ -2029,8 +2043,13 @@ static pci_ers_result_t cxl_pci_slot_reset(struct pci_dev *pdev) result = PCI_ERS_RESULT_DISCONNECT; } } + + spin_unlock(&adapter->afu_list_lock); return result; +err_unlock: + spin_unlock(&adapter->afu_list_lock); + err: /* All the bits that happen in both error_detected and cxl_remove * should be idempotent, so we don't need to worry about leaving a mix @@ -2051,10 +2070,11 @@ static void cxl_pci_resume(struct pci_dev *pdev) * This is not the place to be checking if everything came back up * properly, because there's no return value: do that in slot_reset. */ + spin_lock(&adapter->afu_list_lock); for (i = 0; i < adapter->slices; i++) { afu = adapter->afu[i]; - if (afu->phb == NULL) + if (afu == NULL || afu->phb == NULL) continue; list_for_each_entry(afu_dev, &afu->phb->bus->devices, bus_list) { @@ -2063,6 +2083,7 @@ static void cxl_pci_resume(struct pci_dev *pdev) afu_dev->driver->err_handler->resume(afu_dev); } } + spin_unlock(&adapter->afu_list_lock); } static const struct pci_error_handlers cxl_err_handler = { From 75f8a37580b64f87c223fbd08db6b2f79129864a Mon Sep 17 00:00:00 2001 From: Brajeswar Ghosh Date: Mon, 28 Jan 2019 21:41:36 +0530 Subject: [PATCH 027/200] powerpc/kernel/time: Remove duplicate header Remove linux/rtc.h which is included more than once Signed-off-by: Brajeswar Ghosh Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/time.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 3646affae963..bc0503ef9c9c 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -57,7 +57,6 @@ #include #include #include -#include #include #include #include From f069a062ecce7ccc17221c24097826e829f890e1 Mon Sep 17 00:00:00 2001 From: Sabyasachi Gupta Date: Thu, 17 Jan 2019 21:40:33 +0530 Subject: [PATCH 028/200] powerpc/powernv: Remove duplicate header Remove linux/printk.h which is included more than once. Signed-off-by: Sabyasachi Gupta Acked-by: Souptick Joarder Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/opal.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index 79586f127521..d1072d464e99 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include From 45a202a3fefc6ee7b19b1222bfb5b0679ce31996 Mon Sep 17 00:00:00 2001 From: Sabyasachi Gupta Date: Thu, 17 Jan 2019 21:49:05 +0530 Subject: [PATCH 029/200] powerpc/cell: Remove duplicate header Remove linux/syscalls.h which is included more than once Signed-off-by: Sabyasachi Gupta Acked-by: Souptick Joarder Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/cell/spu_syscalls.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/powerpc/platforms/cell/spu_syscalls.c b/arch/powerpc/platforms/cell/spu_syscalls.c index 263413a34823..b95d6afc39b5 100644 --- a/arch/powerpc/platforms/cell/spu_syscalls.c +++ b/arch/powerpc/platforms/cell/spu_syscalls.c @@ -26,7 +26,6 @@ #include #include #include -#include #include From 865a9432d16fe2f40a1a52005fd30778056c7921 Mon Sep 17 00:00:00 2001 From: Reza Arbab Date: Mon, 28 Jan 2019 11:31:42 -0600 Subject: [PATCH 030/200] powerpc/mm: Add _PAGE_SAO to _PAGE_CACHE_CTL mask In htab_convert_pte_flags(), _PAGE_CACHE_CTL is used to check for the _PAGE_SAO flag: else if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_SAO) rflags |= (HPTE_R_W | HPTE_R_I | HPTE_R_M); But, it isn't defined to include that flag: #define _PAGE_CACHE_CTL (_PAGE_NON_IDEMPOTENT | _PAGE_TOLERANT) This happens to work, but only because of the flag values: #define _PAGE_SAO 0x00010 /* Strong access order */ #define _PAGE_NON_IDEMPOTENT 0x00020 /* non idempotent memory */ #define _PAGE_TOLERANT 0x00030 /* tolerant memory, cache inhibited */ To prevent any issues if these particulars ever change, add _PAGE_SAO to the mask. Suggested-by: Charles Johns Signed-off-by: Reza Arbab Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/64/pgtable.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 2e6ada28da64..1d97a2800cf8 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -811,7 +811,7 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr, return hash__set_pte_at(mm, addr, ptep, pte, percpu); } -#define _PAGE_CACHE_CTL (_PAGE_NON_IDEMPOTENT | _PAGE_TOLERANT) +#define _PAGE_CACHE_CTL (_PAGE_SAO | _PAGE_NON_IDEMPOTENT | _PAGE_TOLERANT) #define pgprot_noncached pgprot_noncached static inline pgprot_t pgprot_noncached(pgprot_t prot) From ab4510e9ac6dcdd5e9ec0380bec279b5ae97ce10 Mon Sep 17 00:00:00 2001 From: Madhavan Srinivasan Date: Mon, 10 Dec 2018 09:29:05 +0530 Subject: [PATCH 031/200] powerpc/perf: Add mem access events to sysfs Add mem-loads/mem-stores events to sysfs. The event is formed based on raw event encoding. Primary PMU event used here is PM_MRK_INST_CMPL along with MMCRA[SM] modes and Thresholding bit Signed-off-by: Madhavan Srinivasan Signed-off-by: Michael Ellerman --- arch/powerpc/perf/power9-events-list.h | 24 ++++++++++++++++++++++++ arch/powerpc/perf/power9-pmu.c | 4 ++++ 2 files changed, 28 insertions(+) diff --git a/arch/powerpc/perf/power9-events-list.h b/arch/powerpc/perf/power9-events-list.h index 7de344b7d9cc..063c9d9f2516 100644 --- a/arch/powerpc/perf/power9-events-list.h +++ b/arch/powerpc/perf/power9-events-list.h @@ -97,3 +97,27 @@ EVENT(PM_MRK_DTLB_MISS_64K, 0x3d156) EVENT(PM_DTLB_MISS_16M, 0x4c056) EVENT(PM_DTLB_MISS_1G, 0x4c05a) EVENT(PM_MRK_DTLB_MISS_16M, 0x4c15e) + +/* + * Memory Access Events + * + * Primary PMU event used here is PM_MRK_INST_CMPL (0x401e0) + * To enable capturing of memory profiling, these MMCRA bits + * needs to be programmed and corresponding raw event format + * encoding. + * + * MMCRA bits encoding needed are + * SM (Sampling Mode) + * EM (Eligibility for Random Sampling) + * TECE (Threshold Event Counter Event) + * TS (Threshold Start Event) + * TE (Threshold End Event) + * + * Corresponding Raw Encoding bits: + * sample [EM,SM] + * thresh_sel (TECE) + * thresh start (TS) + * thresh end (TE) + */ +EVENT(MEM_LOADS, 0x34340401e0) +EVENT(MEM_STORES, 0x343c0401e0) diff --git a/arch/powerpc/perf/power9-pmu.c b/arch/powerpc/perf/power9-pmu.c index 0ff9c43733e9..030544e35959 100644 --- a/arch/powerpc/perf/power9-pmu.c +++ b/arch/powerpc/perf/power9-pmu.c @@ -160,6 +160,8 @@ GENERIC_EVENT_ATTR(branch-instructions, PM_BR_CMPL); GENERIC_EVENT_ATTR(branch-misses, PM_BR_MPRED_CMPL); GENERIC_EVENT_ATTR(cache-references, PM_LD_REF_L1); GENERIC_EVENT_ATTR(cache-misses, PM_LD_MISS_L1_FIN); +GENERIC_EVENT_ATTR(mem-loads, MEM_LOADS); +GENERIC_EVENT_ATTR(mem-stores, MEM_STORES); CACHE_EVENT_ATTR(L1-dcache-load-misses, PM_LD_MISS_L1_FIN); CACHE_EVENT_ATTR(L1-dcache-loads, PM_LD_REF_L1); @@ -185,6 +187,8 @@ static struct attribute *power9_events_attr[] = { GENERIC_EVENT_PTR(PM_BR_MPRED_CMPL), GENERIC_EVENT_PTR(PM_LD_REF_L1), GENERIC_EVENT_PTR(PM_LD_MISS_L1_FIN), + GENERIC_EVENT_PTR(MEM_LOADS), + GENERIC_EVENT_PTR(MEM_STORES), CACHE_EVENT_PTR(PM_LD_MISS_L1_FIN), CACHE_EVENT_PTR(PM_LD_REF_L1), CACHE_EVENT_PTR(PM_L1_PREF), From eddd0b332304d554ad6243942f87c2fcea98c56b Mon Sep 17 00:00:00 2001 From: Nicolai Stange Date: Tue, 22 Jan 2019 10:57:21 -0500 Subject: [PATCH 032/200] powerpc/64s: Clear on-stack exception marker upon exception return The ppc64 specific implementation of the reliable stacktracer, save_stack_trace_tsk_reliable(), bails out and reports an "unreliable trace" whenever it finds an exception frame on the stack. Stack frames are classified as exception frames if the STACK_FRAME_REGS_MARKER magic, as written by exception prologues, is found at a particular location. However, as observed by Joe Lawrence, it is possible in practice that non-exception stack frames can alias with prior exception frames and thus, that the reliable stacktracer can find a stale STACK_FRAME_REGS_MARKER on the stack. It in turn falsely reports an unreliable stacktrace and blocks any live patching transition to finish. Said condition lasts until the stack frame is overwritten/initialized by function call or other means. In principle, we could mitigate this by making the exception frame classification condition in save_stack_trace_tsk_reliable() stronger: in addition to testing for STACK_FRAME_REGS_MARKER, we could also take into account that for all exceptions executing on the kernel stack - their stack frames's backlink pointers always match what is saved in their pt_regs instance's ->gpr[1] slot and that - their exception frame size equals STACK_INT_FRAME_SIZE, a value uncommonly large for non-exception frames. However, while these are currently true, relying on them would make the reliable stacktrace implementation more sensitive towards future changes in the exception entry code. Note that false negatives, i.e. not detecting exception frames, would silently break the live patching consistency model. Furthermore, certain other places (diagnostic stacktraces, perf, xmon) rely on STACK_FRAME_REGS_MARKER as well. Make the exception exit code clear the on-stack STACK_FRAME_REGS_MARKER for those exceptions running on the "normal" kernel stack and returning to kernelspace: because the topmost frame is ignored by the reliable stack tracer anyway, returns to userspace don't need to take care of clearing the marker. Furthermore, as I don't have the ability to test this on Book 3E or 32 bits, limit the change to Book 3S and 64 bits. Fixes: df78d3f61480 ("powerpc/livepatch: Implement reliable stack tracing for the consistency model") Reported-by: Joe Lawrence Signed-off-by: Nicolai Stange Signed-off-by: Joe Lawrence Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/entry_64.S | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 435927f549c4..a2c168b395d2 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -1002,6 +1002,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) ld r2,_NIP(r1) mtspr SPRN_SRR0,r2 + /* + * Leaving a stale exception_marker on the stack can confuse + * the reliable stack unwinder later on. Clear it. + */ + li r2,0 + std r2,STACK_FRAME_OVERHEAD-16(r1) + ld r0,GPR0(r1) ld r2,GPR2(r1) ld r3,GPR3(r1) From a50d3250d7ae34c561177a1f9cfb79816fcbcff1 Mon Sep 17 00:00:00 2001 From: Nicolai Stange Date: Thu, 31 Jan 2019 16:41:50 +1100 Subject: [PATCH 033/200] powerpc/64s: Make reliable stacktrace dependency clearer Make the HAVE_RELIABLE_STACKTRACE Kconfig option depend on PPC_BOOK3S_64 for documentation purposes. Before this patch, it depended on PPC64 && CPU_LITTLE_ENDIAN and because CPU_LITTLE_ENDIAN implies PPC_BOOK3S_64, there's no functional change here. Signed-off-by: Nicolai Stange Signed-off-by: Joe Lawrence [mpe: Split out of larger patch] Signed-off-by: Michael Ellerman --- arch/powerpc/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 2890d36eb531..73bf87b1d274 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -220,7 +220,7 @@ config PPC select HAVE_PERF_USER_STACK_DUMP select HAVE_RCU_TABLE_FREE if SMP select HAVE_REGS_AND_STACK_ACCESS_API - select HAVE_RELIABLE_STACKTRACE if PPC64 && CPU_LITTLE_ENDIAN + select HAVE_RELIABLE_STACKTRACE if PPC_BOOK3S_64 && CPU_LITTLE_ENDIAN select HAVE_SYSCALL_TRACEPOINTS select HAVE_VIRT_CPU_ACCOUNTING select HAVE_IRQ_TIME_ACCOUNTING From 18be37603de81674e41a0b0282326a0debc1696e Mon Sep 17 00:00:00 2001 From: Joe Lawrence Date: Tue, 22 Jan 2019 10:57:22 -0500 Subject: [PATCH 034/200] powerpc/livepatch: relax reliable stack tracer checks for first-frame The bottom-most stack frame (the first to be unwound) may be largely uninitialized, for the "Power Architecture 64-Bit ELF V2 ABI" only requires its backchain pointer to be set. The reliable stack tracer should be careful when verifying this frame: skip checks on STACK_FRAME_LR_SAVE and STACK_FRAME_MARKER offsets that may contain uninitialized residual data. Fixes: df78d3f61480 ("powerpc/livepatch: Implement reliable stack tracing for the consistency model") Signed-off-by: Joe Lawrence Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/stacktrace.c | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/kernel/stacktrace.c b/arch/powerpc/kernel/stacktrace.c index e2c50b55138f..06688f4d557b 100644 --- a/arch/powerpc/kernel/stacktrace.c +++ b/arch/powerpc/kernel/stacktrace.c @@ -84,6 +84,12 @@ save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace) EXPORT_SYMBOL_GPL(save_stack_trace_regs); #ifdef CONFIG_HAVE_RELIABLE_STACKTRACE +/* + * This function returns an error if it detects any unreliable features of the + * stack. Otherwise it guarantees that the stack trace is reliable. + * + * If the task is not 'current', the caller *must* ensure the task is inactive. + */ int save_stack_trace_tsk_reliable(struct task_struct *tsk, struct stack_trace *trace) @@ -142,12 +148,6 @@ save_stack_trace_tsk_reliable(struct task_struct *tsk, if (sp & 0xF) return 1; - /* Mark stacktraces with exception frames as unreliable. */ - if (sp <= stack_end - STACK_INT_FRAME_SIZE && - stack[STACK_FRAME_MARKER] == STACK_FRAME_REGS_MARKER) { - return 1; - } - newsp = stack[0]; /* Stack grows downwards; unwinder may only go up. */ if (newsp <= sp) @@ -158,11 +158,26 @@ save_stack_trace_tsk_reliable(struct task_struct *tsk, return 1; /* invalid backlink, too far up. */ } + /* + * We can only trust the bottom frame's backlink, the + * rest of the frame may be uninitialized, continue to + * the next. + */ + if (firstframe) { + firstframe = 0; + goto next; + } + + /* Mark stacktraces with exception frames as unreliable. */ + if (sp <= stack_end - STACK_INT_FRAME_SIZE && + stack[STACK_FRAME_MARKER] == STACK_FRAME_REGS_MARKER) { + return 1; + } + /* Examine the saved LR: it must point into kernel code. */ ip = stack[STACK_FRAME_LR_SAVE]; - if (!firstframe && !__kernel_text_address(ip)) + if (!__kernel_text_address(ip)) return 1; - firstframe = 0; /* * FIXME: IMHO these tests do not belong in @@ -183,6 +198,7 @@ save_stack_trace_tsk_reliable(struct task_struct *tsk, else trace->skip--; +next: if (newsp == stack_end) break; From 29a77bbb0cf2cea41fa46f8fa176f6cb1e3182c4 Mon Sep 17 00:00:00 2001 From: Joe Lawrence Date: Tue, 22 Jan 2019 10:57:23 -0500 Subject: [PATCH 035/200] powerpc/livepatch: small cleanups in save_stack_trace_tsk_reliable() Mostly cosmetic changes: - Group common stack pointer code at the top - Simplify the first frame logic - Code stackframe iteration into for...loop construct - Check for trace->nr_entries overflow before adding any into the array Suggested-by: Nicolai Stange Signed-off-by: Joe Lawrence Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/stacktrace.c | 40 +++++++++++--------------------- 1 file changed, 14 insertions(+), 26 deletions(-) diff --git a/arch/powerpc/kernel/stacktrace.c b/arch/powerpc/kernel/stacktrace.c index 06688f4d557b..28c3c25755d7 100644 --- a/arch/powerpc/kernel/stacktrace.c +++ b/arch/powerpc/kernel/stacktrace.c @@ -95,20 +95,11 @@ save_stack_trace_tsk_reliable(struct task_struct *tsk, struct stack_trace *trace) { unsigned long sp; + unsigned long newsp; unsigned long stack_page = (unsigned long)task_stack_page(tsk); unsigned long stack_end; int graph_idx = 0; - - /* - * The last frame (unwinding first) may not yet have saved - * its LR onto the stack. - */ - int firstframe = 1; - - if (tsk == current) - sp = current_stack_pointer(); - else - sp = tsk->thread.ksp; + bool firstframe; stack_end = stack_page + THREAD_SIZE; if (!is_idle_task(tsk)) { @@ -135,14 +126,20 @@ save_stack_trace_tsk_reliable(struct task_struct *tsk, stack_end -= STACK_FRAME_OVERHEAD; } + if (tsk == current) + sp = current_stack_pointer(); + else + sp = tsk->thread.ksp; + if (sp < stack_page + sizeof(struct thread_struct) || sp > stack_end - STACK_FRAME_MIN_SIZE) { return 1; } - for (;;) { + for (firstframe = true; sp != stack_end; + firstframe = false, sp = newsp) { unsigned long *stack = (unsigned long *) sp; - unsigned long newsp, ip; + unsigned long ip; /* sanity check: ABI requires SP to be aligned 16 bytes. */ if (sp & 0xF) @@ -163,10 +160,8 @@ save_stack_trace_tsk_reliable(struct task_struct *tsk, * rest of the frame may be uninitialized, continue to * the next. */ - if (firstframe) { - firstframe = 0; - goto next; - } + if (firstframe) + continue; /* Mark stacktraces with exception frames as unreliable. */ if (sp <= stack_end - STACK_INT_FRAME_SIZE && @@ -193,19 +188,12 @@ save_stack_trace_tsk_reliable(struct task_struct *tsk, return 1; #endif + if (trace->nr_entries >= trace->max_entries) + return -E2BIG; if (!trace->skip) trace->entries[trace->nr_entries++] = ip; else trace->skip--; - -next: - if (newsp == stack_end) - break; - - if (trace->nr_entries >= trace->max_entries) - return -E2BIG; - - sp = newsp; } return 0; } From 3de27dcf8121c2a710ab93dce23e0f5901c29783 Mon Sep 17 00:00:00 2001 From: Joe Lawrence Date: Tue, 22 Jan 2019 10:57:24 -0500 Subject: [PATCH 036/200] powerpc/livepatch: return -ERRNO values in save_stack_trace_tsk_reliable() To match its x86 counterpart, save_stack_trace_tsk_reliable() should return -EINVAL in cases that it is currently returning 1. No caller is currently differentiating non-zero error codes, but let's keep the arch-specific implementations consistent. Signed-off-by: Joe Lawrence Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/stacktrace.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/kernel/stacktrace.c b/arch/powerpc/kernel/stacktrace.c index 28c3c25755d7..cf31ce6c1f53 100644 --- a/arch/powerpc/kernel/stacktrace.c +++ b/arch/powerpc/kernel/stacktrace.c @@ -133,7 +133,7 @@ save_stack_trace_tsk_reliable(struct task_struct *tsk, if (sp < stack_page + sizeof(struct thread_struct) || sp > stack_end - STACK_FRAME_MIN_SIZE) { - return 1; + return -EINVAL; } for (firstframe = true; sp != stack_end; @@ -143,16 +143,16 @@ save_stack_trace_tsk_reliable(struct task_struct *tsk, /* sanity check: ABI requires SP to be aligned 16 bytes. */ if (sp & 0xF) - return 1; + return -EINVAL; newsp = stack[0]; /* Stack grows downwards; unwinder may only go up. */ if (newsp <= sp) - return 1; + return -EINVAL; if (newsp != stack_end && newsp > stack_end - STACK_FRAME_MIN_SIZE) { - return 1; /* invalid backlink, too far up. */ + return -EINVAL; /* invalid backlink, too far up. */ } /* @@ -166,13 +166,13 @@ save_stack_trace_tsk_reliable(struct task_struct *tsk, /* Mark stacktraces with exception frames as unreliable. */ if (sp <= stack_end - STACK_INT_FRAME_SIZE && stack[STACK_FRAME_MARKER] == STACK_FRAME_REGS_MARKER) { - return 1; + return -EINVAL; } /* Examine the saved LR: it must point into kernel code. */ ip = stack[STACK_FRAME_LR_SAVE]; if (!__kernel_text_address(ip)) - return 1; + return -EINVAL; /* * FIXME: IMHO these tests do not belong in @@ -185,7 +185,7 @@ save_stack_trace_tsk_reliable(struct task_struct *tsk, * as unreliable. */ if (ip == (unsigned long)kretprobe_trampoline) - return 1; + return -EINVAL; #endif if (trace->nr_entries >= trace->max_entries) From 423bfc69d7f491c47fc35921f7d460be4094d555 Mon Sep 17 00:00:00 2001 From: Christian Lamparter Date: Thu, 31 Jan 2019 21:59:04 +0100 Subject: [PATCH 037/200] powerpc: Enable kernel XZ compression option on 44x Enable kernel XZ compression option on 44x. Tested on a Western Digital - MyBook Live NAS. It takes 22 seconds for the 800 MHz CPU to decompress and boot a 2.63 MiB XZ-compressed kernel simpleImage. Signed-off-by: Christian Lamparter Signed-off-by: Michael Ellerman --- arch/powerpc/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 73bf87b1d274..9c70c2864657 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -201,7 +201,7 @@ config PPC select HAVE_IOREMAP_PROT select HAVE_IRQ_EXIT_ON_IRQ_STACK select HAVE_KERNEL_GZIP - select HAVE_KERNEL_XZ if PPC_BOOK3S + select HAVE_KERNEL_XZ if PPC_BOOK3S || 44x select HAVE_KPROBES select HAVE_KPROBES_ON_FTRACE select HAVE_KRETPROBES From e5c27ef7a5f204ff2f894f0dd7ed37748a7fa12f Mon Sep 17 00:00:00 2001 From: Mathieu Malaterre Date: Sat, 2 Feb 2019 13:54:27 +0100 Subject: [PATCH 038/200] powerpc: Remove trailing semicolon after curly brace There is not point in having a trailing semicolon after a closing curly brace. Remove it. Signed-off-by: Mathieu Malaterre Signed-off-by: Michael Ellerman --- arch/powerpc/sysdev/tsi108_dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/sysdev/tsi108_dev.c b/arch/powerpc/sysdev/tsi108_dev.c index 1fd0717ade02..1f1af12f23e2 100644 --- a/arch/powerpc/sysdev/tsi108_dev.c +++ b/arch/powerpc/sysdev/tsi108_dev.c @@ -51,7 +51,7 @@ phys_addr_t get_csrbase(void) const void *prop = of_get_property(tsi, "reg", &size); tsi108_csr_base = of_translate_address(tsi, prop); of_node_put(tsi); - }; + } return tsi108_csr_base; } From 8e0f97357533aa5b57b333de47eb008c6072fcac Mon Sep 17 00:00:00 2001 From: Mathieu Malaterre Date: Sat, 2 Feb 2019 14:05:35 +0100 Subject: [PATCH 039/200] Move static keyword at beginning of declaration Move the static keyword around to remove the following warnings (W=1): arch/powerpc/platforms/ps3/os-area.c:212:1: error: 'static' is not at beginning of declaration [-Werror=old-style-declaration] arch/powerpc/platforms/ps3/system-bus.c:45:1: error: 'static' is not at beginning of declaration [-Werror=old-style-declaration] Signed-off-by: Mathieu Malaterre Acked-by: Geoff Levand Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/ps3/os-area.c | 4 ++-- arch/powerpc/platforms/ps3/system-bus.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/platforms/ps3/os-area.c b/arch/powerpc/platforms/ps3/os-area.c index f5387ad82279..4d65c5380020 100644 --- a/arch/powerpc/platforms/ps3/os-area.c +++ b/arch/powerpc/platforms/ps3/os-area.c @@ -205,11 +205,11 @@ static const struct os_area_db_id os_area_db_id_rtc_diff = { * 3) The number of seconds from 1970 to 2000. */ -struct saved_params { +static struct saved_params { unsigned int valid; s64 rtc_diff; unsigned int av_multi_out; -} static saved_params; +} saved_params; static struct property property_rtc_diff = { .name = "linux,rtc_diff", diff --git a/arch/powerpc/platforms/ps3/system-bus.c b/arch/powerpc/platforms/ps3/system-bus.c index 5cc35d6b94b6..7c227e784247 100644 --- a/arch/powerpc/platforms/ps3/system-bus.c +++ b/arch/powerpc/platforms/ps3/system-bus.c @@ -37,12 +37,12 @@ static struct device ps3_system_bus = { }; /* FIXME: need device usage counters! */ -struct { +static struct { struct mutex mutex; int sb_11; /* usb 0 */ int sb_12; /* usb 0 */ int gpu; -} static usage_hack; +} usage_hack; static int ps3_is_device(struct ps3_system_bus_device *dev, u64 bus_id, u64 dev_id) From 26b523356f49a0117c8f9e32ca98aa6d6e496e1a Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 1 Feb 2019 10:46:52 +0000 Subject: [PATCH 040/200] powerpc: Drop page_is_ram() and walk_system_ram_range() Since commit c40dd2f76644 ("powerpc: Add System RAM to /proc/iomem") it is possible to use the generic walk_system_ram_range() and the generic page_is_ram(). To enable the use of walk_system_ram_range() by the IBM EHEA ethernet driver, we still need an export of the generic function. As powerpc was the only user of CONFIG_ARCH_HAS_WALK_MEMORY, the ifdef around the generic walk_system_ram_range() has become useless and can be dropped. Fixes: c40dd2f76644 ("powerpc: Add System RAM to /proc/iomem") Signed-off-by: Christophe Leroy [mpe: Keep the EXPORT_SYMBOL_GPL in powerpc code] Signed-off-by: Michael Ellerman --- arch/powerpc/Kconfig | 3 --- arch/powerpc/include/asm/page.h | 1 - arch/powerpc/mm/mem.c | 39 +++++---------------------------- kernel/resource.c | 4 ---- 4 files changed, 6 insertions(+), 41 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 9c70c2864657..08908219fba9 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -478,9 +478,6 @@ config ARCH_CPU_PROBE_RELEASE config ARCH_ENABLE_MEMORY_HOTPLUG def_bool y -config ARCH_HAS_WALK_MEMORY - def_bool y - config ARCH_ENABLE_MEMORY_HOTREMOVE def_bool y diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h index 5c5ea2413413..aa4497175bd3 100644 --- a/arch/powerpc/include/asm/page.h +++ b/arch/powerpc/include/asm/page.h @@ -326,7 +326,6 @@ struct page; extern void clear_user_page(void *page, unsigned long vaddr, struct page *pg); extern void copy_user_page(void *to, void *from, unsigned long vaddr, struct page *p); -extern int page_is_ram(unsigned long pfn); extern int devmem_is_allowed(unsigned long pfn); #ifdef CONFIG_PPC_SMLPAR diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 33cc6f676fa6..81f251fc4169 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -80,11 +80,6 @@ static inline pte_t *virt_to_kpte(unsigned long vaddr) #define TOP_ZONE ZONE_NORMAL #endif -int page_is_ram(unsigned long pfn) -{ - return memblock_is_memory(__pfn_to_phys(pfn)); -} - pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, unsigned long size, pgprot_t vma_prot) { @@ -176,34 +171,6 @@ int __meminit arch_remove_memory(int nid, u64 start, u64 size, #endif #endif /* CONFIG_MEMORY_HOTPLUG */ -/* - * walk_memory_resource() needs to make sure there is no holes in a given - * memory range. PPC64 does not maintain the memory layout in /proc/iomem. - * Instead it maintains it in memblock.memory structures. Walk through the - * memory regions, find holes and callback for contiguous regions. - */ -int -walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, - void *arg, int (*func)(unsigned long, unsigned long, void *)) -{ - struct memblock_region *reg; - unsigned long end_pfn = start_pfn + nr_pages; - unsigned long tstart, tend; - int ret = -1; - - for_each_memblock(memory, reg) { - tstart = max(start_pfn, memblock_region_memory_base_pfn(reg)); - tend = min(end_pfn, memblock_region_memory_end_pfn(reg)); - if (tstart >= tend) - continue; - ret = (*func)(tstart, tend - tstart, arg); - if (ret) - break; - } - return ret; -} -EXPORT_SYMBOL_GPL(walk_system_ram_range); - #ifndef CONFIG_NEED_MULTIPLE_NODES void __init mem_topology_setup(void) { @@ -585,3 +552,9 @@ int devmem_is_allowed(unsigned long pfn) return 0; } #endif /* CONFIG_STRICT_DEVMEM */ + +/* + * This is defined in kernel/resource.c but only powerpc needs to export it, for + * the EHEA driver. Drop this when drivers/net/ethernet/ibm/ehea is removed. + */ +EXPORT_SYMBOL_GPL(walk_system_ram_range); diff --git a/kernel/resource.c b/kernel/resource.c index 915c02e8e5dd..e81b17b53fa5 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -448,8 +448,6 @@ int walk_mem_res(u64 start, u64 end, void *arg, arg, func); } -#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY) - /* * This function calls the @func callback against all memory ranges of type * System RAM which are marked as IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY. @@ -481,8 +479,6 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, return ret; } -#endif - static int __is_ram(unsigned long pfn, unsigned long nr_pages, void *arg) { return 1; From 3376cb91ed908eb0728900894a77d8206574dbcd Mon Sep 17 00:00:00 2001 From: Sam Bobroff Date: Thu, 29 Nov 2018 14:16:37 +1100 Subject: [PATCH 041/200] powerpc/eeh: Cleanup eeh_pe_clear_frozen_state() The 'clear_sw_state' parameter for eeh_pe_clear_frozen_state() is redundant because it has no effect (except in the rare case of a hardware error part way through unfreezing a tree of PEs, where it would dangerously allow partial de-isolation before returning failure). It is passed down to __eeh_pe_clear_frozen_state(), and from there to eeh_unfreeze_pe(), where it causes EEH_PE_ISOLATED to be removed from the state of each PE during the traversal. However, when the traversal finishes, EEH_PE_ISOLATED is unconditionally removed by a call to eeh_pe_state_clear() regardless of the parameter's value. So remove the flag and pass false to eeh_unfreeze_pe() (to avoid the rare case described above, as it was before the flag was introduced). Also, perform the recursion directly in the function and eliminate a bit of boilerplate. There should be no change in functionality, except as mentioned above. Signed-off-by: Sam Bobroff Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/eeh_driver.c | 40 +++++++++++--------------------- 1 file changed, 13 insertions(+), 27 deletions(-) diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index 99eab7bc7edc..3456d9c2d4da 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -591,34 +591,20 @@ static void *eeh_pe_detach_dev(struct eeh_pe *pe, void *userdata) * PE reset (for 3 times), we try to clear the frozen state * for 3 times as well. */ -static void *__eeh_clear_pe_frozen_state(struct eeh_pe *pe, void *flag) +static int eeh_clear_pe_frozen_state(struct eeh_pe *root) { - bool clear_sw_state = *(bool *)flag; - int i, rc = 1; + struct eeh_pe *pe; + int i; - for (i = 0; rc && i < 3; i++) - rc = eeh_unfreeze_pe(pe, clear_sw_state); - - /* Stop immediately on any errors */ - if (rc) { - pr_warn("%s: Failure %d unfreezing PHB#%x-PE#%x\n", - __func__, rc, pe->phb->global_number, pe->addr); - return (void *)pe; + eeh_for_each_pe(root, pe) { + for (i = 0; i < 3; i++) + if (!eeh_unfreeze_pe(pe, false)) + break; + if (i >= 3) + return -EIO; } - - return NULL; -} - -static int eeh_clear_pe_frozen_state(struct eeh_pe *pe, - bool clear_sw_state) -{ - void *rc; - - rc = eeh_pe_traverse(pe, __eeh_clear_pe_frozen_state, &clear_sw_state); - if (!rc) - eeh_pe_state_clear(pe, EEH_PE_ISOLATED); - - return rc ? -EIO : 0; + eeh_pe_state_clear(root, EEH_PE_ISOLATED); + return 0; } int eeh_pe_reset_and_recover(struct eeh_pe *pe) @@ -643,7 +629,7 @@ int eeh_pe_reset_and_recover(struct eeh_pe *pe) } /* Unfreeze the PE */ - ret = eeh_clear_pe_frozen_state(pe, true); + ret = eeh_clear_pe_frozen_state(pe); if (ret) { eeh_pe_state_clear(pe, EEH_PE_RECOVERING); return ret; @@ -716,7 +702,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus, eeh_pe_restore_bars(pe); /* Clear frozen state */ - rc = eeh_clear_pe_frozen_state(pe, false); + rc = eeh_clear_pe_frozen_state(pe); if (rc) { pci_unlock_rescan_remove(); return rc; From 188fdea69fa91dcd674a3d40f060a5891d4bc45a Mon Sep 17 00:00:00 2001 From: Sam Bobroff Date: Thu, 29 Nov 2018 14:16:38 +1100 Subject: [PATCH 042/200] powerpc/eeh: remove sw_state from eeh_unfreeze_pe() eeh_unfreeze_pe() performs two operations: unfreezing a PE (which may cause firmware to unfreeze child PEs as well) and de-isolating the PE and it's children. To simplify this and support future work, separate out the de-isolation and perform it at the call sites (when necessary). There should be no change in behaviour. Signed-off-by: Sam Bobroff Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/eeh.h | 2 +- arch/powerpc/kernel/eeh.c | 18 ++++++++++-------- arch/powerpc/kernel/eeh_driver.c | 2 +- arch/powerpc/kernel/eeh_sysfs.c | 3 ++- 4 files changed, 14 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index 8b596d096ebe..2ff123f745cc 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -293,7 +293,7 @@ void eeh_add_device_late(struct pci_dev *); void eeh_add_device_tree_late(struct pci_bus *); void eeh_add_sysfs_files(struct pci_bus *); void eeh_remove_device(struct pci_dev *); -int eeh_unfreeze_pe(struct eeh_pe *pe, bool sw_state); +int eeh_unfreeze_pe(struct eeh_pe *pe); int eeh_pe_reset_and_recover(struct eeh_pe *pe); int eeh_dev_open(struct pci_dev *pdev); void eeh_dev_release(struct pci_dev *pdev); diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index ae05203eb4de..c56537d03017 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -823,7 +823,7 @@ int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state stat switch (state) { case pcie_deassert_reset: eeh_ops->reset(pe, EEH_RESET_DEACTIVATE); - eeh_unfreeze_pe(pe, false); + eeh_unfreeze_pe(pe); if (!(pe->type & EEH_PE_VF)) eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED); eeh_pe_dev_traverse(pe, eeh_restore_dev_state, dev); @@ -1309,7 +1309,7 @@ void eeh_remove_device(struct pci_dev *dev) edev->mode &= ~EEH_DEV_SYSFS; } -int eeh_unfreeze_pe(struct eeh_pe *pe, bool sw_state) +int eeh_unfreeze_pe(struct eeh_pe *pe) { int ret; @@ -1327,10 +1327,6 @@ int eeh_unfreeze_pe(struct eeh_pe *pe, bool sw_state) return ret; } - /* Clear software isolated state */ - if (sw_state && (pe->state & EEH_PE_ISOLATED)) - eeh_pe_state_clear(pe, EEH_PE_ISOLATED); - return ret; } @@ -1382,7 +1378,10 @@ static int eeh_pe_change_owner(struct eeh_pe *pe) } } - return eeh_unfreeze_pe(pe, true); + ret = eeh_unfreeze_pe(pe); + if (!ret) + eeh_pe_state_clear(pe, EEH_PE_ISOLATED); + return ret; } /** @@ -1639,7 +1638,10 @@ static int eeh_pe_reenable_devices(struct eeh_pe *pe) } /* The PE is still in frozen state */ - return eeh_unfreeze_pe(pe, true); + ret = eeh_unfreeze_pe(pe); + if (!ret) + eeh_pe_state_clear(pe, EEH_PE_ISOLATED); + return ret; } diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index 3456d9c2d4da..5303429ac0e3 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -598,7 +598,7 @@ static int eeh_clear_pe_frozen_state(struct eeh_pe *root) eeh_for_each_pe(root, pe) { for (i = 0; i < 3; i++) - if (!eeh_unfreeze_pe(pe, false)) + if (!eeh_unfreeze_pe(pe)) break; if (i >= 3) return -EIO; diff --git a/arch/powerpc/kernel/eeh_sysfs.c b/arch/powerpc/kernel/eeh_sysfs.c index deed906dd8f1..0731d2f01dd9 100644 --- a/arch/powerpc/kernel/eeh_sysfs.c +++ b/arch/powerpc/kernel/eeh_sysfs.c @@ -82,8 +82,9 @@ static ssize_t eeh_pe_state_store(struct device *dev, if (!(edev->pe->state & EEH_PE_ISOLATED)) return count; - if (eeh_unfreeze_pe(edev->pe, true)) + if (eeh_unfreeze_pe(edev->pe)) return -EIO; + eeh_pe_state_clear(edev->pe, EEH_PE_ISOLATED); return count; } From 9ed5ca66aa66e5ce2e1d8758250a4d740052c8cd Mon Sep 17 00:00:00 2001 From: Sam Bobroff Date: Thu, 29 Nov 2018 14:16:39 +1100 Subject: [PATCH 043/200] powerpc/eeh: Add include_passed to eeh_pe_state_clear() Add a parameter to eeh_pe_state_clear() that allows passed-through PEs to be excluded. Update callers to always pass true so that there is no change in behaviour. Also refactor to use direct traversal, to allow the removal of some boilerplate. This is to prepare for follow-up work for passed-through devices. Signed-off-by: Sam Bobroff Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/ppc-pci.h | 2 +- arch/powerpc/kernel/eeh.c | 18 +++---- arch/powerpc/kernel/eeh_driver.c | 20 ++++---- arch/powerpc/kernel/eeh_pe.c | 76 +++++++++++++----------------- arch/powerpc/kernel/eeh_sysfs.c | 2 +- 5 files changed, 54 insertions(+), 64 deletions(-) diff --git a/arch/powerpc/include/asm/ppc-pci.h b/arch/powerpc/include/asm/ppc-pci.h index f67da277d652..08e094eaeccf 100644 --- a/arch/powerpc/include/asm/ppc-pci.h +++ b/arch/powerpc/include/asm/ppc-pci.h @@ -59,7 +59,7 @@ int rtas_write_config(struct pci_dn *, int where, int size, u32 val); int rtas_read_config(struct pci_dn *, int where, int size, u32 *val); void eeh_pe_state_mark(struct eeh_pe *pe, int state); void eeh_pe_mark_isolated(struct eeh_pe *pe); -void eeh_pe_state_clear(struct eeh_pe *pe, int state); +void eeh_pe_state_clear(struct eeh_pe *pe, int state, bool include_passed); void eeh_pe_state_mark_with_cfg(struct eeh_pe *pe, int state); void eeh_pe_dev_mode_mark(struct eeh_pe *pe, int mode); diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index c56537d03017..8d32587b07dc 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -825,13 +825,13 @@ int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state stat eeh_ops->reset(pe, EEH_RESET_DEACTIVATE); eeh_unfreeze_pe(pe); if (!(pe->type & EEH_PE_VF)) - eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED); + eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED, true); eeh_pe_dev_traverse(pe, eeh_restore_dev_state, dev); - eeh_pe_state_clear(pe, EEH_PE_ISOLATED); + eeh_pe_state_clear(pe, EEH_PE_ISOLATED, true); break; case pcie_hot_reset: eeh_pe_mark_isolated(pe); - eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED); + eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED, true); eeh_ops->set_option(pe, EEH_OPT_FREEZE_PE); eeh_pe_dev_traverse(pe, eeh_disable_and_save_dev_state, dev); if (!(pe->type & EEH_PE_VF)) @@ -840,7 +840,7 @@ int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state stat break; case pcie_warm_reset: eeh_pe_mark_isolated(pe); - eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED); + eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED, true); eeh_ops->set_option(pe, EEH_OPT_FREEZE_PE); eeh_pe_dev_traverse(pe, eeh_disable_and_save_dev_state, dev); if (!(pe->type & EEH_PE_VF)) @@ -848,7 +848,7 @@ int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state stat eeh_ops->reset(pe, EEH_RESET_FUNDAMENTAL); break; default: - eeh_pe_state_clear(pe, EEH_PE_ISOLATED | EEH_PE_CFG_BLOCKED); + eeh_pe_state_clear(pe, EEH_PE_ISOLATED | EEH_PE_CFG_BLOCKED, true); return -EINVAL; }; @@ -936,7 +936,7 @@ int eeh_pe_reset_full(struct eeh_pe *pe) __func__, state, pe->phb->global_number, pe->addr, (i + 1)); } - eeh_pe_state_clear(pe, reset_state); + eeh_pe_state_clear(pe, reset_state, true); return ret; } @@ -1380,7 +1380,7 @@ static int eeh_pe_change_owner(struct eeh_pe *pe) ret = eeh_unfreeze_pe(pe); if (!ret) - eeh_pe_state_clear(pe, EEH_PE_ISOLATED); + eeh_pe_state_clear(pe, EEH_PE_ISOLATED, true); return ret; } @@ -1640,7 +1640,7 @@ static int eeh_pe_reenable_devices(struct eeh_pe *pe) /* The PE is still in frozen state */ ret = eeh_unfreeze_pe(pe); if (!ret) - eeh_pe_state_clear(pe, EEH_PE_ISOLATED); + eeh_pe_state_clear(pe, EEH_PE_ISOLATED, true); return ret; } @@ -1668,7 +1668,7 @@ int eeh_pe_reset(struct eeh_pe *pe, int option) switch (option) { case EEH_RESET_DEACTIVATE: ret = eeh_ops->reset(pe, option); - eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED); + eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED, true); if (ret) break; diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index 5303429ac0e3..997aba0fe593 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -603,7 +603,7 @@ static int eeh_clear_pe_frozen_state(struct eeh_pe *root) if (i >= 3) return -EIO; } - eeh_pe_state_clear(root, EEH_PE_ISOLATED); + eeh_pe_state_clear(root, EEH_PE_ISOLATED, true); return 0; } @@ -624,14 +624,14 @@ int eeh_pe_reset_and_recover(struct eeh_pe *pe) /* Issue reset */ ret = eeh_pe_reset_full(pe); if (ret) { - eeh_pe_state_clear(pe, EEH_PE_RECOVERING); + eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true); return ret; } /* Unfreeze the PE */ ret = eeh_clear_pe_frozen_state(pe); if (ret) { - eeh_pe_state_clear(pe, EEH_PE_RECOVERING); + eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true); return ret; } @@ -639,7 +639,7 @@ int eeh_pe_reset_and_recover(struct eeh_pe *pe) eeh_pe_dev_traverse(pe, eeh_dev_restore_state, NULL); /* Clear recovery mode */ - eeh_pe_state_clear(pe, EEH_PE_RECOVERING); + eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true); return 0; } @@ -730,11 +730,11 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus, eeh_add_virt_device(edev); } else { if (!driver_eeh_aware) - eeh_pe_state_clear(pe, EEH_PE_PRI_BUS); + eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true); pci_hp_add_devices(bus); } } - eeh_pe_state_clear(pe, EEH_PE_KEEP); + eeh_pe_state_clear(pe, EEH_PE_KEEP, true); pe->tstamp = tstamp; pe->freeze_count = cnt; @@ -886,7 +886,7 @@ void eeh_handle_normal_event(struct eeh_pe *pe) * is still in frozen state. Clear it before * resuming the PE. */ - eeh_pe_state_clear(pe, EEH_PE_ISOLATED); + eeh_pe_state_clear(pe, EEH_PE_ISOLATED, true); result = PCI_ERS_RESULT_RECOVERED; } } @@ -963,7 +963,7 @@ void eeh_handle_normal_event(struct eeh_pe *pe) eeh_pe_dev_traverse(pe, eeh_rmv_device, NULL); eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); } else { - eeh_pe_state_clear(pe, EEH_PE_PRI_BUS); + eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true); eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); pci_lock_rescan_remove(); @@ -973,7 +973,7 @@ void eeh_handle_normal_event(struct eeh_pe *pe) return; } } - eeh_pe_state_clear(pe, EEH_PE_RECOVERING); + eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true); } /** @@ -1055,7 +1055,7 @@ void eeh_handle_special_event(void) continue; /* Notify all devices to be down */ - eeh_pe_state_clear(pe, EEH_PE_PRI_BUS); + eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true); eeh_set_channel_state(pe, pci_channel_io_perm_failure); eeh_pe_report( "error_detected(permanent failure)", pe, diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c index 6fa2032e0594..8b578891f27c 100644 --- a/arch/powerpc/kernel/eeh_pe.c +++ b/arch/powerpc/kernel/eeh_pe.c @@ -657,62 +657,52 @@ void eeh_pe_dev_mode_mark(struct eeh_pe *pe, int mode) } /** - * __eeh_pe_state_clear - Clear state for the PE + * eeh_pe_state_clear - Clear state for the PE * @data: EEH PE - * @flag: state + * @state: state + * @include_passed: include passed-through devices? * * The function is used to clear the indicated state from the * given PE. Besides, we also clear the check count of the PE * as well. */ -static void *__eeh_pe_state_clear(struct eeh_pe *pe, void *flag) +void eeh_pe_state_clear(struct eeh_pe *root, int state, bool include_passed) { - int state = *((int *)flag); + struct eeh_pe *pe; struct eeh_dev *edev, *tmp; struct pci_dev *pdev; - /* Keep the state of permanently removed PE intact */ - if (pe->state & EEH_PE_REMOVED) - return NULL; - - pe->state &= ~state; - - /* - * Special treatment on clearing isolated state. Clear - * check count since last isolation and put all affected - * devices to normal state. - */ - if (!(state & EEH_PE_ISOLATED)) - return NULL; - - pe->check_count = 0; - eeh_pe_for_each_dev(pe, edev, tmp) { - pdev = eeh_dev_to_pci_dev(edev); - if (!pdev) + eeh_for_each_pe(root, pe) { + /* Keep the state of permanently removed PE intact */ + if (pe->state & EEH_PE_REMOVED) continue; - pdev->error_state = pci_channel_io_normal; + if (!include_passed && eeh_pe_passed(pe)) + continue; + + pe->state &= ~state; + + /* + * Special treatment on clearing isolated state. Clear + * check count since last isolation and put all affected + * devices to normal state. + */ + if (!(state & EEH_PE_ISOLATED)) + continue; + + pe->check_count = 0; + eeh_pe_for_each_dev(pe, edev, tmp) { + pdev = eeh_dev_to_pci_dev(edev); + if (!pdev) + continue; + + pdev->error_state = pci_channel_io_normal; + } + + /* Unblock PCI config access if required */ + if (pe->state & EEH_PE_CFG_RESTRICTED) + pe->state &= ~EEH_PE_CFG_BLOCKED; } - - /* Unblock PCI config access if required */ - if (pe->state & EEH_PE_CFG_RESTRICTED) - pe->state &= ~EEH_PE_CFG_BLOCKED; - - return NULL; -} - -/** - * eeh_pe_state_clear - Clear state for the PE and its children - * @pe: PE - * @state: state to be cleared - * - * When the PE and its children has been recovered from error, - * we need clear the error state for that. The function is used - * for the purpose. - */ -void eeh_pe_state_clear(struct eeh_pe *pe, int state) -{ - eeh_pe_traverse(pe, __eeh_pe_state_clear, &state); } /* diff --git a/arch/powerpc/kernel/eeh_sysfs.c b/arch/powerpc/kernel/eeh_sysfs.c index 0731d2f01dd9..3fa04dda1737 100644 --- a/arch/powerpc/kernel/eeh_sysfs.c +++ b/arch/powerpc/kernel/eeh_sysfs.c @@ -84,7 +84,7 @@ static ssize_t eeh_pe_state_store(struct device *dev, if (eeh_unfreeze_pe(edev->pe)) return -EIO; - eeh_pe_state_clear(edev->pe, EEH_PE_ISOLATED); + eeh_pe_state_clear(edev->pe, EEH_PE_ISOLATED, true); return count; } From 4d8e325d9df32ef00136d7885f0c65bf124edd22 Mon Sep 17 00:00:00 2001 From: Sam Bobroff Date: Thu, 29 Nov 2018 14:16:40 +1100 Subject: [PATCH 044/200] powerpc/eeh: Add include_passed to eeh_clear_pe_frozen_state() Add a parameter to eeh_clear_pe_frozen_state() that allows passed-through PEs to be excluded. Update callers to always pass true so that there is no change in behaviour. This is to prepare for follow-up work for passed-through devices. Signed-off-by: Sam Bobroff Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/eeh_driver.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index 997aba0fe593..91629b3f3b74 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -591,19 +591,21 @@ static void *eeh_pe_detach_dev(struct eeh_pe *pe, void *userdata) * PE reset (for 3 times), we try to clear the frozen state * for 3 times as well. */ -static int eeh_clear_pe_frozen_state(struct eeh_pe *root) +static int eeh_clear_pe_frozen_state(struct eeh_pe *root, bool include_passed) { struct eeh_pe *pe; int i; eeh_for_each_pe(root, pe) { - for (i = 0; i < 3; i++) - if (!eeh_unfreeze_pe(pe)) - break; - if (i >= 3) - return -EIO; + if (include_passed || !eeh_pe_passed(pe)) { + for (i = 0; i < 3; i++) + if (!eeh_unfreeze_pe(pe)) + break; + if (i >= 3) + return -EIO; + } } - eeh_pe_state_clear(root, EEH_PE_ISOLATED, true); + eeh_pe_state_clear(root, EEH_PE_ISOLATED, include_passed); return 0; } @@ -629,7 +631,7 @@ int eeh_pe_reset_and_recover(struct eeh_pe *pe) } /* Unfreeze the PE */ - ret = eeh_clear_pe_frozen_state(pe); + ret = eeh_clear_pe_frozen_state(pe, true); if (ret) { eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true); return ret; @@ -702,7 +704,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus, eeh_pe_restore_bars(pe); /* Clear frozen state */ - rc = eeh_clear_pe_frozen_state(pe); + rc = eeh_clear_pe_frozen_state(pe, true); if (rc) { pci_unlock_rescan_remove(); return rc; From 1ef52073fd25ea97090eaff2c8b528ebf401a12a Mon Sep 17 00:00:00 2001 From: Sam Bobroff Date: Thu, 29 Nov 2018 14:16:41 +1100 Subject: [PATCH 045/200] powerpc/eeh: Improve recovery of passed-through devices Currently, the EEH recovery process considers passed-through devices as if they were not EEH-aware, which can cause them to be removed as part of recovery. Because device removal requires cooperation from the guest, this may lead to the process stalling or deadlocking. Also, if devices are removed on the host side, they will be removed from their IOMMU group, making recovery in the guest impossible. Therefore, alter the recovery process so that passed-through devices are not removed but are instead left frozen (and marked isolated) until the guest performs it's own recovery. If firmware thaws a passed-through PE because it's parent PE has been thawed (because it was not passed through), re-freeze it. Signed-off-by: Sam Bobroff Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/eeh.h | 2 +- arch/powerpc/include/asm/ppc-pci.h | 2 +- arch/powerpc/kernel/eeh.c | 47 +++++++++++++++++++++++------- arch/powerpc/kernel/eeh_driver.c | 32 +++++++++----------- drivers/vfio/vfio_spapr_eeh.c | 6 ++-- 5 files changed, 55 insertions(+), 34 deletions(-) diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index 2ff123f745cc..0b655810f32d 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -300,7 +300,7 @@ void eeh_dev_release(struct pci_dev *pdev); struct eeh_pe *eeh_iommu_group_to_pe(struct iommu_group *group); int eeh_pe_set_option(struct eeh_pe *pe, int option); int eeh_pe_get_state(struct eeh_pe *pe); -int eeh_pe_reset(struct eeh_pe *pe, int option); +int eeh_pe_reset(struct eeh_pe *pe, int option, bool include_passed); int eeh_pe_configure(struct eeh_pe *pe); int eeh_pe_inject_err(struct eeh_pe *pe, int type, int func, unsigned long addr, unsigned long mask); diff --git a/arch/powerpc/include/asm/ppc-pci.h b/arch/powerpc/include/asm/ppc-pci.h index 08e094eaeccf..f191ef0d2a0a 100644 --- a/arch/powerpc/include/asm/ppc-pci.h +++ b/arch/powerpc/include/asm/ppc-pci.h @@ -53,7 +53,7 @@ void eeh_addr_cache_rmv_dev(struct pci_dev *dev); struct eeh_dev *eeh_addr_cache_get_dev(unsigned long addr); void eeh_slot_error_detail(struct eeh_pe *pe, int severity); int eeh_pci_enable(struct eeh_pe *pe, int function); -int eeh_pe_reset_full(struct eeh_pe *pe); +int eeh_pe_reset_full(struct eeh_pe *pe, bool include_passed); void eeh_save_bars(struct eeh_dev *edev); int rtas_write_config(struct pci_dn *, int where, int size, u32 val); int rtas_read_config(struct pci_dn *, int where, int size, u32 *val); diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 8d32587b07dc..416d1ef49762 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -877,6 +877,24 @@ static void *eeh_set_dev_freset(struct eeh_dev *edev, void *flag) return NULL; } +static void eeh_pe_refreeze_passed(struct eeh_pe *root) +{ + struct eeh_pe *pe; + int state; + + eeh_for_each_pe(root, pe) { + if (eeh_pe_passed(pe)) { + state = eeh_ops->get_state(pe, NULL); + if (state & + (EEH_STATE_MMIO_ACTIVE | EEH_STATE_MMIO_ENABLED)) { + pr_info("EEH: Passed-through PE PHB#%x-PE#%x was thawed by reset, re-freezing for safety.\n", + pe->phb->global_number, pe->addr); + eeh_pe_set_option(pe, EEH_OPT_FREEZE_PE); + } + } + } +} + /** * eeh_pe_reset_full - Complete a full reset process on the indicated PE * @pe: EEH PE @@ -889,7 +907,7 @@ static void *eeh_set_dev_freset(struct eeh_dev *edev, void *flag) * * This function will attempt to reset a PE three times before failing. */ -int eeh_pe_reset_full(struct eeh_pe *pe) +int eeh_pe_reset_full(struct eeh_pe *pe, bool include_passed) { int reset_state = (EEH_PE_RESET | EEH_PE_CFG_BLOCKED); int type = EEH_RESET_HOT; @@ -911,11 +929,11 @@ int eeh_pe_reset_full(struct eeh_pe *pe) /* Make three attempts at resetting the bus */ for (i = 0; i < 3; i++) { - ret = eeh_pe_reset(pe, type); + ret = eeh_pe_reset(pe, type, include_passed); if (ret) break; - ret = eeh_pe_reset(pe, EEH_RESET_DEACTIVATE); + ret = eeh_pe_reset(pe, EEH_RESET_DEACTIVATE, include_passed); if (ret) break; @@ -936,6 +954,12 @@ int eeh_pe_reset_full(struct eeh_pe *pe) __func__, state, pe->phb->global_number, pe->addr, (i + 1)); } + /* Resetting the PE may have unfrozen child PEs. If those PEs have been + * (potentially) passed through to a guest, re-freeze them: + */ + if (!include_passed) + eeh_pe_refreeze_passed(pe); + eeh_pe_state_clear(pe, reset_state, true); return ret; } @@ -1611,13 +1635,12 @@ int eeh_pe_get_state(struct eeh_pe *pe) } EXPORT_SYMBOL_GPL(eeh_pe_get_state); -static int eeh_pe_reenable_devices(struct eeh_pe *pe) +static int eeh_pe_reenable_devices(struct eeh_pe *pe, bool include_passed) { struct eeh_dev *edev, *tmp; struct pci_dev *pdev; int ret = 0; - /* Restore config space */ eeh_pe_restore_bars(pe); /* @@ -1638,9 +1661,13 @@ static int eeh_pe_reenable_devices(struct eeh_pe *pe) } /* The PE is still in frozen state */ - ret = eeh_unfreeze_pe(pe); + if (include_passed || !eeh_pe_passed(pe)) { + ret = eeh_unfreeze_pe(pe); + } else + pr_info("EEH: Note: Leaving passthrough PHB#%x-PE#%x frozen.\n", + pe->phb->global_number, pe->addr); if (!ret) - eeh_pe_state_clear(pe, EEH_PE_ISOLATED, true); + eeh_pe_state_clear(pe, EEH_PE_ISOLATED, include_passed); return ret; } @@ -1654,7 +1681,7 @@ static int eeh_pe_reenable_devices(struct eeh_pe *pe) * indicated type, either fundamental reset or hot reset. * PE reset is the most important part for error recovery. */ -int eeh_pe_reset(struct eeh_pe *pe, int option) +int eeh_pe_reset(struct eeh_pe *pe, int option, bool include_passed) { int ret = 0; @@ -1668,11 +1695,11 @@ int eeh_pe_reset(struct eeh_pe *pe, int option) switch (option) { case EEH_RESET_DEACTIVATE: ret = eeh_ops->reset(pe, option); - eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED, true); + eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED, include_passed); if (ret) break; - ret = eeh_pe_reenable_devices(pe); + ret = eeh_pe_reenable_devices(pe, include_passed); break; case EEH_RESET_HOT: case EEH_RESET_FUNDAMENTAL: diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index 91629b3f3b74..89623962c727 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -510,22 +510,11 @@ static void *eeh_rmv_device(struct eeh_dev *edev, void *userdata) * support EEH. So we just care about PCI devices for * simplicity here. */ - if (!dev || (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)) - return NULL; - - /* - * We rely on count-based pcibios_release_device() to - * detach permanently offlined PEs. Unfortunately, that's - * not reliable enough. We might have the permanently - * offlined PEs attached, but we needn't take care of - * them and their child devices. - */ - if (eeh_dev_removed(edev)) + if (!eeh_edev_actionable(edev) || + (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)) return NULL; if (rmv_data) { - if (eeh_pe_passed(edev->pe)) - return NULL; driver = eeh_pcid_get(dev); if (driver) { if (driver->err_handler && @@ -539,8 +528,8 @@ static void *eeh_rmv_device(struct eeh_dev *edev, void *userdata) } /* Remove it from PCI subsystem */ - pr_debug("EEH: Removing %s without EEH sensitive driver\n", - pci_name(dev)); + pr_info("EEH: Removing %s without EEH sensitive driver\n", + pci_name(dev)); edev->mode |= EEH_DEV_DISCONNECTED; if (rmv_data) rmv_data->removed_dev_count++; @@ -624,7 +613,7 @@ int eeh_pe_reset_and_recover(struct eeh_pe *pe) eeh_pe_dev_traverse(pe, eeh_dev_save_state, NULL); /* Issue reset */ - ret = eeh_pe_reset_full(pe); + ret = eeh_pe_reset_full(pe, true); if (ret) { eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true); return ret; @@ -664,6 +653,11 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus, time64_t tstamp; int cnt, rc; struct eeh_dev *edev; + struct eeh_pe *tmp_pe; + bool any_passed = false; + + eeh_for_each_pe(pe, tmp_pe) + any_passed |= eeh_pe_passed(tmp_pe); /* pcibios will clear the counter; save the value */ cnt = pe->freeze_count; @@ -676,7 +670,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus, * into pci_hp_add_devices(). */ eeh_pe_state_mark(pe, EEH_PE_KEEP); - if (driver_eeh_aware || (pe->type & EEH_PE_VF)) { + if (any_passed || driver_eeh_aware || (pe->type & EEH_PE_VF)) { eeh_pe_dev_traverse(pe, eeh_rmv_device, rmv_data); } else { pci_lock_rescan_remove(); @@ -693,7 +687,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus, * config accesses. So we prefer to block them. However, controlled * PCI config accesses initiated from EEH itself are allowed. */ - rc = eeh_pe_reset_full(pe); + rc = eeh_pe_reset_full(pe, false); if (rc) return rc; @@ -704,7 +698,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus, eeh_pe_restore_bars(pe); /* Clear frozen state */ - rc = eeh_clear_pe_frozen_state(pe, true); + rc = eeh_clear_pe_frozen_state(pe, false); if (rc) { pci_unlock_rescan_remove(); return rc; diff --git a/drivers/vfio/vfio_spapr_eeh.c b/drivers/vfio/vfio_spapr_eeh.c index 38edeb4729a9..1a742fe8f6db 100644 --- a/drivers/vfio/vfio_spapr_eeh.c +++ b/drivers/vfio/vfio_spapr_eeh.c @@ -74,13 +74,13 @@ long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group, ret = eeh_pe_get_state(pe); break; case VFIO_EEH_PE_RESET_DEACTIVATE: - ret = eeh_pe_reset(pe, EEH_RESET_DEACTIVATE); + ret = eeh_pe_reset(pe, EEH_RESET_DEACTIVATE, true); break; case VFIO_EEH_PE_RESET_HOT: - ret = eeh_pe_reset(pe, EEH_RESET_HOT); + ret = eeh_pe_reset(pe, EEH_RESET_HOT, true); break; case VFIO_EEH_PE_RESET_FUNDAMENTAL: - ret = eeh_pe_reset(pe, EEH_RESET_FUNDAMENTAL); + ret = eeh_pe_reset(pe, EEH_RESET_FUNDAMENTAL, true); break; case VFIO_EEH_PE_CONFIGURE: ret = eeh_pe_configure(pe); From 195482c3633c5ce03c099c3e6b3f283b0ae116d6 Mon Sep 17 00:00:00 2001 From: Sam Bobroff Date: Thu, 29 Nov 2018 14:16:42 +1100 Subject: [PATCH 046/200] powerpc/eeh: Correct retries in eeh_pe_reset_full() Currently, eeh_pe_reset_full() will only attempt to reset a PE more than once if activating the reset state and deactivating it both succeed, but later polling shows that it hasn't become active. Change this so that it will try up to three times for any reason other than an unrecoverable slot error and adjust the message generation so that it's clear weather the reset has ultimately succeeded or failed. This allows the reset to succeed in some situations where it would currently fail. Signed-off-by: Sam Bobroff Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/eeh.c | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 416d1ef49762..98d8755ac4c8 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -912,7 +912,7 @@ int eeh_pe_reset_full(struct eeh_pe *pe, bool include_passed) int reset_state = (EEH_PE_RESET | EEH_PE_CFG_BLOCKED); int type = EEH_RESET_HOT; unsigned int freset = 0; - int i, state, ret; + int i, state = 0, ret; /* * Determine the type of reset to perform - hot or fundamental. @@ -930,28 +930,32 @@ int eeh_pe_reset_full(struct eeh_pe *pe, bool include_passed) /* Make three attempts at resetting the bus */ for (i = 0; i < 3; i++) { ret = eeh_pe_reset(pe, type, include_passed); - if (ret) - break; - - ret = eeh_pe_reset(pe, EEH_RESET_DEACTIVATE, include_passed); - if (ret) - break; + if (!ret) + ret = eeh_pe_reset(pe, EEH_RESET_DEACTIVATE, + include_passed); + if (ret) { + ret = -EIO; + pr_warn("EEH: Failure %d resetting PHB#%x-PE#%x (attempt %d)\n\n", + state, pe->phb->global_number, pe->addr, i + 1); + continue; + } + if (i) + pr_warn("EEH: PHB#%x-PE#%x: Successful reset (attempt %d)\n", + pe->phb->global_number, pe->addr, i + 1); /* Wait until the PE is in a functioning state */ state = eeh_wait_state(pe, PCI_BUS_RESET_WAIT_MSEC); if (state < 0) { - pr_warn("%s: Unrecoverable slot failure on PHB#%x-PE#%x", - __func__, pe->phb->global_number, pe->addr); + pr_warn("EEH: Unrecoverable slot failure on PHB#%x-PE#%x", + pe->phb->global_number, pe->addr); ret = -ENOTRECOVERABLE; break; } if (eeh_state_active(state)) break; - - /* Set error in case this is our last attempt */ - ret = -EIO; - pr_warn("%s: Failure %d resetting PHB#%x-PE#%x\n (%d)\n", - __func__, state, pe->phb->global_number, pe->addr, (i + 1)); + else + pr_warn("EEH: PHB#%x-PE#%x: Slot inactive after reset: 0x%x (attempt %d)\n", + pe->phb->global_number, pe->addr, state, i + 1); } /* Resetting the PE may have unfrozen child PEs. If those PEs have been From 98ecc6768e8fdba95da1fc1efa0ef2d769e7fe1c Mon Sep 17 00:00:00 2001 From: Joel Stanley Date: Wed, 14 Nov 2018 13:32:18 +1030 Subject: [PATCH 047/200] powerpc/32: Include .branch_lt in data section When building a 32 bit powerpc kernel with Binutils 2.31.1 this warning is emitted: powerpc-linux-gnu-ld: warning: orphan section `.branch_lt' from `arch/powerpc/kernel/head_44x.o' being placed in section `.branch_lt' As of binutils commit 2d7ad24e8726 ("Support PLT16 relocs against local symbols")[1], 32 bit targets can produce .branch_lt sections in their output. Include these symbols in the .data section as the ppc64 kernel does. [1] https://sourceware.org/git/gitweb.cgi?p=binutils-gdb.git;a=commitdiff;h=2d7ad24e8726ba4c45c9e67be08223a146a837ce Signed-off-by: Joel Stanley Reviewed-by: Alan Modra Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/vmlinux.lds.S | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index 3ae4c959f95b..c3efb972c8c1 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -319,6 +319,7 @@ SECTIONS *(.sdata2) *(.got.plt) *(.got) *(.plt) + *(.branch_lt) } #else .data : AT(ADDR(.data) - LOAD_OFFSET) { From ebb0e13ead2ddc186a80b1b0235deeefc5a1a667 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Wed, 30 Jan 2019 10:46:00 -0200 Subject: [PATCH 048/200] powerpc/ptrace: Mitigate potential Spectre v1 'regno' is directly controlled by user space, hence leading to a potential exploitation of the Spectre variant 1 vulnerability. On PTRACE_SETREGS and PTRACE_GETREGS requests, user space passes the register number that would be read or written. This register number is called 'regno' which is part of the 'addr' syscall parameter. This 'regno' value is checked against the maximum pt_regs structure size, and then used to dereference it, which matches the initial part of a Spectre v1 (and Spectre v1.1) attack. The dereferenced value, then, is returned to userspace in the GETREGS case. This patch sanitizes 'regno' before using it to dereference pt_reg. Notice that given that speculation windows are large, the policy is to kill the speculation on the first load and not worry if it can be completed with a dependent load/store [1]. [1] https://marc.info/?l=linux-kernel&m=152449131114778&w=2 Signed-off-by: Breno Leitao Acked-by: Gustavo A. R. Silva Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/ptrace.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c index cdd5d1d3ae41..7535f89e08cd 100644 --- a/arch/powerpc/kernel/ptrace.c +++ b/arch/powerpc/kernel/ptrace.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -274,6 +275,8 @@ static int set_user_trap(struct task_struct *task, unsigned long trap) */ int ptrace_get_reg(struct task_struct *task, int regno, unsigned long *data) { + unsigned int regs_max; + if ((task->thread.regs == NULL) || !data) return -EIO; @@ -297,7 +300,9 @@ int ptrace_get_reg(struct task_struct *task, int regno, unsigned long *data) } #endif - if (regno < (sizeof(struct user_pt_regs) / sizeof(unsigned long))) { + regs_max = sizeof(struct user_pt_regs) / sizeof(unsigned long); + if (regno < regs_max) { + regno = array_index_nospec(regno, regs_max); *data = ((unsigned long *)task->thread.regs)[regno]; return 0; } @@ -321,6 +326,7 @@ int ptrace_put_reg(struct task_struct *task, int regno, unsigned long data) return set_user_dscr(task, data); if (regno <= PT_MAX_PUT_REG) { + regno = array_index_nospec(regno, PT_MAX_PUT_REG + 1); ((unsigned long *)task->thread.regs)[regno] = data; return 0; } From b174b4fb919d118d9ac546b99a69574dfa431f7f Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Fri, 1 Feb 2019 11:42:01 +1100 Subject: [PATCH 049/200] powerpc/powernv: Escalate reset when IODA reset fails The IODA reset is used to flush out any OS controlled state from the PHB. This reset can fail if a PHB fatal error has occurred in early boot, probably due to a because of a bad device. We already do a fundemental reset of the device in some cases, so this patch just adds a test to force a full reset if firmware reports an error when performing the IODA reset. Signed-off-by: Oliver O'Halloran Reviewed-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/pci-ioda.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 1d6406a051f1..53982f887a7f 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -3943,9 +3943,12 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, * shutdown PCI devices correctly. We already got IODA table * cleaned out. So we have to issue PHB reset to stop all PCI * transactions from previous kernel. The ppc_pci_reset_phbs - * kernel parameter will force this reset too. + * kernel parameter will force this reset too. Additionally, + * if the IODA reset above failed then use a bigger hammer. + * This can happen if we get a PHB fatal error in very early + * boot. */ - if (is_kdump_kernel() || pci_reset_phbs) { + if (is_kdump_kernel() || pci_reset_phbs || rc) { pr_info(" Issue PHB reset ...\n"); pnv_eeh_phb_reset(hose, EEH_RESET_FUNDAMENTAL); pnv_eeh_phb_reset(hose, EEH_RESET_DEACTIVATE); From 74ebe3e733b791f37415b3a1b917ee5035bc7364 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 Feb 2019 08:01:02 +0100 Subject: [PATCH 050/200] net: pasemi: set a 64-bit DMA mask on the DMA device The pasemi driver never set a DMA mask, and given that the powerpc DMA mapping routines never check it this worked ok so far. But the generic dma-direct code which I plan to switch on for powerpc checks the DMA mask and fails unsupported mapping requests, so we need to make sure the proper 64-bit mask is set. Reported-by: Christian Zigotzky Signed-off-by: Christoph Hellwig Tested-by: Christian Zigotzky Signed-off-by: Michael Ellerman --- drivers/net/ethernet/pasemi/pasemi_mac.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/pasemi/pasemi_mac.c b/drivers/net/ethernet/pasemi/pasemi_mac.c index d21041554507..a5bf46310f60 100644 --- a/drivers/net/ethernet/pasemi/pasemi_mac.c +++ b/drivers/net/ethernet/pasemi/pasemi_mac.c @@ -1716,6 +1716,7 @@ pasemi_mac_probe(struct pci_dev *pdev, const struct pci_device_id *ent) err = -ENODEV; goto out; } + dma_set_mask(&mac->dma_pdev->dev, DMA_BIT_MASK(64)); mac->iob_pdev = pci_get_device(PCI_VENDOR_ID_PASEMI, 0xa001, NULL); if (!mac->iob_pdev) { From fbce251baa6e357441961c78796e5e9fad682675 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 Feb 2019 08:01:03 +0100 Subject: [PATCH 051/200] dma-direct: we might need GFP_DMA for 32-bit dma masks If there is no ZONE_DMA32 we might need GFP_DMA to be able to allocate memory that satisfies a 32-bit DMA mask. Reported-by: Christian Zigotzky Signed-off-by: Christoph Hellwig Tested-by: Christian Zigotzky Signed-off-by: Michael Ellerman --- kernel/dma/direct.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 355d16acee6d..d5bb51cf27c6 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -132,8 +132,7 @@ again: goto again; } - if (IS_ENABLED(CONFIG_ZONE_DMA) && - phys_mask < DMA_BIT_MASK(32) && !(gfp & GFP_DMA)) { + if (IS_ENABLED(CONFIG_ZONE_DMA) && !(gfp & GFP_DMA)) { gfp = (gfp & ~GFP_DMA32) | GFP_DMA; goto again; } From a20f507f577b04f286c88a4885ac528e69f6f308 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 Feb 2019 08:01:04 +0100 Subject: [PATCH 052/200] powerpc/dma: untangle vio_dma_mapping_ops from dma_iommu_ops vio_dma_mapping_ops currently does a lot of indirect calls through dma_iommu_ops, which not only make the code harder to follow but are also expensive in the post-spectre world. Unwind the indirect calls by calling the ppc_iommu_* or iommu_* APIs directly applicable, or just use the dma_iommu_* methods directly where we can. Signed-off-by: Christoph Hellwig Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/iommu.h | 1 + arch/powerpc/kernel/dma-iommu.c | 2 +- arch/powerpc/platforms/pseries/vio.c | 87 ++++++++++++---------------- 3 files changed, 38 insertions(+), 52 deletions(-) diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index 17524d222a7b..bd069a6542ab 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -237,6 +237,7 @@ static inline void iommu_del_device(struct device *dev) } #endif /* !CONFIG_IOMMU_API */ +u64 dma_iommu_get_required_mask(struct device *dev); #else static inline void *get_iommu_table_base(struct device *dev) diff --git a/arch/powerpc/kernel/dma-iommu.c b/arch/powerpc/kernel/dma-iommu.c index 9c9bcaae2f75..dd8601cd20df 100644 --- a/arch/powerpc/kernel/dma-iommu.c +++ b/arch/powerpc/kernel/dma-iommu.c @@ -92,7 +92,7 @@ int dma_iommu_dma_supported(struct device *dev, u64 mask) return 1; } -static u64 dma_iommu_get_required_mask(struct device *dev) +u64 dma_iommu_get_required_mask(struct device *dev) { struct iommu_table *tbl = get_iommu_table_base(dev); u64 mask; diff --git a/arch/powerpc/platforms/pseries/vio.c b/arch/powerpc/platforms/pseries/vio.c index 1fad4649735b..7870bf99168c 100644 --- a/arch/powerpc/platforms/pseries/vio.c +++ b/arch/powerpc/platforms/pseries/vio.c @@ -492,7 +492,9 @@ static void *vio_dma_iommu_alloc_coherent(struct device *dev, size_t size, return NULL; } - ret = dma_iommu_ops.alloc(dev, size, dma_handle, flag, attrs); + ret = iommu_alloc_coherent(dev, get_iommu_table_base(dev), size, + dma_handle, dev->coherent_dma_mask, flag, + dev_to_node(dev)); if (unlikely(ret == NULL)) { vio_cmo_dealloc(viodev, roundup(size, PAGE_SIZE)); atomic_inc(&viodev->cmo.allocs_failed); @@ -507,8 +509,7 @@ static void vio_dma_iommu_free_coherent(struct device *dev, size_t size, { struct vio_dev *viodev = to_vio_dev(dev); - dma_iommu_ops.free(dev, size, vaddr, dma_handle, attrs); - + iommu_free_coherent(get_iommu_table_base(dev), size, vaddr, dma_handle); vio_cmo_dealloc(viodev, roundup(size, PAGE_SIZE)); } @@ -518,22 +519,22 @@ static dma_addr_t vio_dma_iommu_map_page(struct device *dev, struct page *page, unsigned long attrs) { struct vio_dev *viodev = to_vio_dev(dev); - struct iommu_table *tbl; + struct iommu_table *tbl = get_iommu_table_base(dev); dma_addr_t ret = DMA_MAPPING_ERROR; - tbl = get_iommu_table_base(dev); - if (vio_cmo_alloc(viodev, roundup(size, IOMMU_PAGE_SIZE(tbl)))) { - atomic_inc(&viodev->cmo.allocs_failed); - return ret; - } - - ret = dma_iommu_ops.map_page(dev, page, offset, size, direction, attrs); - if (unlikely(dma_mapping_error(dev, ret))) { - vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE(tbl))); - atomic_inc(&viodev->cmo.allocs_failed); - } - + if (vio_cmo_alloc(viodev, roundup(size, IOMMU_PAGE_SIZE(tbl)))) + goto out_fail; + ret = iommu_map_page(dev, tbl, page, offset, size, device_to_mask(dev), + direction, attrs); + if (unlikely(ret == DMA_MAPPING_ERROR)) + goto out_deallocate; return ret; + +out_deallocate: + vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE(tbl))); +out_fail: + atomic_inc(&viodev->cmo.allocs_failed); + return DMA_MAPPING_ERROR; } static void vio_dma_iommu_unmap_page(struct device *dev, dma_addr_t dma_handle, @@ -542,11 +543,9 @@ static void vio_dma_iommu_unmap_page(struct device *dev, dma_addr_t dma_handle, unsigned long attrs) { struct vio_dev *viodev = to_vio_dev(dev); - struct iommu_table *tbl; - - tbl = get_iommu_table_base(dev); - dma_iommu_ops.unmap_page(dev, dma_handle, size, direction, attrs); + struct iommu_table *tbl = get_iommu_table_base(dev); + iommu_unmap_page(tbl, dma_handle, size, direction, attrs); vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE(tbl))); } @@ -555,34 +554,32 @@ static int vio_dma_iommu_map_sg(struct device *dev, struct scatterlist *sglist, unsigned long attrs) { struct vio_dev *viodev = to_vio_dev(dev); - struct iommu_table *tbl; + struct iommu_table *tbl = get_iommu_table_base(dev); struct scatterlist *sgl; int ret, count; size_t alloc_size = 0; - tbl = get_iommu_table_base(dev); for_each_sg(sglist, sgl, nelems, count) alloc_size += roundup(sgl->length, IOMMU_PAGE_SIZE(tbl)); - if (vio_cmo_alloc(viodev, alloc_size)) { - atomic_inc(&viodev->cmo.allocs_failed); - return 0; - } - - ret = dma_iommu_ops.map_sg(dev, sglist, nelems, direction, attrs); - - if (unlikely(!ret)) { - vio_cmo_dealloc(viodev, alloc_size); - atomic_inc(&viodev->cmo.allocs_failed); - return ret; - } + if (vio_cmo_alloc(viodev, alloc_size)) + goto out_fail; + ret = ppc_iommu_map_sg(dev, tbl, sglist, nelems, device_to_mask(dev), + direction, attrs); + if (unlikely(!ret)) + goto out_deallocate; for_each_sg(sglist, sgl, ret, count) alloc_size -= roundup(sgl->dma_length, IOMMU_PAGE_SIZE(tbl)); if (alloc_size) vio_cmo_dealloc(viodev, alloc_size); - return ret; + +out_deallocate: + vio_cmo_dealloc(viodev, alloc_size); +out_fail: + atomic_inc(&viodev->cmo.allocs_failed); + return 0; } static void vio_dma_iommu_unmap_sg(struct device *dev, @@ -591,30 +588,18 @@ static void vio_dma_iommu_unmap_sg(struct device *dev, unsigned long attrs) { struct vio_dev *viodev = to_vio_dev(dev); - struct iommu_table *tbl; + struct iommu_table *tbl = get_iommu_table_base(dev); struct scatterlist *sgl; size_t alloc_size = 0; int count; - tbl = get_iommu_table_base(dev); for_each_sg(sglist, sgl, nelems, count) alloc_size += roundup(sgl->dma_length, IOMMU_PAGE_SIZE(tbl)); - dma_iommu_ops.unmap_sg(dev, sglist, nelems, direction, attrs); - + ppc_iommu_unmap_sg(tbl, sglist, nelems, direction, attrs); vio_cmo_dealloc(viodev, alloc_size); } -static int vio_dma_iommu_dma_supported(struct device *dev, u64 mask) -{ - return dma_iommu_ops.dma_supported(dev, mask); -} - -static u64 vio_dma_get_required_mask(struct device *dev) -{ - return dma_iommu_ops.get_required_mask(dev); -} - static const struct dma_map_ops vio_dma_mapping_ops = { .alloc = vio_dma_iommu_alloc_coherent, .free = vio_dma_iommu_free_coherent, @@ -623,8 +608,8 @@ static const struct dma_map_ops vio_dma_mapping_ops = { .unmap_sg = vio_dma_iommu_unmap_sg, .map_page = vio_dma_iommu_map_page, .unmap_page = vio_dma_iommu_unmap_page, - .dma_supported = vio_dma_iommu_dma_supported, - .get_required_mask = vio_dma_get_required_mask, + .dma_supported = dma_iommu_dma_supported, + .get_required_mask = dma_iommu_get_required_mask, }; /** From 8617a5c5bc001e52c40d6b2ece78e8f332039217 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 Feb 2019 08:01:05 +0100 Subject: [PATCH 053/200] powerpc/dma: handle iommu bypass in dma_iommu_ops Add a new iommu_bypass flag to struct dev_archdata so that the dma_iommu implementation can handle the direct mapping transparently instead of switiching ops around. Setting of this flag is controlled by new pci_controller_ops method. Signed-off-by: Christoph Hellwig Tested-by: Christian Zigotzky Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/device.h | 5 ++ arch/powerpc/include/asm/dma-mapping.h | 8 +++ arch/powerpc/include/asm/pci-bridge.h | 2 + arch/powerpc/kernel/dma-iommu.c | 70 +++++++++++++++++++++++--- arch/powerpc/kernel/dma.c | 19 +++---- 5 files changed, 87 insertions(+), 17 deletions(-) diff --git a/arch/powerpc/include/asm/device.h b/arch/powerpc/include/asm/device.h index 0245bfcaac32..1aa53318b4bc 100644 --- a/arch/powerpc/include/asm/device.h +++ b/arch/powerpc/include/asm/device.h @@ -19,6 +19,11 @@ struct iommu_table; * drivers/macintosh/macio_asic.c */ struct dev_archdata { + /* + * Set to %true if the dma_iommu_ops are requested to use a direct + * window instead of dynamically mapping memory. + */ + bool iommu_bypass : 1; /* * These two used to be a union. However, with the hybrid ops we need * both so here we store both a DMA offset for direct mappings and diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h index ebf66809f2d3..ff86b863eceb 100644 --- a/arch/powerpc/include/asm/dma-mapping.h +++ b/arch/powerpc/include/asm/dma-mapping.h @@ -29,6 +29,14 @@ extern int dma_nommu_mmap_coherent(struct device *dev, struct vm_area_struct *vma, void *cpu_addr, dma_addr_t handle, size_t size, unsigned long attrs); +int dma_nommu_map_sg(struct device *dev, struct scatterlist *sgl, + int nents, enum dma_data_direction direction, + unsigned long attrs); +dma_addr_t dma_nommu_map_page(struct device *dev, struct page *page, + unsigned long offset, size_t size, + enum dma_data_direction dir, unsigned long attrs); +int dma_nommu_dma_supported(struct device *dev, u64 mask); +u64 dma_nommu_get_required_mask(struct device *dev); #ifdef CONFIG_NOT_COHERENT_CACHE /* diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h index aee4fcc24990..d7492dca6599 100644 --- a/arch/powerpc/include/asm/pci-bridge.h +++ b/arch/powerpc/include/asm/pci-bridge.h @@ -19,6 +19,8 @@ struct device_node; struct pci_controller_ops { void (*dma_dev_setup)(struct pci_dev *pdev); void (*dma_bus_setup)(struct pci_bus *bus); + bool (*iommu_bypass_supported)(struct pci_dev *pdev, + u64 mask); int (*probe_mode)(struct pci_bus *bus); diff --git a/arch/powerpc/kernel/dma-iommu.c b/arch/powerpc/kernel/dma-iommu.c index dd8601cd20df..fda92156b194 100644 --- a/arch/powerpc/kernel/dma-iommu.c +++ b/arch/powerpc/kernel/dma-iommu.c @@ -6,12 +6,30 @@ * busses using the iommu infrastructure */ +#include +#include #include /* * Generic iommu implementation */ +/* + * The coherent mask may be smaller than the real mask, check if we can + * really use a direct window. + */ +static inline bool dma_iommu_alloc_bypass(struct device *dev) +{ + return dev->archdata.iommu_bypass && + dma_nommu_dma_supported(dev, dev->coherent_dma_mask); +} + +static inline bool dma_iommu_map_bypass(struct device *dev, + unsigned long attrs) +{ + return dev->archdata.iommu_bypass; +} + /* Allocates a contiguous real buffer and creates mappings over it. * Returns the virtual address of the buffer and sets dma_handle * to the dma address (mapping) of the first page. @@ -20,6 +38,9 @@ static void *dma_iommu_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flag, unsigned long attrs) { + if (dma_iommu_alloc_bypass(dev)) + return __dma_nommu_alloc_coherent(dev, size, dma_handle, flag, + attrs); return iommu_alloc_coherent(dev, get_iommu_table_base(dev), size, dma_handle, dev->coherent_dma_mask, flag, dev_to_node(dev)); @@ -29,7 +50,11 @@ static void dma_iommu_free_coherent(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle, unsigned long attrs) { - iommu_free_coherent(get_iommu_table_base(dev), size, vaddr, dma_handle); + if (dma_iommu_alloc_bypass(dev)) + __dma_nommu_free_coherent(dev, size, vaddr, dma_handle, attrs); + else + iommu_free_coherent(get_iommu_table_base(dev), size, vaddr, + dma_handle); } /* Creates TCEs for a user provided buffer. The user buffer must be @@ -42,6 +67,9 @@ static dma_addr_t dma_iommu_map_page(struct device *dev, struct page *page, enum dma_data_direction direction, unsigned long attrs) { + if (dma_iommu_map_bypass(dev, attrs)) + return dma_nommu_map_page(dev, page, offset, size, direction, + attrs); return iommu_map_page(dev, get_iommu_table_base(dev), page, offset, size, device_to_mask(dev), direction, attrs); } @@ -51,8 +79,9 @@ static void dma_iommu_unmap_page(struct device *dev, dma_addr_t dma_handle, size_t size, enum dma_data_direction direction, unsigned long attrs) { - iommu_unmap_page(get_iommu_table_base(dev), dma_handle, size, direction, - attrs); + if (!dma_iommu_map_bypass(dev, attrs)) + iommu_unmap_page(get_iommu_table_base(dev), dma_handle, size, + direction, attrs); } @@ -60,6 +89,8 @@ static int dma_iommu_map_sg(struct device *dev, struct scatterlist *sglist, int nelems, enum dma_data_direction direction, unsigned long attrs) { + if (dma_iommu_map_bypass(dev, attrs)) + return dma_nommu_map_sg(dev, sglist, nelems, direction, attrs); return ppc_iommu_map_sg(dev, get_iommu_table_base(dev), sglist, nelems, device_to_mask(dev), direction, attrs); } @@ -68,10 +99,20 @@ static void dma_iommu_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems, enum dma_data_direction direction, unsigned long attrs) { - ppc_iommu_unmap_sg(get_iommu_table_base(dev), sglist, nelems, + if (!dma_iommu_map_bypass(dev, attrs)) + ppc_iommu_unmap_sg(get_iommu_table_base(dev), sglist, nelems, direction, attrs); } +static bool dma_iommu_bypass_supported(struct device *dev, u64 mask) +{ + struct pci_dev *pdev = to_pci_dev(dev); + struct pci_controller *phb = pci_bus_to_host(pdev->bus); + + return phb->controller_ops.iommu_bypass_supported && + phb->controller_ops.iommu_bypass_supported(pdev, mask); +} + /* We support DMA to/from any memory page via the iommu */ int dma_iommu_dma_supported(struct device *dev, u64 mask) { @@ -83,22 +124,39 @@ int dma_iommu_dma_supported(struct device *dev, u64 mask) return 0; } + if (dev_is_pci(dev) && dma_iommu_bypass_supported(dev, mask)) { + dev->archdata.iommu_bypass = true; + dev_dbg(dev, "iommu: 64-bit OK, using fixed ops\n"); + return 1; + } + if (tbl->it_offset > (mask >> tbl->it_page_shift)) { dev_info(dev, "Warning: IOMMU offset too big for device mask\n"); dev_info(dev, "mask: 0x%08llx, table offset: 0x%08lx\n", mask, tbl->it_offset << tbl->it_page_shift); return 0; - } else - return 1; + } + + dev_dbg(dev, "iommu: not 64-bit, using default ops\n"); + dev->archdata.iommu_bypass = false; + return 1; } u64 dma_iommu_get_required_mask(struct device *dev) { struct iommu_table *tbl = get_iommu_table_base(dev); u64 mask; + if (!tbl) return 0; + if (dev_is_pci(dev)) { + u64 bypass_mask = dma_nommu_get_required_mask(dev); + + if (dma_iommu_bypass_supported(dev, bypass_mask)) + return bypass_mask; + } + mask = 1ULL < (fls_long(tbl->it_offset + tbl->it_size) - 1); mask += mask - 1; diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c index b1903ebb2e9c..e5db4d3f8bea 100644 --- a/arch/powerpc/kernel/dma.c +++ b/arch/powerpc/kernel/dma.c @@ -40,7 +40,7 @@ static u64 __maybe_unused get_pfn_limit(struct device *dev) return pfn; } -static int dma_nommu_dma_supported(struct device *dev, u64 mask) +int dma_nommu_dma_supported(struct device *dev, u64 mask) { #ifdef CONFIG_PPC64 u64 limit = get_dma_offset(dev) + (memblock_end_of_DRAM() - 1); @@ -178,9 +178,9 @@ int dma_nommu_mmap_coherent(struct device *dev, struct vm_area_struct *vma, vma->vm_page_prot); } -static int dma_nommu_map_sg(struct device *dev, struct scatterlist *sgl, - int nents, enum dma_data_direction direction, - unsigned long attrs) +int dma_nommu_map_sg(struct device *dev, struct scatterlist *sgl, + int nents, enum dma_data_direction direction, + unsigned long attrs) { struct scatterlist *sg; int i; @@ -209,7 +209,7 @@ static void dma_nommu_unmap_sg(struct device *dev, struct scatterlist *sgl, __dma_sync_page(sg_page(sg), sg->offset, sg->length, direction); } -static u64 dma_nommu_get_required_mask(struct device *dev) +u64 dma_nommu_get_required_mask(struct device *dev) { u64 end, mask; @@ -221,12 +221,9 @@ static u64 dma_nommu_get_required_mask(struct device *dev) return mask; } -static inline dma_addr_t dma_nommu_map_page(struct device *dev, - struct page *page, - unsigned long offset, - size_t size, - enum dma_data_direction dir, - unsigned long attrs) +dma_addr_t dma_nommu_map_page(struct device *dev, struct page *page, + unsigned long offset, size_t size, + enum dma_data_direction dir, unsigned long attrs) { if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) __dma_sync_page(page, offset, size, dir); From cd7c11ed3a3e466d9503266191d3b51703ea52f1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 Feb 2019 08:01:06 +0100 Subject: [PATCH 054/200] powerpc/pseries: unwind dma_get_required_mask_pSeriesLP a bit Call dma_get_required_mask_pSeriesLP directly instead of dma_iommu_ops to simply the code a bit. Signed-off-by: Christoph Hellwig Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/pseries/iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 8fc8fe0b9848..9f7ac75c5687 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -1278,7 +1278,7 @@ static u64 dma_get_required_mask_pSeriesLP(struct device *dev) return DMA_BIT_MASK(64); } - return dma_iommu_ops.get_required_mask(dev); + return dma_iommu_get_required_mask(dev); } static int iommu_mem_notifier(struct notifier_block *nb, unsigned long action, From 9ae2fddeda4cbf6fcdf08e16a1fb5533d70462ab Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 Feb 2019 08:01:07 +0100 Subject: [PATCH 055/200] powerpc/pseries: use the generic iommu bypass code Use the generic iommu bypass code instead of overriding set_dma_mask. Signed-off-by: Christoph Hellwig Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/pseries/iommu.c | 104 +++++++------------------ 1 file changed, 29 insertions(+), 75 deletions(-) diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 9f7ac75c5687..37d2ce3f55a3 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -978,7 +978,7 @@ static phys_addr_t ddw_memory_hotplug_max(void) * pdn: the parent pe node with the ibm,dma_window property * Future: also check if we can remap the base window for our base page size * - * returns the dma offset for use by dma_set_mask + * returns the dma offset for use by the direct mapped DMA code. */ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) { @@ -1198,87 +1198,40 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev) iommu_add_device(pci->table_group, &dev->dev); } -static int dma_set_mask_pSeriesLP(struct device *dev, u64 dma_mask) +static bool iommu_bypass_supported_pSeriesLP(struct pci_dev *pdev, u64 dma_mask) { - bool ddw_enabled = false; - struct device_node *pdn, *dn; - struct pci_dev *pdev; + struct device_node *dn = pci_device_to_OF_node(pdev), *pdn; const __be32 *dma_window = NULL; u64 dma_offset; - if (!dev->dma_mask) - return -EIO; - - if (!dev_is_pci(dev)) - goto check_mask; - - pdev = to_pci_dev(dev); - /* only attempt to use a new window if 64-bit DMA is requested */ - if (!disable_ddw && dma_mask == DMA_BIT_MASK(64)) { - dn = pci_device_to_OF_node(pdev); - dev_dbg(dev, "node is %pOF\n", dn); + if (dma_mask < DMA_BIT_MASK(64)) + return false; - /* - * the device tree might contain the dma-window properties - * per-device and not necessarily for the bus. So we need to - * search upwards in the tree until we either hit a dma-window - * property, OR find a parent with a table already allocated. - */ - for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->table_group; - pdn = pdn->parent) { - dma_window = of_get_property(pdn, "ibm,dma-window", NULL); - if (dma_window) - break; - } - if (pdn && PCI_DN(pdn)) { - dma_offset = enable_ddw(pdev, pdn); - if (dma_offset != 0) { - dev_info(dev, "Using 64-bit direct DMA at offset %llx\n", dma_offset); - set_dma_offset(dev, dma_offset); - set_dma_ops(dev, &dma_nommu_ops); - ddw_enabled = true; - } + dev_dbg(&pdev->dev, "node is %pOF\n", dn); + + /* + * the device tree might contain the dma-window properties + * per-device and not necessarily for the bus. So we need to + * search upwards in the tree until we either hit a dma-window + * property, OR find a parent with a table already allocated. + */ + for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->table_group; + pdn = pdn->parent) { + dma_window = of_get_property(pdn, "ibm,dma-window", NULL); + if (dma_window) + break; + } + + if (pdn && PCI_DN(pdn)) { + dma_offset = enable_ddw(pdev, pdn); + if (dma_offset != 0) { + set_dma_offset(&pdev->dev, dma_offset); + return true; } } - /* fall back on iommu ops */ - if (!ddw_enabled && get_dma_ops(dev) != &dma_iommu_ops) { - dev_info(dev, "Restoring 32-bit DMA via iommu\n"); - set_dma_ops(dev, &dma_iommu_ops); - } - -check_mask: - if (!dma_supported(dev, dma_mask)) - return -EIO; - - *dev->dma_mask = dma_mask; - return 0; -} - -static u64 dma_get_required_mask_pSeriesLP(struct device *dev) -{ - if (!dev->dma_mask) - return 0; - - if (!disable_ddw && dev_is_pci(dev)) { - struct pci_dev *pdev = to_pci_dev(dev); - struct device_node *dn; - - dn = pci_device_to_OF_node(pdev); - - /* search upwards for ibm,dma-window */ - for (; dn && PCI_DN(dn) && !PCI_DN(dn)->table_group; - dn = dn->parent) - if (of_get_property(dn, "ibm,dma-window", NULL)) - break; - /* if there is a ibm,ddw-applicable property require 64 bits */ - if (dn && PCI_DN(dn) && - of_get_property(dn, "ibm,ddw-applicable", NULL)) - return DMA_BIT_MASK(64); - } - - return dma_iommu_get_required_mask(dev); + return false; } static int iommu_mem_notifier(struct notifier_block *nb, unsigned long action, @@ -1373,8 +1326,9 @@ void iommu_init_early_pSeries(void) if (firmware_has_feature(FW_FEATURE_LPAR)) { pseries_pci_controller_ops.dma_bus_setup = pci_dma_bus_setup_pSeriesLP; pseries_pci_controller_ops.dma_dev_setup = pci_dma_dev_setup_pSeriesLP; - ppc_md.dma_set_mask = dma_set_mask_pSeriesLP; - ppc_md.dma_get_required_mask = dma_get_required_mask_pSeriesLP; + if (!disable_ddw) + pseries_pci_controller_ops.iommu_bypass_supported = + iommu_bypass_supported_pSeriesLP; } else { pseries_pci_controller_ops.dma_bus_setup = pci_dma_bus_setup_pSeries; pseries_pci_controller_ops.dma_dev_setup = pci_dma_dev_setup_pSeries; From cc9c156db500bda1487e25b451f9ff4d8dbee2ad Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 Feb 2019 08:01:08 +0100 Subject: [PATCH 056/200] powerpc/cell: move dma direct window setup out of dma_configure Configure the dma settings at device setup time, and stop playing games with get_pci_dma_ops. This prepares for using the common dma_configure code later on. Includes fixes from Michael Ellerman. Signed-off-by: Christoph Hellwig Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/cell/iommu.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/platforms/cell/iommu.c b/arch/powerpc/platforms/cell/iommu.c index af2a3c15e0ec..4c609c0db5af 100644 --- a/arch/powerpc/platforms/cell/iommu.c +++ b/arch/powerpc/platforms/cell/iommu.c @@ -656,14 +656,21 @@ static const struct dma_map_ops dma_iommu_fixed_ops = { .unmap_page = dma_fixed_unmap_page, }; +static u64 cell_iommu_get_fixed_address(struct device *dev); + static void cell_dma_dev_setup(struct device *dev) { - if (get_pci_dma_ops() == &dma_iommu_ops) + if (get_pci_dma_ops() == &dma_iommu_ops) { + u64 addr = cell_iommu_get_fixed_address(dev); + + if (addr != OF_BAD_ADDR) + set_dma_offset(dev, addr + dma_iommu_fixed_base); set_iommu_table_base(dev, cell_get_iommu_table(dev)); - else if (get_pci_dma_ops() == &dma_nommu_ops) + } else if (get_pci_dma_ops() == &dma_nommu_ops) { set_dma_offset(dev, cell_dma_nommu_offset); - else + } else { BUG(); + } } static void cell_pci_dma_dev_setup(struct pci_dev *dev) @@ -894,7 +901,11 @@ static u64 cell_iommu_get_fixed_address(struct device *dev) const u32 *ranges = NULL; int i, len, best, naddr, nsize, pna, range_size; + /* We can be called for platform devices that have no of_node */ np = of_node_get(dev->of_node); + if (!np) + goto out; + while (1) { naddr = of_n_addr_cells(np); nsize = of_n_size_cells(np); @@ -949,19 +960,14 @@ static int dma_suported_and_switch(struct device *dev, u64 dma_mask) { if (dma_mask == DMA_BIT_MASK(64) && cell_iommu_get_fixed_address(dev) != OF_BAD_ADDR) { - u64 addr = cell_iommu_get_fixed_address(dev) + - dma_iommu_fixed_base; dev_dbg(dev, "iommu: 64-bit OK, using fixed ops\n"); - dev_dbg(dev, "iommu: fixed addr = %llx\n", addr); set_dma_ops(dev, &dma_iommu_fixed_ops); - set_dma_offset(dev, addr); return 1; } if (dma_iommu_dma_supported(dev, dma_mask)) { dev_dbg(dev, "iommu: not 64-bit, using default ops\n"); - set_dma_ops(dev, get_pci_dma_ops()); - cell_dma_dev_setup(dev); + set_dma_ops(dev, &dma_iommu_ops); return 1; } From ba767b5283c06e1a2fcdd1835c33e42b8fccd09c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 Feb 2019 08:01:09 +0100 Subject: [PATCH 057/200] powerpc/cell: use the generic iommu bypass code This gets rid of a lot of clumsy code and finally allows us to mark dma_iommu_ops const. Includes fixes from Michael Ellerman. Signed-off-by: Christoph Hellwig Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/dma-mapping.h | 2 +- arch/powerpc/include/asm/iommu.h | 6 ++ arch/powerpc/kernel/dma-iommu.c | 7 +- arch/powerpc/platforms/cell/iommu.c | 140 ++----------------------- 4 files changed, 20 insertions(+), 135 deletions(-) diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h index ff86b863eceb..1d80174db8a4 100644 --- a/arch/powerpc/include/asm/dma-mapping.h +++ b/arch/powerpc/include/asm/dma-mapping.h @@ -74,7 +74,7 @@ static inline unsigned long device_to_mask(struct device *dev) * Available generic sets of operations */ #ifdef CONFIG_PPC64 -extern struct dma_map_ops dma_iommu_ops; +extern const struct dma_map_ops dma_iommu_ops; #endif extern const struct dma_map_ops dma_nommu_ops; diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index bd069a6542ab..6f00a892ebdf 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -319,5 +319,11 @@ extern void iommu_release_ownership(struct iommu_table *tbl); extern enum dma_data_direction iommu_tce_direction(unsigned long tce); extern unsigned long iommu_direction_to_tce_perm(enum dma_data_direction dir); +#ifdef CONFIG_PPC_CELL_NATIVE +extern bool iommu_fixed_is_weak; +#else +#define iommu_fixed_is_weak false +#endif + #endif /* __KERNEL__ */ #endif /* _ASM_IOMMU_H */ diff --git a/arch/powerpc/kernel/dma-iommu.c b/arch/powerpc/kernel/dma-iommu.c index fda92156b194..5a0b5e863b08 100644 --- a/arch/powerpc/kernel/dma-iommu.c +++ b/arch/powerpc/kernel/dma-iommu.c @@ -20,14 +20,15 @@ */ static inline bool dma_iommu_alloc_bypass(struct device *dev) { - return dev->archdata.iommu_bypass && + return dev->archdata.iommu_bypass && !iommu_fixed_is_weak && dma_nommu_dma_supported(dev, dev->coherent_dma_mask); } static inline bool dma_iommu_map_bypass(struct device *dev, unsigned long attrs) { - return dev->archdata.iommu_bypass; + return dev->archdata.iommu_bypass && + (!iommu_fixed_is_weak || (attrs & DMA_ATTR_WEAK_ORDERING)); } /* Allocates a contiguous real buffer and creates mappings over it. @@ -163,7 +164,7 @@ u64 dma_iommu_get_required_mask(struct device *dev) return mask; } -struct dma_map_ops dma_iommu_ops = { +const struct dma_map_ops dma_iommu_ops = { .alloc = dma_iommu_alloc_coherent, .free = dma_iommu_free_coherent, .mmap = dma_nommu_mmap_coherent, diff --git a/arch/powerpc/platforms/cell/iommu.c b/arch/powerpc/platforms/cell/iommu.c index 4c609c0db5af..6663cd3e6bb6 100644 --- a/arch/powerpc/platforms/cell/iommu.c +++ b/arch/powerpc/platforms/cell/iommu.c @@ -546,7 +546,7 @@ static unsigned long cell_dma_nommu_offset; static unsigned long dma_iommu_fixed_base; /* iommu_fixed_is_weak is set if booted with iommu_fixed=weak */ -static int iommu_fixed_is_weak; +bool iommu_fixed_is_weak; static struct iommu_table *cell_get_iommu_table(struct device *dev) { @@ -568,94 +568,6 @@ static struct iommu_table *cell_get_iommu_table(struct device *dev) return &window->table; } -/* A coherent allocation implies strong ordering */ - -static void *dma_fixed_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t flag, - unsigned long attrs) -{ - if (iommu_fixed_is_weak) - return iommu_alloc_coherent(dev, cell_get_iommu_table(dev), - size, dma_handle, - device_to_mask(dev), flag, - dev_to_node(dev)); - else - return dma_nommu_ops.alloc(dev, size, dma_handle, flag, - attrs); -} - -static void dma_fixed_free_coherent(struct device *dev, size_t size, - void *vaddr, dma_addr_t dma_handle, - unsigned long attrs) -{ - if (iommu_fixed_is_weak) - iommu_free_coherent(cell_get_iommu_table(dev), size, vaddr, - dma_handle); - else - dma_nommu_ops.free(dev, size, vaddr, dma_handle, attrs); -} - -static dma_addr_t dma_fixed_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, - enum dma_data_direction direction, - unsigned long attrs) -{ - if (iommu_fixed_is_weak == (attrs & DMA_ATTR_WEAK_ORDERING)) - return dma_nommu_ops.map_page(dev, page, offset, size, - direction, attrs); - else - return iommu_map_page(dev, cell_get_iommu_table(dev), page, - offset, size, device_to_mask(dev), - direction, attrs); -} - -static void dma_fixed_unmap_page(struct device *dev, dma_addr_t dma_addr, - size_t size, enum dma_data_direction direction, - unsigned long attrs) -{ - if (iommu_fixed_is_weak == (attrs & DMA_ATTR_WEAK_ORDERING)) - dma_nommu_ops.unmap_page(dev, dma_addr, size, direction, - attrs); - else - iommu_unmap_page(cell_get_iommu_table(dev), dma_addr, size, - direction, attrs); -} - -static int dma_fixed_map_sg(struct device *dev, struct scatterlist *sg, - int nents, enum dma_data_direction direction, - unsigned long attrs) -{ - if (iommu_fixed_is_weak == (attrs & DMA_ATTR_WEAK_ORDERING)) - return dma_nommu_ops.map_sg(dev, sg, nents, direction, attrs); - else - return ppc_iommu_map_sg(dev, cell_get_iommu_table(dev), sg, - nents, device_to_mask(dev), - direction, attrs); -} - -static void dma_fixed_unmap_sg(struct device *dev, struct scatterlist *sg, - int nents, enum dma_data_direction direction, - unsigned long attrs) -{ - if (iommu_fixed_is_weak == (attrs & DMA_ATTR_WEAK_ORDERING)) - dma_nommu_ops.unmap_sg(dev, sg, nents, direction, attrs); - else - ppc_iommu_unmap_sg(cell_get_iommu_table(dev), sg, nents, - direction, attrs); -} - -static int dma_suported_and_switch(struct device *dev, u64 dma_mask); - -static const struct dma_map_ops dma_iommu_fixed_ops = { - .alloc = dma_fixed_alloc_coherent, - .free = dma_fixed_free_coherent, - .map_sg = dma_fixed_map_sg, - .unmap_sg = dma_fixed_unmap_sg, - .dma_supported = dma_suported_and_switch, - .map_page = dma_fixed_map_page, - .unmap_page = dma_fixed_unmap_page, -}; - static u64 cell_iommu_get_fixed_address(struct device *dev); static void cell_dma_dev_setup(struct device *dev) @@ -956,22 +868,10 @@ out: return dev_addr; } -static int dma_suported_and_switch(struct device *dev, u64 dma_mask) +static bool cell_pci_iommu_bypass_supported(struct pci_dev *pdev, u64 mask) { - if (dma_mask == DMA_BIT_MASK(64) && - cell_iommu_get_fixed_address(dev) != OF_BAD_ADDR) { - dev_dbg(dev, "iommu: 64-bit OK, using fixed ops\n"); - set_dma_ops(dev, &dma_iommu_fixed_ops); - return 1; - } - - if (dma_iommu_dma_supported(dev, dma_mask)) { - dev_dbg(dev, "iommu: not 64-bit, using default ops\n"); - set_dma_ops(dev, &dma_iommu_ops); - return 1; - } - - return 0; + return mask == DMA_BIT_MASK(64) && + cell_iommu_get_fixed_address(&pdev->dev) != OF_BAD_ADDR; } static void insert_16M_pte(unsigned long addr, unsigned long *ptab, @@ -1125,9 +1025,8 @@ static int __init cell_iommu_fixed_mapping_init(void) cell_iommu_setup_window(iommu, np, dbase, dsize, 0); } - dma_iommu_ops.dma_supported = dma_suported_and_switch; - set_pci_dma_ops(&dma_iommu_ops); - + cell_pci_controller_ops.iommu_bypass_supported = + cell_pci_iommu_bypass_supported; return 0; } @@ -1148,7 +1047,7 @@ static int __init setup_iommu_fixed(char *str) pciep = of_find_node_by_type(NULL, "pcie-endpoint"); if (strcmp(str, "weak") == 0 || (pciep && strcmp(str, "strong") != 0)) - iommu_fixed_is_weak = DMA_ATTR_WEAK_ORDERING; + iommu_fixed_is_weak = true; of_node_put(pciep); @@ -1156,26 +1055,6 @@ static int __init setup_iommu_fixed(char *str) } __setup("iommu_fixed=", setup_iommu_fixed); -static u64 cell_dma_get_required_mask(struct device *dev) -{ - const struct dma_map_ops *dma_ops; - - if (!dev->dma_mask) - return 0; - - if (!iommu_fixed_disabled && - cell_iommu_get_fixed_address(dev) != OF_BAD_ADDR) - return DMA_BIT_MASK(64); - - dma_ops = get_dma_ops(dev); - if (dma_ops->get_required_mask) - return dma_ops->get_required_mask(dev); - - WARN_ONCE(1, "no get_required_mask in %p ops", dma_ops); - - return DMA_BIT_MASK(64); -} - static int __init cell_iommu_init(void) { struct device_node *np; @@ -1192,10 +1071,9 @@ static int __init cell_iommu_init(void) /* Setup various callbacks */ cell_pci_controller_ops.dma_dev_setup = cell_pci_dma_dev_setup; - ppc_md.dma_get_required_mask = cell_dma_get_required_mask; if (!iommu_fixed_disabled && cell_iommu_fixed_mapping_init() == 0) - goto bail; + goto done; /* Create an iommu for each /axon node. */ for_each_node_by_name(np, "axon") { @@ -1212,7 +1090,7 @@ static int __init cell_iommu_init(void) continue; cell_iommu_init_one(np, SPIDER_DMA_OFFSET); } - + done: /* Setup default PCI iommu ops */ set_pci_dma_ops(&dma_iommu_ops); From ee69049e00c2b2ade9b8e3d4d0e69ccf00af91df Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 Feb 2019 08:01:10 +0100 Subject: [PATCH 058/200] powerpc/dart: remove dead cleanup code in iommu_init_early_dart If dart_init failed we didn't have a chance to setup dma or controller ops yet, so there is no point in resetting them. Signed-off-by: Christoph Hellwig Signed-off-by: Michael Ellerman --- arch/powerpc/sysdev/dart_iommu.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/arch/powerpc/sysdev/dart_iommu.c b/arch/powerpc/sysdev/dart_iommu.c index a5b40d1460f1..283ce04c5844 100644 --- a/arch/powerpc/sysdev/dart_iommu.c +++ b/arch/powerpc/sysdev/dart_iommu.c @@ -428,7 +428,7 @@ void __init iommu_init_early_dart(struct pci_controller_ops *controller_ops) /* Initialize the DART HW */ if (dart_init(dn) != 0) - goto bail; + return; /* Setup bypass if supported */ if (dart_is_u4) @@ -439,15 +439,6 @@ void __init iommu_init_early_dart(struct pci_controller_ops *controller_ops) /* Setup pci_dma ops */ set_pci_dma_ops(&dma_iommu_ops); - return; - - bail: - /* If init failed, use direct iommu and null setup functions */ - controller_ops->dma_dev_setup = NULL; - controller_ops->dma_bus_setup = NULL; - - /* Setup pci_dma ops */ - set_pci_dma_ops(&dma_nommu_ops); } #ifdef CONFIG_PM From 9f4a68d464a35166dcc84e54c7076f5f4ae5503a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 Feb 2019 08:01:11 +0100 Subject: [PATCH 059/200] powerpc/dart: use the generic iommu bypass code Use the generic iommu bypass code instead of overriding set_dma_mask. Signed-off-by: Christoph Hellwig Signed-off-by: Michael Ellerman --- arch/powerpc/sysdev/dart_iommu.c | 47 ++++++++++++-------------------- 1 file changed, 17 insertions(+), 30 deletions(-) diff --git a/arch/powerpc/sysdev/dart_iommu.c b/arch/powerpc/sysdev/dart_iommu.c index 283ce04c5844..d42ba645d51d 100644 --- a/arch/powerpc/sysdev/dart_iommu.c +++ b/arch/powerpc/sysdev/dart_iommu.c @@ -360,13 +360,6 @@ static void iommu_table_dart_setup(void) set_bit(iommu_table_dart.it_size - 1, iommu_table_dart.it_map); } -static void pci_dma_dev_setup_dart(struct pci_dev *dev) -{ - if (dart_is_u4) - set_dma_offset(&dev->dev, DART_U4_BYPASS_BASE); - set_iommu_table_base(&dev->dev, &iommu_table_dart); -} - static void pci_dma_bus_setup_dart(struct pci_bus *bus) { if (!iommu_table_dart_inited) { @@ -390,27 +383,18 @@ static bool dart_device_on_pcie(struct device *dev) return false; } -static int dart_dma_set_mask(struct device *dev, u64 dma_mask) +static void pci_dma_dev_setup_dart(struct pci_dev *dev) { - if (!dev->dma_mask || !dma_supported(dev, dma_mask)) - return -EIO; + if (dart_is_u4 && dart_device_on_pcie(&dev->dev)) + set_dma_offset(&dev->dev, DART_U4_BYPASS_BASE); + set_iommu_table_base(&dev->dev, &iommu_table_dart); +} - /* U4 supports a DART bypass, we use it for 64-bit capable - * devices to improve performances. However, that only works - * for devices connected to U4 own PCIe interface, not bridged - * through hypertransport. We need the device to support at - * least 40 bits of addresses. - */ - if (dart_device_on_pcie(dev) && dma_mask >= DMA_BIT_MASK(40)) { - dev_info(dev, "Using 64-bit DMA iommu bypass\n"); - set_dma_ops(dev, &dma_nommu_ops); - } else { - dev_info(dev, "Using 32-bit DMA via iommu\n"); - set_dma_ops(dev, &dma_iommu_ops); - } - - *dev->dma_mask = dma_mask; - return 0; +static bool iommu_bypass_supported_dart(struct pci_dev *dev, u64 mask) +{ + return dart_is_u4 && + dart_device_on_pcie(&dev->dev) && + mask >= DMA_BIT_MASK(40); } void __init iommu_init_early_dart(struct pci_controller_ops *controller_ops) @@ -430,12 +414,15 @@ void __init iommu_init_early_dart(struct pci_controller_ops *controller_ops) if (dart_init(dn) != 0) return; - /* Setup bypass if supported */ - if (dart_is_u4) - ppc_md.dma_set_mask = dart_dma_set_mask; - + /* + * U4 supports a DART bypass, we use it for 64-bit capable devices to + * improve performance. However, that only works for devices connected + * to the U4 own PCIe interface, not bridged through hypertransport. + * We need the device to support at least 40 bits of addresses. + */ controller_ops->dma_dev_setup = pci_dma_dev_setup_dart; controller_ops->dma_bus_setup = pci_dma_bus_setup_dart; + controller_ops->iommu_bypass_supported = iommu_bypass_supported_dart; /* Setup pci_dma ops */ set_pci_dma_ops(&dma_iommu_ops); From 661fcb450b535931f694873a9c16b07ee6b529d4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 Feb 2019 08:01:12 +0100 Subject: [PATCH 060/200] powerpc/powernv: remove pnv_pci_ioda_pe_single_vendor This function is completely bogus - the fact that two PCIe devices come from the same vendor has absolutely nothing to say about the DMA capabilities and characteristics. Signed-off-by: Christoph Hellwig Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/pci-ioda.c | 28 ++--------------------- 1 file changed, 2 insertions(+), 26 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 1d6406a051f1..fce7c6fe2970 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1756,31 +1756,6 @@ static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev */ } -static bool pnv_pci_ioda_pe_single_vendor(struct pnv_ioda_pe *pe) -{ - unsigned short vendor = 0; - struct pci_dev *pdev; - - if (pe->device_count == 1) - return true; - - /* pe->pdev should be set if it's a single device, pe->pbus if not */ - if (!pe->pbus) - return true; - - list_for_each_entry(pdev, &pe->pbus->devices, bus_list) { - if (!vendor) { - vendor = pdev->vendor; - continue; - } - - if (pdev->vendor != vendor) - return false; - } - - return true; -} - /* * Reconfigure TVE#0 to be usable as 64-bit DMA space. * @@ -1881,7 +1856,8 @@ static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask) */ if (dma_mask >> 32 && dma_mask > (memory_hotplug_max() + (1ULL << 32)) && - pnv_pci_ioda_pe_single_vendor(pe) && + /* pe->pdev should be set if it's a single device, pe->pbus if not */ + (pe->device_count == 1 || !pe->pbus) && phb->model == PNV_PHB_MODEL_PHB3) { /* Configure the bypass mode */ rc = pnv_pci_ioda_dma_64bit_bypass(pe); From 6248ac9441b0b8a628480eb3a15777fc0da7d22f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 Feb 2019 08:01:13 +0100 Subject: [PATCH 061/200] powerpc/powernv: remove pnv_npu_dma_set_mask These devices are not PCIe devices and do not have associated dma map ops, so this is just dead code. Signed-off-by: Christoph Hellwig Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/pci-ioda.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index fce7c6fe2970..8ecc50aeb29d 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -3646,14 +3646,6 @@ static const struct pci_controller_ops pnv_pci_ioda_controller_ops = { .shutdown = pnv_pci_ioda_shutdown, }; -static int pnv_npu_dma_set_mask(struct pci_dev *npdev, u64 dma_mask) -{ - dev_err_once(&npdev->dev, - "%s operation unsupported for NVLink devices\n", - __func__); - return -EPERM; -} - static const struct pci_controller_ops pnv_npu_ioda_controller_ops = { .dma_dev_setup = pnv_pci_dma_dev_setup, .setup_msi_irqs = pnv_setup_msi_irqs, @@ -3661,7 +3653,6 @@ static const struct pci_controller_ops pnv_npu_ioda_controller_ops = { .enable_device_hook = pnv_pci_enable_device_hook, .window_alignment = pnv_pci_window_alignment, .reset_secondary_bus = pnv_pci_reset_secondary_bus, - .dma_set_mask = pnv_npu_dma_set_mask, .shutdown = pnv_pci_ioda_shutdown, .disable_device = pnv_npu_disable_device, }; From 2d6ad41b2c210667b803585b62f9c1943bf02c91 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 Feb 2019 08:01:14 +0100 Subject: [PATCH 062/200] powerpc/powernv: use the generic iommu bypass code Use the generic iommu bypass code instead of overriding set_dma_mask. Signed-off-by: Christoph Hellwig Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/pci-ioda.c | 95 ++++++----------------- 1 file changed, 25 insertions(+), 70 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 8ecc50aeb29d..2de7fcf54c40 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1825,89 +1825,45 @@ err: return -EIO; } -static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask) +static bool pnv_pci_ioda_iommu_bypass_supported(struct pci_dev *pdev, + u64 dma_mask) { struct pci_controller *hose = pci_bus_to_host(pdev->bus); struct pnv_phb *phb = hose->private_data; struct pci_dn *pdn = pci_get_pdn(pdev); struct pnv_ioda_pe *pe; - uint64_t top; - bool bypass = false; - s64 rc; if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE)) return -ENODEV; pe = &phb->ioda.pe_array[pdn->pe_number]; if (pe->tce_bypass_enabled) { - top = pe->tce_bypass_base + memblock_end_of_DRAM() - 1; - bypass = (dma_mask >= top); + u64 top = pe->tce_bypass_base + memblock_end_of_DRAM() - 1; + if (dma_mask >= top) + return true; } - if (bypass) { - dev_info(&pdev->dev, "Using 64-bit DMA iommu bypass\n"); - set_dma_ops(&pdev->dev, &dma_nommu_ops); - } else { - /* - * If the device can't set the TCE bypass bit but still wants - * to access 4GB or more, on PHB3 we can reconfigure TVE#0 to - * bypass the 32-bit region and be usable for 64-bit DMAs. - * The device needs to be able to address all of this space. - */ - if (dma_mask >> 32 && - dma_mask > (memory_hotplug_max() + (1ULL << 32)) && - /* pe->pdev should be set if it's a single device, pe->pbus if not */ - (pe->device_count == 1 || !pe->pbus) && - phb->model == PNV_PHB_MODEL_PHB3) { - /* Configure the bypass mode */ - rc = pnv_pci_ioda_dma_64bit_bypass(pe); - if (rc) - return rc; - /* 4GB offset bypasses 32-bit space */ - set_dma_offset(&pdev->dev, (1ULL << 32)); - set_dma_ops(&pdev->dev, &dma_nommu_ops); - } else if (dma_mask >> 32 && dma_mask != DMA_BIT_MASK(64)) { - /* - * Fail the request if a DMA mask between 32 and 64 bits - * was requested but couldn't be fulfilled. Ideally we - * would do this for 64-bits but historically we have - * always fallen back to 32-bits. - */ - return -ENOMEM; - } else { - dev_info(&pdev->dev, "Using 32-bit DMA via iommu\n"); - set_dma_ops(&pdev->dev, &dma_iommu_ops); - } + /* + * If the device can't set the TCE bypass bit but still wants + * to access 4GB or more, on PHB3 we can reconfigure TVE#0 to + * bypass the 32-bit region and be usable for 64-bit DMAs. + * The device needs to be able to address all of this space. + */ + if (dma_mask >> 32 && + dma_mask > (memory_hotplug_max() + (1ULL << 32)) && + /* pe->pdev should be set if it's a single device, pe->pbus if not */ + (pe->device_count == 1 || !pe->pbus) && + phb->model == PNV_PHB_MODEL_PHB3) { + /* Configure the bypass mode */ + s64 rc = pnv_pci_ioda_dma_64bit_bypass(pe); + if (rc) + return rc; + /* 4GB offset bypasses 32-bit space */ + set_dma_offset(&pdev->dev, (1ULL << 32)); + return true; } - *pdev->dev.dma_mask = dma_mask; - /* Update peer npu devices */ - pnv_npu_try_dma_set_bypass(pdev, bypass); - - return 0; -} - -static u64 pnv_pci_ioda_dma_get_required_mask(struct pci_dev *pdev) -{ - struct pci_controller *hose = pci_bus_to_host(pdev->bus); - struct pnv_phb *phb = hose->private_data; - struct pci_dn *pdn = pci_get_pdn(pdev); - struct pnv_ioda_pe *pe; - u64 end, mask; - - if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE)) - return 0; - - pe = &phb->ioda.pe_array[pdn->pe_number]; - if (!pe->tce_bypass_enabled) - return __dma_get_required_mask(&pdev->dev); - - - end = pe->tce_bypass_base + memblock_end_of_DRAM(); - mask = 1ULL << (fls64(end) - 1); - mask += mask - 1; - - return mask; + return false; } static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, struct pci_bus *bus) @@ -3634,6 +3590,7 @@ static void pnv_pci_ioda_shutdown(struct pci_controller *hose) static const struct pci_controller_ops pnv_pci_ioda_controller_ops = { .dma_dev_setup = pnv_pci_dma_dev_setup, .dma_bus_setup = pnv_pci_dma_bus_setup, + .iommu_bypass_supported = pnv_pci_ioda_iommu_bypass_supported, .setup_msi_irqs = pnv_setup_msi_irqs, .teardown_msi_irqs = pnv_teardown_msi_irqs, .enable_device_hook = pnv_pci_enable_device_hook, @@ -3641,8 +3598,6 @@ static const struct pci_controller_ops pnv_pci_ioda_controller_ops = { .window_alignment = pnv_pci_window_alignment, .setup_bridge = pnv_pci_setup_bridge, .reset_secondary_bus = pnv_pci_reset_secondary_bus, - .dma_set_mask = pnv_pci_ioda_dma_set_mask, - .dma_get_required_mask = pnv_pci_ioda_dma_get_required_mask, .shutdown = pnv_pci_ioda_shutdown, }; From ffe3dfd4e3598651a87651f3d59f144ee31f60fb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 Feb 2019 08:01:15 +0100 Subject: [PATCH 063/200] powerpc/dma: stop overriding dma_get_required_mask The ppc_md and pci_controller_ops methods are unused now and can be removed. The dma_nommu implementation is generic to the generic one except for using max_pfn instead of calling into the memblock API, and all other dma_map_ops instances implement a method of their own. Signed-off-by: Christoph Hellwig Tested-by: Christian Zigotzky Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/device.h | 2 -- arch/powerpc/include/asm/dma-mapping.h | 2 -- arch/powerpc/include/asm/machdep.h | 2 -- arch/powerpc/include/asm/pci-bridge.h | 1 - arch/powerpc/kernel/dma.c | 29 -------------------------- kernel/dma/mapping.c | 2 -- 6 files changed, 38 deletions(-) diff --git a/arch/powerpc/include/asm/device.h b/arch/powerpc/include/asm/device.h index 1aa53318b4bc..3814e1c2d4bc 100644 --- a/arch/powerpc/include/asm/device.h +++ b/arch/powerpc/include/asm/device.h @@ -59,6 +59,4 @@ struct pdev_archdata { u64 dma_mask; }; -#define ARCH_HAS_DMA_GET_REQUIRED_MASK - #endif /* _ASM_POWERPC_DEVICE_H */ diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h index 1d80174db8a4..dc7f7bcdf65d 100644 --- a/arch/powerpc/include/asm/dma-mapping.h +++ b/arch/powerpc/include/asm/dma-mapping.h @@ -112,7 +112,5 @@ static inline void set_dma_offset(struct device *dev, dma_addr_t off) #define HAVE_ARCH_DMA_SET_MASK 1 -extern u64 __dma_get_required_mask(struct device *dev); - #endif /* __KERNEL__ */ #endif /* _ASM_DMA_MAPPING_H */ diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h index 8311869005fa..7b70dcbce1b9 100644 --- a/arch/powerpc/include/asm/machdep.h +++ b/arch/powerpc/include/asm/machdep.h @@ -47,9 +47,7 @@ struct machdep_calls { #endif #endif /* CONFIG_PPC64 */ - /* Platform set_dma_mask and dma_get_required_mask overrides */ int (*dma_set_mask)(struct device *dev, u64 dma_mask); - u64 (*dma_get_required_mask)(struct device *dev); int (*probe)(void); void (*setup_arch)(void); /* Optional, may be NULL */ diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h index d7492dca6599..236a7460b6ec 100644 --- a/arch/powerpc/include/asm/pci-bridge.h +++ b/arch/powerpc/include/asm/pci-bridge.h @@ -46,7 +46,6 @@ struct pci_controller_ops { #endif int (*dma_set_mask)(struct pci_dev *pdev, u64 dma_mask); - u64 (*dma_get_required_mask)(struct pci_dev *pdev); void (*shutdown)(struct pci_controller *hose); }; diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c index e5db4d3f8bea..0d52107b90f0 100644 --- a/arch/powerpc/kernel/dma.c +++ b/arch/powerpc/kernel/dma.c @@ -318,35 +318,6 @@ int dma_set_mask(struct device *dev, u64 dma_mask) } EXPORT_SYMBOL(dma_set_mask); -u64 __dma_get_required_mask(struct device *dev) -{ - const struct dma_map_ops *dma_ops = get_dma_ops(dev); - - if (unlikely(dma_ops == NULL)) - return 0; - - if (dma_ops->get_required_mask) - return dma_ops->get_required_mask(dev); - - return DMA_BIT_MASK(8 * sizeof(dma_addr_t)); -} - -u64 dma_get_required_mask(struct device *dev) -{ - if (ppc_md.dma_get_required_mask) - return ppc_md.dma_get_required_mask(dev); - - if (dev_is_pci(dev)) { - struct pci_dev *pdev = to_pci_dev(dev); - struct pci_controller *phb = pci_bus_to_host(pdev->bus); - if (phb->controller_ops.dma_get_required_mask) - return phb->controller_ops.dma_get_required_mask(pdev); - } - - return __dma_get_required_mask(dev); -} -EXPORT_SYMBOL_GPL(dma_get_required_mask); - static int __init dma_init(void) { #ifdef CONFIG_IBMVIO diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index a11006b6d8e8..40c0af744692 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -207,7 +207,6 @@ int dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma, } EXPORT_SYMBOL(dma_mmap_attrs); -#ifndef ARCH_HAS_DMA_GET_REQUIRED_MASK static u64 dma_default_get_required_mask(struct device *dev) { u32 low_totalram = ((max_pfn - 1) << PAGE_SHIFT); @@ -238,7 +237,6 @@ u64 dma_get_required_mask(struct device *dev) return dma_default_get_required_mask(dev); } EXPORT_SYMBOL_GPL(dma_get_required_mask); -#endif #ifndef arch_dma_alloc_attrs #define arch_dma_alloc_attrs(dev) (true) From 662acad4067a2d2de8864c1231630945321aeef1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 Feb 2019 08:01:16 +0100 Subject: [PATCH 064/200] powerpc/pci: remove the dma_set_mask pci_controller ops methods Unused now. Signed-off-by: Christoph Hellwig Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/pci-bridge.h | 2 -- arch/powerpc/kernel/dma.c | 7 ------- 2 files changed, 9 deletions(-) diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h index 236a7460b6ec..98e8b46aff97 100644 --- a/arch/powerpc/include/asm/pci-bridge.h +++ b/arch/powerpc/include/asm/pci-bridge.h @@ -45,8 +45,6 @@ struct pci_controller_ops { void (*teardown_msi_irqs)(struct pci_dev *pdev); #endif - int (*dma_set_mask)(struct pci_dev *pdev, u64 dma_mask); - void (*shutdown)(struct pci_controller *hose); }; diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c index 0d52107b90f0..5eca02315b2e 100644 --- a/arch/powerpc/kernel/dma.c +++ b/arch/powerpc/kernel/dma.c @@ -304,13 +304,6 @@ int dma_set_mask(struct device *dev, u64 dma_mask) if (ppc_md.dma_set_mask) return ppc_md.dma_set_mask(dev, dma_mask); - if (dev_is_pci(dev)) { - struct pci_dev *pdev = to_pci_dev(dev); - struct pci_controller *phb = pci_bus_to_host(pdev->bus); - if (phb->controller_ops.dma_set_mask) - return phb->controller_ops.dma_set_mask(pdev, dma_mask); - } - if (!dev->dma_mask || !dma_supported(dev, dma_mask)) return -EIO; *dev->dma_mask = dma_mask; From e72849827aa24c7cf275ac081db52f3dc5a7cf89 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 Feb 2019 08:01:17 +0100 Subject: [PATCH 065/200] powerpc/dma: remove the iommu fallback for coherent allocations All iommu capable platforms now always use the iommu code with the internal bypass, so there is not need for this magic anymore. Signed-off-by: Christoph Hellwig Tested-by: Christian Zigotzky Signed-off-by: Michael Ellerman --- arch/powerpc/Kconfig | 4 --- arch/powerpc/kernel/dma.c | 68 ++------------------------------------- 2 files changed, 2 insertions(+), 70 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 2890d36eb531..b238c63a75cc 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -119,9 +119,6 @@ config GENERIC_HWEIGHT bool default y -config ARCH_HAS_DMA_SET_COHERENT_MASK - bool - config PPC bool default y @@ -130,7 +127,6 @@ config PPC # select ARCH_HAS_DEBUG_VIRTUAL select ARCH_HAS_DEVMEM_IS_ALLOWED - select ARCH_HAS_DMA_SET_COHERENT_MASK select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c index 5eca02315b2e..9def69c8f602 100644 --- a/arch/powerpc/kernel/dma.c +++ b/arch/powerpc/kernel/dma.c @@ -115,51 +115,6 @@ void __dma_nommu_free_coherent(struct device *dev, size_t size, } #endif /* !CONFIG_NOT_COHERENT_CACHE */ -static void *dma_nommu_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t flag, - unsigned long attrs) -{ - struct iommu_table *iommu; - - /* The coherent mask may be smaller than the real mask, check if - * we can really use the direct ops - */ - if (dma_nommu_dma_supported(dev, dev->coherent_dma_mask)) - return __dma_nommu_alloc_coherent(dev, size, dma_handle, - flag, attrs); - - /* Ok we can't ... do we have an iommu ? If not, fail */ - iommu = get_iommu_table_base(dev); - if (!iommu) - return NULL; - - /* Try to use the iommu */ - return iommu_alloc_coherent(dev, iommu, size, dma_handle, - dev->coherent_dma_mask, flag, - dev_to_node(dev)); -} - -static void dma_nommu_free_coherent(struct device *dev, size_t size, - void *vaddr, dma_addr_t dma_handle, - unsigned long attrs) -{ - struct iommu_table *iommu; - - /* See comments in dma_nommu_alloc_coherent() */ - if (dma_nommu_dma_supported(dev, dev->coherent_dma_mask)) - return __dma_nommu_free_coherent(dev, size, vaddr, dma_handle, - attrs); - /* Maybe we used an iommu ... */ - iommu = get_iommu_table_base(dev); - - /* If we hit that we should have never allocated in the first - * place so how come we are freeing ? - */ - if (WARN_ON(!iommu)) - return; - iommu_free_coherent(iommu, size, vaddr, dma_handle); -} - int dma_nommu_mmap_coherent(struct device *dev, struct vm_area_struct *vma, void *cpu_addr, dma_addr_t handle, size_t size, unsigned long attrs) @@ -262,8 +217,8 @@ static inline void dma_nommu_sync_single(struct device *dev, #endif const struct dma_map_ops dma_nommu_ops = { - .alloc = dma_nommu_alloc_coherent, - .free = dma_nommu_free_coherent, + .alloc = __dma_nommu_alloc_coherent, + .free = __dma_nommu_free_coherent, .mmap = dma_nommu_mmap_coherent, .map_sg = dma_nommu_map_sg, .unmap_sg = dma_nommu_unmap_sg, @@ -280,25 +235,6 @@ const struct dma_map_ops dma_nommu_ops = { }; EXPORT_SYMBOL(dma_nommu_ops); -int dma_set_coherent_mask(struct device *dev, u64 mask) -{ - if (!dma_supported(dev, mask)) { - /* - * We need to special case the direct DMA ops which can - * support a fallback for coherent allocations. There - * is no dma_op->set_coherent_mask() so we have to do - * things the hard way: - */ - if (get_dma_ops(dev) != &dma_nommu_ops || - get_iommu_table_base(dev) == NULL || - !dma_iommu_dma_supported(dev, mask)) - return -EIO; - } - dev->coherent_dma_mask = mask; - return 0; -} -EXPORT_SYMBOL(dma_set_coherent_mask); - int dma_set_mask(struct device *dev, u64 dma_mask) { if (ppc_md.dma_set_mask) From 7c1013b48778e203d4b17ea49ef0e450dd921664 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 Feb 2019 08:01:18 +0100 Subject: [PATCH 066/200] powerpc/dma: remove get_pci_dma_ops This function is only used by the Cell iommu code, which can keep track if it is using the iommu internally just as good. Signed-off-by: Christoph Hellwig Tested-by: Christian Zigotzky Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/pci.h | 2 -- arch/powerpc/kernel/pci-common.c | 6 ------ arch/powerpc/platforms/cell/iommu.c | 17 ++++++++--------- 3 files changed, 8 insertions(+), 17 deletions(-) diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h index 0c72f1897063..6a1861a6301e 100644 --- a/arch/powerpc/include/asm/pci.h +++ b/arch/powerpc/include/asm/pci.h @@ -52,10 +52,8 @@ static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel) #ifdef CONFIG_PCI extern void set_pci_dma_ops(const struct dma_map_ops *dma_ops); -extern const struct dma_map_ops *get_pci_dma_ops(void); #else /* CONFIG_PCI */ #define set_pci_dma_ops(d) -#define get_pci_dma_ops() NULL #endif #ifdef CONFIG_PPC64 diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c index 88e4f69a09e5..a84707680525 100644 --- a/arch/powerpc/kernel/pci-common.c +++ b/arch/powerpc/kernel/pci-common.c @@ -69,12 +69,6 @@ void set_pci_dma_ops(const struct dma_map_ops *dma_ops) pci_dma_ops = dma_ops; } -const struct dma_map_ops *get_pci_dma_ops(void) -{ - return pci_dma_ops; -} -EXPORT_SYMBOL(get_pci_dma_ops); - /* * This function should run under locking protection, specifically * hose_spinlock. diff --git a/arch/powerpc/platforms/cell/iommu.c b/arch/powerpc/platforms/cell/iommu.c index 6663cd3e6bb6..a3c4057a8f65 100644 --- a/arch/powerpc/platforms/cell/iommu.c +++ b/arch/powerpc/platforms/cell/iommu.c @@ -544,6 +544,7 @@ static struct cbe_iommu *cell_iommu_for_node(int nid) static unsigned long cell_dma_nommu_offset; static unsigned long dma_iommu_fixed_base; +static bool cell_iommu_enabled; /* iommu_fixed_is_weak is set if booted with iommu_fixed=weak */ bool iommu_fixed_is_weak; @@ -572,16 +573,14 @@ static u64 cell_iommu_get_fixed_address(struct device *dev); static void cell_dma_dev_setup(struct device *dev) { - if (get_pci_dma_ops() == &dma_iommu_ops) { + if (cell_iommu_enabled) { u64 addr = cell_iommu_get_fixed_address(dev); if (addr != OF_BAD_ADDR) set_dma_offset(dev, addr + dma_iommu_fixed_base); set_iommu_table_base(dev, cell_get_iommu_table(dev)); - } else if (get_pci_dma_ops() == &dma_nommu_ops) { - set_dma_offset(dev, cell_dma_nommu_offset); } else { - BUG(); + set_dma_offset(dev, cell_dma_nommu_offset); } } @@ -599,11 +598,11 @@ static int cell_of_bus_notify(struct notifier_block *nb, unsigned long action, if (action != BUS_NOTIFY_ADD_DEVICE) return 0; - /* We use the PCI DMA ops */ - dev->dma_ops = get_pci_dma_ops(); - + if (cell_iommu_enabled) + dev->dma_ops = &dma_iommu_ops; + else + dev->dma_ops = &dma_nommu_ops; cell_dma_dev_setup(dev); - return 0; } @@ -1093,7 +1092,7 @@ static int __init cell_iommu_init(void) done: /* Setup default PCI iommu ops */ set_pci_dma_ops(&dma_iommu_ops); - + cell_iommu_enabled = true; bail: /* Register callbacks on OF platform device addition/removal * to handle linking them to the right DMA operations From 391133fd5adaba319795cd96882d1ea405c41cf6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 Feb 2019 08:01:19 +0100 Subject: [PATCH 067/200] powerpc/dma: move pci_dma_dev_setup_swiotlb to fsl_pci.c pci_dma_dev_setup_swiotlb is only used by the fsl_pci code, and closely related to it, so fsl_pci.c seems like a better place for it. Signed-off-by: Christoph Hellwig Tested-by: Christian Zigotzky Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/swiotlb.h | 2 -- arch/powerpc/kernel/dma-swiotlb.c | 11 ----------- arch/powerpc/sysdev/fsl_pci.c | 9 +++++++++ 3 files changed, 9 insertions(+), 13 deletions(-) diff --git a/arch/powerpc/include/asm/swiotlb.h b/arch/powerpc/include/asm/swiotlb.h index f65ecf57b66c..26a0f12b835b 100644 --- a/arch/powerpc/include/asm/swiotlb.h +++ b/arch/powerpc/include/asm/swiotlb.h @@ -18,8 +18,6 @@ extern const struct dma_map_ops powerpc_swiotlb_dma_ops; extern unsigned int ppc_swiotlb_enable; int __init swiotlb_setup_bus_notifier(void); -extern void pci_dma_dev_setup_swiotlb(struct pci_dev *pdev); - #ifdef CONFIG_SWIOTLB void swiotlb_detect_4g(void); #else diff --git a/arch/powerpc/kernel/dma-swiotlb.c b/arch/powerpc/kernel/dma-swiotlb.c index 7d5fc9751622..42badc4bf536 100644 --- a/arch/powerpc/kernel/dma-swiotlb.c +++ b/arch/powerpc/kernel/dma-swiotlb.c @@ -62,17 +62,6 @@ const struct dma_map_ops powerpc_swiotlb_dma_ops = { .get_required_mask = swiotlb_powerpc_get_required, }; -void pci_dma_dev_setup_swiotlb(struct pci_dev *pdev) -{ - struct pci_controller *hose; - struct dev_archdata *sd; - - hose = pci_bus_to_host(pdev->bus); - sd = &pdev->dev.archdata; - sd->max_direct_dma_addr = - hose->dma_window_base_cur + hose->dma_window_size; -} - static int ppc_swiotlb_bus_notify(struct notifier_block *nb, unsigned long action, void *data) { diff --git a/arch/powerpc/sysdev/fsl_pci.c b/arch/powerpc/sysdev/fsl_pci.c index 918be816b097..561f97d698cc 100644 --- a/arch/powerpc/sysdev/fsl_pci.c +++ b/arch/powerpc/sysdev/fsl_pci.c @@ -114,6 +114,15 @@ static struct pci_ops fsl_indirect_pcie_ops = static u64 pci64_dma_offset; #ifdef CONFIG_SWIOTLB +static void pci_dma_dev_setup_swiotlb(struct pci_dev *pdev) +{ + struct pci_controller *hose = pci_bus_to_host(pdev->bus); + struct dev_archdata *sd = &pdev->dev.archdata; + + sd->max_direct_dma_addr = + hose->dma_window_base_cur + hose->dma_window_size; +} + static void setup_swiotlb_ops(struct pci_controller *hose) { if (ppc_swiotlb_enable) { From 74194cdaac41f6dfaacd9433f739dcbd83125d0b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 Feb 2019 08:01:20 +0100 Subject: [PATCH 068/200] powerpc/dma: remove max_direct_dma_addr The max_direct_dma_addr duplicates the bus_dma_mask field in struct device. Use the generic field instead. Signed-off-by: Christoph Hellwig Tested-by: Christian Zigotzky Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/device.h | 3 --- arch/powerpc/include/asm/dma-direct.h | 4 +--- arch/powerpc/kernel/dma-swiotlb.c | 21 +-------------------- arch/powerpc/kernel/dma.c | 5 ++--- arch/powerpc/sysdev/fsl_pci.c | 4 ++-- 5 files changed, 6 insertions(+), 31 deletions(-) diff --git a/arch/powerpc/include/asm/device.h b/arch/powerpc/include/asm/device.h index 3814e1c2d4bc..a130be13ee83 100644 --- a/arch/powerpc/include/asm/device.h +++ b/arch/powerpc/include/asm/device.h @@ -38,9 +38,6 @@ struct dev_archdata { #ifdef CONFIG_IOMMU_API void *iommu_domain; #endif -#ifdef CONFIG_SWIOTLB - dma_addr_t max_direct_dma_addr; -#endif #ifdef CONFIG_PPC64 struct pci_dn *pci_data; #endif diff --git a/arch/powerpc/include/asm/dma-direct.h b/arch/powerpc/include/asm/dma-direct.h index 7702875aabb7..e00ab5d0612d 100644 --- a/arch/powerpc/include/asm/dma-direct.h +++ b/arch/powerpc/include/asm/dma-direct.h @@ -5,9 +5,7 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) { #ifdef CONFIG_SWIOTLB - struct dev_archdata *sd = &dev->archdata; - - if (sd->max_direct_dma_addr && addr + size > sd->max_direct_dma_addr) + if (dev->bus_dma_mask && addr + size > dev->bus_dma_mask) return false; #endif diff --git a/arch/powerpc/kernel/dma-swiotlb.c b/arch/powerpc/kernel/dma-swiotlb.c index 42badc4bf536..0e21c318300e 100644 --- a/arch/powerpc/kernel/dma-swiotlb.c +++ b/arch/powerpc/kernel/dma-swiotlb.c @@ -24,21 +24,6 @@ unsigned int ppc_swiotlb_enable; -static u64 swiotlb_powerpc_get_required(struct device *dev) -{ - u64 end, mask, max_direct_dma_addr = dev->archdata.max_direct_dma_addr; - - end = memblock_end_of_DRAM(); - if (max_direct_dma_addr && end > max_direct_dma_addr) - end = max_direct_dma_addr; - end += get_dma_offset(dev); - - mask = 1ULL << (fls64(end) - 1); - mask += mask - 1; - - return mask; -} - /* * At the moment, all platforms that use this code only require * swiotlb to be used if we're operating on HIGHMEM. Since @@ -59,22 +44,18 @@ const struct dma_map_ops powerpc_swiotlb_dma_ops = { .sync_single_for_device = dma_direct_sync_single_for_device, .sync_sg_for_cpu = dma_direct_sync_sg_for_cpu, .sync_sg_for_device = dma_direct_sync_sg_for_device, - .get_required_mask = swiotlb_powerpc_get_required, + .get_required_mask = dma_direct_get_required_mask, }; static int ppc_swiotlb_bus_notify(struct notifier_block *nb, unsigned long action, void *data) { struct device *dev = data; - struct dev_archdata *sd; /* We are only intereted in device addition */ if (action != BUS_NOTIFY_ADD_DEVICE) return 0; - sd = &dev->archdata; - sd->max_direct_dma_addr = 0; - /* May need to bounce if the device can't address all of DRAM */ if ((dma_get_mask(dev) + 1) < memblock_end_of_DRAM()) set_dma_ops(dev, &powerpc_swiotlb_dma_ops); diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c index 9def69c8f602..1e191eb3f0ec 100644 --- a/arch/powerpc/kernel/dma.c +++ b/arch/powerpc/kernel/dma.c @@ -30,11 +30,10 @@ static u64 __maybe_unused get_pfn_limit(struct device *dev) { u64 pfn = (dev->coherent_dma_mask >> PAGE_SHIFT) + 1; - struct dev_archdata __maybe_unused *sd = &dev->archdata; #ifdef CONFIG_SWIOTLB - if (sd->max_direct_dma_addr && dev->dma_ops == &powerpc_swiotlb_dma_ops) - pfn = min_t(u64, pfn, sd->max_direct_dma_addr >> PAGE_SHIFT); + if (dev->bus_dma_mask && dev->dma_ops == &powerpc_swiotlb_dma_ops) + pfn = min_t(u64, pfn, dev->bus_dma_mask >> PAGE_SHIFT); #endif return pfn; diff --git a/arch/powerpc/sysdev/fsl_pci.c b/arch/powerpc/sysdev/fsl_pci.c index 561f97d698cc..b710cee023a2 100644 --- a/arch/powerpc/sysdev/fsl_pci.c +++ b/arch/powerpc/sysdev/fsl_pci.c @@ -117,9 +117,8 @@ static u64 pci64_dma_offset; static void pci_dma_dev_setup_swiotlb(struct pci_dev *pdev) { struct pci_controller *hose = pci_bus_to_host(pdev->bus); - struct dev_archdata *sd = &pdev->dev.archdata; - sd->max_direct_dma_addr = + pdev->dev.bus_dma_mask = hose->dma_window_base_cur + hose->dma_window_size; } @@ -144,6 +143,7 @@ static int fsl_pci_dma_set_mask(struct device *dev, u64 dma_mask) * mapping that allows addressing any RAM address from across PCI. */ if (dev_is_pci(dev) && dma_mask >= pci64_dma_offset * 2 - 1) { + dev->bus_dma_mask = 0; set_dma_ops(dev, &dma_nommu_ops); set_dma_offset(dev, pci64_dma_offset); } From 9b18114c0be0193ebe772e45b3731602f056d54e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 Feb 2019 08:01:21 +0100 Subject: [PATCH 069/200] powerpc/dma: fix an off-by-one in dma_capable We need to compare the last byte in the dma range and not the one after it for the bus_dma_mask, just like we do for the regular dma_mask. Fix this cleanly by merging the two comparisms into one. Signed-off-by: Christoph Hellwig Tested-by: Christian Zigotzky Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/dma-direct.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/include/asm/dma-direct.h b/arch/powerpc/include/asm/dma-direct.h index e00ab5d0612d..92d8aed86422 100644 --- a/arch/powerpc/include/asm/dma-direct.h +++ b/arch/powerpc/include/asm/dma-direct.h @@ -4,15 +4,11 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) { -#ifdef CONFIG_SWIOTLB - if (dev->bus_dma_mask && addr + size > dev->bus_dma_mask) - return false; -#endif - if (!dev->dma_mask) return false; - return addr + size - 1 <= *dev->dma_mask; + return addr + size - 1 <= + min_not_zero(*dev->dma_mask, dev->bus_dma_mask); } static inline dma_addr_t __phys_to_dma(struct device *dev, phys_addr_t paddr) From 11ddce15451eb5e3cb2c951dc5c8d86a2802017a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 Feb 2019 08:01:22 +0100 Subject: [PATCH 070/200] dma-mapping, powerpc: simplify the arch dma_set_mask override Instead of letting the architecture supply all of dma_set_mask just give it an additional hook selected by Kconfig. Signed-off-by: Christoph Hellwig Tested-by: Christian Zigotzky Signed-off-by: Michael Ellerman --- arch/powerpc/Kconfig | 1 + arch/powerpc/include/asm/dma-mapping.h | 2 -- arch/powerpc/include/asm/machdep.h | 2 +- arch/powerpc/kernel/Makefile | 1 + arch/powerpc/kernel/dma-mask.c | 12 ++++++++++++ arch/powerpc/kernel/dma.c | 12 ------------ arch/powerpc/sysdev/fsl_pci.c | 8 +------- kernel/dma/Kconfig | 3 +++ kernel/dma/mapping.c | 9 +++++++-- 9 files changed, 26 insertions(+), 24 deletions(-) create mode 100644 arch/powerpc/kernel/dma-mask.c diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index b238c63a75cc..39d07c02f7d8 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -886,6 +886,7 @@ config FSL_SOC config FSL_PCI bool + select ARCH_HAS_DMA_SET_MASK select PPC_INDIRECT_PCI select PCI_QUIRKS diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h index dc7f7bcdf65d..16d45518d9bb 100644 --- a/arch/powerpc/include/asm/dma-mapping.h +++ b/arch/powerpc/include/asm/dma-mapping.h @@ -110,7 +110,5 @@ static inline void set_dma_offset(struct device *dev, dma_addr_t off) dev->archdata.dma_offset = off; } -#define HAVE_ARCH_DMA_SET_MASK 1 - #endif /* __KERNEL__ */ #endif /* _ASM_DMA_MAPPING_H */ diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h index 7b70dcbce1b9..2f0ca6560e47 100644 --- a/arch/powerpc/include/asm/machdep.h +++ b/arch/powerpc/include/asm/machdep.h @@ -47,7 +47,7 @@ struct machdep_calls { #endif #endif /* CONFIG_PPC64 */ - int (*dma_set_mask)(struct device *dev, u64 dma_mask); + void (*dma_set_mask)(struct device *dev, u64 dma_mask); int (*probe)(void); void (*setup_arch)(void); /* Optional, may be NULL */ diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index cb7f0bb9ee71..9bb12cd642ef 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -105,6 +105,7 @@ obj-$(CONFIG_UPROBES) += uprobes.o obj-$(CONFIG_PPC_UDBG_16550) += legacy_serial.o udbg_16550.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-$(CONFIG_SWIOTLB) += dma-swiotlb.o +obj-$(CONFIG_ARCH_HAS_DMA_SET_MASK) += dma-mask.o pci64-$(CONFIG_PPC64) += pci_dn.o pci-hotplug.o isa-bridge.o obj-$(CONFIG_PCI) += pci_$(BITS).o $(pci64-y) \ diff --git a/arch/powerpc/kernel/dma-mask.c b/arch/powerpc/kernel/dma-mask.c new file mode 100644 index 000000000000..ffbbbc432612 --- /dev/null +++ b/arch/powerpc/kernel/dma-mask.c @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include + +void arch_dma_set_mask(struct device *dev, u64 dma_mask) +{ + if (ppc_md.dma_set_mask) + ppc_md.dma_set_mask(dev, dma_mask); +} +EXPORT_SYMBOL(arch_dma_set_mask); diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c index 1e191eb3f0ec..e422ca65d1cf 100644 --- a/arch/powerpc/kernel/dma.c +++ b/arch/powerpc/kernel/dma.c @@ -234,18 +234,6 @@ const struct dma_map_ops dma_nommu_ops = { }; EXPORT_SYMBOL(dma_nommu_ops); -int dma_set_mask(struct device *dev, u64 dma_mask) -{ - if (ppc_md.dma_set_mask) - return ppc_md.dma_set_mask(dev, dma_mask); - - if (!dev->dma_mask || !dma_supported(dev, dma_mask)) - return -EIO; - *dev->dma_mask = dma_mask; - return 0; -} -EXPORT_SYMBOL(dma_set_mask); - static int __init dma_init(void) { #ifdef CONFIG_IBMVIO diff --git a/arch/powerpc/sysdev/fsl_pci.c b/arch/powerpc/sysdev/fsl_pci.c index b710cee023a2..0c6510f340cb 100644 --- a/arch/powerpc/sysdev/fsl_pci.c +++ b/arch/powerpc/sysdev/fsl_pci.c @@ -133,11 +133,8 @@ static void setup_swiotlb_ops(struct pci_controller *hose) static inline void setup_swiotlb_ops(struct pci_controller *hose) {} #endif -static int fsl_pci_dma_set_mask(struct device *dev, u64 dma_mask) +static void fsl_pci_dma_set_mask(struct device *dev, u64 dma_mask) { - if (!dev->dma_mask || !dma_supported(dev, dma_mask)) - return -EIO; - /* * Fix up PCI devices that are able to DMA to the large inbound * mapping that allows addressing any RAM address from across PCI. @@ -147,9 +144,6 @@ static int fsl_pci_dma_set_mask(struct device *dev, u64 dma_mask) set_dma_ops(dev, &dma_nommu_ops); set_dma_offset(dev, pci64_dma_offset); } - - *dev->dma_mask = dma_mask; - return 0; } static int setup_one_atmu(struct ccsr_pci __iomem *pci, diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index ca88b867e7fe..0711d18645de 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -16,6 +16,9 @@ config ARCH_DMA_ADDR_T_64BIT config ARCH_HAS_DMA_COHERENCE_H bool +config ARCH_HAS_DMA_SET_MASK + bool + config HAVE_GENERIC_DMA_COHERENT bool diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 40c0af744692..ef2aba503467 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -316,18 +316,23 @@ int dma_supported(struct device *dev, u64 mask) } EXPORT_SYMBOL(dma_supported); -#ifndef HAVE_ARCH_DMA_SET_MASK +#ifdef CONFIG_ARCH_HAS_DMA_SET_MASK +void arch_dma_set_mask(struct device *dev, u64 mask); +#else +#define arch_dma_set_mask(dev, mask) do { } while (0) +#endif + int dma_set_mask(struct device *dev, u64 mask) { if (!dev->dma_mask || !dma_supported(dev, mask)) return -EIO; + arch_dma_set_mask(dev, mask); dma_check_mask(dev, mask); *dev->dma_mask = mask; return 0; } EXPORT_SYMBOL(dma_set_mask); -#endif #ifndef CONFIG_ARCH_HAS_DMA_SET_COHERENT_MASK int dma_set_coherent_mask(struct device *dev, u64 mask) From 18b53a2d479f3906b15edcabeb4135c8f22a11ec Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 Feb 2019 08:01:23 +0100 Subject: [PATCH 071/200] powerpc/dma: use phys_to_dma instead of get_dma_offset Use the standard portable helper instead of the powerpc specific one, which is about to go away. Signed-off-by: Christoph Hellwig Acked-by: Benjamin Herrenschmidt Tested-by: Christian Zigotzky Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/dma.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c index e422ca65d1cf..10fa4e18b4e9 100644 --- a/arch/powerpc/kernel/dma.c +++ b/arch/powerpc/kernel/dma.c @@ -6,7 +6,7 @@ */ #include -#include +#include #include #include #include @@ -42,7 +42,7 @@ static u64 __maybe_unused get_pfn_limit(struct device *dev) int dma_nommu_dma_supported(struct device *dev, u64 mask) { #ifdef CONFIG_PPC64 - u64 limit = get_dma_offset(dev) + (memblock_end_of_DRAM() - 1); + u64 limit = phys_to_dma(dev, (memblock_end_of_DRAM() - 1)); /* Limit fits in the mask, we are good */ if (mask >= limit) @@ -101,7 +101,7 @@ void *__dma_nommu_alloc_coherent(struct device *dev, size_t size, return NULL; ret = page_address(page); memset(ret, 0, size); - *dma_handle = __pa(ret) + get_dma_offset(dev); + *dma_handle = phys_to_dma(dev,__pa(ret)); return ret; } @@ -140,7 +140,7 @@ int dma_nommu_map_sg(struct device *dev, struct scatterlist *sgl, int i; for_each_sg(sgl, sg, nents, i) { - sg->dma_address = sg_phys(sg) + get_dma_offset(dev); + sg->dma_address = phys_to_dma(dev, sg_phys(sg)); sg->dma_length = sg->length; if (attrs & DMA_ATTR_SKIP_CPU_SYNC) @@ -182,7 +182,7 @@ dma_addr_t dma_nommu_map_page(struct device *dev, struct page *page, if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) __dma_sync_page(page, offset, size, dir); - return page_to_phys(page) + offset + get_dma_offset(dev); + return phys_to_dma(dev, page_to_phys(page)) + offset; } static inline void dma_nommu_unmap_page(struct device *dev, From 6666cc17d7802b7dcbb073e7be1eee2cf6fa64d9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 Feb 2019 08:01:24 +0100 Subject: [PATCH 072/200] powerpc/dma: remove dma_nommu_mmap_coherent The coherent cache version of this function already is functionally identicall to the default version, and by defining the arch_dma_coherent_to_pfn hook the same is ture for the noncoherent version as well. Signed-off-by: Christoph Hellwig Tested-by: Christian Zigotzky Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/dma-mapping.h | 4 ---- arch/powerpc/kernel/dma-iommu.c | 1 - arch/powerpc/kernel/dma-swiotlb.c | 1 - arch/powerpc/kernel/dma.c | 19 ------------------- arch/powerpc/mm/dma-noncoherent.c | 7 +++++-- arch/powerpc/platforms/Kconfig.cputype | 1 + arch/powerpc/platforms/pseries/vio.c | 1 - 7 files changed, 6 insertions(+), 28 deletions(-) diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h index 16d45518d9bb..f19c486e7b3f 100644 --- a/arch/powerpc/include/asm/dma-mapping.h +++ b/arch/powerpc/include/asm/dma-mapping.h @@ -25,10 +25,6 @@ extern void *__dma_nommu_alloc_coherent(struct device *dev, size_t size, extern void __dma_nommu_free_coherent(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle, unsigned long attrs); -extern int dma_nommu_mmap_coherent(struct device *dev, - struct vm_area_struct *vma, - void *cpu_addr, dma_addr_t handle, - size_t size, unsigned long attrs); int dma_nommu_map_sg(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction direction, unsigned long attrs); diff --git a/arch/powerpc/kernel/dma-iommu.c b/arch/powerpc/kernel/dma-iommu.c index 5a0b5e863b08..ed8b60829a90 100644 --- a/arch/powerpc/kernel/dma-iommu.c +++ b/arch/powerpc/kernel/dma-iommu.c @@ -167,7 +167,6 @@ u64 dma_iommu_get_required_mask(struct device *dev) const struct dma_map_ops dma_iommu_ops = { .alloc = dma_iommu_alloc_coherent, .free = dma_iommu_free_coherent, - .mmap = dma_nommu_mmap_coherent, .map_sg = dma_iommu_map_sg, .unmap_sg = dma_iommu_unmap_sg, .dma_supported = dma_iommu_dma_supported, diff --git a/arch/powerpc/kernel/dma-swiotlb.c b/arch/powerpc/kernel/dma-swiotlb.c index 0e21c318300e..d5950a0cb758 100644 --- a/arch/powerpc/kernel/dma-swiotlb.c +++ b/arch/powerpc/kernel/dma-swiotlb.c @@ -34,7 +34,6 @@ unsigned int ppc_swiotlb_enable; const struct dma_map_ops powerpc_swiotlb_dma_ops = { .alloc = __dma_nommu_alloc_coherent, .free = __dma_nommu_free_coherent, - .mmap = dma_nommu_mmap_coherent, .map_sg = dma_direct_map_sg, .unmap_sg = dma_direct_unmap_sg, .dma_supported = swiotlb_dma_supported, diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c index 10fa4e18b4e9..841c43355a7e 100644 --- a/arch/powerpc/kernel/dma.c +++ b/arch/powerpc/kernel/dma.c @@ -114,24 +114,6 @@ void __dma_nommu_free_coherent(struct device *dev, size_t size, } #endif /* !CONFIG_NOT_COHERENT_CACHE */ -int dma_nommu_mmap_coherent(struct device *dev, struct vm_area_struct *vma, - void *cpu_addr, dma_addr_t handle, size_t size, - unsigned long attrs) -{ - unsigned long pfn; - -#ifdef CONFIG_NOT_COHERENT_CACHE - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - pfn = __dma_get_coherent_pfn((unsigned long)cpu_addr); -#else - pfn = page_to_pfn(virt_to_page(cpu_addr)); -#endif - return remap_pfn_range(vma, vma->vm_start, - pfn + vma->vm_pgoff, - vma->vm_end - vma->vm_start, - vma->vm_page_prot); -} - int dma_nommu_map_sg(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction direction, unsigned long attrs) @@ -218,7 +200,6 @@ static inline void dma_nommu_sync_single(struct device *dev, const struct dma_map_ops dma_nommu_ops = { .alloc = __dma_nommu_alloc_coherent, .free = __dma_nommu_free_coherent, - .mmap = dma_nommu_mmap_coherent, .map_sg = dma_nommu_map_sg, .unmap_sg = dma_nommu_unmap_sg, .dma_supported = dma_nommu_dma_supported, diff --git a/arch/powerpc/mm/dma-noncoherent.c b/arch/powerpc/mm/dma-noncoherent.c index e955539686a4..ee95da19c82d 100644 --- a/arch/powerpc/mm/dma-noncoherent.c +++ b/arch/powerpc/mm/dma-noncoherent.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -400,14 +401,16 @@ EXPORT_SYMBOL(__dma_sync_page); /* * Return the PFN for a given cpu virtual address returned by - * __dma_nommu_alloc_coherent. This is used by dma_mmap_coherent() + * __dma_nommu_alloc_coherent. */ -unsigned long __dma_get_coherent_pfn(unsigned long cpu_addr) +long arch_dma_coherent_to_pfn(struct device *dev, void *vaddr, + dma_addr_t dma_addr) { /* This should always be populated, so we don't test every * level. If that fails, we'll have a nice crash which * will be as good as a BUG_ON() */ + unsigned long cpu_addr = (unsigned long)vaddr; pgd_t *pgd = pgd_offset_k(cpu_addr); pud_t *pud = pud_offset(pgd, cpu_addr); pmd_t *pmd = pmd_offset(pud, cpu_addr); diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 8c7464c3f27f..48cd5aa90ad2 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -402,6 +402,7 @@ config NOT_COHERENT_CACHE bool depends on 4xx || PPC_8xx || E200 || PPC_MPC512x || \ GAMECUBE_COMMON || AMIGAONE + select ARCH_HAS_DMA_COHERENT_TO_PFN default n if PPC_47x default y diff --git a/arch/powerpc/platforms/pseries/vio.c b/arch/powerpc/platforms/pseries/vio.c index 7870bf99168c..b7dc8bd41fd0 100644 --- a/arch/powerpc/platforms/pseries/vio.c +++ b/arch/powerpc/platforms/pseries/vio.c @@ -603,7 +603,6 @@ static void vio_dma_iommu_unmap_sg(struct device *dev, static const struct dma_map_ops vio_dma_mapping_ops = { .alloc = vio_dma_iommu_alloc_coherent, .free = vio_dma_iommu_free_coherent, - .mmap = dma_nommu_mmap_coherent, .map_sg = vio_dma_iommu_map_sg, .unmap_sg = vio_dma_iommu_unmap_sg, .map_page = vio_dma_iommu_map_page, From 5a47910d76f26e5fe6e9837872efdf8282ea76fb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 Feb 2019 08:01:25 +0100 Subject: [PATCH 073/200] powerpc/dma: remove dma_nommu_get_required_mask This function is identical to the generic dma_direct_get_required_mask, except that the generic version also takes the bus_dma_mask account, which could lead to incorrect results in the powerpc version. Signed-off-by: Christoph Hellwig Tested-by: Christian Zigotzky Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/dma-mapping.h | 1 - arch/powerpc/kernel/dma-iommu.c | 2 +- arch/powerpc/kernel/dma.c | 14 +------------- 3 files changed, 2 insertions(+), 15 deletions(-) diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h index f19c486e7b3f..af9a32d4fcf8 100644 --- a/arch/powerpc/include/asm/dma-mapping.h +++ b/arch/powerpc/include/asm/dma-mapping.h @@ -32,7 +32,6 @@ dma_addr_t dma_nommu_map_page(struct device *dev, struct page *page, unsigned long offset, size_t size, enum dma_data_direction dir, unsigned long attrs); int dma_nommu_dma_supported(struct device *dev, u64 mask); -u64 dma_nommu_get_required_mask(struct device *dev); #ifdef CONFIG_NOT_COHERENT_CACHE /* diff --git a/arch/powerpc/kernel/dma-iommu.c b/arch/powerpc/kernel/dma-iommu.c index ed8b60829a90..4377b69a9d42 100644 --- a/arch/powerpc/kernel/dma-iommu.c +++ b/arch/powerpc/kernel/dma-iommu.c @@ -152,7 +152,7 @@ u64 dma_iommu_get_required_mask(struct device *dev) return 0; if (dev_is_pci(dev)) { - u64 bypass_mask = dma_nommu_get_required_mask(dev); + u64 bypass_mask = dma_direct_get_required_mask(dev); if (dma_iommu_bypass_supported(dev, bypass_mask)) return bypass_mask; diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c index 841c43355a7e..46afc66cc271 100644 --- a/arch/powerpc/kernel/dma.c +++ b/arch/powerpc/kernel/dma.c @@ -145,18 +145,6 @@ static void dma_nommu_unmap_sg(struct device *dev, struct scatterlist *sgl, __dma_sync_page(sg_page(sg), sg->offset, sg->length, direction); } -u64 dma_nommu_get_required_mask(struct device *dev) -{ - u64 end, mask; - - end = memblock_end_of_DRAM() + get_dma_offset(dev); - - mask = 1ULL << (fls64(end) - 1); - mask += mask - 1; - - return mask; -} - dma_addr_t dma_nommu_map_page(struct device *dev, struct page *page, unsigned long offset, size_t size, enum dma_data_direction dir, unsigned long attrs) @@ -205,7 +193,7 @@ const struct dma_map_ops dma_nommu_ops = { .dma_supported = dma_nommu_dma_supported, .map_page = dma_nommu_map_page, .unmap_page = dma_nommu_unmap_page, - .get_required_mask = dma_nommu_get_required_mask, + .get_required_mask = dma_direct_get_required_mask, #ifdef CONFIG_NOT_COHERENT_CACHE .sync_single_for_cpu = dma_nommu_sync_single, .sync_single_for_device = dma_nommu_sync_single, From 65a21b71f948406201e4f62e41f06513350ca390 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 Feb 2019 08:01:26 +0100 Subject: [PATCH 074/200] powerpc/dma: remove dma_nommu_dma_supported This function is largely identical to the generic version used everywhere else. Replace it with the generic version. Signed-off-by: Christoph Hellwig Tested-by: Christian Zigotzky Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/dma-mapping.h | 1 - arch/powerpc/kernel/dma-iommu.c | 2 +- arch/powerpc/kernel/dma.c | 25 +------------------------ 3 files changed, 2 insertions(+), 26 deletions(-) diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h index af9a32d4fcf8..cdf70aaeafeb 100644 --- a/arch/powerpc/include/asm/dma-mapping.h +++ b/arch/powerpc/include/asm/dma-mapping.h @@ -31,7 +31,6 @@ int dma_nommu_map_sg(struct device *dev, struct scatterlist *sgl, dma_addr_t dma_nommu_map_page(struct device *dev, struct page *page, unsigned long offset, size_t size, enum dma_data_direction dir, unsigned long attrs); -int dma_nommu_dma_supported(struct device *dev, u64 mask); #ifdef CONFIG_NOT_COHERENT_CACHE /* diff --git a/arch/powerpc/kernel/dma-iommu.c b/arch/powerpc/kernel/dma-iommu.c index 4377b69a9d42..67fbfaa4e3b2 100644 --- a/arch/powerpc/kernel/dma-iommu.c +++ b/arch/powerpc/kernel/dma-iommu.c @@ -21,7 +21,7 @@ static inline bool dma_iommu_alloc_bypass(struct device *dev) { return dev->archdata.iommu_bypass && !iommu_fixed_is_weak && - dma_nommu_dma_supported(dev, dev->coherent_dma_mask); + dma_direct_supported(dev, dev->coherent_dma_mask); } static inline bool dma_iommu_map_bypass(struct device *dev, diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c index 46afc66cc271..a3546a82f6d7 100644 --- a/arch/powerpc/kernel/dma.c +++ b/arch/powerpc/kernel/dma.c @@ -39,29 +39,6 @@ static u64 __maybe_unused get_pfn_limit(struct device *dev) return pfn; } -int dma_nommu_dma_supported(struct device *dev, u64 mask) -{ -#ifdef CONFIG_PPC64 - u64 limit = phys_to_dma(dev, (memblock_end_of_DRAM() - 1)); - - /* Limit fits in the mask, we are good */ - if (mask >= limit) - return 1; - -#ifdef CONFIG_FSL_SOC - /* - * Freescale gets another chance via ZONE_DMA, however - * that will have to be refined if/when they support iommus - */ - return 1; -#endif - /* Sorry ... */ - return 0; -#else - return 1; -#endif -} - #ifndef CONFIG_NOT_COHERENT_CACHE void *__dma_nommu_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flag, @@ -190,7 +167,7 @@ const struct dma_map_ops dma_nommu_ops = { .free = __dma_nommu_free_coherent, .map_sg = dma_nommu_map_sg, .unmap_sg = dma_nommu_unmap_sg, - .dma_supported = dma_nommu_dma_supported, + .dma_supported = dma_direct_supported, .map_page = dma_nommu_map_page, .unmap_page = dma_nommu_unmap_page, .get_required_mask = dma_direct_get_required_mask, From feee96440c9c5fdf47f8c8079c104fc8082924a0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 Feb 2019 08:01:27 +0100 Subject: [PATCH 075/200] swiotlb: remove swiotlb_dma_supported The only user left is powerpc, but even there the generic dma-direct version works just as well, given that we guarantee that the swiotlb buffer must always be addressable. Signed-off-by: Christoph Hellwig Tested-by: Christian Zigotzky Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/dma-swiotlb.c | 2 +- include/linux/swiotlb.h | 3 --- kernel/dma/swiotlb.c | 12 ------------ 3 files changed, 1 insertion(+), 16 deletions(-) diff --git a/arch/powerpc/kernel/dma-swiotlb.c b/arch/powerpc/kernel/dma-swiotlb.c index d5950a0cb758..6d2677b2daa6 100644 --- a/arch/powerpc/kernel/dma-swiotlb.c +++ b/arch/powerpc/kernel/dma-swiotlb.c @@ -36,7 +36,7 @@ const struct dma_map_ops powerpc_swiotlb_dma_ops = { .free = __dma_nommu_free_coherent, .map_sg = dma_direct_map_sg, .unmap_sg = dma_direct_unmap_sg, - .dma_supported = swiotlb_dma_supported, + .dma_supported = dma_direct_supported, .map_page = dma_direct_map_page, .unmap_page = dma_direct_unmap_page, .sync_single_for_cpu = dma_direct_sync_single_for_cpu, diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index 7c007ed7505f..54254388899e 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -60,9 +60,6 @@ extern void swiotlb_tbl_sync_single(struct device *hwdev, size_t size, enum dma_data_direction dir, enum dma_sync_target target); -extern int -swiotlb_dma_supported(struct device *hwdev, u64 mask); - #ifdef CONFIG_SWIOTLB extern enum swiotlb_force swiotlb_force; extern phys_addr_t io_tlb_start, io_tlb_end; diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index d6361776dc5c..cbf3498a46f9 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -648,15 +648,3 @@ bool swiotlb_map(struct device *dev, phys_addr_t *phys, dma_addr_t *dma_addr, return true; } - -/* - * Return whether the given device DMA address mask can be supported - * properly. For example, if your device can only drive the low 24-bits - * during bus mastering, then you would pass 0x00ffffff as the mask to - * this function. - */ -int -swiotlb_dma_supported(struct device *hwdev, u64 mask) -{ - return __phys_to_dma(hwdev, io_tlb_end - 1) <= mask; -} From 31f940afda6add7a7bb182adde97e615e5355c6d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 Feb 2019 08:01:28 +0100 Subject: [PATCH 076/200] powerpc/dma: use the dma-direct allocator for coherent platforms The generic code allows a few nice things such as node local allocations and dipping into the CMA area. The lookup of the right zone for a given dma mask works a little different, but the results should be the same. Signed-off-by: Christoph Hellwig Tested-by: Christian Zigotzky Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/pgtable.h | 1 - arch/powerpc/kernel/dma-iommu.c | 5 +-- arch/powerpc/kernel/dma-swiotlb.c | 4 +- arch/powerpc/kernel/dma.c | 69 +++--------------------------- arch/powerpc/mm/mem.c | 22 ---------- 5 files changed, 9 insertions(+), 92 deletions(-) diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index dad1d27e196d..505550fb2935 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -66,7 +66,6 @@ extern unsigned long empty_zero_page[]; extern pgd_t swapper_pg_dir[]; -int dma_pfn_limit_to_zone(u64 pfn_limit); extern void paging_init(void); /* diff --git a/arch/powerpc/kernel/dma-iommu.c b/arch/powerpc/kernel/dma-iommu.c index 67fbfaa4e3b2..c75ba4e3a50c 100644 --- a/arch/powerpc/kernel/dma-iommu.c +++ b/arch/powerpc/kernel/dma-iommu.c @@ -40,8 +40,7 @@ static void *dma_iommu_alloc_coherent(struct device *dev, size_t size, unsigned long attrs) { if (dma_iommu_alloc_bypass(dev)) - return __dma_nommu_alloc_coherent(dev, size, dma_handle, flag, - attrs); + return dma_direct_alloc(dev, size, dma_handle, flag, attrs); return iommu_alloc_coherent(dev, get_iommu_table_base(dev), size, dma_handle, dev->coherent_dma_mask, flag, dev_to_node(dev)); @@ -52,7 +51,7 @@ static void dma_iommu_free_coherent(struct device *dev, size_t size, unsigned long attrs) { if (dma_iommu_alloc_bypass(dev)) - __dma_nommu_free_coherent(dev, size, vaddr, dma_handle, attrs); + dma_direct_free(dev, size, vaddr, dma_handle, attrs); else iommu_free_coherent(get_iommu_table_base(dev), size, vaddr, dma_handle); diff --git a/arch/powerpc/kernel/dma-swiotlb.c b/arch/powerpc/kernel/dma-swiotlb.c index 6d2677b2daa6..3a15a7d945e9 100644 --- a/arch/powerpc/kernel/dma-swiotlb.c +++ b/arch/powerpc/kernel/dma-swiotlb.c @@ -32,8 +32,8 @@ unsigned int ppc_swiotlb_enable; * for everything else. */ const struct dma_map_ops powerpc_swiotlb_dma_ops = { - .alloc = __dma_nommu_alloc_coherent, - .free = __dma_nommu_free_coherent, + .alloc = dma_direct_alloc, + .free = dma_direct_free, .map_sg = dma_direct_map_sg, .unmap_sg = dma_direct_unmap_sg, .dma_supported = dma_direct_supported, diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c index a3546a82f6d7..f983f8d435a6 100644 --- a/arch/powerpc/kernel/dma.c +++ b/arch/powerpc/kernel/dma.c @@ -27,70 +27,6 @@ * default the offset is PCI_DRAM_OFFSET. */ -static u64 __maybe_unused get_pfn_limit(struct device *dev) -{ - u64 pfn = (dev->coherent_dma_mask >> PAGE_SHIFT) + 1; - -#ifdef CONFIG_SWIOTLB - if (dev->bus_dma_mask && dev->dma_ops == &powerpc_swiotlb_dma_ops) - pfn = min_t(u64, pfn, dev->bus_dma_mask >> PAGE_SHIFT); -#endif - - return pfn; -} - -#ifndef CONFIG_NOT_COHERENT_CACHE -void *__dma_nommu_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t flag, - unsigned long attrs) -{ - void *ret; - struct page *page; - int node = dev_to_node(dev); -#ifdef CONFIG_FSL_SOC - u64 pfn = get_pfn_limit(dev); - int zone; - - /* - * This code should be OK on other platforms, but we have drivers that - * don't set coherent_dma_mask. As a workaround we just ifdef it. This - * whole routine needs some serious cleanup. - */ - - zone = dma_pfn_limit_to_zone(pfn); - if (zone < 0) { - dev_err(dev, "%s: No suitable zone for pfn %#llx\n", - __func__, pfn); - return NULL; - } - - switch (zone) { -#ifdef CONFIG_ZONE_DMA - case ZONE_DMA: - flag |= GFP_DMA; - break; -#endif - }; -#endif /* CONFIG_FSL_SOC */ - - page = alloc_pages_node(node, flag, get_order(size)); - if (page == NULL) - return NULL; - ret = page_address(page); - memset(ret, 0, size); - *dma_handle = phys_to_dma(dev,__pa(ret)); - - return ret; -} - -void __dma_nommu_free_coherent(struct device *dev, size_t size, - void *vaddr, dma_addr_t dma_handle, - unsigned long attrs) -{ - free_pages((unsigned long)vaddr, get_order(size)); -} -#endif /* !CONFIG_NOT_COHERENT_CACHE */ - int dma_nommu_map_sg(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction direction, unsigned long attrs) @@ -163,8 +99,13 @@ static inline void dma_nommu_sync_single(struct device *dev, #endif const struct dma_map_ops dma_nommu_ops = { +#ifdef CONFIG_NOT_COHERENT_CACHE .alloc = __dma_nommu_alloc_coherent, .free = __dma_nommu_free_coherent, +#else + .alloc = dma_direct_alloc, + .free = dma_direct_free, +#endif .map_sg = dma_nommu_map_sg, .unmap_sg = dma_nommu_unmap_sg, .dma_supported = dma_direct_supported, diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 33cc6f676fa6..a10ee3645a6c 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -69,15 +69,12 @@ pte_t *kmap_pte; EXPORT_SYMBOL(kmap_pte); pgprot_t kmap_prot; EXPORT_SYMBOL(kmap_prot); -#define TOP_ZONE ZONE_HIGHMEM static inline pte_t *virt_to_kpte(unsigned long vaddr) { return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), vaddr), vaddr), vaddr); } -#else -#define TOP_ZONE ZONE_NORMAL #endif int page_is_ram(unsigned long pfn) @@ -261,25 +258,6 @@ static int __init mark_nonram_nosave(void) */ static unsigned long max_zone_pfns[MAX_NR_ZONES]; -/* - * Find the least restrictive zone that is entirely below the - * specified pfn limit. Returns < 0 if no suitable zone is found. - * - * pfn_limit must be u64 because it can exceed 32 bits even on 32-bit - * systems -- the DMA limit can be higher than any possible real pfn. - */ -int dma_pfn_limit_to_zone(u64 pfn_limit) -{ - int i; - - for (i = TOP_ZONE; i >= 0; i--) { - if (max_zone_pfns[i] <= pfn_limit) - return i; - } - - return -EPERM; -} - /* * paging_init() sets up the page tables - in fact we've already done this. */ From 461db2bdbf3c978e76dd10a04a63fa06bb29114f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 Feb 2019 08:01:29 +0100 Subject: [PATCH 077/200] powerpc/dma: use the dma_direct mapping routines Switch the streaming DMA mapping and ownership transfer methods to the functionally identical dma_direct_ versions. Factor the cache maintainance helpers into the form expected by the common code for that. Signed-off-by: Christoph Hellwig Tested-by: Christian Zigotzky Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/dma-mapping.h | 30 --------- arch/powerpc/kernel/dma-iommu.c | 4 +- arch/powerpc/kernel/dma.c | 87 +++----------------------- arch/powerpc/mm/dma-noncoherent.c | 29 ++++++--- arch/powerpc/platforms/Kconfig.cputype | 2 + 5 files changed, 32 insertions(+), 120 deletions(-) diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h index cdf70aaeafeb..4de9d4ee23c1 100644 --- a/arch/powerpc/include/asm/dma-mapping.h +++ b/arch/powerpc/include/asm/dma-mapping.h @@ -25,36 +25,6 @@ extern void *__dma_nommu_alloc_coherent(struct device *dev, size_t size, extern void __dma_nommu_free_coherent(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle, unsigned long attrs); -int dma_nommu_map_sg(struct device *dev, struct scatterlist *sgl, - int nents, enum dma_data_direction direction, - unsigned long attrs); -dma_addr_t dma_nommu_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, - enum dma_data_direction dir, unsigned long attrs); - -#ifdef CONFIG_NOT_COHERENT_CACHE -/* - * DMA-consistent mapping functions for PowerPCs that don't support - * cache snooping. These allocate/free a region of uncached mapped - * memory space for use with DMA devices. Alternatively, you could - * allocate the space "normally" and use the cache management functions - * to ensure it is consistent. - */ -struct device; -extern void __dma_sync(void *vaddr, size_t size, int direction); -extern void __dma_sync_page(struct page *page, unsigned long offset, - size_t size, int direction); -extern unsigned long __dma_get_coherent_pfn(unsigned long cpu_addr); - -#else /* ! CONFIG_NOT_COHERENT_CACHE */ -/* - * Cache coherent cores. - */ - -#define __dma_sync(addr, size, rw) ((void)0) -#define __dma_sync_page(pg, off, sz, rw) ((void)0) - -#endif /* ! CONFIG_NOT_COHERENT_CACHE */ static inline unsigned long device_to_mask(struct device *dev) { diff --git a/arch/powerpc/kernel/dma-iommu.c b/arch/powerpc/kernel/dma-iommu.c index c75ba4e3a50c..09231ef06d01 100644 --- a/arch/powerpc/kernel/dma-iommu.c +++ b/arch/powerpc/kernel/dma-iommu.c @@ -68,7 +68,7 @@ static dma_addr_t dma_iommu_map_page(struct device *dev, struct page *page, unsigned long attrs) { if (dma_iommu_map_bypass(dev, attrs)) - return dma_nommu_map_page(dev, page, offset, size, direction, + return dma_direct_map_page(dev, page, offset, size, direction, attrs); return iommu_map_page(dev, get_iommu_table_base(dev), page, offset, size, device_to_mask(dev), direction, attrs); @@ -90,7 +90,7 @@ static int dma_iommu_map_sg(struct device *dev, struct scatterlist *sglist, unsigned long attrs) { if (dma_iommu_map_bypass(dev, attrs)) - return dma_nommu_map_sg(dev, sglist, nelems, direction, attrs); + return dma_direct_map_sg(dev, sglist, nelems, direction, attrs); return ppc_iommu_map_sg(dev, get_iommu_table_base(dev), sglist, nelems, device_to_mask(dev), direction, attrs); } diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c index f983f8d435a6..b9f7283e7224 100644 --- a/arch/powerpc/kernel/dma.c +++ b/arch/powerpc/kernel/dma.c @@ -27,77 +27,6 @@ * default the offset is PCI_DRAM_OFFSET. */ -int dma_nommu_map_sg(struct device *dev, struct scatterlist *sgl, - int nents, enum dma_data_direction direction, - unsigned long attrs) -{ - struct scatterlist *sg; - int i; - - for_each_sg(sgl, sg, nents, i) { - sg->dma_address = phys_to_dma(dev, sg_phys(sg)); - sg->dma_length = sg->length; - - if (attrs & DMA_ATTR_SKIP_CPU_SYNC) - continue; - - __dma_sync_page(sg_page(sg), sg->offset, sg->length, direction); - } - - return nents; -} - -static void dma_nommu_unmap_sg(struct device *dev, struct scatterlist *sgl, - int nents, enum dma_data_direction direction, - unsigned long attrs) -{ - struct scatterlist *sg; - int i; - - for_each_sg(sgl, sg, nents, i) - __dma_sync_page(sg_page(sg), sg->offset, sg->length, direction); -} - -dma_addr_t dma_nommu_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, - enum dma_data_direction dir, unsigned long attrs) -{ - if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) - __dma_sync_page(page, offset, size, dir); - - return phys_to_dma(dev, page_to_phys(page)) + offset; -} - -static inline void dma_nommu_unmap_page(struct device *dev, - dma_addr_t dma_address, - size_t size, - enum dma_data_direction direction, - unsigned long attrs) -{ - if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) - __dma_sync(bus_to_virt(dma_address), size, direction); -} - -#ifdef CONFIG_NOT_COHERENT_CACHE -static inline void dma_nommu_sync_sg(struct device *dev, - struct scatterlist *sgl, int nents, - enum dma_data_direction direction) -{ - struct scatterlist *sg; - int i; - - for_each_sg(sgl, sg, nents, i) - __dma_sync_page(sg_page(sg), sg->offset, sg->length, direction); -} - -static inline void dma_nommu_sync_single(struct device *dev, - dma_addr_t dma_handle, size_t size, - enum dma_data_direction direction) -{ - __dma_sync(bus_to_virt(dma_handle), size, direction); -} -#endif - const struct dma_map_ops dma_nommu_ops = { #ifdef CONFIG_NOT_COHERENT_CACHE .alloc = __dma_nommu_alloc_coherent, @@ -106,17 +35,17 @@ const struct dma_map_ops dma_nommu_ops = { .alloc = dma_direct_alloc, .free = dma_direct_free, #endif - .map_sg = dma_nommu_map_sg, - .unmap_sg = dma_nommu_unmap_sg, + .map_sg = dma_direct_map_sg, .dma_supported = dma_direct_supported, - .map_page = dma_nommu_map_page, - .unmap_page = dma_nommu_unmap_page, + .map_page = dma_direct_map_page, .get_required_mask = dma_direct_get_required_mask, #ifdef CONFIG_NOT_COHERENT_CACHE - .sync_single_for_cpu = dma_nommu_sync_single, - .sync_single_for_device = dma_nommu_sync_single, - .sync_sg_for_cpu = dma_nommu_sync_sg, - .sync_sg_for_device = dma_nommu_sync_sg, + .unmap_sg = dma_direct_unmap_sg, + .unmap_page = dma_direct_unmap_page, + .sync_single_for_cpu = dma_direct_sync_single_for_cpu, + .sync_single_for_device = dma_direct_sync_single_for_device, + .sync_sg_for_cpu = dma_direct_sync_sg_for_cpu, + .sync_sg_for_device = dma_direct_sync_sg_for_device, #endif }; EXPORT_SYMBOL(dma_nommu_ops); diff --git a/arch/powerpc/mm/dma-noncoherent.c b/arch/powerpc/mm/dma-noncoherent.c index ee95da19c82d..c3d15d718a58 100644 --- a/arch/powerpc/mm/dma-noncoherent.c +++ b/arch/powerpc/mm/dma-noncoherent.c @@ -314,7 +314,7 @@ void __dma_nommu_free_coherent(struct device *dev, size_t size, void *vaddr, /* * make an area consistent. */ -void __dma_sync(void *vaddr, size_t size, int direction) +static void __dma_sync(void *vaddr, size_t size, int direction) { unsigned long start = (unsigned long)vaddr; unsigned long end = start + size; @@ -340,7 +340,6 @@ void __dma_sync(void *vaddr, size_t size, int direction) break; } } -EXPORT_SYMBOL(__dma_sync); #ifdef CONFIG_HIGHMEM /* @@ -387,21 +386,33 @@ static inline void __dma_sync_page_highmem(struct page *page, * __dma_sync_page makes memory consistent. identical to __dma_sync, but * takes a struct page instead of a virtual address */ -void __dma_sync_page(struct page *page, unsigned long offset, - size_t size, int direction) +static void __dma_sync_page(phys_addr_t paddr, size_t size, int dir) { + struct page *page = pfn_to_page(paddr >> PAGE_SHIFT); + unsigned offset = paddr & ~PAGE_MASK; + #ifdef CONFIG_HIGHMEM - __dma_sync_page_highmem(page, offset, size, direction); + __dma_sync_page_highmem(page, offset, size, dir); #else unsigned long start = (unsigned long)page_address(page) + offset; - __dma_sync((void *)start, size, direction); + __dma_sync((void *)start, size, dir); #endif } -EXPORT_SYMBOL(__dma_sync_page); + +void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr, + size_t size, enum dma_data_direction dir) +{ + __dma_sync_page(paddr, size, dir); +} + +void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr, + size_t size, enum dma_data_direction dir) +{ + __dma_sync_page(paddr, size, dir); +} /* - * Return the PFN for a given cpu virtual address returned by - * __dma_nommu_alloc_coherent. + * Return the PFN for a given cpu virtual address returned by arch_dma_alloc. */ long arch_dma_coherent_to_pfn(struct device *dev, void *vaddr, dma_addr_t dma_addr) diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 48cd5aa90ad2..47db4934c1cf 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -403,6 +403,8 @@ config NOT_COHERENT_CACHE depends on 4xx || PPC_8xx || E200 || PPC_MPC512x || \ GAMECUBE_COMMON || AMIGAONE select ARCH_HAS_DMA_COHERENT_TO_PFN + select ARCH_HAS_SYNC_DMA_FOR_DEVICE + select ARCH_HAS_SYNC_DMA_FOR_CPU default n if PPC_47x default y From 68005b67d15a1ee5b5ddff965175728e65fa73e7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 Feb 2019 08:01:30 +0100 Subject: [PATCH 078/200] powerpc/dma: use the generic direct mapping bypass Now that we've switched all the powerpc nommu and swiotlb methods to use the generic dma_direct_* calls we can remove these ops vectors entirely and rely on the common direct mapping bypass that avoids indirect function calls entirely. This also allows to remove a whole lot of boilerplate code related to setting up these operations. Signed-off-by: Christoph Hellwig Tested-by: Christian Zigotzky Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/dma-mapping.h | 9 --- arch/powerpc/include/asm/swiotlb.h | 3 - arch/powerpc/kernel/Makefile | 2 +- arch/powerpc/kernel/dma-swiotlb.c | 58 ----------------- arch/powerpc/kernel/dma.c | 62 ------------------- arch/powerpc/kernel/pci-common.c | 2 +- arch/powerpc/kernel/setup-common.c | 1 - arch/powerpc/mm/dma-noncoherent.c | 6 +- arch/powerpc/platforms/44x/warp.c | 2 +- arch/powerpc/platforms/85xx/corenet_generic.c | 4 -- arch/powerpc/platforms/85xx/ge_imp3a.c | 2 - arch/powerpc/platforms/85xx/mpc8536_ds.c | 2 - arch/powerpc/platforms/85xx/mpc85xx_ds.c | 4 -- arch/powerpc/platforms/85xx/mpc85xx_mds.c | 4 -- arch/powerpc/platforms/85xx/p1010rdb.c | 1 - arch/powerpc/platforms/85xx/p1022_ds.c | 2 - arch/powerpc/platforms/85xx/p1022_rdk.c | 2 - arch/powerpc/platforms/86xx/mpc86xx_hpcn.c | 1 - arch/powerpc/platforms/cell/iommu.c | 3 - arch/powerpc/platforms/pasemi/iommu.c | 2 +- arch/powerpc/platforms/pasemi/setup.c | 51 --------------- arch/powerpc/platforms/powernv/npu-dma.c | 2 +- arch/powerpc/platforms/pseries/vio.c | 7 +++ arch/powerpc/sysdev/fsl_pci.c | 5 +- drivers/misc/cxl/vphb.c | 1 - 25 files changed, 16 insertions(+), 222 deletions(-) delete mode 100644 arch/powerpc/kernel/dma.c diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h index 4de9d4ee23c1..93e57e28be28 100644 --- a/arch/powerpc/include/asm/dma-mapping.h +++ b/arch/powerpc/include/asm/dma-mapping.h @@ -18,14 +18,6 @@ #include #include -/* Some dma direct funcs must be visible for use in other dma_ops */ -extern void *__dma_nommu_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t flag, - unsigned long attrs); -extern void __dma_nommu_free_coherent(struct device *dev, size_t size, - void *vaddr, dma_addr_t dma_handle, - unsigned long attrs); - static inline unsigned long device_to_mask(struct device *dev) { if (dev->dma_mask && *dev->dma_mask) @@ -40,7 +32,6 @@ static inline unsigned long device_to_mask(struct device *dev) #ifdef CONFIG_PPC64 extern const struct dma_map_ops dma_iommu_ops; #endif -extern const struct dma_map_ops dma_nommu_ops; static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus) { diff --git a/arch/powerpc/include/asm/swiotlb.h b/arch/powerpc/include/asm/swiotlb.h index 26a0f12b835b..b7d082c0ec25 100644 --- a/arch/powerpc/include/asm/swiotlb.h +++ b/arch/powerpc/include/asm/swiotlb.h @@ -13,10 +13,7 @@ #include -extern const struct dma_map_ops powerpc_swiotlb_dma_ops; - extern unsigned int ppc_swiotlb_enable; -int __init swiotlb_setup_bus_notifier(void); #ifdef CONFIG_SWIOTLB void swiotlb_detect_4g(void); diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 9bb12cd642ef..8809e287b80d 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -36,7 +36,7 @@ obj-y := cputable.o ptrace.o syscalls.o \ process.o systbl.o idle.o \ signal.o sysfs.o cacheinfo.o time.o \ prom.o traps.o setup-common.o \ - udbg.o misc.o io.o dma.o misc_$(BITS).o \ + udbg.o misc.o io.o misc_$(BITS).o \ of_platform.o prom_parse.o obj-$(CONFIG_PPC64) += setup_64.o sys_ppc32.o \ signal_64.o ptrace32.o \ diff --git a/arch/powerpc/kernel/dma-swiotlb.c b/arch/powerpc/kernel/dma-swiotlb.c index 3a15a7d945e9..132d61c91629 100644 --- a/arch/powerpc/kernel/dma-swiotlb.c +++ b/arch/powerpc/kernel/dma-swiotlb.c @@ -10,70 +10,12 @@ * option) any later version. * */ - -#include #include -#include -#include -#include -#include - #include #include -#include unsigned int ppc_swiotlb_enable; -/* - * At the moment, all platforms that use this code only require - * swiotlb to be used if we're operating on HIGHMEM. Since - * we don't ever call anything other than map_sg, unmap_sg, - * map_page, and unmap_page on highmem, use normal dma_ops - * for everything else. - */ -const struct dma_map_ops powerpc_swiotlb_dma_ops = { - .alloc = dma_direct_alloc, - .free = dma_direct_free, - .map_sg = dma_direct_map_sg, - .unmap_sg = dma_direct_unmap_sg, - .dma_supported = dma_direct_supported, - .map_page = dma_direct_map_page, - .unmap_page = dma_direct_unmap_page, - .sync_single_for_cpu = dma_direct_sync_single_for_cpu, - .sync_single_for_device = dma_direct_sync_single_for_device, - .sync_sg_for_cpu = dma_direct_sync_sg_for_cpu, - .sync_sg_for_device = dma_direct_sync_sg_for_device, - .get_required_mask = dma_direct_get_required_mask, -}; - -static int ppc_swiotlb_bus_notify(struct notifier_block *nb, - unsigned long action, void *data) -{ - struct device *dev = data; - - /* We are only intereted in device addition */ - if (action != BUS_NOTIFY_ADD_DEVICE) - return 0; - - /* May need to bounce if the device can't address all of DRAM */ - if ((dma_get_mask(dev) + 1) < memblock_end_of_DRAM()) - set_dma_ops(dev, &powerpc_swiotlb_dma_ops); - - return NOTIFY_DONE; -} - -static struct notifier_block ppc_swiotlb_plat_bus_notifier = { - .notifier_call = ppc_swiotlb_bus_notify, - .priority = 0, -}; - -int __init swiotlb_setup_bus_notifier(void) -{ - bus_register_notifier(&platform_bus_type, - &ppc_swiotlb_plat_bus_notifier); - return 0; -} - void __init swiotlb_detect_4g(void) { if ((memblock_end_of_DRAM() - 1) > 0xffffffff) diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c deleted file mode 100644 index b9f7283e7224..000000000000 --- a/arch/powerpc/kernel/dma.c +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright (C) 2006 Benjamin Herrenschmidt, IBM Corporation - * - * Provide default implementations of the DMA mapping callbacks for - * directly mapped busses. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * Generic direct DMA implementation - * - * This implementation supports a per-device offset that can be applied if - * the address at which memory is visible to devices is not 0. Platform code - * can set archdata.dma_data to an unsigned long holding the offset. By - * default the offset is PCI_DRAM_OFFSET. - */ - -const struct dma_map_ops dma_nommu_ops = { -#ifdef CONFIG_NOT_COHERENT_CACHE - .alloc = __dma_nommu_alloc_coherent, - .free = __dma_nommu_free_coherent, -#else - .alloc = dma_direct_alloc, - .free = dma_direct_free, -#endif - .map_sg = dma_direct_map_sg, - .dma_supported = dma_direct_supported, - .map_page = dma_direct_map_page, - .get_required_mask = dma_direct_get_required_mask, -#ifdef CONFIG_NOT_COHERENT_CACHE - .unmap_sg = dma_direct_unmap_sg, - .unmap_page = dma_direct_unmap_page, - .sync_single_for_cpu = dma_direct_sync_single_for_cpu, - .sync_single_for_device = dma_direct_sync_single_for_device, - .sync_sg_for_cpu = dma_direct_sync_sg_for_cpu, - .sync_sg_for_device = dma_direct_sync_sg_for_device, -#endif -}; -EXPORT_SYMBOL(dma_nommu_ops); - -static int __init dma_init(void) -{ -#ifdef CONFIG_IBMVIO - dma_debug_add_bus(&vio_bus_type); -#endif - - return 0; -} -fs_initcall(dma_init); - diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c index a84707680525..23989175349c 100644 --- a/arch/powerpc/kernel/pci-common.c +++ b/arch/powerpc/kernel/pci-common.c @@ -62,7 +62,7 @@ resource_size_t isa_mem_base; EXPORT_SYMBOL(isa_mem_base); -static const struct dma_map_ops *pci_dma_ops = &dma_nommu_ops; +static const struct dma_map_ops *pci_dma_ops; void set_pci_dma_ops(const struct dma_map_ops *dma_ops) { diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index ca00fbb97cf8..fa606aa98f6d 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -791,7 +791,6 @@ void arch_setup_pdev_archdata(struct platform_device *pdev) { pdev->archdata.dma_mask = DMA_BIT_MASK(32); pdev->dev.dma_mask = &pdev->archdata.dma_mask; - set_dma_ops(&pdev->dev, &dma_nommu_ops); } static __init void print_system_info(void) diff --git a/arch/powerpc/mm/dma-noncoherent.c b/arch/powerpc/mm/dma-noncoherent.c index c3d15d718a58..b5d2658c26af 100644 --- a/arch/powerpc/mm/dma-noncoherent.c +++ b/arch/powerpc/mm/dma-noncoherent.c @@ -152,8 +152,8 @@ static struct ppc_vm_region *ppc_vm_region_find(struct ppc_vm_region *head, unsi * Allocate DMA-coherent memory space and return both the kernel remapped * virtual and bus address for that space. */ -void *__dma_nommu_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) +void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, + gfp_t gfp, unsigned long attrs) { struct page *page; struct ppc_vm_region *c; @@ -254,7 +254,7 @@ void *__dma_nommu_alloc_coherent(struct device *dev, size_t size, /* * free a page as defined by the above mapping. */ -void __dma_nommu_free_coherent(struct device *dev, size_t size, void *vaddr, +void arch_dma_free(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle, unsigned long attrs) { struct ppc_vm_region *c; diff --git a/arch/powerpc/platforms/44x/warp.c b/arch/powerpc/platforms/44x/warp.c index f467247fd1c4..18422dbd061a 100644 --- a/arch/powerpc/platforms/44x/warp.c +++ b/arch/powerpc/platforms/44x/warp.c @@ -47,7 +47,7 @@ static int __init warp_probe(void) if (!of_machine_is_compatible("pika,warp")) return 0; - /* For __dma_nommu_alloc_coherent */ + /* For arch_dma_alloc */ ISA_DMA_THRESHOLD = ~0L; return 1; diff --git a/arch/powerpc/platforms/85xx/corenet_generic.c b/arch/powerpc/platforms/85xx/corenet_generic.c index b0dac307bebf..808da1e9c0a7 100644 --- a/arch/powerpc/platforms/85xx/corenet_generic.c +++ b/arch/powerpc/platforms/85xx/corenet_generic.c @@ -223,7 +223,3 @@ define_machine(corenet_generic) { }; machine_arch_initcall(corenet_generic, corenet_gen_publish_devices); - -#ifdef CONFIG_SWIOTLB -machine_arch_initcall(corenet_generic, swiotlb_setup_bus_notifier); -#endif diff --git a/arch/powerpc/platforms/85xx/ge_imp3a.c b/arch/powerpc/platforms/85xx/ge_imp3a.c index f29c6f0909f3..c64fa2483ea9 100644 --- a/arch/powerpc/platforms/85xx/ge_imp3a.c +++ b/arch/powerpc/platforms/85xx/ge_imp3a.c @@ -202,8 +202,6 @@ static int __init ge_imp3a_probe(void) machine_arch_initcall(ge_imp3a, mpc85xx_common_publish_devices); -machine_arch_initcall(ge_imp3a, swiotlb_setup_bus_notifier); - define_machine(ge_imp3a) { .name = "GE_IMP3A", .probe = ge_imp3a_probe, diff --git a/arch/powerpc/platforms/85xx/mpc8536_ds.c b/arch/powerpc/platforms/85xx/mpc8536_ds.c index 94a7f92c858f..94194bad4954 100644 --- a/arch/powerpc/platforms/85xx/mpc8536_ds.c +++ b/arch/powerpc/platforms/85xx/mpc8536_ds.c @@ -57,8 +57,6 @@ static void __init mpc8536_ds_setup_arch(void) machine_arch_initcall(mpc8536_ds, mpc85xx_common_publish_devices); -machine_arch_initcall(mpc8536_ds, swiotlb_setup_bus_notifier); - /* * Called very early, device-tree isn't unflattened */ diff --git a/arch/powerpc/platforms/85xx/mpc85xx_ds.c b/arch/powerpc/platforms/85xx/mpc85xx_ds.c index dc9e035cc637..b7e29ce1f266 100644 --- a/arch/powerpc/platforms/85xx/mpc85xx_ds.c +++ b/arch/powerpc/platforms/85xx/mpc85xx_ds.c @@ -174,10 +174,6 @@ machine_arch_initcall(mpc8544_ds, mpc85xx_common_publish_devices); machine_arch_initcall(mpc8572_ds, mpc85xx_common_publish_devices); machine_arch_initcall(p2020_ds, mpc85xx_common_publish_devices); -machine_arch_initcall(mpc8544_ds, swiotlb_setup_bus_notifier); -machine_arch_initcall(mpc8572_ds, swiotlb_setup_bus_notifier); -machine_arch_initcall(p2020_ds, swiotlb_setup_bus_notifier); - /* * Called very early, device-tree isn't unflattened */ diff --git a/arch/powerpc/platforms/85xx/mpc85xx_mds.c b/arch/powerpc/platforms/85xx/mpc85xx_mds.c index d7e440e6dba3..80939a425de5 100644 --- a/arch/powerpc/platforms/85xx/mpc85xx_mds.c +++ b/arch/powerpc/platforms/85xx/mpc85xx_mds.c @@ -367,10 +367,6 @@ machine_arch_initcall(mpc8568_mds, mpc85xx_publish_devices); machine_arch_initcall(mpc8569_mds, mpc85xx_publish_devices); machine_arch_initcall(p1021_mds, mpc85xx_common_publish_devices); -machine_arch_initcall(mpc8568_mds, swiotlb_setup_bus_notifier); -machine_arch_initcall(mpc8569_mds, swiotlb_setup_bus_notifier); -machine_arch_initcall(p1021_mds, swiotlb_setup_bus_notifier); - static void __init mpc85xx_mds_pic_init(void) { struct mpic *mpic = mpic_alloc(NULL, 0, MPIC_BIG_ENDIAN | diff --git a/arch/powerpc/platforms/85xx/p1010rdb.c b/arch/powerpc/platforms/85xx/p1010rdb.c index 78d13b364cd6..33ca373322e1 100644 --- a/arch/powerpc/platforms/85xx/p1010rdb.c +++ b/arch/powerpc/platforms/85xx/p1010rdb.c @@ -55,7 +55,6 @@ static void __init p1010_rdb_setup_arch(void) } machine_arch_initcall(p1010_rdb, mpc85xx_common_publish_devices); -machine_arch_initcall(p1010_rdb, swiotlb_setup_bus_notifier); /* * Called very early, device-tree isn't unflattened diff --git a/arch/powerpc/platforms/85xx/p1022_ds.c b/arch/powerpc/platforms/85xx/p1022_ds.c index 9fb57f78cdbe..1f1af0557470 100644 --- a/arch/powerpc/platforms/85xx/p1022_ds.c +++ b/arch/powerpc/platforms/85xx/p1022_ds.c @@ -548,8 +548,6 @@ static void __init p1022_ds_setup_arch(void) machine_arch_initcall(p1022_ds, mpc85xx_common_publish_devices); -machine_arch_initcall(p1022_ds, swiotlb_setup_bus_notifier); - /* * Called very early, device-tree isn't unflattened */ diff --git a/arch/powerpc/platforms/85xx/p1022_rdk.c b/arch/powerpc/platforms/85xx/p1022_rdk.c index 276e00ab3dde..fd9e3e7ef234 100644 --- a/arch/powerpc/platforms/85xx/p1022_rdk.c +++ b/arch/powerpc/platforms/85xx/p1022_rdk.c @@ -128,8 +128,6 @@ static void __init p1022_rdk_setup_arch(void) machine_arch_initcall(p1022_rdk, mpc85xx_common_publish_devices); -machine_arch_initcall(p1022_rdk, swiotlb_setup_bus_notifier); - /* * Called very early, device-tree isn't unflattened */ diff --git a/arch/powerpc/platforms/86xx/mpc86xx_hpcn.c b/arch/powerpc/platforms/86xx/mpc86xx_hpcn.c index 17c6cd3d02e6..775a92353c83 100644 --- a/arch/powerpc/platforms/86xx/mpc86xx_hpcn.c +++ b/arch/powerpc/platforms/86xx/mpc86xx_hpcn.c @@ -121,7 +121,6 @@ static int __init declare_of_platform_devices(void) return 0; } machine_arch_initcall(mpc86xx_hpcn, declare_of_platform_devices); -machine_arch_initcall(mpc86xx_hpcn, swiotlb_setup_bus_notifier); define_machine(mpc86xx_hpcn) { .name = "MPC86xx HPCN", diff --git a/arch/powerpc/platforms/cell/iommu.c b/arch/powerpc/platforms/cell/iommu.c index a3c4057a8f65..06abd432b830 100644 --- a/arch/powerpc/platforms/cell/iommu.c +++ b/arch/powerpc/platforms/cell/iommu.c @@ -600,8 +600,6 @@ static int cell_of_bus_notify(struct notifier_block *nb, unsigned long action, if (cell_iommu_enabled) dev->dma_ops = &dma_iommu_ops; - else - dev->dma_ops = &dma_nommu_ops; cell_dma_dev_setup(dev); return 0; } @@ -727,7 +725,6 @@ static int __init cell_iommu_init_disabled(void) unsigned long base = 0, size; /* When no iommu is present, we use direct DMA ops */ - set_pci_dma_ops(&dma_nommu_ops); /* First make sure all IOC translation is turned off */ cell_disable_iommus(); diff --git a/arch/powerpc/platforms/pasemi/iommu.c b/arch/powerpc/platforms/pasemi/iommu.c index f2971522fb4a..bbeb6a1b0393 100644 --- a/arch/powerpc/platforms/pasemi/iommu.c +++ b/arch/powerpc/platforms/pasemi/iommu.c @@ -186,7 +186,7 @@ static void pci_dma_dev_setup_pasemi(struct pci_dev *dev) */ if (dev->vendor == 0x1959 && dev->device == 0xa007 && !firmware_has_feature(FW_FEATURE_LPAR)) { - dev->dev.dma_ops = &dma_nommu_ops; + dev->dev.dma_ops = NULL; /* * Set the coherent DMA mask to prevent the iommu * being used unnecessarily diff --git a/arch/powerpc/platforms/pasemi/setup.c b/arch/powerpc/platforms/pasemi/setup.c index c0532999f854..46dd463faaa7 100644 --- a/arch/powerpc/platforms/pasemi/setup.c +++ b/arch/powerpc/platforms/pasemi/setup.c @@ -411,55 +411,6 @@ out: return !!(srr1 & 0x2); } -#ifdef CONFIG_PCMCIA -static int pcmcia_notify(struct notifier_block *nb, unsigned long action, - void *data) -{ - struct device *dev = data; - struct device *parent; - struct pcmcia_device *pdev = to_pcmcia_dev(dev); - - /* We are only intereted in device addition */ - if (action != BUS_NOTIFY_ADD_DEVICE) - return 0; - - parent = pdev->socket->dev.parent; - - /* We know electra_cf devices will always have of_node set, since - * electra_cf is an of_platform driver. - */ - if (!parent->of_node) - return 0; - - if (!of_device_is_compatible(parent->of_node, "electra-cf")) - return 0; - - /* We use the direct ops for localbus */ - dev->dma_ops = &dma_nommu_ops; - - return 0; -} - -static struct notifier_block pcmcia_notifier = { - .notifier_call = pcmcia_notify, -}; - -static inline void pasemi_pcmcia_init(void) -{ - extern struct bus_type pcmcia_bus_type; - - bus_register_notifier(&pcmcia_bus_type, &pcmcia_notifier); -} - -#else - -static inline void pasemi_pcmcia_init(void) -{ -} - -#endif - - static const struct of_device_id pasemi_bus_ids[] = { /* Unfortunately needed for legacy firmwares */ { .type = "localbus", }, @@ -472,8 +423,6 @@ static const struct of_device_id pasemi_bus_ids[] = { static int __init pasemi_publish_devices(void) { - pasemi_pcmcia_init(); - /* Publish OF platform devices for SDC and other non-PCI devices */ of_platform_bus_probe(NULL, pasemi_bus_ids, NULL); diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c index d7f742ed48ba..4e87e13fa0fc 100644 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ b/arch/powerpc/platforms/powernv/npu-dma.c @@ -220,7 +220,7 @@ static void pnv_npu_dma_set_32(struct pnv_ioda_pe *npe) * their parent device so drivers shouldn't be doing DMA * operations directly on these devices. */ - set_dma_ops(&npe->pdev->dev, NULL); + set_dma_ops(&npe->pdev->dev, &dma_dummy_ops); } /* diff --git a/arch/powerpc/platforms/pseries/vio.c b/arch/powerpc/platforms/pseries/vio.c index b7dc8bd41fd0..141795275ccb 100644 --- a/arch/powerpc/platforms/pseries/vio.c +++ b/arch/powerpc/platforms/pseries/vio.c @@ -1699,3 +1699,10 @@ int vio_disable_interrupts(struct vio_dev *dev) } EXPORT_SYMBOL(vio_disable_interrupts); #endif /* CONFIG_PPC_PSERIES */ + +static int __init vio_init(void) +{ + dma_debug_add_bus(&vio_bus_type); + return 0; +} +fs_initcall(vio_init); diff --git a/arch/powerpc/sysdev/fsl_pci.c b/arch/powerpc/sysdev/fsl_pci.c index 0c6510f340cb..23000ca7f688 100644 --- a/arch/powerpc/sysdev/fsl_pci.c +++ b/arch/powerpc/sysdev/fsl_pci.c @@ -124,10 +124,8 @@ static void pci_dma_dev_setup_swiotlb(struct pci_dev *pdev) static void setup_swiotlb_ops(struct pci_controller *hose) { - if (ppc_swiotlb_enable) { + if (ppc_swiotlb_enable) hose->controller_ops.dma_dev_setup = pci_dma_dev_setup_swiotlb; - set_pci_dma_ops(&powerpc_swiotlb_dma_ops); - } } #else static inline void setup_swiotlb_ops(struct pci_controller *hose) {} @@ -141,7 +139,6 @@ static void fsl_pci_dma_set_mask(struct device *dev, u64 dma_mask) */ if (dev_is_pci(dev) && dma_mask >= pci64_dma_offset * 2 - 1) { dev->bus_dma_mask = 0; - set_dma_ops(dev, &dma_nommu_ops); set_dma_offset(dev, pci64_dma_offset); } } diff --git a/drivers/misc/cxl/vphb.c b/drivers/misc/cxl/vphb.c index 49da2f744bbf..b64adc0f0865 100644 --- a/drivers/misc/cxl/vphb.c +++ b/drivers/misc/cxl/vphb.c @@ -43,7 +43,6 @@ static bool cxl_pci_enable_device_hook(struct pci_dev *dev) return false; } - set_dma_ops(&dev->dev, &dma_nommu_ops); set_dma_offset(&dev->dev, PAGE_OFFSET); /* From 7610fdf5e056ad5764d19f39db49b11608334610 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 Feb 2019 08:01:31 +0100 Subject: [PATCH 079/200] powerpc/dma: remove get_dma_offset Just fold the calculation into __phys_to_dma/__dma_to_phys as those are the only places that should know about it. Signed-off-by: Christoph Hellwig Acked-by: Benjamin Herrenschmidt Tested-by: Christian Zigotzky Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/dma-direct.h | 8 ++++++-- arch/powerpc/include/asm/dma-mapping.h | 16 ---------------- 2 files changed, 6 insertions(+), 18 deletions(-) diff --git a/arch/powerpc/include/asm/dma-direct.h b/arch/powerpc/include/asm/dma-direct.h index 92d8aed86422..a2912b47102c 100644 --- a/arch/powerpc/include/asm/dma-direct.h +++ b/arch/powerpc/include/asm/dma-direct.h @@ -13,11 +13,15 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) static inline dma_addr_t __phys_to_dma(struct device *dev, phys_addr_t paddr) { - return paddr + get_dma_offset(dev); + if (!dev) + return paddr + PCI_DRAM_OFFSET; + return paddr + dev->archdata.dma_offset; } static inline phys_addr_t __dma_to_phys(struct device *dev, dma_addr_t daddr) { - return daddr - get_dma_offset(dev); + if (!dev) + return daddr - PCI_DRAM_OFFSET; + return daddr - dev->archdata.dma_offset; } #endif /* ASM_POWERPC_DMA_DIRECT_H */ diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h index 93e57e28be28..c70f55d2f5e0 100644 --- a/arch/powerpc/include/asm/dma-mapping.h +++ b/arch/powerpc/include/asm/dma-mapping.h @@ -43,22 +43,6 @@ static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus) return NULL; } -/* - * get_dma_offset() - * - * Get the dma offset on configurations where the dma address can be determined - * from the physical address by looking at a simple offset. Direct dma and - * swiotlb use this function, but it is typically not used by implementations - * with an iommu. - */ -static inline dma_addr_t get_dma_offset(struct device *dev) -{ - if (dev) - return dev->archdata.dma_offset; - - return PCI_DRAM_OFFSET; -} - static inline void set_dma_offset(struct device *dev, dma_addr_t off) { if (dev) From 0617fc0ca412b535c0ab0e5e7b03180067f0f7fd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 Feb 2019 08:01:32 +0100 Subject: [PATCH 080/200] powerpc/dma: remove set_dma_offset There is no good reason for this helper, just opencode it. Signed-off-by: Christoph Hellwig Tested-by: Christian Zigotzky Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/dma-mapping.h | 6 ------ arch/powerpc/kernel/pci-common.c | 2 +- arch/powerpc/platforms/cell/iommu.c | 4 ++-- arch/powerpc/platforms/powernv/pci-ioda.c | 6 +++--- arch/powerpc/platforms/pseries/iommu.c | 7 ++----- arch/powerpc/sysdev/dart_iommu.c | 2 +- arch/powerpc/sysdev/fsl_pci.c | 2 +- drivers/misc/cxl/vphb.c | 2 +- 8 files changed, 11 insertions(+), 20 deletions(-) diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h index c70f55d2f5e0..a59c42879194 100644 --- a/arch/powerpc/include/asm/dma-mapping.h +++ b/arch/powerpc/include/asm/dma-mapping.h @@ -43,11 +43,5 @@ static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus) return NULL; } -static inline void set_dma_offset(struct device *dev, dma_addr_t off) -{ - if (dev) - dev->archdata.dma_offset = off; -} - #endif /* __KERNEL__ */ #endif /* _ASM_DMA_MAPPING_H */ diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c index 23989175349c..cbdf13d86227 100644 --- a/arch/powerpc/kernel/pci-common.c +++ b/arch/powerpc/kernel/pci-common.c @@ -966,7 +966,7 @@ static void pcibios_setup_device(struct pci_dev *dev) /* Hook up default DMA ops */ set_dma_ops(&dev->dev, pci_dma_ops); - set_dma_offset(&dev->dev, PCI_DRAM_OFFSET); + dev->dev.archdata.dma_offset = PCI_DRAM_OFFSET; /* Additional platform DMA/iommu setup */ phb = pci_bus_to_host(dev->bus); diff --git a/arch/powerpc/platforms/cell/iommu.c b/arch/powerpc/platforms/cell/iommu.c index 06abd432b830..54e012e1f720 100644 --- a/arch/powerpc/platforms/cell/iommu.c +++ b/arch/powerpc/platforms/cell/iommu.c @@ -577,10 +577,10 @@ static void cell_dma_dev_setup(struct device *dev) u64 addr = cell_iommu_get_fixed_address(dev); if (addr != OF_BAD_ADDR) - set_dma_offset(dev, addr + dma_iommu_fixed_base); + dev->archdata.dma_offset = addr + dma_iommu_fixed_base; set_iommu_table_base(dev, cell_get_iommu_table(dev)); } else { - set_dma_offset(dev, cell_dma_nommu_offset); + dev->archdata.dma_offset = cell_dma_nommu_offset; } } diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 2de7fcf54c40..6438f38235e8 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1746,7 +1746,7 @@ static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev pe = &phb->ioda.pe_array[pdn->pe_number]; WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops); - set_dma_offset(&pdev->dev, pe->tce_bypass_base); + pdev->dev.archdata.dma_offset = pe->tce_bypass_base; set_iommu_table_base(&pdev->dev, pe->table_group.tables[0]); /* * Note: iommu_add_device() will fail here as @@ -1859,7 +1859,7 @@ static bool pnv_pci_ioda_iommu_bypass_supported(struct pci_dev *pdev, if (rc) return rc; /* 4GB offset bypasses 32-bit space */ - set_dma_offset(&pdev->dev, (1ULL << 32)); + pdev->dev.archdata.dma_offset = (1ULL << 32); return true; } @@ -1872,7 +1872,7 @@ static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, struct pci_bus *bus) list_for_each_entry(dev, &bus->devices, bus_list) { set_iommu_table_base(&dev->dev, pe->table_group.tables[0]); - set_dma_offset(&dev->dev, pe->tce_bypass_base); + dev->dev.archdata.dma_offset = pe->tce_bypass_base; if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate) pnv_ioda_setup_bus_dma(pe, dev->subordinate); diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 37d2ce3f55a3..36eb1ddbac69 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -1202,7 +1202,6 @@ static bool iommu_bypass_supported_pSeriesLP(struct pci_dev *pdev, u64 dma_mask) { struct device_node *dn = pci_device_to_OF_node(pdev), *pdn; const __be32 *dma_window = NULL; - u64 dma_offset; /* only attempt to use a new window if 64-bit DMA is requested */ if (dma_mask < DMA_BIT_MASK(64)) @@ -1224,11 +1223,9 @@ static bool iommu_bypass_supported_pSeriesLP(struct pci_dev *pdev, u64 dma_mask) } if (pdn && PCI_DN(pdn)) { - dma_offset = enable_ddw(pdev, pdn); - if (dma_offset != 0) { - set_dma_offset(&pdev->dev, dma_offset); + pdev->dev.archdata.dma_offset = enable_ddw(pdev, pdn); + if (pdev->dev.archdata.dma_offset) return true; - } } return false; diff --git a/arch/powerpc/sysdev/dart_iommu.c b/arch/powerpc/sysdev/dart_iommu.c index d42ba645d51d..809797dbe169 100644 --- a/arch/powerpc/sysdev/dart_iommu.c +++ b/arch/powerpc/sysdev/dart_iommu.c @@ -386,7 +386,7 @@ static bool dart_device_on_pcie(struct device *dev) static void pci_dma_dev_setup_dart(struct pci_dev *dev) { if (dart_is_u4 && dart_device_on_pcie(&dev->dev)) - set_dma_offset(&dev->dev, DART_U4_BYPASS_BASE); + dev->dev.archdata.dma_offset = DART_U4_BYPASS_BASE; set_iommu_table_base(&dev->dev, &iommu_table_dart); } diff --git a/arch/powerpc/sysdev/fsl_pci.c b/arch/powerpc/sysdev/fsl_pci.c index 23000ca7f688..a04c6dde6ed0 100644 --- a/arch/powerpc/sysdev/fsl_pci.c +++ b/arch/powerpc/sysdev/fsl_pci.c @@ -139,7 +139,7 @@ static void fsl_pci_dma_set_mask(struct device *dev, u64 dma_mask) */ if (dev_is_pci(dev) && dma_mask >= pci64_dma_offset * 2 - 1) { dev->bus_dma_mask = 0; - set_dma_offset(dev, pci64_dma_offset); + dev->archdata.dma_offset = pci64_dma_offset; } } diff --git a/drivers/misc/cxl/vphb.c b/drivers/misc/cxl/vphb.c index b64adc0f0865..631c5df246d4 100644 --- a/drivers/misc/cxl/vphb.c +++ b/drivers/misc/cxl/vphb.c @@ -43,7 +43,7 @@ static bool cxl_pci_enable_device_hook(struct pci_dev *dev) return false; } - set_dma_offset(&dev->dev, PAGE_OFFSET); + dev->dev.archdata.dma_offset = PAGE_OFFSET; /* * Allocate a context to do cxl things too. If we eventually do real From 4a605e2d1a69f5aea06da10d81e22802a90812a3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 Feb 2019 08:01:33 +0100 Subject: [PATCH 081/200] powerpc/dma: trim the fat from There is no need to provide anything but get_arch_dma_ops to . More the remaining declarations to and drop all the includes. Signed-off-by: Christoph Hellwig Tested-by: Christian Zigotzky Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/dma-mapping.h | 29 ------------------- arch/powerpc/include/asm/iommu.h | 10 +++++++ arch/powerpc/platforms/44x/ppc476.c | 1 + arch/powerpc/platforms/85xx/corenet_generic.c | 1 + arch/powerpc/platforms/85xx/qemu_e500.c | 1 + arch/powerpc/sysdev/fsl_pci.c | 1 + 6 files changed, 14 insertions(+), 29 deletions(-) diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h index a59c42879194..565d6f74b189 100644 --- a/arch/powerpc/include/asm/dma-mapping.h +++ b/arch/powerpc/include/asm/dma-mapping.h @@ -1,37 +1,9 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2004 IBM - * - * Implements the generic device dma API for powerpc. - * the pci and vio busses */ #ifndef _ASM_DMA_MAPPING_H #define _ASM_DMA_MAPPING_H -#ifdef __KERNEL__ - -#include -#include -/* need struct page definitions */ -#include -#include -#include -#include -#include - -static inline unsigned long device_to_mask(struct device *dev) -{ - if (dev->dma_mask && *dev->dma_mask) - return *dev->dma_mask; - /* Assume devices without mask can take 32 bit addresses */ - return 0xfffffffful; -} - -/* - * Available generic sets of operations - */ -#ifdef CONFIG_PPC64 -extern const struct dma_map_ops dma_iommu_ops; -#endif static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus) { @@ -43,5 +15,4 @@ static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus) return NULL; } -#endif /* __KERNEL__ */ #endif /* _ASM_DMA_MAPPING_H */ diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index 6f00a892ebdf..0ac52392ed99 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -325,5 +325,15 @@ extern bool iommu_fixed_is_weak; #define iommu_fixed_is_weak false #endif +extern const struct dma_map_ops dma_iommu_ops; + +static inline unsigned long device_to_mask(struct device *dev) +{ + if (dev->dma_mask && *dev->dma_mask) + return *dev->dma_mask; + /* Assume devices without mask can take 32 bit addresses */ + return 0xfffffffful; +} + #endif /* __KERNEL__ */ #endif /* _ASM_IOMMU_H */ diff --git a/arch/powerpc/platforms/44x/ppc476.c b/arch/powerpc/platforms/44x/ppc476.c index e55933f9cd55..a5e61e5c16e2 100644 --- a/arch/powerpc/platforms/44x/ppc476.c +++ b/arch/powerpc/platforms/44x/ppc476.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include diff --git a/arch/powerpc/platforms/85xx/corenet_generic.c b/arch/powerpc/platforms/85xx/corenet_generic.c index 808da1e9c0a7..785e9641220d 100644 --- a/arch/powerpc/platforms/85xx/corenet_generic.c +++ b/arch/powerpc/platforms/85xx/corenet_generic.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include diff --git a/arch/powerpc/platforms/85xx/qemu_e500.c b/arch/powerpc/platforms/85xx/qemu_e500.c index 27631c607f3d..c52c8f9e8385 100644 --- a/arch/powerpc/platforms/85xx/qemu_e500.c +++ b/arch/powerpc/platforms/85xx/qemu_e500.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include "smp.h" diff --git a/arch/powerpc/sysdev/fsl_pci.c b/arch/powerpc/sysdev/fsl_pci.c index a04c6dde6ed0..f49aec251a5a 100644 --- a/arch/powerpc/sysdev/fsl_pci.c +++ b/arch/powerpc/sysdev/fsl_pci.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include From d976f6807ea613c54fcb74bd7ae68a43fdd62e1f Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 20 Feb 2019 19:55:00 +1100 Subject: [PATCH 082/200] KVM: PPC: Book3S HV: Context switch AMR on Power9 kvmhv_p9_guest_entry() implements a fast-path guest entry for Power9 when guest and host are both running with the Radix MMU. Currently in that path we don't save the host AMR (Authority Mask Register) value, and we always restore 0 on return to the host. That is OK at the moment because the AMR is not used for storage keys with the Radix MMU. However we plan to start using the AMR on Radix to prevent the kernel from reading/writing to userspace outside of copy_to/from_user(). In order to make that work we need to save/restore the AMR value. We only restore the value if it is different from the guest value, which is already in the register when we exit to the host. This should mean we rarely need to actually restore the value when running a modern Linux as a guest, because it will be using the same value as us. Signed-off-by: Michael Ellerman Tested-by: Russell Currey --- arch/powerpc/kvm/book3s_hv.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 5a066fc299e1..105a3f78a760 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3455,6 +3455,7 @@ int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long host_dscr = mfspr(SPRN_DSCR); unsigned long host_tidr = mfspr(SPRN_TIDR); unsigned long host_iamr = mfspr(SPRN_IAMR); + unsigned long host_amr = mfspr(SPRN_AMR); s64 dec; u64 tb; int trap, save_pmu; @@ -3571,13 +3572,15 @@ int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, mtspr(SPRN_PSPB, 0); mtspr(SPRN_WORT, 0); - mtspr(SPRN_AMR, 0); mtspr(SPRN_UAMOR, 0); mtspr(SPRN_DSCR, host_dscr); mtspr(SPRN_TIDR, host_tidr); mtspr(SPRN_IAMR, host_iamr); mtspr(SPRN_PSPB, 0); + if (host_amr != vcpu->arch.amr) + mtspr(SPRN_AMR, host_amr); + msr_check_and_set(MSR_FP | MSR_VEC | MSR_VSX); store_fp_state(&vcpu->arch.fp); #ifdef CONFIG_ALTIVEC From 884dfb722db899e36d8c382783347aab57f96caa Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 21 Feb 2019 13:38:49 +1100 Subject: [PATCH 083/200] KVM: PPC: Book3S HV: Simplify machine check handling This makes the handling of machine check interrupts that occur inside a guest simpler and more robust, with less done in assembler code and in real mode. Now, when a machine check occurs inside a guest, we always get the machine check event struct and put a copy in the vcpu struct for the vcpu where the machine check occurred. We no longer call machine_check_queue_event() from kvmppc_realmode_mc_power7(), because on POWER8, when a vcpu is running on an offline secondary thread and we call machine_check_queue_event(), that calls irq_work_queue(), which doesn't work because the CPU is offline, but instead triggers the WARN_ON(lazy_irq_pending()) in pnv_smp_cpu_kill_self() (which fires again and again because nothing clears the condition). All that machine_check_queue_event() actually does is to cause the event to be printed to the console. For a machine check occurring in the guest, we now print the event in kvmppc_handle_exit_hv() instead. The assembly code at label machine_check_realmode now just calls C code and then continues exiting the guest. We no longer either synthesize a machine check for the guest in assembly code or return to the guest without a machine check. The code in kvmppc_handle_exit_hv() is extended to handle the case where the guest is not FWNMI-capable. In that case we now always synthesize a machine check interrupt for the guest. Previously, if the host thinks it has recovered the machine check fully, it would return to the guest without any notification that the machine check had occurred. If the machine check was caused by some action of the guest (such as creating duplicate SLB entries), it is much better to tell the guest that it has caused a problem. Therefore we now always generate a machine check interrupt for guests that are not FWNMI-capable. Reviewed-by: Aravinda Prasad Reviewed-by: Mahesh Salgaonkar Signed-off-by: Paul Mackerras Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/kvm_ppc.h | 3 +- arch/powerpc/kvm/book3s.c | 7 +++ arch/powerpc/kvm/book3s_hv.c | 18 +++++++- arch/powerpc/kvm/book3s_hv_ras.c | 58 ++++++------------------- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 40 ++--------------- 5 files changed, 42 insertions(+), 84 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index eb0d79f0ca45..a6c8548ed9fa 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -141,6 +141,7 @@ extern void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu); extern int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu); extern int kvmppc_core_pending_dec(struct kvm_vcpu *vcpu); +extern void kvmppc_core_queue_machine_check(struct kvm_vcpu *vcpu, ulong flags); extern void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong flags); extern void kvmppc_core_queue_fpunavail(struct kvm_vcpu *vcpu); extern void kvmppc_core_queue_vec_unavail(struct kvm_vcpu *vcpu); @@ -632,7 +633,7 @@ long int kvmppc_rm_h_confer(struct kvm_vcpu *vcpu, int target, unsigned int yield_count); long kvmppc_h_random(struct kvm_vcpu *vcpu); void kvmhv_commence_exit(int trap); -long kvmppc_realmode_machine_check(struct kvm_vcpu *vcpu); +void kvmppc_realmode_machine_check(struct kvm_vcpu *vcpu); void kvmppc_subcore_enter_guest(void); void kvmppc_subcore_exit_guest(void); long kvmppc_realmode_hmi_handler(void); diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index bd1a677dd9e4..9a7dadbe1f17 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -192,6 +192,13 @@ void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec) } EXPORT_SYMBOL_GPL(kvmppc_book3s_queue_irqprio); +void kvmppc_core_queue_machine_check(struct kvm_vcpu *vcpu, ulong flags) +{ + /* might as well deliver this straight away */ + kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_MACHINE_CHECK, flags); +} +EXPORT_SYMBOL_GPL(kvmppc_core_queue_machine_check); + void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong flags) { /* might as well deliver this straight away */ diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 105a3f78a760..53b202415395 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -1215,6 +1215,22 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, r = RESUME_GUEST; break; case BOOK3S_INTERRUPT_MACHINE_CHECK: + /* Print the MCE event to host console. */ + machine_check_print_event_info(&vcpu->arch.mce_evt, false); + + /* + * If the guest can do FWNMI, exit to userspace so it can + * deliver a FWNMI to the guest. + * Otherwise we synthesize a machine check for the guest + * so that it knows that the machine check occurred. + */ + if (!vcpu->kvm->arch.fwnmi_enabled) { + ulong flags = vcpu->arch.shregs.msr & 0x083c0000; + kvmppc_core_queue_machine_check(vcpu, flags); + r = RESUME_GUEST; + break; + } + /* Exit to guest with KVM_EXIT_NMI as exit reason */ run->exit_reason = KVM_EXIT_NMI; run->hw.hardware_exit_reason = vcpu->arch.trap; @@ -1227,8 +1243,6 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, run->flags |= KVM_RUN_PPC_NMI_DISP_NOT_RECOV; r = RESUME_HOST; - /* Print the MCE event to host console. */ - machine_check_print_event_info(&vcpu->arch.mce_evt, false); break; case BOOK3S_INTERRUPT_PROGRAM: { diff --git a/arch/powerpc/kvm/book3s_hv_ras.c b/arch/powerpc/kvm/book3s_hv_ras.c index 0787f12c1a1b..8c24c3bea0bf 100644 --- a/arch/powerpc/kvm/book3s_hv_ras.c +++ b/arch/powerpc/kvm/book3s_hv_ras.c @@ -66,10 +66,8 @@ static void reload_slb(struct kvm_vcpu *vcpu) /* * On POWER7, see if we can handle a machine check that occurred inside * the guest in real mode, without switching to the host partition. - * - * Returns: 0 => exit guest, 1 => deliver machine check to guest */ -static long kvmppc_realmode_mc_power7(struct kvm_vcpu *vcpu) +static void kvmppc_realmode_mc_power7(struct kvm_vcpu *vcpu) { unsigned long srr1 = vcpu->arch.shregs.msr; struct machine_check_event mce_evt; @@ -111,52 +109,24 @@ static long kvmppc_realmode_mc_power7(struct kvm_vcpu *vcpu) } /* - * See if we have already handled the condition in the linux host. - * We assume that if the condition is recovered then linux host - * will have generated an error log event that we will pick - * up and log later. - * Don't release mce event now. We will queue up the event so that - * we can log the MCE event info on host console. + * Now get the event and stash it in the vcpu struct so it can + * be handled by the primary thread in virtual mode. We can't + * call machine_check_queue_event() here if we are running on + * an offline secondary thread. */ - if (!get_mce_event(&mce_evt, MCE_EVENT_DONTRELEASE)) - goto out; + if (get_mce_event(&mce_evt, MCE_EVENT_RELEASE)) { + if (handled && mce_evt.version == MCE_V1) + mce_evt.disposition = MCE_DISPOSITION_RECOVERED; + } else { + memset(&mce_evt, 0, sizeof(mce_evt)); + } - if (mce_evt.version == MCE_V1 && - (mce_evt.severity == MCE_SEV_NO_ERROR || - mce_evt.disposition == MCE_DISPOSITION_RECOVERED)) - handled = 1; - -out: - /* - * For guest that supports FWNMI capability, hook the MCE event into - * vcpu structure. We are going to exit the guest with KVM_EXIT_NMI - * exit reason. On our way to exit we will pull this event from vcpu - * structure and print it from thread 0 of the core/subcore. - * - * For guest that does not support FWNMI capability (old QEMU): - * We are now going enter guest either through machine check - * interrupt (for unhandled errors) or will continue from - * current HSRR0 (for handled errors) in guest. Hence - * queue up the event so that we can log it from host console later. - */ - if (vcpu->kvm->arch.fwnmi_enabled) { - /* - * Hook up the mce event on to vcpu structure. - * First clear the old event. - */ - memset(&vcpu->arch.mce_evt, 0, sizeof(vcpu->arch.mce_evt)); - if (get_mce_event(&mce_evt, MCE_EVENT_RELEASE)) { - vcpu->arch.mce_evt = mce_evt; - } - } else - machine_check_queue_event(); - - return handled; + vcpu->arch.mce_evt = mce_evt; } -long kvmppc_realmode_machine_check(struct kvm_vcpu *vcpu) +void kvmppc_realmode_machine_check(struct kvm_vcpu *vcpu) { - return kvmppc_realmode_mc_power7(vcpu); + kvmppc_realmode_mc_power7(vcpu); } /* Check if dynamic split is in force and return subcore size accordingly. */ diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 9b8d50a7cbaf..f24f6a2f8eb5 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -2826,49 +2826,15 @@ kvm_cede_exit: #endif /* CONFIG_KVM_XICS */ 3: b guest_exit_cont - /* Try to handle a machine check in real mode */ + /* Try to do machine check recovery in real mode */ machine_check_realmode: mr r3, r9 /* get vcpu pointer */ bl kvmppc_realmode_machine_check nop + /* all machine checks go to virtual mode for further handling */ ld r9, HSTATE_KVM_VCPU(r13) li r12, BOOK3S_INTERRUPT_MACHINE_CHECK - /* - * For the guest that is FWNMI capable, deliver all the MCE errors - * (handled/unhandled) by exiting the guest with KVM_EXIT_NMI exit - * reason. This new approach injects machine check errors in guest - * address space to guest with additional information in the form - * of RTAS event, thus enabling guest kernel to suitably handle - * such errors. - * - * For the guest that is not FWNMI capable (old QEMU) fallback - * to old behaviour for backward compatibility: - * Deliver unhandled/fatal (e.g. UE) MCE errors to guest either - * through machine check interrupt (set HSRR0 to 0x200). - * For handled errors (no-fatal), just go back to guest execution - * with current HSRR0. - * if we receive machine check with MSR(RI=0) then deliver it to - * guest as machine check causing guest to crash. - */ - ld r11, VCPU_MSR(r9) - rldicl. r0, r11, 64-MSR_HV_LG, 63 /* check if it happened in HV mode */ - bne guest_exit_cont /* if so, exit to host */ - /* Check if guest is capable of handling NMI exit */ - ld r10, VCPU_KVM(r9) - lbz r10, KVM_FWNMI(r10) - cmpdi r10, 1 /* FWNMI capable? */ - beq guest_exit_cont /* if so, exit with KVM_EXIT_NMI. */ - - /* if not, fall through for backward compatibility. */ - andi. r10, r11, MSR_RI /* check for unrecoverable exception */ - beq 1f /* Deliver a machine check to guest */ - ld r10, VCPU_PC(r9) - cmpdi r3, 0 /* Did we handle MCE ? */ - bne 2f /* Continue guest execution. */ - /* If not, deliver a machine check. SRR0/1 are already set */ -1: li r10, BOOK3S_INTERRUPT_MACHINE_CHECK - bl kvmppc_msr_interrupt -2: b fast_interrupt_c_return + b guest_exit_cont /* * Call C code to handle a HMI in real mode. From c05772018491e5294f55d63b239ab0d532e96616 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 21 Feb 2019 13:40:20 +1100 Subject: [PATCH 084/200] powerpc/64s: Better printing of machine check info for guest MCEs This adds an "in_guest" parameter to machine_check_print_event_info() so that we can avoid trying to translate guest NIP values into symbolic form using the host kernel's symbol table. Reviewed-by: Aravinda Prasad Reviewed-by: Mahesh Salgaonkar Signed-off-by: Paul Mackerras Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/mce.h | 2 +- arch/powerpc/kernel/mce.c | 8 +++++--- arch/powerpc/kvm/book3s_hv.c | 4 ++-- arch/powerpc/platforms/powernv/opal.c | 2 +- 4 files changed, 9 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h index a8b8903e1844..17996bc9382b 100644 --- a/arch/powerpc/include/asm/mce.h +++ b/arch/powerpc/include/asm/mce.h @@ -209,7 +209,7 @@ extern int get_mce_event(struct machine_check_event *mce, bool release); extern void release_mce_event(void); extern void machine_check_queue_event(void); extern void machine_check_print_event_info(struct machine_check_event *evt, - bool user_mode); + bool user_mode, bool in_guest); #ifdef CONFIG_PPC_BOOK3S_64 void flush_and_reload_slb(void); #endif /* CONFIG_PPC_BOOK3S_64 */ diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index bd933a75f0bc..d501b48f287e 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -301,13 +301,13 @@ static void machine_check_process_queued_event(struct irq_work *work) while (__this_cpu_read(mce_queue_count) > 0) { index = __this_cpu_read(mce_queue_count) - 1; evt = this_cpu_ptr(&mce_event_queue[index]); - machine_check_print_event_info(evt, false); + machine_check_print_event_info(evt, false, false); __this_cpu_dec(mce_queue_count); } } void machine_check_print_event_info(struct machine_check_event *evt, - bool user_mode) + bool user_mode, bool in_guest) { const char *level, *sevstr, *subtype; static const char *mc_ue_types[] = { @@ -387,7 +387,9 @@ void machine_check_print_event_info(struct machine_check_event *evt, evt->disposition == MCE_DISPOSITION_RECOVERED ? "Recovered" : "Not recovered"); - if (user_mode) { + if (in_guest) { + printk("%s Guest NIP: %016llx\n", level, evt->srr0); + } else if (user_mode) { printk("%s NIP: [%016llx] PID: %d Comm: %s\n", level, evt->srr0, current->pid, current->comm); } else { diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 53b202415395..a3d5318f5d1e 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -1216,7 +1216,7 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, break; case BOOK3S_INTERRUPT_MACHINE_CHECK: /* Print the MCE event to host console. */ - machine_check_print_event_info(&vcpu->arch.mce_evt, false); + machine_check_print_event_info(&vcpu->arch.mce_evt, false, true); /* * If the guest can do FWNMI, exit to userspace so it can @@ -1406,7 +1406,7 @@ static int kvmppc_handle_nested_exit(struct kvm_run *run, struct kvm_vcpu *vcpu) /* Pass the machine check to the L1 guest */ r = RESUME_HOST; /* Print the MCE event to host console. */ - machine_check_print_event_info(&vcpu->arch.mce_evt, false); + machine_check_print_event_info(&vcpu->arch.mce_evt, false, true); break; /* * We get these next two if the guest accesses a page which it thinks diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index 79586f127521..05c85be0370f 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -587,7 +587,7 @@ int opal_machine_check(struct pt_regs *regs) evt.version); return 0; } - machine_check_print_event_info(&evt, user_mode(regs)); + machine_check_print_event_info(&evt, user_mode(regs), false); if (opal_recover_mce(regs, &evt)) return 1; From 1b58a975be36994a572ae3b3fb3e023272bc299f Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 31 Jan 2019 18:30:22 +0800 Subject: [PATCH 085/200] powerpc/powernv/npu: Remove redundant change_pte() hook The change_pte() notifier was designed to use as a quick path to update secondary MMU PTEs on write permission changes or PFN changes. For KVM, it could reduce the vm-exits when vcpu faults on the pages that was touched up by KSM. It's not used to do cache invalidations, for example, if we see the notifier will be called before the real PTE update after all (please see set_pte_at_notify that set_pte_at was called later). All the necessary cache invalidation should all be done in invalidate_range() already. Signed-off-by: Peter Xu Reviewed-by: Andrea Arcangeli Reviewed-by: Alistair Popple Reviewed-by: Balbir Singh Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/npu-dma.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c index ccf186bb60ec..dc23d9d2a7d9 100644 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ b/arch/powerpc/platforms/powernv/npu-dma.c @@ -913,15 +913,6 @@ static void pnv_npu2_mn_release(struct mmu_notifier *mn, mmio_invalidate(npu_context, 0, ~0UL); } -static void pnv_npu2_mn_change_pte(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long address, - pte_t pte) -{ - struct npu_context *npu_context = mn_to_npu_context(mn); - mmio_invalidate(npu_context, address, PAGE_SIZE); -} - static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, unsigned long end) @@ -932,7 +923,6 @@ static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn, static const struct mmu_notifier_ops nv_nmmu_notifier_ops = { .release = pnv_npu2_mn_release, - .change_pte = pnv_npu2_mn_change_pte, .invalidate_range = pnv_npu2_mn_invalidate_range, }; From ca6d5149d2ad0a8d2f9c28cbe379802260a0a5e0 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 14 Feb 2019 11:08:29 +1100 Subject: [PATCH 086/200] powerpc/ptrace: Simplify vr_get/set() to avoid GCC warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GCC 8 warns about the logic in vr_get/set(), which with -Werror breaks the build: In function ‘user_regset_copyin’, inlined from ‘vr_set’ at arch/powerpc/kernel/ptrace.c:628:9: include/linux/regset.h:295:4: error: ‘memcpy’ offset [-527, -529] is out of the bounds [0, 16] of object ‘vrsave’ with type ‘union ’ [-Werror=array-bounds] arch/powerpc/kernel/ptrace.c: In function ‘vr_set’: arch/powerpc/kernel/ptrace.c:623:5: note: ‘vrsave’ declared here } vrsave; This has been identified as a regression in GCC, see GCC bug 88273. However we can avoid the warning and also simplify the logic and make it more robust. Currently we pass -1 as end_pos to user_regset_copyout(). This says "copy up to the end of the regset". The definition of the regset is: [REGSET_VMX] = { .core_note_type = NT_PPC_VMX, .n = 34, .size = sizeof(vector128), .align = sizeof(vector128), .active = vr_active, .get = vr_get, .set = vr_set }, The end is calculated as (n * size), ie. 34 * sizeof(vector128). In vr_get/set() we pass start_pos as 33 * sizeof(vector128), meaning we can copy up to sizeof(vector128) into/out-of vrsave. The on-stack vrsave is defined as: union { elf_vrreg_t reg; u32 word; } vrsave; And elf_vrreg_t is: typedef __vector128 elf_vrreg_t; So there is no bug, but we rely on all those sizes lining up, otherwise we would have a kernel stack exposure/overwrite on our hands. Rather than relying on that we can pass an explict end_pos based on the sizeof(vrsave). The result should be exactly the same but it's more obviously not over-reading/writing the stack and it avoids the compiler warning. Reported-by: Meelis Roos Reported-by: Mathieu Malaterre Cc: stable@vger.kernel.org Tested-by: Mathieu Malaterre Tested-by: Meelis Roos Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/ptrace.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c index 7535f89e08cd..d9ac7d94656e 100644 --- a/arch/powerpc/kernel/ptrace.c +++ b/arch/powerpc/kernel/ptrace.c @@ -567,6 +567,7 @@ static int vr_get(struct task_struct *target, const struct user_regset *regset, /* * Copy out only the low-order word of vrsave. */ + int start, end; union { elf_vrreg_t reg; u32 word; @@ -575,8 +576,10 @@ static int vr_get(struct task_struct *target, const struct user_regset *regset, vrsave.word = target->thread.vrsave; + start = 33 * sizeof(vector128); + end = start + sizeof(vrsave); ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, &vrsave, - 33 * sizeof(vector128), -1); + start, end); } return ret; @@ -614,6 +617,7 @@ static int vr_set(struct task_struct *target, const struct user_regset *regset, /* * We use only the first word of vrsave. */ + int start, end; union { elf_vrreg_t reg; u32 word; @@ -622,8 +626,10 @@ static int vr_set(struct task_struct *target, const struct user_regset *regset, vrsave.word = target->thread.vrsave; + start = 33 * sizeof(vector128); + end = start + sizeof(vrsave); ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &vrsave, - 33 * sizeof(vector128), -1); + start, end); if (!ret) target->thread.vrsave = vrsave.word; } From 3d8810e02b7f811be3bc9ad2f433be4e245e8267 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 13 Feb 2019 16:45:09 +0530 Subject: [PATCH 087/200] powerpc/mm/hash: Increase vmalloc space to 512T with hash MMU This patch updates the kernel non-linear virtual map to 512TB when we're built with 64K page size and are using the hash MMU. We allocate one context for the vmalloc region and hence the max virtual area size is limited by the context map size (512TB for 64K and 64TB for 4K page size). This patch fixes boot failures with large amounts of system RAM where we need large vmalloc space to handle per cpu allocations. Signed-off-by: Michael Ellerman Signed-off-by: Aneesh Kumar K.V Tested-by: Aneesh Kumar K.V --- arch/powerpc/include/asm/book3s/64/hash.h | 32 ++++++++++++++++------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h index 247aff9cc6ba..54b7af6cd27f 100644 --- a/arch/powerpc/include/asm/book3s/64/hash.h +++ b/arch/powerpc/include/asm/book3s/64/hash.h @@ -40,22 +40,36 @@ #else #define H_PUD_CACHE_INDEX (H_PUD_INDEX_SIZE) #endif + /* - * Define the address range of the kernel non-linear virtual area + * Define the address range of the kernel non-linear virtual area. In contrast + * to the linear mapping, this is managed using the kernel page tables and then + * inserted into the hash page table to actually take effect, similarly to user + * mappings. */ #define H_KERN_VIRT_START ASM_CONST(0xD000000000000000) -#define H_KERN_VIRT_SIZE ASM_CONST(0x0000400000000000) /* 64T */ /* - * The vmalloc space starts at the beginning of that region, and - * occupies half of it on hash CPUs and a quarter of it on Book3E - * (we keep a quarter for the virtual memmap) + * Allow virtual mapping of one context size. + * 512TB for 64K page size + * 64TB for 4K page size */ -#define H_VMALLOC_START H_KERN_VIRT_START -#define H_VMALLOC_SIZE ASM_CONST(0x380000000000) /* 56T */ -#define H_VMALLOC_END (H_VMALLOC_START + H_VMALLOC_SIZE) +#define H_KERN_VIRT_SIZE (1UL << MAX_EA_BITS_PER_CONTEXT) -#define H_KERN_IO_START H_VMALLOC_END +/* + * 8TB IO mapping size + */ +#define H_KERN_IO_SIZE ASM_CONST(0x80000000000) /* 8T */ + +/* + * The vmalloc space starts at the beginning of the kernel non-linear virtual + * region, and occupies 504T (64K) or 56T (4K) + */ +#define H_VMALLOC_START H_KERN_VIRT_START +#define H_VMALLOC_SIZE (H_KERN_VIRT_SIZE - H_KERN_IO_SIZE) +#define H_VMALLOC_END (H_VMALLOC_START + H_VMALLOC_SIZE) + +#define H_KERN_IO_START H_VMALLOC_END /* * Region IDs From 7104dccfd052fde51eecc9972dad9c40bd3e0d11 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 15 Feb 2019 20:20:20 +1000 Subject: [PATCH 088/200] powerpc/64s/hash: Fix assert_slb_presence() use of the slbfee. instruction The slbfee. instruction must have bit 24 of RB clear, failure to do so can result in false negatives that result in incorrect assertions. This is not obvious from the ISA v3.0B document, which only says: The hardware ignores the contents of RB 36:38 40:63 -- p.1032 This patch fixes the bug and also clears all other bits from PPC bit 36-63, which is good practice when dealing with reserved or ignored bits. Fixes: e15a4fea4dee ("powerpc/64s/hash: Add some SLB debugging tests") Cc: stable@vger.kernel.org # v4.20+ Reported-by: Aneesh Kumar K.V Tested-by: Aneesh Kumar K.V Signed-off-by: Nicholas Piggin Reviewed-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/mm/slb.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c index bc3914d54e26..5986df48359b 100644 --- a/arch/powerpc/mm/slb.c +++ b/arch/powerpc/mm/slb.c @@ -69,6 +69,11 @@ static void assert_slb_presence(bool present, unsigned long ea) if (!cpu_has_feature(CPU_FTR_ARCH_206)) return; + /* + * slbfee. requires bit 24 (PPC bit 39) be clear in RB. Hardware + * ignores all other bits from 0-27, so just clear them all. + */ + ea &= ~((1UL << 28) - 1); asm volatile(__PPC_SLBFEE_DOT(%0, %1) : "=r"(tmp) : "r"(ea) : "cr0"); WARN_ON(present == (tmp == 0)); From d065ee93aab6ef4c2a5af5c455b5044bd5136547 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 15 Feb 2019 10:32:02 +0000 Subject: [PATCH 089/200] powerpc: drop unused GENERIC_CSUM Kconfig item Commit d4fde568a34a ("powerpc/64: Use optimized checksum routines on little-endian") converted last powerpc user of GENERIC_CSUM. This patch does a final cleanup dropping the Kconfig GENERIC_CSUM option which is always 'n', and associated piece of code in asm/checksum.h Fixes: d4fde568a34a ("powerpc/64: Use optimized checksum routines on little-endian") Reported-by: Christoph Hellwig Signed-off-by: Christophe Leroy Reviewed-by: Christoph Hellwig Signed-off-by: Michael Ellerman --- arch/powerpc/Kconfig | 3 --- arch/powerpc/include/asm/checksum.h | 4 ---- 2 files changed, 7 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 8af6a7d93148..3db58fcfb0b2 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -247,9 +247,6 @@ config PPC_BARRIER_NOSPEC default y depends on PPC_BOOK3S_64 || PPC_FSL_BOOK3E -config GENERIC_CSUM - def_bool n - config EARLY_PRINTK bool default y diff --git a/arch/powerpc/include/asm/checksum.h b/arch/powerpc/include/asm/checksum.h index a78a57e5058d..72a65d744a28 100644 --- a/arch/powerpc/include/asm/checksum.h +++ b/arch/powerpc/include/asm/checksum.h @@ -9,9 +9,6 @@ * 2 of the License, or (at your option) any later version. */ -#ifdef CONFIG_GENERIC_CSUM -#include -#else #include #include /* @@ -217,6 +214,5 @@ __sum16 csum_ipv6_magic(const struct in6_addr *saddr, const struct in6_addr *daddr, __u32 len, __u8 proto, __wsum sum); -#endif #endif /* __KERNEL__ */ #endif From 46ee7c3c5212b0f4f8713d60cfd595721efdf0d3 Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Fri, 15 Feb 2019 11:48:11 +1100 Subject: [PATCH 090/200] powerpc/eeh: Use debugfs_create_u32 for eeh_max_freezes There's no need to the custom getter/setter functions so we should remove them in favour of using the generic one. While we're here, change the type of eeh_max_freeze to u32 and print the value in decimal rather than hex because printing it in hex makes no sense. Signed-off-by: Oliver O'Halloran Reviewed-by: Sam Bobroff Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/eeh.h | 2 +- arch/powerpc/kernel/eeh.c | 21 +++------------------ 2 files changed, 4 insertions(+), 19 deletions(-) diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index 0b655810f32d..f3b3c3537792 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -219,7 +219,7 @@ struct eeh_ops { }; extern int eeh_subsystem_flags; -extern int eeh_max_freezes; +extern u32 eeh_max_freezes; extern struct eeh_ops *eeh_ops; extern raw_spinlock_t confirm_error_lock; diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 98d8755ac4c8..15e2734b4854 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -109,7 +109,7 @@ EXPORT_SYMBOL(eeh_subsystem_flags); * frozen count in last hour exceeds this limit, the PE will * be forced to be offline permanently. */ -int eeh_max_freezes = 5; +u32 eeh_max_freezes = 5; /* Platform dependent EEH operations */ struct eeh_ops *eeh_ops = NULL; @@ -1829,22 +1829,8 @@ static int eeh_enable_dbgfs_get(void *data, u64 *val) return 0; } -static int eeh_freeze_dbgfs_set(void *data, u64 val) -{ - eeh_max_freezes = val; - return 0; -} - -static int eeh_freeze_dbgfs_get(void *data, u64 *val) -{ - *val = eeh_max_freezes; - return 0; -} - DEFINE_DEBUGFS_ATTRIBUTE(eeh_enable_dbgfs_ops, eeh_enable_dbgfs_get, eeh_enable_dbgfs_set, "0x%llx\n"); -DEFINE_DEBUGFS_ATTRIBUTE(eeh_freeze_dbgfs_ops, eeh_freeze_dbgfs_get, - eeh_freeze_dbgfs_set, "0x%llx\n"); #endif static int __init eeh_init_proc(void) @@ -1855,9 +1841,8 @@ static int __init eeh_init_proc(void) debugfs_create_file_unsafe("eeh_enable", 0600, powerpc_debugfs_root, NULL, &eeh_enable_dbgfs_ops); - debugfs_create_file_unsafe("eeh_max_freezes", 0600, - powerpc_debugfs_root, NULL, - &eeh_freeze_dbgfs_ops); + debugfs_create_u32("eeh_max_freezes", 0600, + powerpc_debugfs_root, &eeh_max_freezes); #endif } From e67fbbec74220733971b88804be93528f5246434 Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Fri, 15 Feb 2019 11:48:12 +1100 Subject: [PATCH 091/200] powerpc/eeh_cache: Add pr_debug() prints for insert/remove The EEH address cache is used to map a physical MMIO address back to a PCI device. It's useful to know when it's being manipulated, but currently this requires recompiling with #define DEBUG set. This is pointless since we have dynamic_debug nowdays, so remove the #ifdef guard and add a pr_debug() for the remove case too. Signed-off-by: Oliver O'Halloran Reviewed-by: Sam Bobroff Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/eeh_cache.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/eeh_cache.c b/arch/powerpc/kernel/eeh_cache.c index 201943d54a6e..b2c320e0fcef 100644 --- a/arch/powerpc/kernel/eeh_cache.c +++ b/arch/powerpc/kernel/eeh_cache.c @@ -157,10 +157,8 @@ eeh_addr_cache_insert(struct pci_dev *dev, resource_size_t alo, piar->pcidev = dev; piar->flags = flags; -#ifdef DEBUG pr_debug("PIAR: insert range=[%pap:%pap] dev=%s\n", &alo, &ahi, pci_name(dev)); -#endif rb_link_node(&piar->rb_node, parent, p); rb_insert_color(&piar->rb_node, &pci_io_addr_cache_root.rb_root); @@ -240,6 +238,8 @@ restart: piar = rb_entry(n, struct pci_io_addr_range, rb_node); if (piar->pcidev == dev) { + pr_debug("PIAR: remove range=[%pap:%pap] dev=%s\n", + &piar->addr_lo, &piar->addr_hi, pci_name(dev)); rb_erase(n, &pci_io_addr_cache_root.rb_root); kfree(piar); goto restart; From 5ca85ae6318df34874999e3fd1760a88208e2a8e Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Fri, 15 Feb 2019 11:48:13 +1100 Subject: [PATCH 092/200] powerpc/eeh_cache: Add a way to dump the EEH address cache Adds a debugfs file that can be read to view the contents of the EEH address cache. This is pretty similar to the existing eeh_addr_cache_print() function, but that function is intended to debug issues inside of the kernel since it's #ifdef`ed out by default, and writes into the kernel log. Signed-off-by: Oliver O'Halloran Reviewed-by: Sam Bobroff Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/eeh.h | 3 +++ arch/powerpc/kernel/eeh.c | 1 + arch/powerpc/kernel/eeh_cache.c | 32 +++++++++++++++++++++++++++----- 3 files changed, 31 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index f3b3c3537792..e42d643a20ac 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -460,6 +460,9 @@ static inline void eeh_readsl(const volatile void __iomem *addr, void * buf, eeh_check_failure(addr); } + +void eeh_cache_debugfs_init(void); + #endif /* CONFIG_PPC64 */ #endif /* __KERNEL__ */ #endif /* _POWERPC_EEH_H */ diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 15e2734b4854..8d36c50e906f 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -1843,6 +1843,7 @@ static int __init eeh_init_proc(void) &eeh_enable_dbgfs_ops); debugfs_create_u32("eeh_max_freezes", 0600, powerpc_debugfs_root, &eeh_max_freezes); + eeh_cache_debugfs_init(); #endif } diff --git a/arch/powerpc/kernel/eeh_cache.c b/arch/powerpc/kernel/eeh_cache.c index b2c320e0fcef..5c5697cced41 100644 --- a/arch/powerpc/kernel/eeh_cache.c +++ b/arch/powerpc/kernel/eeh_cache.c @@ -26,6 +26,7 @@ #include #include #include +#include #include @@ -298,9 +299,30 @@ void eeh_addr_cache_build(void) eeh_addr_cache_insert_dev(dev); eeh_sysfs_add_device(dev); } - -#ifdef DEBUG - /* Verify tree built up above, echo back the list of addrs. */ - eeh_addr_cache_print(&pci_io_addr_cache_root); -#endif +} + +static int eeh_addr_cache_show(struct seq_file *s, void *v) +{ + struct pci_io_addr_range *piar; + struct rb_node *n; + + spin_lock(&pci_io_addr_cache_root.piar_lock); + for (n = rb_first(&pci_io_addr_cache_root.rb_root); n; n = rb_next(n)) { + piar = rb_entry(n, struct pci_io_addr_range, rb_node); + + seq_printf(s, "%s addr range [%pap-%pap]: %s\n", + (piar->flags & IORESOURCE_IO) ? "i/o" : "mem", + &piar->addr_lo, &piar->addr_hi, pci_name(piar->pcidev)); + } + spin_unlock(&pci_io_addr_cache_root.piar_lock); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(eeh_addr_cache); + +void eeh_cache_debugfs_init(void) +{ + debugfs_create_file_unsafe("eeh_address_cache", 0400, + powerpc_debugfs_root, NULL, + &eeh_addr_cache_fops); } From c8f02f2108136a6218326fbcd4034b985849667c Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Fri, 15 Feb 2019 11:48:14 +1100 Subject: [PATCH 093/200] powerpc/eeh_cache: Bump log level of eeh_addr_cache_print() To use this function at all #define DEBUG needs to be set in eeh_cache.c. Considering that printing at pr_debug is probably not all that useful since it adds the additional hurdle of requiring you to enable the debug print if dynamic_debug is in use so this patch bumps it to pr_info. Signed-off-by: Oliver O'Halloran Reviewed-by: Sam Bobroff Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/eeh_cache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/eeh_cache.c b/arch/powerpc/kernel/eeh_cache.c index 5c5697cced41..9c68f0837385 100644 --- a/arch/powerpc/kernel/eeh_cache.c +++ b/arch/powerpc/kernel/eeh_cache.c @@ -114,7 +114,7 @@ static void eeh_addr_cache_print(struct pci_io_addr_cache *cache) while (n) { struct pci_io_addr_range *piar; piar = rb_entry(n, struct pci_io_addr_range, rb_node); - pr_debug("PCI: %s addr range %d [%pap-%pap]: %s\n", + pr_info("PCI: %s addr range %d [%pap-%pap]: %s\n", (piar->flags & IORESOURCE_IO) ? "i/o" : "mem", cnt, &piar->addr_lo, &piar->addr_hi, pci_name(piar->pcidev)); cnt++; From 67060cb1ffa474c4fa1ae4db865ac1c7ed1fa899 Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Fri, 15 Feb 2019 11:48:15 +1100 Subject: [PATCH 094/200] powerpc/pci: Add pci_find_controller_for_domain() Add a helper to find the pci_controller structure based on the domain number / phb id. Signed-off-by: Oliver O'Halloran Reviewed-by: Sam Bobroff Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/pci-bridge.h | 2 ++ arch/powerpc/kernel/pci-common.c | 11 +++++++++++ 2 files changed, 13 insertions(+) diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h index 98e8b46aff97..6c0039f3a3a6 100644 --- a/arch/powerpc/include/asm/pci-bridge.h +++ b/arch/powerpc/include/asm/pci-bridge.h @@ -273,6 +273,8 @@ extern int pcibios_map_io_space(struct pci_bus *bus); extern struct pci_controller *pci_find_hose_for_OF_device( struct device_node* node); +extern struct pci_controller *pci_find_controller_for_domain(int domain_nr); + /* Fill up host controller resources from the OF node */ extern void pci_process_bridge_OF_ranges(struct pci_controller *hose, struct device_node *dev, int primary); diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c index cbdf13d86227..60f20c2e559a 100644 --- a/arch/powerpc/kernel/pci-common.c +++ b/arch/powerpc/kernel/pci-common.c @@ -351,6 +351,17 @@ struct pci_controller* pci_find_hose_for_OF_device(struct device_node* node) return NULL; } +struct pci_controller *pci_find_controller_for_domain(int domain_nr) +{ + struct pci_controller *hose; + + list_for_each_entry(hose, &hose_list, list_node) + if (hose->global_number == domain_nr) + return hose; + + return NULL; +} + /* * Reads the interrupt pin to determine if interrupt is use by card. * If the interrupt is used, then gets the interrupt line from the From 6b493f6079a430fd41f66933b68d1bb1ad37ca8c Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Fri, 15 Feb 2019 11:48:16 +1100 Subject: [PATCH 095/200] powerpc/eeh: Allow disabling recovery Currently when we detect an error we automatically invoke the EEH recovery handler. This can be annoying when debugging EEH problems, or when working on EEH itself so this patch adds a debugfs knob that will prevent a recovery event from being queued up when an issue is detected. Signed-off-by: Oliver O'Halloran Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/eeh.h | 1 + arch/powerpc/kernel/eeh.c | 10 ++++++++++ arch/powerpc/kernel/eeh_event.c | 9 +++++++++ 3 files changed, 20 insertions(+) diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index e42d643a20ac..94cfcf33030a 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -220,6 +220,7 @@ struct eeh_ops { extern int eeh_subsystem_flags; extern u32 eeh_max_freezes; +extern bool eeh_debugfs_no_recover; extern struct eeh_ops *eeh_ops; extern raw_spinlock_t confirm_error_lock; diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 8d36c50e906f..0996f22b2612 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -111,6 +111,13 @@ EXPORT_SYMBOL(eeh_subsystem_flags); */ u32 eeh_max_freezes = 5; +/* + * Controls whether a recovery event should be scheduled when an + * isolated device is discovered. This is only really useful for + * debugging problems with the EEH core. + */ +bool eeh_debugfs_no_recover; + /* Platform dependent EEH operations */ struct eeh_ops *eeh_ops = NULL; @@ -1843,6 +1850,9 @@ static int __init eeh_init_proc(void) &eeh_enable_dbgfs_ops); debugfs_create_u32("eeh_max_freezes", 0600, powerpc_debugfs_root, &eeh_max_freezes); + debugfs_create_bool("eeh_disable_recovery", 0600, + powerpc_debugfs_root, + &eeh_debugfs_no_recover); eeh_cache_debugfs_init(); #endif } diff --git a/arch/powerpc/kernel/eeh_event.c b/arch/powerpc/kernel/eeh_event.c index 227e57f980df..19837798bb1d 100644 --- a/arch/powerpc/kernel/eeh_event.c +++ b/arch/powerpc/kernel/eeh_event.c @@ -126,6 +126,15 @@ int eeh_send_failure_event(struct eeh_pe *pe) unsigned long flags; struct eeh_event *event; + /* + * If we've manually supressed recovery events via debugfs + * then just drop it on the floor. + */ + if (eeh_debugfs_no_recover) { + pr_err("EEH: Event dropped due to no_recover setting\n"); + return 0; + } + event = kzalloc(sizeof(*event), GFP_ATOMIC); if (!event) { pr_err("EEH: out of memory, event not handled\n"); From 954bd99435b8ba99e86665d6a2ec1baa1d128325 Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Fri, 15 Feb 2019 11:48:17 +1100 Subject: [PATCH 096/200] powerpc/eeh: Add eeh_force_recover to debugfs This patch adds a debugfs interface to force scheduling a recovery event. This can be used to recover a specific PE or schedule a "special" recovery even that checks for errors at the PHB level. To force a recovery of a normal PE, use: echo '<#pe>:<#phb>' > /sys/kernel/debug/powerpc/eeh_force_recover To force a scan for broken PHBs: echo 'hwcheck' > /sys/kernel/debug/powerpc/eeh_force_recover Signed-off-by: Oliver O'Halloran Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/eeh_event.h | 1 + arch/powerpc/kernel/eeh.c | 59 ++++++++++++++++++++++++++++ arch/powerpc/kernel/eeh_event.c | 25 +++++++----- 3 files changed, 75 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/include/asm/eeh_event.h b/arch/powerpc/include/asm/eeh_event.h index 9884e872686f..6d0412b846ac 100644 --- a/arch/powerpc/include/asm/eeh_event.h +++ b/arch/powerpc/include/asm/eeh_event.h @@ -33,6 +33,7 @@ struct eeh_event { int eeh_event_init(void); int eeh_send_failure_event(struct eeh_pe *pe); +int __eeh_send_failure_event(struct eeh_pe *pe); void eeh_remove_event(struct eeh_pe *pe, bool force); void eeh_handle_normal_event(struct eeh_pe *pe); void eeh_handle_special_event(void); diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 0996f22b2612..289c0b37d845 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -1838,6 +1838,62 @@ static int eeh_enable_dbgfs_get(void *data, u64 *val) DEFINE_DEBUGFS_ATTRIBUTE(eeh_enable_dbgfs_ops, eeh_enable_dbgfs_get, eeh_enable_dbgfs_set, "0x%llx\n"); + +static ssize_t eeh_force_recover_write(struct file *filp, + const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct pci_controller *hose; + uint32_t phbid, pe_no; + struct eeh_pe *pe; + char buf[20]; + int ret; + + ret = simple_write_to_buffer(buf, sizeof(buf), ppos, user_buf, count); + if (!ret) + return -EFAULT; + + /* + * When PE is NULL the event is a "special" event. Rather than + * recovering a specific PE it forces the EEH core to scan for failed + * PHBs and recovers each. This needs to be done before any device + * recoveries can occur. + */ + if (!strncmp(buf, "hwcheck", 7)) { + __eeh_send_failure_event(NULL); + return count; + } + + ret = sscanf(buf, "%x:%x", &phbid, &pe_no); + if (ret != 2) + return -EINVAL; + + hose = pci_find_controller_for_domain(phbid); + if (!hose) + return -ENODEV; + + /* Retrieve PE */ + pe = eeh_pe_get(hose, pe_no, 0); + if (!pe) + return -ENODEV; + + /* + * We don't do any state checking here since the detection + * process is async to the recovery process. The recovery + * thread *should* not break even if we schedule a recovery + * from an odd state (e.g. PE removed, or recovery of a + * non-isolated PE) + */ + __eeh_send_failure_event(pe); + + return ret < 0 ? ret : count; +} + +static const struct file_operations eeh_force_recover_fops = { + .open = simple_open, + .llseek = no_llseek, + .write = eeh_force_recover_write, +}; #endif static int __init eeh_init_proc(void) @@ -1853,6 +1909,9 @@ static int __init eeh_init_proc(void) debugfs_create_bool("eeh_disable_recovery", 0600, powerpc_debugfs_root, &eeh_debugfs_no_recover); + debugfs_create_file_unsafe("eeh_force_recover", 0600, + powerpc_debugfs_root, NULL, + &eeh_force_recover_fops); eeh_cache_debugfs_init(); #endif } diff --git a/arch/powerpc/kernel/eeh_event.c b/arch/powerpc/kernel/eeh_event.c index 19837798bb1d..539aca055d70 100644 --- a/arch/powerpc/kernel/eeh_event.c +++ b/arch/powerpc/kernel/eeh_event.c @@ -121,20 +121,11 @@ int eeh_event_init(void) * the actual event will be delivered in a normal context * (from a workqueue). */ -int eeh_send_failure_event(struct eeh_pe *pe) +int __eeh_send_failure_event(struct eeh_pe *pe) { unsigned long flags; struct eeh_event *event; - /* - * If we've manually supressed recovery events via debugfs - * then just drop it on the floor. - */ - if (eeh_debugfs_no_recover) { - pr_err("EEH: Event dropped due to no_recover setting\n"); - return 0; - } - event = kzalloc(sizeof(*event), GFP_ATOMIC); if (!event) { pr_err("EEH: out of memory, event not handled\n"); @@ -153,6 +144,20 @@ int eeh_send_failure_event(struct eeh_pe *pe) return 0; } +int eeh_send_failure_event(struct eeh_pe *pe) +{ + /* + * If we've manually supressed recovery events via debugfs + * then just drop it on the floor. + */ + if (eeh_debugfs_no_recover) { + pr_err("EEH: Event dropped due to no_recover setting\n"); + return 0; + } + + return __eeh_send_failure_event(pe); +} + /** * eeh_remove_event - Remove EEH event from the queue * @pe: Event binding to the PE From aa7150ba378650d0e9d84b8e4d805946965a5926 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 7 Feb 2019 13:43:26 +1100 Subject: [PATCH 097/200] powerpc/44x: Force PCI on for CURRITUCK The recent rework of PCI kconfig symbols exposed an existing bug in the CURRITUCK kconfig logic. It selects PPC4xx_PCI_EXPRESS which depends on PCI, but PCI is user selectable and might be disabled, leading to a warning: WARNING: unmet direct dependencies detected for PPC4xx_PCI_EXPRESS Depends on [n]: PCI [=n] && 4xx [=y] Selected by [y]: - CURRITUCK [=y] && PPC_47x [=y] Prior to commit eb01d42a7778 ("PCI: consolidate PCI config entry in drivers/pci") PCI was enabled by default for currituck_defconfig so we didn't see the warning. The bad logic was still there, it just required someone disabling PCI in their .config to hit it. Fix it by forcing PCI on for CURRITUCK, which seems was always the expectation anyway. Fixes: eb01d42a7778 ("PCI: consolidate PCI config entry in drivers/pci") Reported-by: Randy Dunlap Reviewed-by: Christoph Hellwig Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/44x/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/platforms/44x/Kconfig b/arch/powerpc/platforms/44x/Kconfig index 4a9a72d01c3c..35be81fd2dc2 100644 --- a/arch/powerpc/platforms/44x/Kconfig +++ b/arch/powerpc/platforms/44x/Kconfig @@ -180,6 +180,7 @@ config CURRITUCK depends on PPC_47x select SWIOTLB select 476FPE + select FORCE_PCI select PPC4xx_PCI_EXPRESS help This option enables support for the IBM Currituck (476fpe) evaluation board From 9f3ba362d84786af8e1ab36a32fb337882b8648c Mon Sep 17 00:00:00 2001 From: Tyrel Datwyler Date: Sat, 8 Dec 2018 17:48:27 -0600 Subject: [PATCH 098/200] powerpc/pseries: export timebase register sample in lparcfg The Processor Utilzation of Resource Registers (PURR) provide an estimate of resources used by a cpu thread. Section 7.6 in Book III of the ISA outlines how to calculate the percentage of shared resources for threads using the ratio of the PURR delta and Timebase Register delta for a sampled period. This calculation is currently done erroneously by the lparstat tool from the powerpc-utils package. This patch exports the current timebase value after we sample the PURRs and exposes it to userspace accounting tools via /proc/ppc64/lparcfg. Signed-off-by: Tyrel Datwyler Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/pseries/lparcfg.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/platforms/pseries/lparcfg.c b/arch/powerpc/platforms/pseries/lparcfg.c index 794487313cc8..e73c7e30efe6 100644 --- a/arch/powerpc/platforms/pseries/lparcfg.c +++ b/arch/powerpc/platforms/pseries/lparcfg.c @@ -475,6 +475,7 @@ static int pseries_lparcfg_data(struct seq_file *m, void *v) splpar_dispatch_data(m); seq_printf(m, "purr=%ld\n", get_purr()); + seq_printf(m, "tbr=%ld\n", mftb()); } else { /* non SPLPAR case */ seq_printf(m, "system_active_processors=%d\n", From fe1ef6bcdb4fca33434256a802a3ed6aacf0bd2f Mon Sep 17 00:00:00 2001 From: Mark Cave-Ayland Date: Fri, 8 Feb 2019 14:33:19 +0000 Subject: [PATCH 099/200] powerpc: Fix 32-bit KVM-PR lockup and host crash with MacOS guest Commit 8792468da5e1 "powerpc: Add the ability to save FPU without giving it up" unexpectedly removed the MSR_FE0 and MSR_FE1 bits from the bitmask used to update the MSR of the previous thread in __giveup_fpu() causing a KVM-PR MacOS guest to lockup and panic the host kernel. Leaving FE0/1 enabled means unrelated processes might receive FPEs when they're not expecting them and crash. In particular if this happens to init the host will then panic. eg (transcribed): qemu-system-ppc[837]: unhandled signal 8 at 12cc9ce4 nip 12cc9ce4 lr 12cc9ca4 code 0 systemd[1]: unhandled signal 8 at 202f02e0 nip 202f02e0 lr 001003d4 code 0 Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b Reinstate these bits to the MSR bitmask to enable MacOS guests to run under 32-bit KVM-PR once again without issue. Fixes: 8792468da5e1 ("powerpc: Add the ability to save FPU without giving it up") Cc: stable@vger.kernel.org # v4.6+ Signed-off-by: Mark Cave-Ayland Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/process.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index ce393df243aa..71bad4b6f80d 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -176,7 +176,7 @@ static void __giveup_fpu(struct task_struct *tsk) save_fpu(tsk); msr = tsk->thread.regs->msr; - msr &= ~MSR_FP; + msr &= ~(MSR_FP|MSR_FE0|MSR_FE1); #ifdef CONFIG_VSX if (cpu_has_feature(CPU_FTR_VSX)) msr &= ~MSR_VSX; From c746ca00f5eac6224eda02f39ebdc48fabfad3c5 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 14 Feb 2019 12:15:40 +0530 Subject: [PATCH 100/200] powerpc/book3s: Remove pgd/pud/pmd_set() interfaces When updating page tables, we need to make sure we fill the page table entry valid bits. We do this by or'ing in one of PGD/PUD/PMD_VAL_BITS. The page table 'set' interfaces allow updating the raw value of page table entries without setting the valid bits, so remove those interfaces to avoid incorrect usage in future. Signed-off-by: Aneesh Kumar K.V [mpe: Reword commit message based on mailing list discussion] Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/64/pgalloc.h | 8 ++++---- arch/powerpc/include/asm/book3s/64/pgtable.h | 14 -------------- 2 files changed, 4 insertions(+), 18 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/pgalloc.h b/arch/powerpc/include/asm/book3s/64/pgalloc.h index 9c1173283b96..138bc2ecc0c4 100644 --- a/arch/powerpc/include/asm/book3s/64/pgalloc.h +++ b/arch/powerpc/include/asm/book3s/64/pgalloc.h @@ -111,7 +111,7 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) { - pgd_set(pgd, __pgtable_ptr_val(pud) | PGD_VAL_BITS); + *pgd = __pgd(__pgtable_ptr_val(pud) | PGD_VAL_BITS); } static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) @@ -138,7 +138,7 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud) static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) { - pud_set(pud, __pgtable_ptr_val(pmd) | PUD_VAL_BITS); + *pud = __pud(__pgtable_ptr_val(pmd) | PUD_VAL_BITS); } static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, @@ -176,13 +176,13 @@ static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte) { - pmd_set(pmd, __pgtable_ptr_val(pte) | PMD_VAL_BITS); + *pmd = __pmd(__pgtable_ptr_val(pte) | PMD_VAL_BITS); } static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, pgtable_t pte_page) { - pmd_set(pmd, __pgtable_ptr_val(pte_page) | PMD_VAL_BITS); + *pmd = __pmd(__pgtable_ptr_val(pte_page) | PMD_VAL_BITS); } static inline pgtable_t pmd_pgtable(pmd_t pmd) diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 404e0f48f3f3..49c2c2888274 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -851,11 +851,6 @@ static inline bool pte_ci(pte_t pte) return false; } -static inline void pmd_set(pmd_t *pmdp, unsigned long val) -{ - *pmdp = __pmd(val); -} - static inline void pmd_clear(pmd_t *pmdp) { *pmdp = __pmd(0); @@ -887,11 +882,6 @@ static inline int pmd_bad(pmd_t pmd) return hash__pmd_bad(pmd); } -static inline void pud_set(pud_t *pudp, unsigned long val) -{ - *pudp = __pud(val); -} - static inline void pud_clear(pud_t *pudp) { *pudp = __pud(0); @@ -934,10 +924,6 @@ static inline bool pud_access_permitted(pud_t pud, bool write) } #define pgd_write(pgd) pte_write(pgd_pte(pgd)) -static inline void pgd_set(pgd_t *pgdp, unsigned long val) -{ - *pgdp = __pgd(val); -} static inline void pgd_clear(pgd_t *pgdp) { From 5c285dd76c7f022dbde3f617efd4f0b8b8ebaeb7 Mon Sep 17 00:00:00 2001 From: "Robert P. J. Day" Date: Sat, 2 Sep 2017 02:47:26 -0400 Subject: [PATCH 101/200] powerpc/dts: Standardize DTS status assignments from "ok" to "okay" While the current kernel drivers/of/ code allows developers to be sloppy and use a DTS status value of "ok", the current DTSpec 0.1 makes it clear that the proper spelling is "okay", so fix the small number of PowerPC .dts files that do this. Signed-off-by: Robert P. J. Day Signed-off-by: Michael Ellerman --- arch/powerpc/boot/dts/akebono.dts | 2 +- arch/powerpc/boot/dts/bluestone.dts | 2 +- arch/powerpc/boot/dts/currituck.dts | 2 +- arch/powerpc/boot/dts/iss4xx-mpic.dts | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/boot/dts/akebono.dts b/arch/powerpc/boot/dts/akebono.dts index 8a7a10139bc9..cd9d66041a3f 100644 --- a/arch/powerpc/boot/dts/akebono.dts +++ b/arch/powerpc/boot/dts/akebono.dts @@ -40,7 +40,7 @@ d-cache-size = <32768>; dcr-controller; dcr-access-method = "native"; - status = "ok"; + status = "okay"; }; cpu@1 { device_type = "cpu"; diff --git a/arch/powerpc/boot/dts/bluestone.dts b/arch/powerpc/boot/dts/bluestone.dts index b0b26d8d68a2..64eaf7e09d22 100644 --- a/arch/powerpc/boot/dts/bluestone.dts +++ b/arch/powerpc/boot/dts/bluestone.dts @@ -109,7 +109,7 @@ OCM: ocm@400040000 { compatible = "ibm,ocm"; - status = "ok"; + status = "okay"; cell-index = <1>; /* configured in U-Boot */ reg = <4 0x00040000 0x8000>; /* 32K */ diff --git a/arch/powerpc/boot/dts/currituck.dts b/arch/powerpc/boot/dts/currituck.dts index a04a4fcfde63..b6d87b9c2cef 100644 --- a/arch/powerpc/boot/dts/currituck.dts +++ b/arch/powerpc/boot/dts/currituck.dts @@ -39,7 +39,7 @@ d-cache-size = <32768>; dcr-controller; dcr-access-method = "native"; - status = "ok"; + status = "okay"; }; cpu@1 { device_type = "cpu"; diff --git a/arch/powerpc/boot/dts/iss4xx-mpic.dts b/arch/powerpc/boot/dts/iss4xx-mpic.dts index f7063198b2dc..c9f90f1a9c8e 100644 --- a/arch/powerpc/boot/dts/iss4xx-mpic.dts +++ b/arch/powerpc/boot/dts/iss4xx-mpic.dts @@ -43,7 +43,7 @@ d-cache-size = <32768>; dcr-controller; dcr-access-method = "native"; - status = "ok"; + status = "okay"; }; cpu@1 { device_type = "cpu"; From 81dac817786263bb44cd5a200a07eff346f78e31 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 15 Jan 2019 17:37:36 +1100 Subject: [PATCH 102/200] powerpc/64: Make sys_switch_endian() traceable We weren't using SYSCALL_DEFINE for sys_switch_endian(), which means it wasn't able to be traced by CONFIG_FTRACE_SYSCALLS. By using the macro we create the right metadata and the syscall is visible. eg: # cd /sys/kernel/debug/tracing # echo 1 | tee events/syscalls/sys_*_switch_endian/enable # ~/switch_endian_test # cat trace ... switch_endian_t-3604 [009] .... 315.175164: sys_switch_endian() switch_endian_t-3604 [009] .... 315.175167: sys_switch_endian -> 0x5555aaaa5555aaaa switch_endian_t-3604 [009] .... 315.175169: sys_switch_endian() switch_endian_t-3604 [009] .... 315.175169: sys_switch_endian -> 0x5555aaaa5555aaaa Fixes: 529d235a0e19 ("powerpc: Add a proper syscall for switching endianness") Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/syscalls.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/syscalls.c b/arch/powerpc/kernel/syscalls.c index e6982ab21816..e52a8878c2fb 100644 --- a/arch/powerpc/kernel/syscalls.c +++ b/arch/powerpc/kernel/syscalls.c @@ -123,7 +123,7 @@ long ppc_fadvise64_64(int fd, int advice, u32 offset_high, u32 offset_low, (u64)len_high << 32 | len_low, advice); } -long sys_switch_endian(void) +SYSCALL_DEFINE0(switch_endian) { struct thread_info *ti; From bba436309d579f306419e0bffe8a28a5935cb17a Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 8 Feb 2019 23:34:16 +1100 Subject: [PATCH 103/200] powerpc: Make PPC_64K_PAGES depend on only 44x or PPC_BOOK3S_64 In commit 7820856a4fcd ("powerpc/mm/book3e/64: Remove unsupported 64Kpage size from 64bit booke") we dropped the 64K page size support from the 64-bit nohash (Book3E) code. But we didn't update the dependencies of the PPC_64K_PAGES option, meaning a randconfig can still trigger this code and cause a build breakage, eg: arch/powerpc/include/asm/nohash/64/pgtable.h:14:2: error: #error "Page size not supported" arch/powerpc/include/asm/nohash/mmu-book3e.h:275:2: error: #error Unsupported page size So remove PPC_BOOK3E_64 from the dependencies. This also means we don't need to worry about PPC_FSL_BOOK3E, because that was just trying to prevent the PPC_BOOK3E_64=y && PPC_FSL_BOOK3E=y case. Signed-off-by: Michael Ellerman --- arch/powerpc/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 3db58fcfb0b2..3d5d63c9b797 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -686,7 +686,7 @@ config PPC_16K_PAGES config PPC_64K_PAGES bool "64k page size" - depends on !PPC_FSL_BOOK3E && (44x || PPC_BOOK3S_64 || PPC_BOOK3E_64) + depends on 44x || PPC_BOOK3S_64 select HAVE_ARCH_SOFT_DIRTY if PPC_BOOK3S_64 config PPC_256K_PAGES From 1b5fc84aba170bdfe3533396ca9662ceea1609b7 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Mon, 26 Nov 2018 12:01:05 +1000 Subject: [PATCH 104/200] powerpc/smp: Fix NMI IPI timeout The NMI IPI timeout logic is broken, if __smp_send_nmi_ipi() times out on the first condition, delay_us will be zero which will send it into the second spin loop with no timeout so it will spin forever. Fixes: 5b73151fff63 ("powerpc: NMI IPI make NMI IPIs fully sychronous") Cc: stable@vger.kernel.org # v4.19+ Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/smp.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 3f15edf25a0d..137196a4248b 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -519,7 +519,7 @@ int __smp_send_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), u64 delay_us, bool if (delay_us) { delay_us--; if (!delay_us) - break; + goto timeout; } } @@ -530,10 +530,11 @@ int __smp_send_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), u64 delay_us, bool if (delay_us) { delay_us--; if (!delay_us) - break; + goto timeout; } } +timeout: if (!cpumask_empty(&nmi_ipi_pending_mask)) { /* Timeout waiting for CPUs to call smp_handle_nmi_ipi */ ret = 0; From 88b9a3d1425a436e95c41f09986fdae2daee437a Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Mon, 26 Nov 2018 12:01:06 +1000 Subject: [PATCH 105/200] powerpc/smp: Fix NMI IPI xmon timeout The xmon debugger IPI handler waits in the callback function while xmon is still active. This means they don't complete the IPI, and the initiator always times out waiting for them. Things manage to work after the timeout because there is some fallback logic to keep NMI IPI state sane in case of the timeout, but this is a bit ugly. This patch changes NMI IPI back to half-asynchronous (i.e., wait for everyone to call in, do not wait for IPI function to complete), but the complexity is avoided by going one step further and allowing new IPIs to be issued before the IPI functions to all complete. If synchronization against that is required, it is left up to the caller, but current callers don't require that. In fact with the timeout handling, callers must be able to cope with this already. Fixes: 5b73151fff63 ("powerpc: NMI IPI make NMI IPIs fully sychronous") Cc: stable@vger.kernel.org # v4.19+ Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/smp.c | 93 ++++++++++++--------------------------- 1 file changed, 29 insertions(+), 64 deletions(-) diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 137196a4248b..6e521a3f67ca 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -358,13 +358,12 @@ void arch_send_call_function_ipi_mask(const struct cpumask *mask) * NMI IPIs may not be recoverable, so should not be used as ongoing part of * a running system. They can be used for crash, debug, halt/reboot, etc. * - * NMI IPIs are globally single threaded. No more than one in progress at - * any time. - * * The IPI call waits with interrupts disabled until all targets enter the - * NMI handler, then the call returns. + * NMI handler, then returns. Subsequent IPIs can be issued before targets + * have returned from their handlers, so there is no guarantee about + * concurrency or re-entrancy. * - * No new NMI can be initiated until targets exit the handler. + * A new NMI can be issued before all targets exit the handler. * * The IPI call may time out without all targets entering the NMI handler. * In that case, there is some logic to recover (and ignore subsequent @@ -375,7 +374,7 @@ void arch_send_call_function_ipi_mask(const struct cpumask *mask) static atomic_t __nmi_ipi_lock = ATOMIC_INIT(0); static struct cpumask nmi_ipi_pending_mask; -static int nmi_ipi_busy_count = 0; +static bool nmi_ipi_busy = false; static void (*nmi_ipi_function)(struct pt_regs *) = NULL; static void nmi_ipi_lock_start(unsigned long *flags) @@ -414,7 +413,7 @@ static void nmi_ipi_unlock_end(unsigned long *flags) */ int smp_handle_nmi_ipi(struct pt_regs *regs) { - void (*fn)(struct pt_regs *); + void (*fn)(struct pt_regs *) = NULL; unsigned long flags; int me = raw_smp_processor_id(); int ret = 0; @@ -425,29 +424,17 @@ int smp_handle_nmi_ipi(struct pt_regs *regs) * because the caller may have timed out. */ nmi_ipi_lock_start(&flags); - if (!nmi_ipi_busy_count) - goto out; - if (!cpumask_test_cpu(me, &nmi_ipi_pending_mask)) - goto out; - - fn = nmi_ipi_function; - if (!fn) - goto out; - - cpumask_clear_cpu(me, &nmi_ipi_pending_mask); - nmi_ipi_busy_count++; - nmi_ipi_unlock(); - - ret = 1; - - fn(regs); - - nmi_ipi_lock(); - if (nmi_ipi_busy_count > 1) /* Can race with caller time-out */ - nmi_ipi_busy_count--; -out: + if (cpumask_test_cpu(me, &nmi_ipi_pending_mask)) { + cpumask_clear_cpu(me, &nmi_ipi_pending_mask); + fn = READ_ONCE(nmi_ipi_function); + WARN_ON_ONCE(!fn); + ret = 1; + } nmi_ipi_unlock_end(&flags); + if (fn) + fn(regs); + return ret; } @@ -473,7 +460,7 @@ static void do_smp_send_nmi_ipi(int cpu, bool safe) * - cpu is the target CPU (must not be this CPU), or NMI_IPI_ALL_OTHERS. * - fn is the target callback function. * - delay_us > 0 is the delay before giving up waiting for targets to - * complete executing the handler, == 0 specifies indefinite delay. + * begin executing the handler, == 0 specifies indefinite delay. */ int __smp_send_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), u64 delay_us, bool safe) { @@ -487,31 +474,33 @@ int __smp_send_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), u64 delay_us, bool if (unlikely(!smp_ops)) return 0; - /* Take the nmi_ipi_busy count/lock with interrupts hard disabled */ nmi_ipi_lock_start(&flags); - while (nmi_ipi_busy_count) { + while (nmi_ipi_busy) { nmi_ipi_unlock_end(&flags); - spin_until_cond(nmi_ipi_busy_count == 0); + spin_until_cond(!nmi_ipi_busy); nmi_ipi_lock_start(&flags); } - + nmi_ipi_busy = true; nmi_ipi_function = fn; + WARN_ON_ONCE(!cpumask_empty(&nmi_ipi_pending_mask)); + if (cpu < 0) { /* ALL_OTHERS */ cpumask_copy(&nmi_ipi_pending_mask, cpu_online_mask); cpumask_clear_cpu(me, &nmi_ipi_pending_mask); } else { - /* cpumask starts clear */ cpumask_set_cpu(cpu, &nmi_ipi_pending_mask); } - nmi_ipi_busy_count++; + nmi_ipi_unlock(); + /* Interrupts remain hard disabled */ + do_smp_send_nmi_ipi(cpu, safe); nmi_ipi_lock(); - /* nmi_ipi_busy_count is held here, so unlock/lock is okay */ + /* nmi_ipi_busy is set here, so unlock/lock is okay */ while (!cpumask_empty(&nmi_ipi_pending_mask)) { nmi_ipi_unlock(); udelay(1); @@ -519,34 +508,19 @@ int __smp_send_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), u64 delay_us, bool if (delay_us) { delay_us--; if (!delay_us) - goto timeout; + break; } } - while (nmi_ipi_busy_count > 1) { - nmi_ipi_unlock(); - udelay(1); - nmi_ipi_lock(); - if (delay_us) { - delay_us--; - if (!delay_us) - goto timeout; - } - } - -timeout: if (!cpumask_empty(&nmi_ipi_pending_mask)) { /* Timeout waiting for CPUs to call smp_handle_nmi_ipi */ ret = 0; cpumask_clear(&nmi_ipi_pending_mask); } - if (nmi_ipi_busy_count > 1) { - /* Timeout waiting for CPUs to execute fn */ - ret = 0; - nmi_ipi_busy_count = 1; - } - nmi_ipi_busy_count--; + nmi_ipi_function = NULL; + nmi_ipi_busy = false; + nmi_ipi_unlock_end(&flags); return ret; @@ -614,17 +588,8 @@ void crash_send_ipi(void (*crash_ipi_callback)(struct pt_regs *)) static void nmi_stop_this_cpu(struct pt_regs *regs) { /* - * This is a special case because it never returns, so the NMI IPI - * handling would never mark it as done, which makes any later - * smp_send_nmi_ipi() call spin forever. Mark it done now. - * * IRQs are already hard disabled by the smp_handle_nmi_ipi. */ - nmi_ipi_lock(); - if (nmi_ipi_busy_count > 1) - nmi_ipi_busy_count--; - nmi_ipi_unlock(); - spin_begin(); while (1) spin_cpu_relax(); From 6fe243fe5157076f3b8d88a02f064b41a4b7eec2 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Mon, 26 Nov 2018 12:01:07 +1000 Subject: [PATCH 106/200] powerpc/smp: Make __smp_send_nmi_ipi() static Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/smp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 6e521a3f67ca..5366d9e7bed4 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -462,7 +462,8 @@ static void do_smp_send_nmi_ipi(int cpu, bool safe) * - delay_us > 0 is the delay before giving up waiting for targets to * begin executing the handler, == 0 specifies indefinite delay. */ -int __smp_send_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), u64 delay_us, bool safe) +static int __smp_send_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), + u64 delay_us, bool safe) { unsigned long flags; int me = raw_smp_processor_id(); From 8cfaf106918a8c13abb24c641556172afbb9545c Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Mon, 11 Feb 2019 11:20:01 +1100 Subject: [PATCH 107/200] powerpc/64s: Fix logic when handling unknown CPU features MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In cpufeatures_process_feature(), if a provided CPU feature is unknown and enable_unknown is false, we erroneously print that the feature is being enabled and return true, even though no feature has been enabled, and may also set feature bits based on the last entry in the match table. Fix this so that we only set feature bits from the match table if we have actually enabled a feature from that table, and when failing to enable an unknown feature, always print the "not enabling" message and return false. Coincidentally, some older gccs (cpu_ftr_bit_mask) An upcoming patch will enable support for kcov, which requires this option. This patch avoids the warning. Fixes: 5a61ef74f269 ("powerpc/64s: Support new device tree binding for discovering CPU features") Reported-by: Segher Boessenkool Signed-off-by: Michael Ellerman [ajd: add commit message] Signed-off-by: Andrew Donnellan --- arch/powerpc/kernel/dt_cpu_ftrs.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c index 8be3721d9302..e49bd5efcfe6 100644 --- a/arch/powerpc/kernel/dt_cpu_ftrs.c +++ b/arch/powerpc/kernel/dt_cpu_ftrs.c @@ -666,8 +666,10 @@ static bool __init cpufeatures_process_feature(struct dt_cpu_feature *f) m = &dt_cpu_feature_match_table[i]; if (!strcmp(f->name, m->name)) { known = true; - if (m->enable(f)) + if (m->enable(f)) { + cur_cpu_spec->cpu_features |= m->cpu_ftr_bit_mask; break; + } pr_info("not enabling: %s (disabled or unsupported by kernel)\n", f->name); @@ -675,17 +677,12 @@ static bool __init cpufeatures_process_feature(struct dt_cpu_feature *f) } } - if (!known && enable_unknown) { - if (!feat_try_enable_unknown(f)) { - pr_info("not enabling: %s (unknown and unsupported by kernel)\n", - f->name); - return false; - } + if (!known && (!enable_unknown || !feat_try_enable_unknown(f))) { + pr_info("not enabling: %s (unknown and unsupported by kernel)\n", + f->name); + return false; } - if (m->cpu_ftr_bit_mask) - cur_cpu_spec->cpu_features |= m->cpu_ftr_bit_mask; - if (known) pr_debug("enabling: %s\n", f->name); else From 19f8a5b5be2898573a5e1dc1db93e8d40117606a Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 12 Feb 2019 11:58:29 +1100 Subject: [PATCH 108/200] powerpc/powernv: Don't reprogram SLW image on every KVM guest entry/exit Commit 24be85a23d1f ("powerpc/powernv: Clear PECE1 in LPCR via stop-api only on Hotplug", 2017-07-21) added two calls to opal_slw_set_reg() inside pnv_cpu_offline(), with the aim of changing the LPCR value in the SLW image to disable wakeups from the decrementer while a CPU is offline. However, pnv_cpu_offline() gets called each time a secondary CPU thread is woken up to participate in running a KVM guest, that is, not just when a CPU is offlined. Since opal_slw_set_reg() is a very slow operation (with observed execution times around 20 milliseconds), this means that an offline secondary CPU can often be busy doing the opal_slw_set_reg() call when the primary CPU wants to grab all the secondary threads so that it can run a KVM guest. This leads to messages like "KVM: couldn't grab CPU n" being printed and guest execution failing. There is no need to reprogram the SLW image on every KVM guest entry and exit. So that we do it only when a CPU is really transitioning between online and offline, this moves the calls to pnv_program_cpu_hotplug_lpcr() into pnv_smp_cpu_kill_self(). Fixes: 24be85a23d1f ("powerpc/powernv: Clear PECE1 in LPCR via stop-api only on Hotplug") Cc: stable@vger.kernel.org # v4.14+ Signed-off-by: Paul Mackerras Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/powernv.h | 2 ++ arch/powerpc/platforms/powernv/idle.c | 27 ++------------------------- arch/powerpc/platforms/powernv/smp.c | 25 +++++++++++++++++++++++++ 3 files changed, 29 insertions(+), 25 deletions(-) diff --git a/arch/powerpc/include/asm/powernv.h b/arch/powerpc/include/asm/powernv.h index 362ea12a4501..05b552418519 100644 --- a/arch/powerpc/include/asm/powernv.h +++ b/arch/powerpc/include/asm/powernv.h @@ -23,6 +23,8 @@ extern int pnv_npu2_handle_fault(struct npu_context *context, uintptr_t *ea, unsigned long *flags, unsigned long *status, int count); +void pnv_program_cpu_hotplug_lpcr(unsigned int cpu, u64 lpcr_val); + void pnv_tm_init(void); #else static inline void powernv_set_nmmu_ptcr(unsigned long ptcr) { } diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index 35f699ebb662..e52f9b06dd9c 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -458,7 +458,8 @@ EXPORT_SYMBOL_GPL(pnv_power9_force_smt4_release); #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ #ifdef CONFIG_HOTPLUG_CPU -static void pnv_program_cpu_hotplug_lpcr(unsigned int cpu, u64 lpcr_val) + +void pnv_program_cpu_hotplug_lpcr(unsigned int cpu, u64 lpcr_val) { u64 pir = get_hard_smp_processor_id(cpu); @@ -481,20 +482,6 @@ unsigned long pnv_cpu_offline(unsigned int cpu) { unsigned long srr1; u32 idle_states = pnv_get_supported_cpuidle_states(); - u64 lpcr_val; - - /* - * We don't want to take decrementer interrupts while we are - * offline, so clear LPCR:PECE1. We keep PECE2 (and - * LPCR_PECE_HVEE on P9) enabled as to let IPIs in. - * - * If the CPU gets woken up by a special wakeup, ensure that - * the SLW engine sets LPCR with decrementer bit cleared, else - * the CPU will come back to the kernel due to a spurious - * wakeup. - */ - lpcr_val = mfspr(SPRN_LPCR) & ~(u64)LPCR_PECE1; - pnv_program_cpu_hotplug_lpcr(cpu, lpcr_val); __ppc64_runlatch_off(); @@ -526,16 +513,6 @@ unsigned long pnv_cpu_offline(unsigned int cpu) __ppc64_runlatch_on(); - /* - * Re-enable decrementer interrupts in LPCR. - * - * Further, we want stop states to be woken up by decrementer - * for non-hotplug cases. So program the LPCR via stop api as - * well. - */ - lpcr_val = mfspr(SPRN_LPCR) | (u64)LPCR_PECE1; - pnv_program_cpu_hotplug_lpcr(cpu, lpcr_val); - return srr1; } #endif diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c index 0d354e19ef92..db09c7022635 100644 --- a/arch/powerpc/platforms/powernv/smp.c +++ b/arch/powerpc/platforms/powernv/smp.c @@ -39,6 +39,7 @@ #include #include #include +#include #include "powernv.h" @@ -153,6 +154,7 @@ static void pnv_smp_cpu_kill_self(void) { unsigned int cpu; unsigned long srr1, wmask; + u64 lpcr_val; /* Standard hot unplug procedure */ /* @@ -174,6 +176,19 @@ static void pnv_smp_cpu_kill_self(void) if (cpu_has_feature(CPU_FTR_ARCH_207S)) wmask = SRR1_WAKEMASK_P8; + /* + * We don't want to take decrementer interrupts while we are + * offline, so clear LPCR:PECE1. We keep PECE2 (and + * LPCR_PECE_HVEE on P9) enabled so as to let IPIs in. + * + * If the CPU gets woken up by a special wakeup, ensure that + * the SLW engine sets LPCR with decrementer bit cleared, else + * the CPU will come back to the kernel due to a spurious + * wakeup. + */ + lpcr_val = mfspr(SPRN_LPCR) & ~(u64)LPCR_PECE1; + pnv_program_cpu_hotplug_lpcr(cpu, lpcr_val); + while (!generic_check_cpu_restart(cpu)) { /* * Clear IPI flag, since we don't handle IPIs while @@ -246,6 +261,16 @@ static void pnv_smp_cpu_kill_self(void) } + /* + * Re-enable decrementer interrupts in LPCR. + * + * Further, we want stop states to be woken up by decrementer + * for non-hotplug cases. So program the LPCR via stop api as + * well. + */ + lpcr_val = mfspr(SPRN_LPCR) | (u64)LPCR_PECE1; + pnv_program_cpu_hotplug_lpcr(cpu, lpcr_val); + DBG("CPU%d coming online...\n", cpu); } From beb4f4722cf60d9f0803054dec4eb5025f2cf594 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 22 Jan 2019 13:54:57 +0000 Subject: [PATCH 109/200] powerpc/selftest: fix type of mftb() in null_syscall All callers of mftb() expect 'unsigned long', and the function itself only returns lower part of the TB so it really is 'unsigned long' not 'unsigned long long' Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- tools/testing/selftests/powerpc/benchmarks/null_syscall.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/powerpc/benchmarks/null_syscall.c b/tools/testing/selftests/powerpc/benchmarks/null_syscall.c index ecc14d68e101..908de689a902 100644 --- a/tools/testing/selftests/powerpc/benchmarks/null_syscall.c +++ b/tools/testing/selftests/powerpc/benchmarks/null_syscall.c @@ -25,7 +25,7 @@ unsigned long long clock_frequency; unsigned long long timebase_frequency; double timebase_multiplier; -static inline unsigned long long mftb(void) +static inline unsigned long mftb(void) { unsigned long low; From 32ceaa6e128556ec67151a16d7c2c04580170db8 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 13 Dec 2018 08:08:11 +0000 Subject: [PATCH 110/200] powerpc/8xx: hide itlbie and dtlbie symbols When disassembling InstructionTLBError we get the following messy code: c000138c: 7d 84 63 78 mr r4,r12 c0001390: 75 25 58 00 andis. r5,r9,22528 c0001394: 75 2a 40 00 andis. r10,r9,16384 c0001398: 41 a2 00 08 beq c00013a0 c000139c: 7c 00 22 64 tlbie r4,r0 c00013a0 : c00013a0: 39 40 04 01 li r10,1025 c00013a4: 91 4b 00 b0 stw r10,176(r11) c00013a8: 39 40 10 32 li r10,4146 c00013ac: 48 00 cc 59 bl c000e004 For a cleaner code dump, this patch replaces itlbie and dtlbie symbols by local symbols. c000138c: 7d 84 63 78 mr r4,r12 c0001390: 75 25 58 00 andis. r5,r9,22528 c0001394: 75 2a 40 00 andis. r10,r9,16384 c0001398: 41 a2 00 08 beq c00013a0 c000139c: 7c 00 22 64 tlbie r4,r0 c00013a0: 39 40 04 01 li r10,1025 c00013a4: 91 4b 00 b0 stw r10,176(r11) c00013a8: 39 40 10 32 li r10,4146 c00013ac: 48 00 cc 59 bl c000e004 Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/head_8xx.S | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S index 20cc816b3508..67cbae30ebf2 100644 --- a/arch/powerpc/kernel/head_8xx.S +++ b/arch/powerpc/kernel/head_8xx.S @@ -551,11 +551,11 @@ InstructionTLBError: mr r4,r12 andis. r5,r9,DSISR_SRR1_MATCH_32S@h /* Filter relevant SRR1 bits */ andis. r10,r9,SRR1_ISI_NOPT@h - beq+ 1f + beq+ .Litlbie tlbie r4 -itlbie: /* 0x400 is InstructionAccess exception, needed by bad_page_fault() */ -1: EXC_XFER_LITE(0x400, handle_page_fault) +.Litlbie: + EXC_XFER_LITE(0x400, handle_page_fault) /* This is the data TLB error on the MPC8xx. This could be due to * many reasons, including a dirty update to a pte. We bail out to @@ -577,10 +577,10 @@ DARFixed:/* Return from dcbx instruction bug workaround */ stw r5,_DSISR(r11) mfspr r4,SPRN_DAR andis. r10,r5,DSISR_NOHPTE@h - beq+ 1f + beq+ .Ldtlbie tlbie r4 -dtlbie: -1: li r10,RPN_PATTERN +.Ldtlbie: + li r10,RPN_PATTERN mtspr SPRN_DAR,r10 /* Tag DAR, to be used in DTLB Error */ /* 0x300 is DataAccess exception, needed by bad_page_fault() */ EXC_XFER_LITE(0x300, handle_page_fault) @@ -603,8 +603,8 @@ DataBreakpoint: mtspr SPRN_SPRG_SCRATCH1, r11 mfcr r10 mfspr r11, SPRN_SRR0 - cmplwi cr0, r11, (dtlbie - PAGE_OFFSET)@l - cmplwi cr7, r11, (itlbie - PAGE_OFFSET)@l + cmplwi cr0, r11, (.Ldtlbie - PAGE_OFFSET)@l + cmplwi cr7, r11, (.Litlbie - PAGE_OFFSET)@l beq- cr0, 11f beq- cr7, 11f EXCEPTION_PROLOG_1 From e995265252fad3cf10cec6821ff0870cd95c4f08 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 18 Dec 2018 06:53:41 +0000 Subject: [PATCH 111/200] powerpc/setup: display reason for not booting When no machine description matches, display it clearly before looping forever. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/setup-common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index fa606aa98f6d..9a6a0859c1ef 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -634,7 +634,7 @@ void probe_machine(void) } /* What can we do if we didn't find ? */ if (machine_id >= &__machine_desc_end) { - DBG("No suitable machine found !\n"); + pr_err("No suitable machine description found !\n"); for (;;); } From ab44840df1dcd1a10c514aa2938c47f5b526fa37 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 22 Jan 2019 13:52:04 +0000 Subject: [PATCH 112/200] powerpc/32: Remove unneccessary MSR[RI] clearing for 8xx MSR[RI] has already been cleared a few lines above. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/entry_32.S | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index 0768dfd8a64e..c2b66fbbf7f0 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -997,9 +997,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_NEED_PAIRED_STWCX) .globl exc_exit_restart exc_exit_restart: lwz r12,_NIP(r1) -#if defined(CONFIG_PPC_8xx) && defined(CONFIG_PERF_EVENTS) - mtspr SPRN_NRI, r0 -#endif mtspr SPRN_SRR0,r12 mtspr SPRN_SRR1,r9 REST_4GPRS(9, r1) From 0bbea75c476b77fa7d7811d6be911cc7583e640f Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 22 Jan 2019 14:11:24 +0000 Subject: [PATCH 113/200] powerpc/traps: fix recoverability of machine check handling on book3s/32 Looks like book3s/32 doesn't set RI on machine check, so checking RI before calling die() will always be fatal allthought this is not an issue in most cases. Fixes: b96672dd840f ("powerpc: Machine check interrupt is a non-maskable interrupt") Fixes: daf00ae71dad ("powerpc/traps: restore recoverability of machine_check interrupts") Signed-off-by: Christophe Leroy Cc: stable@vger.kernel.org Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/traps.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 040b60293613..b25bc8af7d38 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -756,15 +756,15 @@ void machine_check_exception(struct pt_regs *regs) if (check_io_access(regs)) goto bail; - /* Must die if the interrupt is not recoverable */ - if (!(regs->msr & MSR_RI)) - nmi_panic(regs, "Unrecoverable Machine check"); - if (!nested) nmi_exit(); die("Machine check", regs, SIGBUS); + /* Must die if the interrupt is not recoverable */ + if (!(regs->msr & MSR_RI)) + nmi_panic(regs, "Unrecoverable Machine check"); + return; bail: From 36da5ff0bea2dc67298150ead8d8471575c54c7d Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 25 Jan 2019 12:03:55 +0000 Subject: [PATCH 114/200] powerpc/83xx: Also save/restore SPRG4-7 during suspend The 83xx has 8 SPRG registers and uses at least SPRG4 for DTLB handling LRU. Fixes: 2319f1239592 ("powerpc/mm: e300c2/c3/c4 TLB errata workaround") Cc: stable@vger.kernel.org Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/83xx/suspend-asm.S | 34 ++++++++++++++++++----- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/platforms/83xx/suspend-asm.S b/arch/powerpc/platforms/83xx/suspend-asm.S index 3d1ecd211776..8137f77abad5 100644 --- a/arch/powerpc/platforms/83xx/suspend-asm.S +++ b/arch/powerpc/platforms/83xx/suspend-asm.S @@ -26,13 +26,13 @@ #define SS_MSR 0x74 #define SS_SDR1 0x78 #define SS_LR 0x7c -#define SS_SPRG 0x80 /* 4 SPRGs */ -#define SS_DBAT 0x90 /* 8 DBATs */ -#define SS_IBAT 0xd0 /* 8 IBATs */ -#define SS_TB 0x110 -#define SS_CR 0x118 -#define SS_GPREG 0x11c /* r12-r31 */ -#define STATE_SAVE_SIZE 0x16c +#define SS_SPRG 0x80 /* 8 SPRGs */ +#define SS_DBAT 0xa0 /* 8 DBATs */ +#define SS_IBAT 0xe0 /* 8 IBATs */ +#define SS_TB 0x120 +#define SS_CR 0x128 +#define SS_GPREG 0x12c /* r12-r31 */ +#define STATE_SAVE_SIZE 0x17c .section .data .align 5 @@ -103,6 +103,16 @@ _GLOBAL(mpc83xx_enter_deep_sleep) stw r7, SS_SPRG+12(r3) stw r8, SS_SDR1(r3) + mfspr r4, SPRN_SPRG4 + mfspr r5, SPRN_SPRG5 + mfspr r6, SPRN_SPRG6 + mfspr r7, SPRN_SPRG7 + + stw r4, SS_SPRG+16(r3) + stw r5, SS_SPRG+20(r3) + stw r6, SS_SPRG+24(r3) + stw r7, SS_SPRG+28(r3) + mfspr r4, SPRN_DBAT0U mfspr r5, SPRN_DBAT0L mfspr r6, SPRN_DBAT1U @@ -493,6 +503,16 @@ mpc83xx_deep_resume: mtspr SPRN_IBAT7U, r6 mtspr SPRN_IBAT7L, r7 + lwz r4, SS_SPRG+16(r3) + lwz r5, SS_SPRG+20(r3) + lwz r6, SS_SPRG+24(r3) + lwz r7, SS_SPRG+28(r3) + + mtspr SPRN_SPRG4, r4 + mtspr SPRN_SPRG5, r5 + mtspr SPRN_SPRG6, r6 + mtspr SPRN_SPRG7, r7 + lwz r4, SS_SPRG+0(r3) lwz r5, SS_SPRG+4(r3) lwz r6, SS_SPRG+8(r3) From 40058337f23f79212f92ed5ef066e90a032905b1 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 21 Feb 2019 10:37:53 +0000 Subject: [PATCH 115/200] powerpc: simplify BDI switch There is no reason to re-read each time the pointer at location 0xf0 as it is fixed and known. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/mmu.h | 2 ++ arch/powerpc/kernel/head_32.S | 5 ++--- arch/powerpc/kernel/head_40x.S | 5 ++--- arch/powerpc/kernel/head_8xx.S | 1 + arch/powerpc/mm/8xx_mmu.c | 7 ++----- 5 files changed, 9 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h index 25607604a7a5..6d22a8e78fe2 100644 --- a/arch/powerpc/include/asm/mmu.h +++ b/arch/powerpc/include/asm/mmu.h @@ -356,6 +356,8 @@ extern void early_init_mmu_secondary(void); extern void setup_initial_memory_limit(phys_addr_t first_memblock_base, phys_addr_t first_memblock_size); static inline void mmu_early_init_devtree(void) { } + +extern void *abatron_pteptrs[2]; #endif /* __ASSEMBLY__ */ #endif diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index 05b08db3901d..c2f564690778 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -1027,9 +1027,8 @@ _ENTRY(switch_mmu_context) * The PGDIR is passed as second argument. */ lwz r4,MM_PGD(r4) - lis r5, KERNELBASE@h - lwz r5, 0xf0(r5) - stw r4, 0x4(r5) + lis r5, abatron_pteptrs@ha + stw r4, abatron_pteptrs@l + 0x4(r5) #endif li r4,0 isync diff --git a/arch/powerpc/kernel/head_40x.S b/arch/powerpc/kernel/head_40x.S index b19d78410511..11dd09d0ce1a 100644 --- a/arch/powerpc/kernel/head_40x.S +++ b/arch/powerpc/kernel/head_40x.S @@ -953,9 +953,8 @@ _GLOBAL(set_context) /* Context switch the PTE pointer for the Abatron BDI2000. * The PGDIR is the second parameter. */ - lis r5, KERNELBASE@h - lwz r5, 0xf0(r5) - stw r4, 0x4(r5) + lis r5, abatron_pteptrs@ha + stw r4, abatron_pteptrs@l + 0x4(r5) #endif sync mtspr SPRN_PID,r3 diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S index 67cbae30ebf2..7e14796bea81 100644 --- a/arch/powerpc/kernel/head_8xx.S +++ b/arch/powerpc/kernel/head_8xx.S @@ -989,5 +989,6 @@ swapper_pg_dir: /* Room for two PTE table poiners, usually the kernel and current user * pointer to their respective root page table (pgdir). */ + .globl abatron_pteptrs abatron_pteptrs: .space 8 diff --git a/arch/powerpc/mm/8xx_mmu.c b/arch/powerpc/mm/8xx_mmu.c index bfa503cff351..f12ec85e965c 100644 --- a/arch/powerpc/mm/8xx_mmu.c +++ b/arch/powerpc/mm/8xx_mmu.c @@ -162,14 +162,11 @@ void set_context(unsigned long id, pgd_t *pgd) { s16 offset = (s16)(__pa(swapper_pg_dir)); -#ifdef CONFIG_BDI_SWITCH - pgd_t **ptr = *(pgd_t ***)(KERNELBASE + 0xf0); - /* Context switch the PTE pointer for the Abatron BDI2000. * The PGDIR is passed as second argument. */ - *(ptr + 1) = pgd; -#endif + if (IS_ENABLED(CONFIG_BDI_SWITCH)) + abatron_pteptrs[1] = pgd; /* Register M_TWB will contain base address of level 1 table minus the * lower part of the kernel PGDIR base address, so that all accesses to From 0df977eafc792a5365a7f81d8d5920132e03afad Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 21 Feb 2019 10:37:54 +0000 Subject: [PATCH 116/200] powerpc/6xx: Don't use SPRN_SPRG2 for storing stack pointer while in RTAS When calling RTAS, the stack pointer is stored in SPRN_SPRG2 in order to be able to restore it in case of machine check in RTAS. As machine check is not a perfomance critical path, this patch frees SPRN_SPRG2 by using a field in thread struct instead. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/processor.h | 3 +++ arch/powerpc/include/asm/reg.h | 1 - arch/powerpc/kernel/asm-offsets.c | 3 +++ arch/powerpc/kernel/entry_32.S | 5 +++-- arch/powerpc/kernel/head_32.S | 22 ++++++++++++---------- 5 files changed, 21 insertions(+), 13 deletions(-) diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index ee58526cb6c2..e8682122ea3d 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -250,6 +250,9 @@ struct thread_struct { #ifdef CONFIG_PPC32 void *pgdir; /* root of page-table tree */ unsigned long ksp_limit; /* if ksp <= ksp_limit stack overflow */ +#ifdef CONFIG_PPC_RTAS + unsigned long rtas_sp; /* stack pointer for when in RTAS */ +#endif #endif /* Debug Registers */ struct debug_reg debug; diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index 1c98ef1f2d5b..371ef6e8248e 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -1167,7 +1167,6 @@ #ifdef CONFIG_PPC_BOOK3S_32 #define SPRN_SPRG_SCRATCH0 SPRN_SPRG0 #define SPRN_SPRG_SCRATCH1 SPRN_SPRG1 -#define SPRN_SPRG_RTAS SPRN_SPRG2 #define SPRN_SPRG_603_LRU SPRN_SPRG4 #endif diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 9ffc72ded73a..d6f9bdb1eb2e 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -93,6 +93,9 @@ int main(void) OFFSET(THREAD_INFO, task_struct, stack); DEFINE(THREAD_INFO_GAP, _ALIGN_UP(sizeof(struct thread_info), 16)); OFFSET(KSP_LIMIT, thread_struct, ksp_limit); +#ifdef CONFIG_PPC_RTAS + OFFSET(RTAS_SP, thread_struct, rtas_sp); +#endif #endif /* CONFIG_PPC64 */ #ifdef CONFIG_LIVEPATCH diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index c2b66fbbf7f0..6c671ceb5a06 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -1332,7 +1332,7 @@ _GLOBAL(enter_rtas) MTMSRD(r0) /* don't get trashed */ li r9,MSR_KERNEL & ~(MSR_IR|MSR_DR) mtlr r6 - mtspr SPRN_SPRG_RTAS,r7 + stw r7, THREAD + RTAS_SP(r2) mtspr SPRN_SRR0,r8 mtspr SPRN_SRR1,r9 RFI @@ -1341,7 +1341,8 @@ _GLOBAL(enter_rtas) lwz r9,8(r9) /* original msr value */ addi r1,r1,INT_FRAME_SIZE li r0,0 - mtspr SPRN_SPRG_RTAS,r0 + tophys(r7, r2) + stw r0, THREAD + RTAS_SP(r7) mtspr SPRN_SRR0,r8 mtspr SPRN_SRR1,r9 RFI /* return to caller */ diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index c2f564690778..04128899a0a5 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -352,9 +352,8 @@ i##n: \ * registers that might have bad values includes all the GPRs * and all the BATs. We indicate that we are in RTAS by putting * a non-zero value, the address of the exception frame to use, - * in SPRG2. The machine check handler checks SPRG2 and uses its - * value if it is non-zero. If we ever needed to free up SPRG2, - * we could use a field in the thread_info or thread_struct instead. + * in thread.rtas_sp. The machine check handler checks thread.rtas_sp + * and uses its value if it is non-zero. * (Other exception handlers assume that r1 is a valid kernel stack * pointer when we take an exception from supervisor mode.) * -- paulus. @@ -365,16 +364,15 @@ i##n: \ mtspr SPRN_SPRG_SCRATCH1,r11 mfcr r10 #ifdef CONFIG_PPC_CHRP - mfspr r11,SPRN_SPRG_RTAS - cmpwi 0,r11,0 - bne 7f + mfspr r11, SPRN_SPRG_THREAD + lwz r11, RTAS_SP(r11) + cmpwi cr1, r11, 0 + bne cr1, 7f #endif /* CONFIG_PPC_CHRP */ EXCEPTION_PROLOG_1 7: EXCEPTION_PROLOG_2 addi r3,r1,STACK_FRAME_OVERHEAD #ifdef CONFIG_PPC_CHRP - mfspr r4,SPRN_SPRG_RTAS - cmpwi cr1,r4,0 bne cr1,1f #endif EXC_XFER_STD(0x200, machine_check_exception) @@ -865,8 +863,10 @@ __secondary_start: tophys(r4,r2) addi r4,r4,THREAD /* phys address of our thread_struct */ mtspr SPRN_SPRG_THREAD,r4 +#ifdef CONFIG_PPC_RTAS li r3,0 - mtspr SPRN_SPRG_RTAS,r3 /* 0 => not in RTAS */ + stw r3, RTAS_SP(r4) /* 0 => not in RTAS */ +#endif /* enable MMU and jump to start_secondary */ li r4,MSR_KERNEL @@ -950,8 +950,10 @@ start_here: tophys(r4,r2) addi r4,r4,THREAD /* init task's THREAD */ mtspr SPRN_SPRG_THREAD,r4 +#ifdef CONFIG_PPC_RTAS li r3,0 - mtspr SPRN_SPRG_RTAS,r3 /* 0 => not in RTAS */ + stw r3, RTAS_SP(r4) /* 0 => not in RTAS */ +#endif /* stack */ lis r1,init_thread_union@ha From 93c4a162b014d238a287f8264adb25c009c79e61 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 21 Feb 2019 10:37:55 +0000 Subject: [PATCH 117/200] powerpc/6xx: Store PGDIR physical address in a SPRG Use SPRN_SPRG2 to store the current thread PGDIR and avoid reading thread_struct.pgdir at every TLB miss. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/reg.h | 1 + arch/powerpc/kernel/cpu_setup_6xx.S | 4 ++++ arch/powerpc/kernel/head_32.S | 25 ++++++++++++------------- arch/powerpc/mm/hash_low_32.S | 3 +-- 4 files changed, 18 insertions(+), 15 deletions(-) diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index 371ef6e8248e..1f79e1d8fb0b 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -1167,6 +1167,7 @@ #ifdef CONFIG_PPC_BOOK3S_32 #define SPRN_SPRG_SCRATCH0 SPRN_SPRG0 #define SPRN_SPRG_SCRATCH1 SPRN_SPRG1 +#define SPRN_SPRG_PGDIR SPRN_SPRG2 #define SPRN_SPRG_603_LRU SPRN_SPRG4 #endif diff --git a/arch/powerpc/kernel/cpu_setup_6xx.S b/arch/powerpc/kernel/cpu_setup_6xx.S index 8c069e96c478..6f1c11e0691f 100644 --- a/arch/powerpc/kernel/cpu_setup_6xx.S +++ b/arch/powerpc/kernel/cpu_setup_6xx.S @@ -24,6 +24,10 @@ BEGIN_MMU_FTR_SECTION li r10,0 mtspr SPRN_SPRG_603_LRU,r10 /* init SW LRU tracking */ END_MMU_FTR_SECTION_IFSET(MMU_FTR_NEED_DTLB_SW_LRU) + lis r10, (swapper_pg_dir - PAGE_OFFSET)@h + ori r10, r10, (swapper_pg_dir - PAGE_OFFSET)@l + mtspr SPRN_SPRG_PGDIR, r10 + BEGIN_FTR_SECTION bl __init_fpu_registers END_FTR_SECTION_IFCLR(CPU_FTR_FPU_UNAVAILABLE) diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index 04128899a0a5..2b0a26f66115 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -500,16 +500,15 @@ InstructionTLBMiss: mfspr r3,SPRN_IMISS lis r1,PAGE_OFFSET@h /* check if kernel address */ cmplw 0,r1,r3 - mfspr r2,SPRN_SPRG_THREAD + mfspr r2, SPRN_SPRG_PGDIR li r1,_PAGE_USER|_PAGE_PRESENT|_PAGE_EXEC /* low addresses tested as user */ - lwz r2,PGDIR(r2) bge- 112f mfspr r2,SPRN_SRR1 /* and MSR_PR bit from SRR1 */ rlwimi r1,r2,32-12,29,29 /* shift MSR_PR to _PAGE_USER posn */ lis r2,swapper_pg_dir@ha /* if kernel address, use */ addi r2,r2,swapper_pg_dir@l /* kernel page table */ -112: tophys(r2,r2) - rlwimi r2,r3,12,20,29 /* insert top 10 bits of address */ + tophys(r2,r2) +112: rlwimi r2,r3,12,20,29 /* insert top 10 bits of address */ lwz r2,0(r2) /* get pmd entry */ rlwinm. r2,r2,0,0,19 /* extract address of pte page */ beq- InstructionAddressInvalid /* return if no mapping */ @@ -574,16 +573,15 @@ DataLoadTLBMiss: mfspr r3,SPRN_DMISS lis r1,PAGE_OFFSET@h /* check if kernel address */ cmplw 0,r1,r3 - mfspr r2,SPRN_SPRG_THREAD + mfspr r2, SPRN_SPRG_PGDIR li r1,_PAGE_USER|_PAGE_PRESENT /* low addresses tested as user */ - lwz r2,PGDIR(r2) bge- 112f mfspr r2,SPRN_SRR1 /* and MSR_PR bit from SRR1 */ rlwimi r1,r2,32-12,29,29 /* shift MSR_PR to _PAGE_USER posn */ lis r2,swapper_pg_dir@ha /* if kernel address, use */ addi r2,r2,swapper_pg_dir@l /* kernel page table */ -112: tophys(r2,r2) - rlwimi r2,r3,12,20,29 /* insert top 10 bits of address */ + tophys(r2,r2) +112: rlwimi r2,r3,12,20,29 /* insert top 10 bits of address */ lwz r2,0(r2) /* get pmd entry */ rlwinm. r2,r2,0,0,19 /* extract address of pte page */ beq- DataAddressInvalid /* return if no mapping */ @@ -658,16 +656,15 @@ DataStoreTLBMiss: mfspr r3,SPRN_DMISS lis r1,PAGE_OFFSET@h /* check if kernel address */ cmplw 0,r1,r3 - mfspr r2,SPRN_SPRG_THREAD + mfspr r2, SPRN_SPRG_PGDIR li r1,_PAGE_RW|_PAGE_USER|_PAGE_PRESENT /* access flags */ - lwz r2,PGDIR(r2) bge- 112f mfspr r2,SPRN_SRR1 /* and MSR_PR bit from SRR1 */ rlwimi r1,r2,32-12,29,29 /* shift MSR_PR to _PAGE_USER posn */ lis r2,swapper_pg_dir@ha /* if kernel address, use */ addi r2,r2,swapper_pg_dir@l /* kernel page table */ -112: tophys(r2,r2) - rlwimi r2,r3,12,20,29 /* insert top 10 bits of address */ + tophys(r2,r2) +112: rlwimi r2,r3,12,20,29 /* insert top 10 bits of address */ lwz r2,0(r2) /* get pmd entry */ rlwinm. r2,r2,0,0,19 /* extract address of pte page */ beq- DataAddressInvalid /* return if no mapping */ @@ -1024,14 +1021,16 @@ _ENTRY(switch_mmu_context) li r0,NUM_USER_SEGMENTS mtctr r0 + lwz r4, MM_PGD(r4) #ifdef CONFIG_BDI_SWITCH /* Context switch the PTE pointer for the Abatron BDI2000. * The PGDIR is passed as second argument. */ - lwz r4,MM_PGD(r4) lis r5, abatron_pteptrs@ha stw r4, abatron_pteptrs@l + 0x4(r5) #endif + tophys(r4, r4) + mtspr SPRN_SPRG_PGDIR, r4 li r4,0 isync 3: diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/hash_low_32.S index 1e2df3e9f9ea..82e7dd0c0220 100644 --- a/arch/powerpc/mm/hash_low_32.S +++ b/arch/powerpc/mm/hash_low_32.S @@ -70,9 +70,8 @@ _GLOBAL(hash_page) /* Get PTE (linux-style) and check access */ lis r0,KERNELBASE@h /* check if kernel address */ cmplw 0,r4,r0 - mfspr r8,SPRN_SPRG_THREAD /* current task's THREAD (phys) */ ori r3,r3,_PAGE_USER|_PAGE_PRESENT /* test low addresses as user */ - lwz r5,PGDIR(r8) /* virt page-table root */ + mfspr r5, SPRN_SPRG_PGDIR /* virt page-table root */ blt+ 112f /* assume user more likely */ lis r5,swapper_pg_dir@ha /* if kernel address, use */ addi r5,r5,swapper_pg_dir@l /* kernel page table */ From 2c12393f577396a51b7e0537bd3eb29dcc26dc1b Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 21 Feb 2019 10:37:56 +0000 Subject: [PATCH 118/200] powerpc/603: use physical address directly in TLB miss handlers. Since commit c62ce9ef97ba ("powerpc: remove remaining bits from CONFIG_APUS"), tophys() has become a pure constant operation. PAGE_OFFSET is known at compile time so the physical address can be builtin directly. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/head_32.S | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index 2b0a26f66115..4c2cc42399aa 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -505,9 +505,8 @@ InstructionTLBMiss: bge- 112f mfspr r2,SPRN_SRR1 /* and MSR_PR bit from SRR1 */ rlwimi r1,r2,32-12,29,29 /* shift MSR_PR to _PAGE_USER posn */ - lis r2,swapper_pg_dir@ha /* if kernel address, use */ - addi r2,r2,swapper_pg_dir@l /* kernel page table */ - tophys(r2,r2) + lis r2, (swapper_pg_dir - PAGE_OFFSET)@ha /* if kernel address, use */ + addi r2, r2, (swapper_pg_dir - PAGE_OFFSET)@l /* kernel page table */ 112: rlwimi r2,r3,12,20,29 /* insert top 10 bits of address */ lwz r2,0(r2) /* get pmd entry */ rlwinm. r2,r2,0,0,19 /* extract address of pte page */ @@ -578,9 +577,8 @@ DataLoadTLBMiss: bge- 112f mfspr r2,SPRN_SRR1 /* and MSR_PR bit from SRR1 */ rlwimi r1,r2,32-12,29,29 /* shift MSR_PR to _PAGE_USER posn */ - lis r2,swapper_pg_dir@ha /* if kernel address, use */ - addi r2,r2,swapper_pg_dir@l /* kernel page table */ - tophys(r2,r2) + lis r2, (swapper_pg_dir - PAGE_OFFSET)@ha /* if kernel address, use */ + addi r2, r2, (swapper_pg_dir - PAGE_OFFSET)@l /* kernel page table */ 112: rlwimi r2,r3,12,20,29 /* insert top 10 bits of address */ lwz r2,0(r2) /* get pmd entry */ rlwinm. r2,r2,0,0,19 /* extract address of pte page */ @@ -661,9 +659,8 @@ DataStoreTLBMiss: bge- 112f mfspr r2,SPRN_SRR1 /* and MSR_PR bit from SRR1 */ rlwimi r1,r2,32-12,29,29 /* shift MSR_PR to _PAGE_USER posn */ - lis r2,swapper_pg_dir@ha /* if kernel address, use */ - addi r2,r2,swapper_pg_dir@l /* kernel page table */ - tophys(r2,r2) + lis r2, (swapper_pg_dir - PAGE_OFFSET)@ha /* if kernel address, use */ + addi r2, r2, (swapper_pg_dir - PAGE_OFFSET)@l /* kernel page table */ 112: rlwimi r2,r3,12,20,29 /* insert top 10 bits of address */ lwz r2,0(r2) /* get pmd entry */ rlwinm. r2,r2,0,0,19 /* extract address of pte page */ From 6790dae886f9a95018c52165171b905a8cc47588 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 21 Feb 2019 10:37:57 +0000 Subject: [PATCH 119/200] powerpc/hash32: use physical address directly in hash handlers. Since commit c62ce9ef97ba ("powerpc: remove remaining bits from CONFIG_APUS"), tophys() has become a pure constant operation. PAGE_OFFSET is known at compile time so the physical address can be builtin directly. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/mm/hash_low_32.S | 62 +++++++++++++++-------------------- arch/powerpc/mm/ppc_mmu_32.c | 6 ++-- 2 files changed, 31 insertions(+), 37 deletions(-) diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/hash_low_32.S index 82e7dd0c0220..d94fef524ef5 100644 --- a/arch/powerpc/mm/hash_low_32.S +++ b/arch/powerpc/mm/hash_low_32.S @@ -47,14 +47,13 @@ mmu_hash_lock: * Returns to the caller if the access is illegal or there is no * mapping for the address. Otherwise it places an appropriate PTE * in the hash table and returns from the exception. - * Uses r0, r3 - r8, r10, ctr, lr. + * Uses r0, r3 - r6, r8, r10, ctr, lr. */ .text _GLOBAL(hash_page) - tophys(r7,0) /* gets -KERNELBASE into r7 */ #ifdef CONFIG_SMP - addis r8,r7,mmu_hash_lock@h - ori r8,r8,mmu_hash_lock@l + lis r8, (mmu_hash_lock - PAGE_OFFSET)@h + ori r8, r8, (mmu_hash_lock - PAGE_OFFSET)@l lis r0,0x0fff b 10f 11: lwz r6,0(r8) @@ -76,7 +75,7 @@ _GLOBAL(hash_page) lis r5,swapper_pg_dir@ha /* if kernel address, use */ addi r5,r5,swapper_pg_dir@l /* kernel page table */ rlwimi r3,r9,32-12,29,29 /* MSR_PR -> _PAGE_USER */ -112: add r5,r5,r7 /* convert to phys addr */ +112: tophys(r5, r5) #ifndef CONFIG_PTE_64BIT rlwimi r5,r4,12,20,29 /* insert top 10 bits of address */ lwz r8,0(r5) /* get pmd entry */ @@ -143,25 +142,24 @@ retry: #ifdef CONFIG_SMP eieio - addis r8,r7,mmu_hash_lock@ha + lis r8, (mmu_hash_lock - PAGE_OFFSET)@ha li r0,0 - stw r0,mmu_hash_lock@l(r8) + stw r0, (mmu_hash_lock - PAGE_OFFSET)@l(r8) #endif /* Return from the exception */ lwz r5,_CTR(r11) mtctr r5 lwz r0,GPR0(r11) - lwz r7,GPR7(r11) lwz r8,GPR8(r11) b fast_exception_return #ifdef CONFIG_SMP hash_page_out: eieio - addis r8,r7,mmu_hash_lock@ha + lis r8, (mmu_hash_lock - PAGE_OFFSET)@ha li r0,0 - stw r0,mmu_hash_lock@l(r8) + stw r0, (mmu_hash_lock - PAGE_OFFSET)@l(r8) blr #endif /* CONFIG_SMP */ @@ -207,11 +205,9 @@ _GLOBAL(add_hash_page) SYNC_601 isync - tophys(r7,0) - #ifdef CONFIG_SMP - addis r6,r7,mmu_hash_lock@ha - addi r6,r6,mmu_hash_lock@l + lis r6, (mmu_hash_lock - PAGE_OFFSET)@ha + addi r6, r6, (mmu_hash_lock - PAGE_OFFSET)@l 10: lwarx r0,0,r6 /* take the mmu_hash_lock */ cmpi 0,r0,0 bne- 11f @@ -256,8 +252,8 @@ _GLOBAL(add_hash_page) 9: #ifdef CONFIG_SMP - addis r6,r7,mmu_hash_lock@ha - addi r6,r6,mmu_hash_lock@l + lis r6, (mmu_hash_lock - PAGE_OFFSET)@ha + addi r6, r6, (mmu_hash_lock - PAGE_OFFSET)@l eieio li r0,0 stw r0,0(r6) /* clear mmu_hash_lock */ @@ -277,10 +273,8 @@ _GLOBAL(add_hash_page) * It is designed to be called with the MMU either on or off. * r3 contains the VSID, r4 contains the virtual address, * r5 contains the linux PTE, r6 contains the old value of the - * linux PTE (before setting _PAGE_HASHPTE) and r7 contains the - * offset to be added to addresses (0 if the MMU is on, - * -KERNELBASE if it is off). r10 contains the upper half of - * the PTE if CONFIG_PTE_64BIT. + * linux PTE (before setting _PAGE_HASHPTE). r10 contains the + * upper half of the PTE if CONFIG_PTE_64BIT. * On SMP, the caller should have the mmu_hash_lock held. * We assume that the caller has (or will) set the _PAGE_HASHPTE * bit in the linux PTE in memory. The value passed in r6 should @@ -341,7 +335,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT) patch_site 1f, patch__hash_page_A1 patch_site 2f, patch__hash_page_A2 /* Get the address of the primary PTE group in the hash table (r3) */ -0: addis r0,r7,Hash_base@h /* base address of hash table */ +0: lis r0, (Hash_base - PAGE_OFFSET)@h /* base address of hash table */ 1: rlwimi r0,r3,LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* VSID -> hash */ 2: rlwinm r3,r4,20+LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* PI -> hash */ xor r3,r3,r0 /* make primary hash */ @@ -355,10 +349,10 @@ END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT) beq+ 10f /* no PTE: go look for an empty slot */ tlbie r4 - addis r4,r7,htab_hash_searches@ha - lwz r6,htab_hash_searches@l(r4) + lis r4, (htab_hash_searches - PAGE_OFFSET)@ha + lwz r6, (htab_hash_searches - PAGE_OFFSET)@l(r4) addi r6,r6,1 /* count how many searches we do */ - stw r6,htab_hash_searches@l(r4) + stw r6, (htab_hash_searches - PAGE_OFFSET)@l(r4) /* Search the primary PTEG for a PTE whose 1st (d)word matches r5 */ mtctr r0 @@ -390,10 +384,10 @@ END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT) beq+ found_empty /* update counter of times that the primary PTEG is full */ - addis r4,r7,primary_pteg_full@ha - lwz r6,primary_pteg_full@l(r4) + lis r4, (primary_pteg_full - PAGE_OFFSET)@ha + lwz r6, (primary_pteg_full - PAGE_OFFSET)@l(r4) addi r6,r6,1 - stw r6,primary_pteg_full@l(r4) + stw r6, (primary_pteg_full - PAGE_OFFSET)@l(r4) patch_site 0f, patch__hash_page_C /* Search the secondary PTEG for an empty slot */ @@ -427,8 +421,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT) * lockup here but that shouldn't happen */ -1: addis r4,r7,next_slot@ha /* get next evict slot */ - lwz r6,next_slot@l(r4) +1: lis r4, (next_slot - PAGE_OFFSET)@ha /* get next evict slot */ + lwz r6, (next_slot - PAGE_OFFSET)@l(r4) addi r6,r6,HPTE_SIZE /* search for candidate */ andi. r6,r6,7*HPTE_SIZE stw r6,next_slot@l(r4) @@ -500,8 +494,6 @@ htab_hash_searches: * We assume that there is a hash table in use (Hash != 0). */ _GLOBAL(flush_hash_pages) - tophys(r7,0) - /* * We disable interrupts here, even on UP, because we want * the _PAGE_HASHPTE bit to be a reliable indication of @@ -546,10 +538,10 @@ _GLOBAL(flush_hash_pages) SET_V(r11) /* set V (valid) bit */ #ifdef CONFIG_SMP - addis r9,r7,mmu_hash_lock@ha - addi r9,r9,mmu_hash_lock@l + lis r9, (mmu_hash_lock - PAGE_OFFSET)@ha + addi r9, r9, (mmu_hash_lock - PAGE_OFFSET)@l CURRENT_THREAD_INFO(r8, r1) - add r8,r8,r7 + tophys(r8, r8) lwz r8,TI_CPU(r8) oris r8,r8,9 10: lwarx r0,0,r9 @@ -583,7 +575,7 @@ _GLOBAL(flush_hash_pages) patch_site 1f, patch__flush_hash_A1 patch_site 2f, patch__flush_hash_A2 /* Get the address of the primary PTE group in the hash table (r3) */ -0: addis r8,r7,Hash_base@h /* base address of hash table */ +0: lis r8, (Hash_base - PAGE_OFFSET)@h /* base address of hash table */ 1: rlwimi r8,r3,LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* VSID -> hash */ 2: rlwinm r0,r4,20+LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* PI -> hash */ xor r8,r0,r8 /* make primary hash */ diff --git a/arch/powerpc/mm/ppc_mmu_32.c b/arch/powerpc/mm/ppc_mmu_32.c index 3f4193201ee7..fb747bb0b3e4 100644 --- a/arch/powerpc/mm/ppc_mmu_32.c +++ b/arch/powerpc/mm/ppc_mmu_32.c @@ -231,7 +231,8 @@ void __init MMU_init_hw(void) if (lg_n_hpteg > 16) mb2 = 16 - LG_HPTEG_SIZE; - modify_instruction_site(&patch__hash_page_A0, 0xffff, (unsigned int)Hash >> 16); + modify_instruction_site(&patch__hash_page_A0, 0xffff, + ((unsigned int)Hash - PAGE_OFFSET) >> 16); modify_instruction_site(&patch__hash_page_A1, 0x7c0, mb << 6); modify_instruction_site(&patch__hash_page_A2, 0x7c0, mb2 << 6); modify_instruction_site(&patch__hash_page_B, 0xffff, hmask); @@ -240,7 +241,8 @@ void __init MMU_init_hw(void) /* * Patch up the instructions in hashtable.S:flush_hash_page */ - modify_instruction_site(&patch__flush_hash_A0, 0xffff, (unsigned int)Hash >> 16); + modify_instruction_site(&patch__flush_hash_A0, 0xffff, + ((unsigned int)Hash - PAGE_OFFSET) >> 16); modify_instruction_site(&patch__flush_hash_A1, 0x7c0, mb << 6); modify_instruction_site(&patch__flush_hash_A2, 0x7c0, mb2 << 6); modify_instruction_site(&patch__flush_hash_B, 0xffff, hmask); From a8a121995b2e4f227fddc534c6bd5f1c02cbe2ee Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 21 Feb 2019 10:37:58 +0000 Subject: [PATCH 120/200] powerpc/603: Don't handle kernel page TLB misses when not need ITLB miss on kernel pages only occur with CONFIG_MODULES and CONFIG_DEBUG_PAGEALLOC. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/head_32.S | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index 4c2cc42399aa..e4338d785a94 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -498,15 +498,19 @@ InstructionTLBMiss: */ /* Get PTE (linux-style) and check access */ mfspr r3,SPRN_IMISS +#if defined(CONFIG_MODULES) || defined(CONFIG_DEBUG_PAGEALLOC) lis r1,PAGE_OFFSET@h /* check if kernel address */ cmplw 0,r1,r3 +#endif mfspr r2, SPRN_SPRG_PGDIR li r1,_PAGE_USER|_PAGE_PRESENT|_PAGE_EXEC /* low addresses tested as user */ +#if defined(CONFIG_MODULES) || defined(CONFIG_DEBUG_PAGEALLOC) bge- 112f mfspr r2,SPRN_SRR1 /* and MSR_PR bit from SRR1 */ rlwimi r1,r2,32-12,29,29 /* shift MSR_PR to _PAGE_USER posn */ lis r2, (swapper_pg_dir - PAGE_OFFSET)@ha /* if kernel address, use */ addi r2, r2, (swapper_pg_dir - PAGE_OFFSET)@l /* kernel page table */ +#endif 112: rlwimi r2,r3,12,20,29 /* insert top 10 bits of address */ lwz r2,0(r2) /* get pmd entry */ rlwinm. r2,r2,0,0,19 /* extract address of pte page */ From 54a05a30c888d1037022661031bbd9195aee90f8 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 21 Feb 2019 10:37:59 +0000 Subject: [PATCH 121/200] powerpc/603: Don't handle _PAGE_RW and _PAGE_DIRTY on ITLB misses _PAGE_RW and _PAGE_DIRTY do not matter for ITLB misses. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/head_32.S | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index e4338d785a94..6db54425f1d9 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -526,13 +526,9 @@ InstructionTLBMiss: */ stw r0,0(r2) /* update PTE (accessed bit) */ /* Convert linux-style PTE to low word of PPC-style PTE */ - rlwinm r1,r0,32-10,31,31 /* _PAGE_RW -> PP lsb */ - rlwinm r2,r0,32-7,31,31 /* _PAGE_DIRTY -> PP lsb */ - and r1,r1,r2 /* writable if _RW and _DIRTY */ rlwimi r0,r0,32-1,30,30 /* _PAGE_USER -> PP msb */ - rlwimi r0,r0,32-1,31,31 /* _PAGE_USER -> PP lsb */ - ori r1,r1,0xe04 /* clear out reserved bits */ - andc r1,r0,r1 /* PP = user? (rw&dirty? 2: 3): 0 */ + ori r1, r1, 0xe05 /* clear out reserved bits */ + andc r1, r0, r1 /* PP = user? 2 : 0 */ BEGIN_FTR_SECTION rlwinm r1,r1,0,~_PAGE_COHERENT /* clear M (coherence not required) */ END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT) From f8b58c64eaef03946dbb1b100ba0255f72031c17 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 21 Feb 2019 10:38:00 +0000 Subject: [PATCH 122/200] powerpc/603: let's handle PAGE_DIRTY directly PAGE_DIRTY corresponds to the C bit. If writing on a page for which the C bit is not set, a DataStoreTLBMiss is generated. No need to check it in DataLoadTLBMiss. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/head_32.S | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index 6db54425f1d9..b071f328b4b0 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -595,12 +595,10 @@ DataLoadTLBMiss: stw r0,0(r2) /* update PTE (accessed bit) */ /* Convert linux-style PTE to low word of PPC-style PTE */ rlwinm r1,r0,32-10,31,31 /* _PAGE_RW -> PP lsb */ - rlwinm r2,r0,32-7,31,31 /* _PAGE_DIRTY -> PP lsb */ - and r1,r1,r2 /* writable if _RW and _DIRTY */ rlwimi r0,r0,32-1,30,30 /* _PAGE_USER -> PP msb */ rlwimi r0,r0,32-1,31,31 /* _PAGE_USER -> PP lsb */ ori r1,r1,0xe04 /* clear out reserved bits */ - andc r1,r0,r1 /* PP = user? (rw&dirty? 2: 3): 0 */ + andc r1,r0,r1 /* PP = user? rw? 2: 3: 0 */ BEGIN_FTR_SECTION rlwinm r1,r1,0,~_PAGE_COHERENT /* clear M (coherence not required) */ END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT) @@ -669,7 +667,7 @@ DataStoreTLBMiss: lwz r0,0(r2) /* get linux-style pte */ andc. r1,r1,r0 /* check access & ~permission */ bne- DataAddressInvalid /* return if access not permitted */ - ori r0,r0,_PAGE_ACCESSED|_PAGE_DIRTY + ori r0,r0,_PAGE_ACCESSED /* * NOTE! We are assuming this is not an SMP system, otherwise * we would need to update the pte atomically with lwarx/stwcx. From 451b3ec082c2fef31455261fb18d2676afb50c61 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 21 Feb 2019 10:38:01 +0000 Subject: [PATCH 123/200] powerpc/603: Don't worry about _PAGE_USER in TLB miss handlers PP bits take user access into account, so no need to check _PAGE_USER here. A DSI or ISI will be generated if needed. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/head_32.S | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index b071f328b4b0..6b5cb7551a72 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -503,11 +503,9 @@ InstructionTLBMiss: cmplw 0,r1,r3 #endif mfspr r2, SPRN_SPRG_PGDIR - li r1,_PAGE_USER|_PAGE_PRESENT|_PAGE_EXEC /* low addresses tested as user */ + li r1,_PAGE_PRESENT | _PAGE_EXEC #if defined(CONFIG_MODULES) || defined(CONFIG_DEBUG_PAGEALLOC) bge- 112f - mfspr r2,SPRN_SRR1 /* and MSR_PR bit from SRR1 */ - rlwimi r1,r2,32-12,29,29 /* shift MSR_PR to _PAGE_USER posn */ lis r2, (swapper_pg_dir - PAGE_OFFSET)@ha /* if kernel address, use */ addi r2, r2, (swapper_pg_dir - PAGE_OFFSET)@l /* kernel page table */ #endif @@ -573,10 +571,8 @@ DataLoadTLBMiss: lis r1,PAGE_OFFSET@h /* check if kernel address */ cmplw 0,r1,r3 mfspr r2, SPRN_SPRG_PGDIR - li r1,_PAGE_USER|_PAGE_PRESENT /* low addresses tested as user */ + li r1, _PAGE_PRESENT bge- 112f - mfspr r2,SPRN_SRR1 /* and MSR_PR bit from SRR1 */ - rlwimi r1,r2,32-12,29,29 /* shift MSR_PR to _PAGE_USER posn */ lis r2, (swapper_pg_dir - PAGE_OFFSET)@ha /* if kernel address, use */ addi r2, r2, (swapper_pg_dir - PAGE_OFFSET)@l /* kernel page table */ 112: rlwimi r2,r3,12,20,29 /* insert top 10 bits of address */ @@ -653,10 +649,8 @@ DataStoreTLBMiss: lis r1,PAGE_OFFSET@h /* check if kernel address */ cmplw 0,r1,r3 mfspr r2, SPRN_SPRG_PGDIR - li r1,_PAGE_RW|_PAGE_USER|_PAGE_PRESENT /* access flags */ + li r1, _PAGE_RW | _PAGE_PRESENT /* access flags */ bge- 112f - mfspr r2,SPRN_SRR1 /* and MSR_PR bit from SRR1 */ - rlwimi r1,r2,32-12,29,29 /* shift MSR_PR to _PAGE_USER posn */ lis r2, (swapper_pg_dir - PAGE_OFFSET)@ha /* if kernel address, use */ addi r2, r2, (swapper_pg_dir - PAGE_OFFSET)@l /* kernel page table */ 112: rlwimi r2,r3,12,20,29 /* insert top 10 bits of address */ From 84de6ab0e904d058cc221af564fa44636a6c2c59 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 21 Feb 2019 10:38:02 +0000 Subject: [PATCH 124/200] powerpc/603: don't handle PAGE_ACCESSED in TLB miss handlers. PAGE_ACCESSED is only needed for CONFIG_SWAP. When CONFIG_SWAP is not set, just ignore it. If CONFIG_SWAP is set and PAGE_ACCESSED is not, let's take a minor fault. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/head_32.S | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index 6b5cb7551a72..fdb587c96a80 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -503,7 +503,11 @@ InstructionTLBMiss: cmplw 0,r1,r3 #endif mfspr r2, SPRN_SPRG_PGDIR +#ifdef CONFIG_SWAP + li r1,_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_EXEC +#else li r1,_PAGE_PRESENT | _PAGE_EXEC +#endif #if defined(CONFIG_MODULES) || defined(CONFIG_DEBUG_PAGEALLOC) bge- 112f lis r2, (swapper_pg_dir - PAGE_OFFSET)@ha /* if kernel address, use */ @@ -517,12 +521,6 @@ InstructionTLBMiss: lwz r0,0(r2) /* get linux-style pte */ andc. r1,r1,r0 /* check access & ~permission */ bne- InstructionAddressInvalid /* return if access not permitted */ - ori r0,r0,_PAGE_ACCESSED /* set _PAGE_ACCESSED in pte */ - /* - * NOTE! We are assuming this is not an SMP system, otherwise - * we would need to update the pte atomically with lwarx/stwcx. - */ - stw r0,0(r2) /* update PTE (accessed bit) */ /* Convert linux-style PTE to low word of PPC-style PTE */ rlwimi r0,r0,32-1,30,30 /* _PAGE_USER -> PP msb */ ori r1, r1, 0xe05 /* clear out reserved bits */ @@ -571,7 +569,11 @@ DataLoadTLBMiss: lis r1,PAGE_OFFSET@h /* check if kernel address */ cmplw 0,r1,r3 mfspr r2, SPRN_SPRG_PGDIR +#ifdef CONFIG_SWAP + li r1, _PAGE_PRESENT | _PAGE_ACCESSED +#else li r1, _PAGE_PRESENT +#endif bge- 112f lis r2, (swapper_pg_dir - PAGE_OFFSET)@ha /* if kernel address, use */ addi r2, r2, (swapper_pg_dir - PAGE_OFFSET)@l /* kernel page table */ @@ -583,12 +585,10 @@ DataLoadTLBMiss: lwz r0,0(r2) /* get linux-style pte */ andc. r1,r1,r0 /* check access & ~permission */ bne- DataAddressInvalid /* return if access not permitted */ - ori r0,r0,_PAGE_ACCESSED /* set _PAGE_ACCESSED in pte */ /* * NOTE! We are assuming this is not an SMP system, otherwise * we would need to update the pte atomically with lwarx/stwcx. */ - stw r0,0(r2) /* update PTE (accessed bit) */ /* Convert linux-style PTE to low word of PPC-style PTE */ rlwinm r1,r0,32-10,31,31 /* _PAGE_RW -> PP lsb */ rlwimi r0,r0,32-1,30,30 /* _PAGE_USER -> PP msb */ @@ -649,7 +649,11 @@ DataStoreTLBMiss: lis r1,PAGE_OFFSET@h /* check if kernel address */ cmplw 0,r1,r3 mfspr r2, SPRN_SPRG_PGDIR - li r1, _PAGE_RW | _PAGE_PRESENT /* access flags */ +#ifdef CONFIG_SWAP + li r1, _PAGE_RW | _PAGE_PRESENT | _PAGE_ACCESSED +#else + li r1, _PAGE_RW | _PAGE_PRESENT +#endif bge- 112f lis r2, (swapper_pg_dir - PAGE_OFFSET)@ha /* if kernel address, use */ addi r2, r2, (swapper_pg_dir - PAGE_OFFSET)@l /* kernel page table */ @@ -661,12 +665,10 @@ DataStoreTLBMiss: lwz r0,0(r2) /* get linux-style pte */ andc. r1,r1,r0 /* check access & ~permission */ bne- DataAddressInvalid /* return if access not permitted */ - ori r0,r0,_PAGE_ACCESSED /* * NOTE! We are assuming this is not an SMP system, otherwise * we would need to update the pte atomically with lwarx/stwcx. */ - stw r0,0(r2) /* update PTE (accessed/dirty bits) */ /* Convert linux-style PTE to low word of PPC-style PTE */ rlwimi r0,r0,32-1,30,30 /* _PAGE_USER -> PP msb */ li r1,0xe05 /* clear out reserved bits & PP lsb */ From 78ca1108b10927b3d068c8da91352b0f4cd01fc5 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 25 Jan 2019 12:34:20 +0000 Subject: [PATCH 125/200] powerpc/book3s32: Reorder _PAGE_XXX flags to simplify TLB handling For pages without _PAGE_USER, PP field is 00 For pages with _PAGE_USER, PP field is 10 for RW and 11 for RO. This patch sets _PAGE_USER to 0x002 and _PAGE_RW to 0x001 is order to simplify TLB handling by reducing amount of shifts. The location of _PAGE_PRESENT and _PAGE_HASHPTE doesn't matter as they are only SW related flags. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/32/hash.h | 8 ++++---- arch/powerpc/kernel/head_32.S | 5 +---- arch/powerpc/mm/hash_low_32.S | 6 ++---- 3 files changed, 7 insertions(+), 12 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/32/hash.h b/arch/powerpc/include/asm/book3s/32/hash.h index 2a0a467d2985..a5907ea4fb40 100644 --- a/arch/powerpc/include/asm/book3s/32/hash.h +++ b/arch/powerpc/include/asm/book3s/32/hash.h @@ -17,9 +17,9 @@ * updating the accessed and modified bits in the page table tree. */ -#define _PAGE_PRESENT 0x001 /* software: pte contains a translation */ -#define _PAGE_HASHPTE 0x002 /* hash_page has made an HPTE for this pte */ -#define _PAGE_USER 0x004 /* usermode access allowed */ +#define _PAGE_RW 0x001 /* PP = x1: user write access allowed */ +#define _PAGE_USER 0x002 /* PP = 1x: usermode access allowed */ +#define _PAGE_HASHPTE 0x004 /* software: hash_page has made an HPTE for this pte */ #define _PAGE_GUARDED 0x008 /* G: prohibit speculative access */ #define _PAGE_COHERENT 0x010 /* M: enforce memory coherence (SMP systems) */ #define _PAGE_NO_CACHE 0x020 /* I: cache inhibit */ @@ -27,7 +27,7 @@ #define _PAGE_DIRTY 0x080 /* C: page changed */ #define _PAGE_ACCESSED 0x100 /* R: page referenced */ #define _PAGE_EXEC 0x200 /* software: exec allowed */ -#define _PAGE_RW 0x400 /* software: user write access allowed */ +#define _PAGE_PRESENT 0x400 /* software: pte contains a translation */ #define _PAGE_SPECIAL 0x800 /* software: Special page */ #ifdef CONFIG_PTE_64BIT diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index fdb587c96a80..e7a5b312a7db 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -522,7 +522,6 @@ InstructionTLBMiss: andc. r1,r1,r0 /* check access & ~permission */ bne- InstructionAddressInvalid /* return if access not permitted */ /* Convert linux-style PTE to low word of PPC-style PTE */ - rlwimi r0,r0,32-1,30,30 /* _PAGE_USER -> PP msb */ ori r1, r1, 0xe05 /* clear out reserved bits */ andc r1, r0, r1 /* PP = user? 2 : 0 */ BEGIN_FTR_SECTION @@ -590,8 +589,7 @@ DataLoadTLBMiss: * we would need to update the pte atomically with lwarx/stwcx. */ /* Convert linux-style PTE to low word of PPC-style PTE */ - rlwinm r1,r0,32-10,31,31 /* _PAGE_RW -> PP lsb */ - rlwimi r0,r0,32-1,30,30 /* _PAGE_USER -> PP msb */ + rlwinm r1, r0, 0, 31, 31 /* _PAGE_RW -> PP lsb */ rlwimi r0,r0,32-1,31,31 /* _PAGE_USER -> PP lsb */ ori r1,r1,0xe04 /* clear out reserved bits */ andc r1,r0,r1 /* PP = user? rw? 2: 3: 0 */ @@ -670,7 +668,6 @@ DataStoreTLBMiss: * we would need to update the pte atomically with lwarx/stwcx. */ /* Convert linux-style PTE to low word of PPC-style PTE */ - rlwimi r0,r0,32-1,30,30 /* _PAGE_USER -> PP msb */ li r1,0xe05 /* clear out reserved bits & PP lsb */ andc r1,r0,r1 /* PP = user? 2: 0 */ BEGIN_FTR_SECTION diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/hash_low_32.S index d94fef524ef5..f4294edeca9d 100644 --- a/arch/powerpc/mm/hash_low_32.S +++ b/arch/powerpc/mm/hash_low_32.S @@ -310,11 +310,9 @@ Hash_msk = (((1 << Hash_bits) - 1) * 64) _GLOBAL(create_hpte) /* Convert linux-style PTE (r5) to low word of PPC-style PTE (r8) */ - rlwinm r8,r5,32-10,31,31 /* _PAGE_RW -> PP lsb */ rlwinm r0,r5,32-7,31,31 /* _PAGE_DIRTY -> PP lsb */ - and r8,r8,r0 /* writable if _RW & _DIRTY */ - rlwimi r5,r5,32-1,30,30 /* _PAGE_USER -> PP msb */ - rlwimi r5,r5,32-2,31,31 /* _PAGE_USER -> PP lsb */ + and r8, r5, r0 /* writable if _RW & _DIRTY */ + rlwimi r5, r5, 32 - 1, 31, 31 /* _PAGE_USER -> PP lsb */ ori r8,r8,0xe04 /* clear out reserved bits */ andc r8,r5,r8 /* PP = user? (rw&dirty? 2: 3): 0 */ BEGIN_FTR_SECTION From 6b9166f0786e6f24da9fb198bcf70ce131c88ee1 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 31 Jan 2019 10:10:31 +0000 Subject: [PATCH 126/200] powerpc/32: Fix CONFIG_VIRT_CPU_ACCOUNTING_NATIVE for 40x/booke 40x/booke have another path to reach 3f from transfer_to_handler, make sure it also calls ACCOUNT_CPU_USER_ENTRY() when CONFIG_VIRT_CPU_ACCOUNTING_NATIVE is selected. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/entry_32.S | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index 6c671ceb5a06..0c6977376233 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -166,6 +166,13 @@ transfer_to_handler: internal debug mode bit to do this. */ lwz r12,THREAD_DBCR0(r12) andis. r12,r12,DBCR0_IDM@h +#endif +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE + CURRENT_THREAD_INFO(r9, r1) + tophys(r9, r9) + ACCOUNT_CPU_USER_ENTRY(r9, r11, r12) +#endif +#if defined(CONFIG_40x) || defined(CONFIG_BOOKE) beq+ 3f /* From user and task is ptraced - load up global dbcr0 */ li r12,-1 /* clear all pending debug events */ @@ -185,11 +192,6 @@ transfer_to_handler: addi r12,r12,-1 stw r12,4(r11) #endif -#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE - CURRENT_THREAD_INFO(r9, r1) - tophys(r9, r9) - ACCOUNT_CPU_USER_ENTRY(r9, r11, r12) -#endif b 3f From fa7b9a805c797b729022029aaa3a2b7c35fff4c6 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 8 Feb 2019 15:02:55 +0000 Subject: [PATCH 127/200] tools/selftest/vm: allow choosing mem size and page size in map_hugetlb map_hugetlb maps 256Mbytes of memory with default hugepage size. This patch allows the user to pass the size and page shift as an argument in order to use different size and page size. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- tools/testing/selftests/vm/map_hugetlb.c | 29 ++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/vm/map_hugetlb.c b/tools/testing/selftests/vm/map_hugetlb.c index 9b777fa95f09..5a2d7b8efc40 100644 --- a/tools/testing/selftests/vm/map_hugetlb.c +++ b/tools/testing/selftests/vm/map_hugetlb.c @@ -23,6 +23,14 @@ #define MAP_HUGETLB 0x40000 /* arch specific */ #endif +#ifndef MAP_HUGE_SHIFT +#define MAP_HUGE_SHIFT 26 +#endif + +#ifndef MAP_HUGE_MASK +#define MAP_HUGE_MASK 0x3f +#endif + /* Only ia64 requires this */ #ifdef __ia64__ #define ADDR (void *)(0x8000000000000000UL) @@ -58,12 +66,29 @@ static int read_bytes(char *addr) return 0; } -int main(void) +int main(int argc, char **argv) { void *addr; int ret; + size_t length = LENGTH; + int flags = FLAGS; + int shift = 0; - addr = mmap(ADDR, LENGTH, PROTECTION, FLAGS, -1, 0); + if (argc > 1) + length = atol(argv[1]) << 20; + if (argc > 2) { + shift = atoi(argv[2]); + if (shift) + flags |= (shift & MAP_HUGE_MASK) << MAP_HUGE_SHIFT; + } + + if (shift) + printf("%u kB hugepages\n", 1 << shift); + else + printf("Default size hugepages\n"); + printf("Mapping %lu Mbytes\n", (unsigned long)length >> 20); + + addr = mmap(ADDR, length, PROTECTION, flags, -1, 0); if (addr == MAP_FAILED) { perror("mmap"); exit(1); From cabe8138b23c92e851b92c6a20ffb13598ff34d4 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 18 Feb 2019 12:25:20 +0000 Subject: [PATCH 128/200] powerpc: dump as a single line areas mapping a single physical page. When using KASAN, there are parts of the shadow area where all pages are mapped to the kasan_early_shadow_page. It is pointless to dump one line for each of those pages (in the example below there are 7168 entries pointing to the same physical page). ~# cat /sys/kernel/debug/kernel_page_tables ... ---[ kasan shadow mem start ]--- 0xf7c00000-0xf8bfffff 0x06fac000 16M rw present dirty accessed 0xf8c00000-0xf8c03fff 0x00cd0000 16K r present dirty accessed 0xf8c04000-0xf8c07fff 0x00cd0000 16K r present dirty accessed 0xf8c08000-0xf8c0bfff 0x00cd0000 16K r present dirty accessed 0xf8c0c000-0xf8c0ffff 0x00cd0000 16K r present dirty accessed 0xf8c10000-0xf8c13fff 0x00cd0000 16K r present dirty accessed ... 7168 identical lines 0xffbfc000-0xffbfffff 0x00cd0000 16K r present dirty accessed ---[ kasan shadow mem end ]--- ... This patch modifies linux table dump to dump as a single line areas where all addresses points to the same physical page. That physical address is put inside [] to show that all virt pages points to the same phys page. ~# cat /sys/kernel/debug/kernel_page_tables ... ---[ kasan shadow mem start ]--- 0xf7c00000-0xf8bfffff 0x06fac000 16M rw present dirty accessed 0xf8c00000-0xffbfffff [0x00cd0000] 16K r present dirty accessed ---[ kasan shadow mem end ]--- ... Signed-off-by: Christophe Leroy Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/mm/dump_linuxpagetables.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/mm/dump_linuxpagetables.c b/arch/powerpc/mm/dump_linuxpagetables.c index 6aa41669ac1a..b0da447197d4 100644 --- a/arch/powerpc/mm/dump_linuxpagetables.c +++ b/arch/powerpc/mm/dump_linuxpagetables.c @@ -143,14 +143,19 @@ static void dump_addr(struct pg_state *st, unsigned long addr) unsigned long delta; #ifdef CONFIG_PPC64 - seq_printf(st->seq, "0x%016lx-0x%016lx ", st->start_address, addr-1); - seq_printf(st->seq, "0x%016lx ", st->start_pa); +#define REG "0x%016lx" #else - seq_printf(st->seq, "0x%08lx-0x%08lx ", st->start_address, addr - 1); - seq_printf(st->seq, "0x%08lx ", st->start_pa); +#define REG "0x%08lx" #endif - delta = (addr - st->start_address) >> 10; + seq_printf(st->seq, REG "-" REG " ", st->start_address, addr - 1); + if (st->start_pa == st->last_pa && st->start_address + PAGE_SIZE != addr) { + seq_printf(st->seq, "[" REG "]", st->start_pa); + delta = PAGE_SIZE >> 10; + } else { + seq_printf(st->seq, " " REG " ", st->start_pa); + delta = (addr - st->start_address) >> 10; + } /* Work out what appropriate unit to use */ while (!(delta & 1023) && unit[1]) { delta >>= 10; @@ -184,7 +189,8 @@ static void note_page(struct pg_state *st, unsigned long addr, */ } else if (flag != st->current_flags || level != st->level || addr >= st->marker[1].start_address || - pa != st->last_pa + PAGE_SIZE) { + (pa != st->last_pa + PAGE_SIZE && + (pa != st->start_pa || st->start_pa != st->last_pa))) { /* Check the PTE flags */ if (st->current_flags) { From c3c7470c75566a077c8dc71dcf8f1948b8ddfab4 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 22 Feb 2019 13:22:08 +1100 Subject: [PATCH 129/200] powerpc/kvm: Save and restore host AMR/IAMR/UAMOR When the hash MMU is active the AMR, IAMR and UAMOR are used for pkeys. The AMR is directly writable by user space, and the UAMOR masks those writes, meaning both registers are effectively user register state. The IAMR is used to create an execute only key. Also we must maintain the value of at least the AMR when running in process context, so that any memory accesses done by the kernel on behalf of the process are correctly controlled by the AMR. Although we are correctly switching all registers when going into a guest, on returning to the host we just write 0 into all regs, except on Power9 where we restore the IAMR correctly. This could be observed by a user process if it writes the AMR, then runs a guest and we then return immediately to it without rescheduling. Because we have written 0 to the AMR that would have the effect of granting read/write permission to pages that the process was trying to protect. In addition, when using the Radix MMU, the AMR can prevent inadvertent kernel access to userspace data, writing 0 to the AMR disables that protection. So save and restore AMR, IAMR and UAMOR. Fixes: cf43d3b26452 ("powerpc: Enable pkey subsystem") Cc: stable@vger.kernel.org # v4.16+ Signed-off-by: Russell Currey Signed-off-by: Michael Ellerman Acked-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 26 ++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index f24f6a2f8eb5..25043b50cb30 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -58,6 +58,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) #define STACK_SLOT_DAWR (SFS-56) #define STACK_SLOT_DAWRX (SFS-64) #define STACK_SLOT_HFSCR (SFS-72) +#define STACK_SLOT_AMR (SFS-80) +#define STACK_SLOT_UAMOR (SFS-88) /* the following is used by the P9 short path */ #define STACK_SLOT_NVGPRS (SFS-152) /* 18 gprs */ @@ -726,11 +728,9 @@ BEGIN_FTR_SECTION mfspr r5, SPRN_TIDR mfspr r6, SPRN_PSSCR mfspr r7, SPRN_PID - mfspr r8, SPRN_IAMR std r5, STACK_SLOT_TID(r1) std r6, STACK_SLOT_PSSCR(r1) std r7, STACK_SLOT_PID(r1) - std r8, STACK_SLOT_IAMR(r1) mfspr r5, SPRN_HFSCR std r5, STACK_SLOT_HFSCR(r1) END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) @@ -738,11 +738,18 @@ BEGIN_FTR_SECTION mfspr r5, SPRN_CIABR mfspr r6, SPRN_DAWR mfspr r7, SPRN_DAWRX + mfspr r8, SPRN_IAMR std r5, STACK_SLOT_CIABR(r1) std r6, STACK_SLOT_DAWR(r1) std r7, STACK_SLOT_DAWRX(r1) + std r8, STACK_SLOT_IAMR(r1) END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + mfspr r5, SPRN_AMR + std r5, STACK_SLOT_AMR(r1) + mfspr r6, SPRN_UAMOR + std r6, STACK_SLOT_UAMOR(r1) + BEGIN_FTR_SECTION /* Set partition DABR */ /* Do this before re-enabling PMU to avoid P7 DABR corruption bug */ @@ -1631,22 +1638,25 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) mtspr SPRN_PSPB, r0 mtspr SPRN_WORT, r0 BEGIN_FTR_SECTION - mtspr SPRN_IAMR, r0 mtspr SPRN_TCSCR, r0 /* Set MMCRS to 1<<31 to freeze and disable the SPMC counters */ li r0, 1 sldi r0, r0, 31 mtspr SPRN_MMCRS, r0 END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) -8: - /* Save and reset AMR and UAMOR before turning on the MMU */ + /* Save and restore AMR, IAMR and UAMOR before turning on the MMU */ + ld r8, STACK_SLOT_IAMR(r1) + mtspr SPRN_IAMR, r8 + +8: /* Power7 jumps back in here */ mfspr r5,SPRN_AMR mfspr r6,SPRN_UAMOR std r5,VCPU_AMR(r9) std r6,VCPU_UAMOR(r9) - li r6,0 - mtspr SPRN_AMR,r6 + ld r5,STACK_SLOT_AMR(r1) + ld r6,STACK_SLOT_UAMOR(r1) + mtspr SPRN_AMR, r5 mtspr SPRN_UAMOR, r6 /* Switch DSCR back to host value */ @@ -1746,11 +1756,9 @@ BEGIN_FTR_SECTION ld r5, STACK_SLOT_TID(r1) ld r6, STACK_SLOT_PSSCR(r1) ld r7, STACK_SLOT_PID(r1) - ld r8, STACK_SLOT_IAMR(r1) mtspr SPRN_TIDR, r5 mtspr SPRN_PSSCR, r6 mtspr SPRN_PID, r7 - mtspr SPRN_IAMR, r8 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) #ifdef CONFIG_PPC_RADIX_MMU From e66c3209c7fd17209ccc4cbbee8b1b1bd5c438dd Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 18 Feb 2019 12:28:36 +0000 Subject: [PATCH 130/200] powerpc: Move page table dump files in a dedicated subdirectory This patch moves the files related to page table dump in a dedicated subdirectory. The purpose is to clean a bit arch/powerpc/mm by regrouping multiple files handling a dedicated function. Signed-off-by: Christophe Leroy [mpe: Shorten the file names while we're at it] Signed-off-by: Michael Ellerman --- arch/powerpc/Kconfig.debug | 4 ---- arch/powerpc/mm/Makefile | 10 +--------- .../mm/{dump_linuxpagetables-8xx.c => ptdump/8xx.c} | 2 +- arch/powerpc/mm/ptdump/Makefile | 9 +++++++++ arch/powerpc/mm/{dump_bats.c => ptdump/bats.c} | 0 .../book3s64.c} | 2 +- .../{dump_hashpagetable.c => ptdump/hashpagetable.c} | 0 .../mm/{dump_linuxpagetables.c => ptdump/ptdump.c} | 2 +- .../mm/{dump_linuxpagetables.h => ptdump/ptdump.h} | 0 arch/powerpc/mm/{dump_sr.c => ptdump/segment_regs.c} | 0 .../shared.c} | 2 +- 11 files changed, 14 insertions(+), 17 deletions(-) rename arch/powerpc/mm/{dump_linuxpagetables-8xx.c => ptdump/8xx.c} (97%) create mode 100644 arch/powerpc/mm/ptdump/Makefile rename arch/powerpc/mm/{dump_bats.c => ptdump/bats.c} (100%) rename arch/powerpc/mm/{dump_linuxpagetables-book3s64.c => ptdump/book3s64.c} (98%) rename arch/powerpc/mm/{dump_hashpagetable.c => ptdump/hashpagetable.c} (100%) rename arch/powerpc/mm/{dump_linuxpagetables.c => ptdump/ptdump.c} (99%) rename arch/powerpc/mm/{dump_linuxpagetables.h => ptdump/ptdump.h} (100%) rename arch/powerpc/mm/{dump_sr.c => ptdump/segment_regs.c} (100%) rename arch/powerpc/mm/{dump_linuxpagetables-generic.c => ptdump/shared.c} (97%) diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug index f4961fbcb48d..4e00cb0a5464 100644 --- a/arch/powerpc/Kconfig.debug +++ b/arch/powerpc/Kconfig.debug @@ -361,10 +361,6 @@ config PPC_PTDUMP If you are unsure, say N. -config PPC_HTDUMP - def_bool y - depends on PPC_PTDUMP && PPC_BOOK3S_64 - config PPC_FAST_ENDIAN_SWITCH bool "Deprecated fast endian-switch syscall" depends on DEBUG_KERNEL && PPC_BOOK3S_64 diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index f965fc33a8b7..ee1efa3b3382 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -45,13 +45,5 @@ obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o obj-$(CONFIG_HIGHMEM) += highmem.o obj-$(CONFIG_PPC_COPRO_BASE) += copro_fault.o obj-$(CONFIG_SPAPR_TCE_IOMMU) += mmu_context_iommu.o -obj-$(CONFIG_PPC_PTDUMP) += dump_linuxpagetables.o -ifdef CONFIG_PPC_PTDUMP -obj-$(CONFIG_4xx) += dump_linuxpagetables-generic.o -obj-$(CONFIG_PPC_8xx) += dump_linuxpagetables-8xx.o -obj-$(CONFIG_PPC_BOOK3E_MMU) += dump_linuxpagetables-generic.o -obj-$(CONFIG_PPC_BOOK3S_32) += dump_linuxpagetables-generic.o dump_bats.o dump_sr.o -obj-$(CONFIG_PPC_BOOK3S_64) += dump_linuxpagetables-book3s64.o -endif -obj-$(CONFIG_PPC_HTDUMP) += dump_hashpagetable.o +obj-$(CONFIG_PPC_PTDUMP) += ptdump/ obj-$(CONFIG_PPC_MEM_KEYS) += pkeys.o diff --git a/arch/powerpc/mm/dump_linuxpagetables-8xx.c b/arch/powerpc/mm/ptdump/8xx.c similarity index 97% rename from arch/powerpc/mm/dump_linuxpagetables-8xx.c rename to arch/powerpc/mm/ptdump/8xx.c index ab9e3f24db2f..9e2d8e847d6e 100644 --- a/arch/powerpc/mm/dump_linuxpagetables-8xx.c +++ b/arch/powerpc/mm/ptdump/8xx.c @@ -7,7 +7,7 @@ #include #include -#include "dump_linuxpagetables.h" +#include "ptdump.h" static const struct flag_info flag_array[] = { { diff --git a/arch/powerpc/mm/ptdump/Makefile b/arch/powerpc/mm/ptdump/Makefile new file mode 100644 index 000000000000..712762be3cb1 --- /dev/null +++ b/arch/powerpc/mm/ptdump/Makefile @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-y += ptdump.o + +obj-$(CONFIG_4xx) += shared.o +obj-$(CONFIG_PPC_8xx) += 8xx.o +obj-$(CONFIG_PPC_BOOK3E_MMU) += shared.o +obj-$(CONFIG_PPC_BOOK3S_32) += shared.o bats.o segment_regs.o +obj-$(CONFIG_PPC_BOOK3S_64) += book3s64.o hashpagetable.o diff --git a/arch/powerpc/mm/dump_bats.c b/arch/powerpc/mm/ptdump/bats.c similarity index 100% rename from arch/powerpc/mm/dump_bats.c rename to arch/powerpc/mm/ptdump/bats.c diff --git a/arch/powerpc/mm/dump_linuxpagetables-book3s64.c b/arch/powerpc/mm/ptdump/book3s64.c similarity index 98% rename from arch/powerpc/mm/dump_linuxpagetables-book3s64.c rename to arch/powerpc/mm/ptdump/book3s64.c index ed6fcf78256e..0dfca72cb9bd 100644 --- a/arch/powerpc/mm/dump_linuxpagetables-book3s64.c +++ b/arch/powerpc/mm/ptdump/book3s64.c @@ -7,7 +7,7 @@ #include #include -#include "dump_linuxpagetables.h" +#include "ptdump.h" static const struct flag_info flag_array[] = { { diff --git a/arch/powerpc/mm/dump_hashpagetable.c b/arch/powerpc/mm/ptdump/hashpagetable.c similarity index 100% rename from arch/powerpc/mm/dump_hashpagetable.c rename to arch/powerpc/mm/ptdump/hashpagetable.c diff --git a/arch/powerpc/mm/dump_linuxpagetables.c b/arch/powerpc/mm/ptdump/ptdump.c similarity index 99% rename from arch/powerpc/mm/dump_linuxpagetables.c rename to arch/powerpc/mm/ptdump/ptdump.c index b0da447197d4..37138428ab55 100644 --- a/arch/powerpc/mm/dump_linuxpagetables.c +++ b/arch/powerpc/mm/ptdump/ptdump.c @@ -28,7 +28,7 @@ #include #include -#include "dump_linuxpagetables.h" +#include "ptdump.h" #ifdef CONFIG_PPC32 #define KERN_VIRT_START 0 diff --git a/arch/powerpc/mm/dump_linuxpagetables.h b/arch/powerpc/mm/ptdump/ptdump.h similarity index 100% rename from arch/powerpc/mm/dump_linuxpagetables.h rename to arch/powerpc/mm/ptdump/ptdump.h diff --git a/arch/powerpc/mm/dump_sr.c b/arch/powerpc/mm/ptdump/segment_regs.c similarity index 100% rename from arch/powerpc/mm/dump_sr.c rename to arch/powerpc/mm/ptdump/segment_regs.c diff --git a/arch/powerpc/mm/dump_linuxpagetables-generic.c b/arch/powerpc/mm/ptdump/shared.c similarity index 97% rename from arch/powerpc/mm/dump_linuxpagetables-generic.c rename to arch/powerpc/mm/ptdump/shared.c index 3fe98a0974c6..f7ed2f187cb0 100644 --- a/arch/powerpc/mm/dump_linuxpagetables-generic.c +++ b/arch/powerpc/mm/ptdump/shared.c @@ -7,7 +7,7 @@ #include #include -#include "dump_linuxpagetables.h" +#include "ptdump.h" static const struct flag_info flag_array[] = { { From f68e7927212fa0dbe44c00c144b643c87ab0cf43 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Sat, 23 Feb 2019 20:30:50 +1100 Subject: [PATCH 131/200] Revert "powerpc/book3s32: Reorder _PAGE_XXX flags to simplify TLB handling" This reverts commit 78ca1108b10927b3d068c8da91352b0f4cd01fc5. It is causing boot failures with qemu mac99 in at least some configurations. --- arch/powerpc/include/asm/book3s/32/hash.h | 8 ++++---- arch/powerpc/kernel/head_32.S | 5 ++++- arch/powerpc/mm/hash_low_32.S | 6 ++++-- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/32/hash.h b/arch/powerpc/include/asm/book3s/32/hash.h index a5907ea4fb40..2a0a467d2985 100644 --- a/arch/powerpc/include/asm/book3s/32/hash.h +++ b/arch/powerpc/include/asm/book3s/32/hash.h @@ -17,9 +17,9 @@ * updating the accessed and modified bits in the page table tree. */ -#define _PAGE_RW 0x001 /* PP = x1: user write access allowed */ -#define _PAGE_USER 0x002 /* PP = 1x: usermode access allowed */ -#define _PAGE_HASHPTE 0x004 /* software: hash_page has made an HPTE for this pte */ +#define _PAGE_PRESENT 0x001 /* software: pte contains a translation */ +#define _PAGE_HASHPTE 0x002 /* hash_page has made an HPTE for this pte */ +#define _PAGE_USER 0x004 /* usermode access allowed */ #define _PAGE_GUARDED 0x008 /* G: prohibit speculative access */ #define _PAGE_COHERENT 0x010 /* M: enforce memory coherence (SMP systems) */ #define _PAGE_NO_CACHE 0x020 /* I: cache inhibit */ @@ -27,7 +27,7 @@ #define _PAGE_DIRTY 0x080 /* C: page changed */ #define _PAGE_ACCESSED 0x100 /* R: page referenced */ #define _PAGE_EXEC 0x200 /* software: exec allowed */ -#define _PAGE_PRESENT 0x400 /* software: pte contains a translation */ +#define _PAGE_RW 0x400 /* software: user write access allowed */ #define _PAGE_SPECIAL 0x800 /* software: Special page */ #ifdef CONFIG_PTE_64BIT diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index e7a5b312a7db..fdb587c96a80 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -522,6 +522,7 @@ InstructionTLBMiss: andc. r1,r1,r0 /* check access & ~permission */ bne- InstructionAddressInvalid /* return if access not permitted */ /* Convert linux-style PTE to low word of PPC-style PTE */ + rlwimi r0,r0,32-1,30,30 /* _PAGE_USER -> PP msb */ ori r1, r1, 0xe05 /* clear out reserved bits */ andc r1, r0, r1 /* PP = user? 2 : 0 */ BEGIN_FTR_SECTION @@ -589,7 +590,8 @@ DataLoadTLBMiss: * we would need to update the pte atomically with lwarx/stwcx. */ /* Convert linux-style PTE to low word of PPC-style PTE */ - rlwinm r1, r0, 0, 31, 31 /* _PAGE_RW -> PP lsb */ + rlwinm r1,r0,32-10,31,31 /* _PAGE_RW -> PP lsb */ + rlwimi r0,r0,32-1,30,30 /* _PAGE_USER -> PP msb */ rlwimi r0,r0,32-1,31,31 /* _PAGE_USER -> PP lsb */ ori r1,r1,0xe04 /* clear out reserved bits */ andc r1,r0,r1 /* PP = user? rw? 2: 3: 0 */ @@ -668,6 +670,7 @@ DataStoreTLBMiss: * we would need to update the pte atomically with lwarx/stwcx. */ /* Convert linux-style PTE to low word of PPC-style PTE */ + rlwimi r0,r0,32-1,30,30 /* _PAGE_USER -> PP msb */ li r1,0xe05 /* clear out reserved bits & PP lsb */ andc r1,r0,r1 /* PP = user? 2: 0 */ BEGIN_FTR_SECTION diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/hash_low_32.S index f4294edeca9d..d94fef524ef5 100644 --- a/arch/powerpc/mm/hash_low_32.S +++ b/arch/powerpc/mm/hash_low_32.S @@ -310,9 +310,11 @@ Hash_msk = (((1 << Hash_bits) - 1) * 64) _GLOBAL(create_hpte) /* Convert linux-style PTE (r5) to low word of PPC-style PTE (r8) */ + rlwinm r8,r5,32-10,31,31 /* _PAGE_RW -> PP lsb */ rlwinm r0,r5,32-7,31,31 /* _PAGE_DIRTY -> PP lsb */ - and r8, r5, r0 /* writable if _RW & _DIRTY */ - rlwimi r5, r5, 32 - 1, 31, 31 /* _PAGE_USER -> PP lsb */ + and r8,r8,r0 /* writable if _RW & _DIRTY */ + rlwimi r5,r5,32-1,30,30 /* _PAGE_USER -> PP msb */ + rlwimi r5,r5,32-2,31,31 /* _PAGE_USER -> PP lsb */ ori r8,r8,0xe04 /* clear out reserved bits */ andc r8,r5,r8 /* PP = user? (rw&dirty? 2: 3): 0 */ BEGIN_FTR_SECTION From 84022ac17327c5383917f46c162fd943cf79583f Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Wed, 20 Feb 2019 12:26:58 +0530 Subject: [PATCH 132/200] powerpc: sstep: Add tests for compute type instructions This enhances the current selftest framework for validating the in-kernel instruction emulation infrastructure by adding support for compute type instructions i.e. integer ALU-based instructions. Originally, this framework was limited to only testing load and store instructions. While most of the GPRs can be validated, support for SPRs is limited to LR, CR and XER for now. When writing the test cases, one must ensure that the Stack Pointer (GPR1) or the Thread Pointer (GPR13) are not touched by any means as these are vital non-volatile registers. Signed-off-by: Sandipan Das [mpe: Use patch_site for the code patching] Signed-off-by: Michael Ellerman --- arch/powerpc/lib/Makefile | 3 +- arch/powerpc/lib/test_emulate_step.c | 167 +++++++++++++++++- .../lib/test_emulate_step_exec_instr.S | 150 ++++++++++++++++ 3 files changed, 315 insertions(+), 5 deletions(-) create mode 100644 arch/powerpc/lib/test_emulate_step_exec_instr.S diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile index 3bf9fc6fd36c..79396e184bca 100644 --- a/arch/powerpc/lib/Makefile +++ b/arch/powerpc/lib/Makefile @@ -30,7 +30,8 @@ obj64-y += copypage_64.o copyuser_64.o mem_64.o hweight_64.o \ obj64-$(CONFIG_SMP) += locks.o obj64-$(CONFIG_ALTIVEC) += vmx-helper.o -obj64-$(CONFIG_KPROBES_SANITY_TEST) += test_emulate_step.o +obj64-$(CONFIG_KPROBES_SANITY_TEST) += test_emulate_step.o \ + test_emulate_step_exec_instr.o obj-y += checksum_$(BITS).o checksum_wrappers.o \ string_$(BITS).o memcmp_$(BITS).o diff --git a/arch/powerpc/lib/test_emulate_step.c b/arch/powerpc/lib/test_emulate_step.c index 6c47daa61614..e9f762ac825f 100644 --- a/arch/powerpc/lib/test_emulate_step.c +++ b/arch/powerpc/lib/test_emulate_step.c @@ -1,5 +1,5 @@ /* - * Simple sanity test for emulate_step load/store instructions. + * Simple sanity tests for instruction emulation infrastructure. * * Copyright IBM Corp. 2016 * @@ -14,6 +14,7 @@ #include #include #include +#include #define IMM_L(i) ((uintptr_t)(i) & 0xffff) @@ -49,6 +50,11 @@ #define TEST_LXVD2X(s, a, b) (PPC_INST_LXVD2X | VSX_XX1((s), R##a, R##b)) #define TEST_STXVD2X(s, a, b) (PPC_INST_STXVD2X | VSX_XX1((s), R##a, R##b)) +#define MAX_SUBTESTS 16 + +#define IGNORE_GPR(n) (0x1UL << (n)) +#define IGNORE_XER (0x1UL << 32) +#define IGNORE_CCR (0x1UL << 33) static void __init init_pt_regs(struct pt_regs *regs) { @@ -72,9 +78,15 @@ static void __init init_pt_regs(struct pt_regs *regs) msr_cached = true; } -static void __init show_result(char *ins, char *result) +static void __init show_result(char *mnemonic, char *result) { - pr_info("%-14s : %s\n", ins, result); + pr_info("%-14s : %s\n", mnemonic, result); +} + +static void __init show_result_with_descr(char *mnemonic, char *descr, + char *result) +{ + pr_info("%-14s : %-50s %s\n", mnemonic, descr, result); } static void __init test_ld(void) @@ -426,7 +438,7 @@ static void __init test_lxvd2x_stxvd2x(void) } #endif /* CONFIG_VSX */ -static int __init test_emulate_step(void) +static void __init run_tests_load_store(void) { test_ld(); test_lwz(); @@ -437,6 +449,153 @@ static int __init test_emulate_step(void) test_lfdx_stfdx(); test_lvx_stvx(); test_lxvd2x_stxvd2x(); +} + +struct compute_test { + char *mnemonic; + struct { + char *descr; + unsigned long flags; + unsigned int instr; + struct pt_regs regs; + } subtests[MAX_SUBTESTS + 1]; +}; + +static struct compute_test compute_tests[] = { + { + .mnemonic = "nop", + .subtests = { + { + .descr = "R0 = LONG_MAX", + .instr = PPC_INST_NOP, + .regs = { + .gpr[0] = LONG_MAX, + } + } + } + } +}; + +static int __init emulate_compute_instr(struct pt_regs *regs, + unsigned int instr) +{ + struct instruction_op op; + + if (!regs || !instr) + return -EINVAL; + + if (analyse_instr(&op, regs, instr) != 1 || + GETTYPE(op.type) != COMPUTE) { + pr_info("emulation failed, instruction = 0x%08x\n", instr); + return -EFAULT; + } + + emulate_update_regs(regs, &op); + return 0; +} + +static int __init execute_compute_instr(struct pt_regs *regs, + unsigned int instr) +{ + extern int exec_instr(struct pt_regs *regs); + extern s32 patch__exec_instr; + + if (!regs || !instr) + return -EINVAL; + + /* Patch the NOP with the actual instruction */ + patch_instruction_site(&patch__exec_instr, instr); + if (exec_instr(regs)) { + pr_info("execution failed, instruction = 0x%08x\n", instr); + return -EFAULT; + } + + return 0; +} + +#define gpr_mismatch(gprn, exp, got) \ + pr_info("GPR%u mismatch, exp = 0x%016lx, got = 0x%016lx\n", \ + gprn, exp, got) + +#define reg_mismatch(name, exp, got) \ + pr_info("%s mismatch, exp = 0x%016lx, got = 0x%016lx\n", \ + name, exp, got) + +static void __init run_tests_compute(void) +{ + unsigned long flags; + struct compute_test *test; + struct pt_regs *regs, exp, got; + unsigned int i, j, k, instr; + bool ignore_gpr, ignore_xer, ignore_ccr, passed; + + for (i = 0; i < ARRAY_SIZE(compute_tests); i++) { + test = &compute_tests[i]; + + for (j = 0; j < MAX_SUBTESTS && test->subtests[j].descr; j++) { + instr = test->subtests[j].instr; + flags = test->subtests[j].flags; + regs = &test->subtests[j].regs; + ignore_xer = flags & IGNORE_XER; + ignore_ccr = flags & IGNORE_CCR; + passed = true; + + memcpy(&exp, regs, sizeof(struct pt_regs)); + memcpy(&got, regs, sizeof(struct pt_regs)); + + /* + * Set a compatible MSR value explicitly to ensure + * that XER and CR bits are updated appropriately + */ + exp.msr = MSR_KERNEL; + got.msr = MSR_KERNEL; + + if (emulate_compute_instr(&got, instr) || + execute_compute_instr(&exp, instr)) { + passed = false; + goto print; + } + + /* Verify GPR values */ + for (k = 0; k < 32; k++) { + ignore_gpr = flags & IGNORE_GPR(k); + if (!ignore_gpr && exp.gpr[k] != got.gpr[k]) { + passed = false; + gpr_mismatch(k, exp.gpr[k], got.gpr[k]); + } + } + + /* Verify LR value */ + if (exp.link != got.link) { + passed = false; + reg_mismatch("LR", exp.link, got.link); + } + + /* Verify XER value */ + if (!ignore_xer && exp.xer != got.xer) { + passed = false; + reg_mismatch("XER", exp.xer, got.xer); + } + + /* Verify CR value */ + if (!ignore_ccr && exp.ccr != got.ccr) { + passed = false; + reg_mismatch("CR", exp.ccr, got.ccr); + } + +print: + show_result_with_descr(test->mnemonic, + test->subtests[j].descr, + passed ? "PASS" : "FAIL"); + } + } +} + +static int __init test_emulate_step(void) +{ + printk(KERN_INFO "Running instruction emulation self-tests ...\n"); + run_tests_load_store(); + run_tests_compute(); return 0; } diff --git a/arch/powerpc/lib/test_emulate_step_exec_instr.S b/arch/powerpc/lib/test_emulate_step_exec_instr.S new file mode 100644 index 000000000000..1580f34f4f4f --- /dev/null +++ b/arch/powerpc/lib/test_emulate_step_exec_instr.S @@ -0,0 +1,150 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Non-emulated single-stepping support (currently limited to basic integer + * computations) used to validate the instruction emulation infrastructure. + * + * Copyright (C) 2019 IBM Corporation + */ + +#include +#include +#include +#include + +/* int exec_instr(struct pt_regs *regs) */ +_GLOBAL(exec_instr) + + /* + * Stack frame layout (INT_FRAME_SIZE bytes) + * In-memory pt_regs (SP + STACK_FRAME_OVERHEAD) + * Scratch space (SP + 8) + * Back chain (SP + 0) + */ + + /* + * Allocate a new stack frame with enough space to hold the register + * states in an in-memory pt_regs and also create the back chain to + * the caller's stack frame. + */ + stdu r1, -INT_FRAME_SIZE(r1) + + /* + * Save non-volatile GPRs on stack. This includes TOC pointer (GPR2) + * and local variables (GPR14 to GPR31). The register for the pt_regs + * parameter (GPR3) is saved additionally to ensure that the resulting + * register state can still be saved even if GPR3 gets overwritten + * when loading the initial register state for the test instruction. + * The stack pointer (GPR1) and the thread pointer (GPR13) are not + * saved as these should not be modified anyway. + */ + SAVE_2GPRS(2, r1) + SAVE_NVGPRS(r1) + + /* + * Save LR on stack to ensure that the return address is available + * even if it gets overwritten by the test instruction. + */ + mflr r0 + std r0, _LINK(r1) + + /* + * Save CR on stack. For simplicity, the entire register is saved + * even though only fields 2 to 4 are non-volatile. + */ + mfcr r0 + std r0, _CCR(r1) + + /* + * Load register state for the test instruction without touching the + * critical non-volatile registers. The register state is passed as a + * pointer to a pt_regs instance. + */ + subi r31, r3, GPR0 + + /* Load LR from pt_regs */ + ld r0, _LINK(r31) + mtlr r0 + + /* Load CR from pt_regs */ + ld r0, _CCR(r31) + mtcr r0 + + /* Load XER from pt_regs */ + ld r0, _XER(r31) + mtxer r0 + + /* Load GPRs from pt_regs */ + REST_GPR(0, r31) + REST_10GPRS(2, r31) + REST_GPR(12, r31) + REST_NVGPRS(r31) + + /* Placeholder for the test instruction */ +1: nop + patch_site 1b patch__exec_instr + + /* + * Since GPR3 is overwritten, temporarily restore it back to its + * original state, i.e. the pointer to pt_regs, to ensure that the + * resulting register state can be saved. Before doing this, a copy + * of it is created in the scratch space which is used later on to + * save it to pt_regs. + */ + std r3, 8(r1) + REST_GPR(3, r1) + + /* Save resulting GPR state to pt_regs */ + subi r3, r3, GPR0 + SAVE_GPR(0, r3) + SAVE_GPR(2, r3) + SAVE_8GPRS(4, r3) + SAVE_GPR(12, r3) + SAVE_NVGPRS(r3) + + /* Save resulting LR to pt_regs */ + mflr r0 + std r0, _LINK(r3) + + /* Save resulting CR to pt_regs */ + mfcr r0 + std r0, _CCR(r3) + + /* Save resulting XER to pt_regs */ + mfxer r0 + std r0, _XER(r3) + + /* Restore resulting GPR3 from scratch space and save it to pt_regs */ + ld r0, 8(r1) + std r0, GPR3(r3) + + /* Set return value to denote execution success */ + li r3, 0 + + /* Continue */ + b 3f + + /* Set return value to denote execution failure */ +2: li r3, -EFAULT + + /* Restore the non-volatile GPRs from stack */ +3: REST_GPR(2, r1) + REST_NVGPRS(r1) + + /* Restore LR from stack to be able to return */ + ld r0, _LINK(r1) + mtlr r0 + + /* Restore CR from stack */ + ld r0, _CCR(r1) + mtcr r0 + + /* Tear down stack frame */ + addi r1, r1, INT_FRAME_SIZE + + /* Return */ + blr + + /* Setup exception table */ + EX_TABLE(1b, 2b) + +_ASM_NOKPROBE_SYMBOL(exec_instr) From 44dea1784b2fdea3aec20c14306aebbeb4294f64 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Wed, 20 Feb 2019 12:26:59 +0530 Subject: [PATCH 133/200] powerpc: sstep: Add tests for add[.] instruction This adds test cases for the add[.] instruction. Signed-off-by: Sandipan Das Signed-off-by: Michael Ellerman --- arch/powerpc/lib/test_emulate_step.c | 176 +++++++++++++++++++++++++++ 1 file changed, 176 insertions(+) diff --git a/arch/powerpc/lib/test_emulate_step.c b/arch/powerpc/lib/test_emulate_step.c index e9f762ac825f..ee6d5ac3b615 100644 --- a/arch/powerpc/lib/test_emulate_step.c +++ b/arch/powerpc/lib/test_emulate_step.c @@ -49,6 +49,10 @@ ___PPC_RA(a) | ___PPC_RB(b)) #define TEST_LXVD2X(s, a, b) (PPC_INST_LXVD2X | VSX_XX1((s), R##a, R##b)) #define TEST_STXVD2X(s, a, b) (PPC_INST_STXVD2X | VSX_XX1((s), R##a, R##b)) +#define TEST_ADD(t, a, b) (PPC_INST_ADD | ___PPC_RT(t) | \ + ___PPC_RA(a) | ___PPC_RB(b)) +#define TEST_ADD_DOT(t, a, b) (PPC_INST_ADD | ___PPC_RT(t) | \ + ___PPC_RA(a) | ___PPC_RB(b) | 0x1) #define MAX_SUBTESTS 16 @@ -473,6 +477,178 @@ static struct compute_test compute_tests[] = { } } } + }, + { + .mnemonic = "add", + .subtests = { + { + .descr = "RA = LONG_MIN, RB = LONG_MIN", + .instr = TEST_ADD(20, 21, 22), + .regs = { + .gpr[21] = LONG_MIN, + .gpr[22] = LONG_MIN, + } + }, + { + .descr = "RA = LONG_MIN, RB = LONG_MAX", + .instr = TEST_ADD(20, 21, 22), + .regs = { + .gpr[21] = LONG_MIN, + .gpr[22] = LONG_MAX, + } + }, + { + .descr = "RA = LONG_MAX, RB = LONG_MAX", + .instr = TEST_ADD(20, 21, 22), + .regs = { + .gpr[21] = LONG_MAX, + .gpr[22] = LONG_MAX, + } + }, + { + .descr = "RA = ULONG_MAX, RB = ULONG_MAX", + .instr = TEST_ADD(20, 21, 22), + .regs = { + .gpr[21] = ULONG_MAX, + .gpr[22] = ULONG_MAX, + } + }, + { + .descr = "RA = ULONG_MAX, RB = 0x1", + .instr = TEST_ADD(20, 21, 22), + .regs = { + .gpr[21] = ULONG_MAX, + .gpr[22] = 0x1, + } + }, + { + .descr = "RA = INT_MIN, RB = INT_MIN", + .instr = TEST_ADD(20, 21, 22), + .regs = { + .gpr[21] = INT_MIN, + .gpr[22] = INT_MIN, + } + }, + { + .descr = "RA = INT_MIN, RB = INT_MAX", + .instr = TEST_ADD(20, 21, 22), + .regs = { + .gpr[21] = INT_MIN, + .gpr[22] = INT_MAX, + } + }, + { + .descr = "RA = INT_MAX, RB = INT_MAX", + .instr = TEST_ADD(20, 21, 22), + .regs = { + .gpr[21] = INT_MAX, + .gpr[22] = INT_MAX, + } + }, + { + .descr = "RA = UINT_MAX, RB = UINT_MAX", + .instr = TEST_ADD(20, 21, 22), + .regs = { + .gpr[21] = UINT_MAX, + .gpr[22] = UINT_MAX, + } + }, + { + .descr = "RA = UINT_MAX, RB = 0x1", + .instr = TEST_ADD(20, 21, 22), + .regs = { + .gpr[21] = UINT_MAX, + .gpr[22] = 0x1, + } + } + } + }, + { + .mnemonic = "add.", + .subtests = { + { + .descr = "RA = LONG_MIN, RB = LONG_MIN", + .flags = IGNORE_CCR, + .instr = TEST_ADD_DOT(20, 21, 22), + .regs = { + .gpr[21] = LONG_MIN, + .gpr[22] = LONG_MIN, + } + }, + { + .descr = "RA = LONG_MIN, RB = LONG_MAX", + .instr = TEST_ADD_DOT(20, 21, 22), + .regs = { + .gpr[21] = LONG_MIN, + .gpr[22] = LONG_MAX, + } + }, + { + .descr = "RA = LONG_MAX, RB = LONG_MAX", + .flags = IGNORE_CCR, + .instr = TEST_ADD_DOT(20, 21, 22), + .regs = { + .gpr[21] = LONG_MAX, + .gpr[22] = LONG_MAX, + } + }, + { + .descr = "RA = ULONG_MAX, RB = ULONG_MAX", + .instr = TEST_ADD_DOT(20, 21, 22), + .regs = { + .gpr[21] = ULONG_MAX, + .gpr[22] = ULONG_MAX, + } + }, + { + .descr = "RA = ULONG_MAX, RB = 0x1", + .instr = TEST_ADD_DOT(20, 21, 22), + .regs = { + .gpr[21] = ULONG_MAX, + .gpr[22] = 0x1, + } + }, + { + .descr = "RA = INT_MIN, RB = INT_MIN", + .instr = TEST_ADD_DOT(20, 21, 22), + .regs = { + .gpr[21] = INT_MIN, + .gpr[22] = INT_MIN, + } + }, + { + .descr = "RA = INT_MIN, RB = INT_MAX", + .instr = TEST_ADD_DOT(20, 21, 22), + .regs = { + .gpr[21] = INT_MIN, + .gpr[22] = INT_MAX, + } + }, + { + .descr = "RA = INT_MAX, RB = INT_MAX", + .instr = TEST_ADD_DOT(20, 21, 22), + .regs = { + .gpr[21] = INT_MAX, + .gpr[22] = INT_MAX, + } + }, + { + .descr = "RA = UINT_MAX, RB = UINT_MAX", + .instr = TEST_ADD_DOT(20, 21, 22), + .regs = { + .gpr[21] = UINT_MAX, + .gpr[22] = UINT_MAX, + } + }, + { + .descr = "RA = UINT_MAX, RB = 0x1", + .instr = TEST_ADD_DOT(20, 21, 22), + .regs = { + .gpr[21] = UINT_MAX, + .gpr[22] = 0x1, + } + } + } } }; From 78a8da0600940d679bb727cea7e153685e211723 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Wed, 20 Feb 2019 12:27:00 +0530 Subject: [PATCH 134/200] powerpc: sstep: Add tests for addc[.] instruction This adds test cases for the addc[.] instruction. Signed-off-by: Sandipan Das Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/ppc-opcode.h | 1 + arch/powerpc/lib/test_emulate_step.c | 192 ++++++++++++++++++++++++++ 2 files changed, 193 insertions(+) diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h index 19a8834e0398..87b73aa56b53 100644 --- a/arch/powerpc/include/asm/ppc-opcode.h +++ b/arch/powerpc/include/asm/ppc-opcode.h @@ -326,6 +326,7 @@ #define PPC_INST_ADDI 0x38000000 #define PPC_INST_ADDIS 0x3c000000 #define PPC_INST_ADD 0x7c000214 +#define PPC_INST_ADDC 0x7c000014 #define PPC_INST_SUB 0x7c000050 #define PPC_INST_BLR 0x4e800020 #define PPC_INST_BLRL 0x4e800021 diff --git a/arch/powerpc/lib/test_emulate_step.c b/arch/powerpc/lib/test_emulate_step.c index ee6d5ac3b615..9992c1ea7a1d 100644 --- a/arch/powerpc/lib/test_emulate_step.c +++ b/arch/powerpc/lib/test_emulate_step.c @@ -53,6 +53,10 @@ ___PPC_RA(a) | ___PPC_RB(b)) #define TEST_ADD_DOT(t, a, b) (PPC_INST_ADD | ___PPC_RT(t) | \ ___PPC_RA(a) | ___PPC_RB(b) | 0x1) +#define TEST_ADDC(t, a, b) (PPC_INST_ADDC | ___PPC_RT(t) | \ + ___PPC_RA(a) | ___PPC_RB(b)) +#define TEST_ADDC_DOT(t, a, b) (PPC_INST_ADDC | ___PPC_RT(t) | \ + ___PPC_RA(a) | ___PPC_RB(b) | 0x1) #define MAX_SUBTESTS 16 @@ -649,6 +653,194 @@ static struct compute_test compute_tests[] = { } } } + }, + { + .mnemonic = "addc", + .subtests = { + { + .descr = "RA = LONG_MIN, RB = LONG_MIN", + .instr = TEST_ADDC(20, 21, 22), + .regs = { + .gpr[21] = LONG_MIN, + .gpr[22] = LONG_MIN, + } + }, + { + .descr = "RA = LONG_MIN, RB = LONG_MAX", + .instr = TEST_ADDC(20, 21, 22), + .regs = { + .gpr[21] = LONG_MIN, + .gpr[22] = LONG_MAX, + } + }, + { + .descr = "RA = LONG_MAX, RB = LONG_MAX", + .instr = TEST_ADDC(20, 21, 22), + .regs = { + .gpr[21] = LONG_MAX, + .gpr[22] = LONG_MAX, + } + }, + { + .descr = "RA = ULONG_MAX, RB = ULONG_MAX", + .instr = TEST_ADDC(20, 21, 22), + .regs = { + .gpr[21] = ULONG_MAX, + .gpr[22] = ULONG_MAX, + } + }, + { + .descr = "RA = ULONG_MAX, RB = 0x1", + .instr = TEST_ADDC(20, 21, 22), + .regs = { + .gpr[21] = ULONG_MAX, + .gpr[22] = 0x1, + } + }, + { + .descr = "RA = INT_MIN, RB = INT_MIN", + .instr = TEST_ADDC(20, 21, 22), + .regs = { + .gpr[21] = INT_MIN, + .gpr[22] = INT_MIN, + } + }, + { + .descr = "RA = INT_MIN, RB = INT_MAX", + .instr = TEST_ADDC(20, 21, 22), + .regs = { + .gpr[21] = INT_MIN, + .gpr[22] = INT_MAX, + } + }, + { + .descr = "RA = INT_MAX, RB = INT_MAX", + .instr = TEST_ADDC(20, 21, 22), + .regs = { + .gpr[21] = INT_MAX, + .gpr[22] = INT_MAX, + } + }, + { + .descr = "RA = UINT_MAX, RB = UINT_MAX", + .instr = TEST_ADDC(20, 21, 22), + .regs = { + .gpr[21] = UINT_MAX, + .gpr[22] = UINT_MAX, + } + }, + { + .descr = "RA = UINT_MAX, RB = 0x1", + .instr = TEST_ADDC(20, 21, 22), + .regs = { + .gpr[21] = UINT_MAX, + .gpr[22] = 0x1, + } + }, + { + .descr = "RA = LONG_MIN | INT_MIN, RB = LONG_MIN | INT_MIN", + .instr = TEST_ADDC(20, 21, 22), + .regs = { + .gpr[21] = LONG_MIN | (uint)INT_MIN, + .gpr[22] = LONG_MIN | (uint)INT_MIN, + } + } + } + }, + { + .mnemonic = "addc.", + .subtests = { + { + .descr = "RA = LONG_MIN, RB = LONG_MIN", + .flags = IGNORE_CCR, + .instr = TEST_ADDC_DOT(20, 21, 22), + .regs = { + .gpr[21] = LONG_MIN, + .gpr[22] = LONG_MIN, + } + }, + { + .descr = "RA = LONG_MIN, RB = LONG_MAX", + .instr = TEST_ADDC_DOT(20, 21, 22), + .regs = { + .gpr[21] = LONG_MIN, + .gpr[22] = LONG_MAX, + } + }, + { + .descr = "RA = LONG_MAX, RB = LONG_MAX", + .flags = IGNORE_CCR, + .instr = TEST_ADDC_DOT(20, 21, 22), + .regs = { + .gpr[21] = LONG_MAX, + .gpr[22] = LONG_MAX, + } + }, + { + .descr = "RA = ULONG_MAX, RB = ULONG_MAX", + .instr = TEST_ADDC_DOT(20, 21, 22), + .regs = { + .gpr[21] = ULONG_MAX, + .gpr[22] = ULONG_MAX, + } + }, + { + .descr = "RA = ULONG_MAX, RB = 0x1", + .instr = TEST_ADDC_DOT(20, 21, 22), + .regs = { + .gpr[21] = ULONG_MAX, + .gpr[22] = 0x1, + } + }, + { + .descr = "RA = INT_MIN, RB = INT_MIN", + .instr = TEST_ADDC_DOT(20, 21, 22), + .regs = { + .gpr[21] = INT_MIN, + .gpr[22] = INT_MIN, + } + }, + { + .descr = "RA = INT_MIN, RB = INT_MAX", + .instr = TEST_ADDC_DOT(20, 21, 22), + .regs = { + .gpr[21] = INT_MIN, + .gpr[22] = INT_MAX, + } + }, + { + .descr = "RA = INT_MAX, RB = INT_MAX", + .instr = TEST_ADDC_DOT(20, 21, 22), + .regs = { + .gpr[21] = INT_MAX, + .gpr[22] = INT_MAX, + } + }, + { + .descr = "RA = UINT_MAX, RB = UINT_MAX", + .instr = TEST_ADDC_DOT(20, 21, 22), + .regs = { + .gpr[21] = UINT_MAX, + .gpr[22] = UINT_MAX, + } + }, + { + .descr = "RA = UINT_MAX, RB = 0x1", + .instr = TEST_ADDC_DOT(20, 21, 22), + .regs = { + .gpr[21] = UINT_MAX, + .gpr[22] = 0x1, + } + }, + { + .descr = "RA = LONG_MIN | INT_MIN, RB = LONG_MIN | INT_MIN", + .instr = TEST_ADDC_DOT(20, 21, 22), + .regs = { + .gpr[21] = LONG_MIN | (uint)INT_MIN, + .gpr[22] = LONG_MIN | (uint)INT_MIN, + } + } + } } }; From 665bed2386e5dc29844ad78c7ef1464664b103ec Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 13 Feb 2019 16:06:19 +0000 Subject: [PATCH 135/200] powerpc/8xx: replace most #ifdef by IS_ENABLED() in 8xx_mmu.c This patch replaces most #ifdef mess by IS_ENABLED() in 8xx_mmu.c This has the advantage of allowing syntax verification at compile time regardless of selected options. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/mm/8xx_mmu.c | 42 +++++++++++++++++---------------------- 1 file changed, 18 insertions(+), 24 deletions(-) diff --git a/arch/powerpc/mm/8xx_mmu.c b/arch/powerpc/mm/8xx_mmu.c index f12ec85e965c..174452f7b5db 100644 --- a/arch/powerpc/mm/8xx_mmu.c +++ b/arch/powerpc/mm/8xx_mmu.c @@ -66,26 +66,22 @@ unsigned long p_block_mapped(phys_addr_t pa) void __init MMU_init_hw(void) { /* PIN up to the 3 first 8Mb after IMMR in DTLB table */ -#ifdef CONFIG_PIN_TLB_DATA - unsigned long ctr = mfspr(SPRN_MD_CTR) & 0xfe000000; - unsigned long flags = 0xf0 | MD_SPS16K | _PAGE_SH | _PAGE_DIRTY; -#ifdef CONFIG_PIN_TLB_IMMR - int i = 29; -#else - int i = 28; -#endif - unsigned long addr = 0; - unsigned long mem = total_lowmem; + if (IS_ENABLED(CONFIG_PIN_TLB_DATA)) { + unsigned long ctr = mfspr(SPRN_MD_CTR) & 0xfe000000; + unsigned long flags = 0xf0 | MD_SPS16K | _PAGE_SH | _PAGE_DIRTY; + int i = IS_ENABLED(CONFIG_PIN_TLB_IMMR) ? 29 : 28; + unsigned long addr = 0; + unsigned long mem = total_lowmem; - for (; i < 32 && mem >= LARGE_PAGE_SIZE_8M; i++) { - mtspr(SPRN_MD_CTR, ctr | (i << 8)); - mtspr(SPRN_MD_EPN, (unsigned long)__va(addr) | MD_EVALID); - mtspr(SPRN_MD_TWC, MD_PS8MEG | MD_SVALID); - mtspr(SPRN_MD_RPN, addr | flags | _PAGE_PRESENT); - addr += LARGE_PAGE_SIZE_8M; - mem -= LARGE_PAGE_SIZE_8M; + for (; i < 32 && mem >= LARGE_PAGE_SIZE_8M; i++) { + mtspr(SPRN_MD_CTR, ctr | (i << 8)); + mtspr(SPRN_MD_EPN, (unsigned long)__va(addr) | MD_EVALID); + mtspr(SPRN_MD_TWC, MD_PS8MEG | MD_SVALID); + mtspr(SPRN_MD_RPN, addr | flags | _PAGE_PRESENT); + addr += LARGE_PAGE_SIZE_8M; + mem -= LARGE_PAGE_SIZE_8M; + } } -#endif } static void __init mmu_mapin_immr(void) @@ -110,12 +106,10 @@ unsigned long __init mmu_mapin_ram(unsigned long top) if (__map_without_ltlbs) { mapped = 0; mmu_mapin_immr(); -#ifndef CONFIG_PIN_TLB_IMMR - patch_instruction_site(&patch__dtlbmiss_immr_jmp, PPC_INST_NOP); -#endif -#ifndef CONFIG_PIN_TLB_TEXT - mmu_patch_cmp_limit(&patch__itlbmiss_linmem_top, 0); -#endif + if (!IS_ENABLED(CONFIG_PIN_TLB_IMMR)) + patch_instruction_site(&patch__dtlbmiss_immr_jmp, PPC_INST_NOP); + if (!IS_ENABLED(CONFIG_PIN_TLB_TEXT)) + mmu_patch_cmp_limit(&patch__itlbmiss_linmem_top, 0); } else { mapped = top & ~(LARGE_PAGE_SIZE_8M - 1); } From e4470bd6a41477333f13ef05d78d9d86a40ccf25 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 13 Feb 2019 16:06:21 +0000 Subject: [PATCH 136/200] powerpc/8xx: Map 32Mb of RAM at init. At the time being, initial MMU setup allows 24 Mbytes of DATA and 8 Mbytes of code. Some debug setup like CONFIG_KASAN generate huge kernels with text size over the 8M limit and data over the 24 Mbytes limit. Here is an 8xx kernel compiled with CONFIG_KASAN_INLINE for one of my boards: [root@po16846vm linux-powerpc]# size -x vmlinux text data bss dec hex filename 0x111019c 0x41b0d4 0x490de0 26984528 19bc050 vmlinux This patch maps up to 32 Mbytes code based on _einittext symbol and allows 32 Mbytes of memory instead of 24. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/head_8xx.S | 51 +++++++++++++++++++++------------- arch/powerpc/mm/8xx_mmu.c | 7 +++-- 2 files changed, 36 insertions(+), 22 deletions(-) diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S index 7e14796bea81..52c92913e39b 100644 --- a/arch/powerpc/kernel/head_8xx.S +++ b/arch/powerpc/kernel/head_8xx.S @@ -337,8 +337,8 @@ InstructionTLBMiss: rlwinm r10, r10, 16, 0xfff8 cmpli cr0, r10, PAGE_OFFSET@h #ifndef CONFIG_PIN_TLB_TEXT - /* It is assumed that kernel code fits into the first 8M page */ -0: cmpli cr7, r10, (PAGE_OFFSET + 0x0800000)@h + /* It is assumed that kernel code fits into the first 32M */ +0: cmpli cr7, r10, (PAGE_OFFSET + 0x2000000)@h patch_site 0b, patch__itlbmiss_linmem_top #endif #endif @@ -434,7 +434,7 @@ DataStoreTLBMiss: #ifndef CONFIG_PIN_TLB_IMMR cmpli cr6, r10, VIRT_IMMR_BASE@h #endif -0: cmpli cr7, r10, (PAGE_OFFSET + 0x1800000)@h +0: cmpli cr7, r10, (PAGE_OFFSET + 0x2000000)@h patch_site 0b, patch__dtlbmiss_linmem_top mfspr r10, SPRN_M_TWB /* Get level 1 table */ @@ -886,28 +886,11 @@ initial_mmu: mtspr SPRN_MD_CTR, r10 /* remove PINNED DTLB entries */ tlbia /* Invalidate all TLB entries */ -#ifdef CONFIG_PIN_TLB_TEXT - lis r8, MI_RSV4I@h - ori r8, r8, 0x1c00 - - mtspr SPRN_MI_CTR, r8 /* Set instruction MMU control */ -#endif - #ifdef CONFIG_PIN_TLB_DATA oris r10, r10, MD_RSV4I@h mtspr SPRN_MD_CTR, r10 /* Set data TLB control */ #endif - /* Now map the lower 8 Meg into the ITLB. */ - lis r8, KERNELBASE@h /* Create vaddr for TLB */ - ori r8, r8, MI_EVALID /* Mark it valid */ - mtspr SPRN_MI_EPN, r8 - li r8, MI_PS8MEG /* Set 8M byte page */ - ori r8, r8, MI_SVALID /* Make it valid */ - mtspr SPRN_MI_TWC, r8 - li r8, MI_BOOTINIT /* Create RPN for address 0 */ - mtspr SPRN_MI_RPN, r8 /* Store TLB entry */ - lis r8, MI_APG_INIT@h /* Set protection modes */ ori r8, r8, MI_APG_INIT@l mtspr SPRN_MI_AP, r8 @@ -937,6 +920,34 @@ initial_mmu: mtspr SPRN_MD_RPN, r8 #endif + /* Now map the lower RAM (up to 32 Mbytes) into the ITLB. */ +#ifdef CONFIG_PIN_TLB_TEXT + lis r8, MI_RSV4I@h + ori r8, r8, 0x1c00 +#endif + li r9, 4 /* up to 4 pages of 8M */ + mtctr r9 + lis r9, KERNELBASE@h /* Create vaddr for TLB */ + li r10, MI_PS8MEG | MI_SVALID /* Set 8M byte page */ + li r11, MI_BOOTINIT /* Create RPN for address 0 */ + lis r12, _einittext@h + ori r12, r12, _einittext@l +1: +#ifdef CONFIG_PIN_TLB_TEXT + mtspr SPRN_MI_CTR, r8 /* Set instruction MMU control */ + addi r8, r8, 0x100 +#endif + + ori r0, r9, MI_EVALID /* Mark it valid */ + mtspr SPRN_MI_EPN, r0 + mtspr SPRN_MI_TWC, r10 + mtspr SPRN_MI_RPN, r11 /* Store TLB entry */ + addis r9, r9, 0x80 + addis r11, r11, 0x80 + + cmpl cr0, r9, r12 + bdnzf gt, 1b + /* Since the cache is enabled according to the information we * just loaded into the TLB, invalidate and enable the caches here. * We should probably check/set other modes....later. diff --git a/arch/powerpc/mm/8xx_mmu.c b/arch/powerpc/mm/8xx_mmu.c index 174452f7b5db..e95196fdc92b 100644 --- a/arch/powerpc/mm/8xx_mmu.c +++ b/arch/powerpc/mm/8xx_mmu.c @@ -112,6 +112,9 @@ unsigned long __init mmu_mapin_ram(unsigned long top) mmu_patch_cmp_limit(&patch__itlbmiss_linmem_top, 0); } else { mapped = top & ~(LARGE_PAGE_SIZE_8M - 1); + if (!IS_ENABLED(CONFIG_PIN_TLB_TEXT)) + mmu_patch_cmp_limit(&patch__itlbmiss_linmem_top, + _ALIGN(__pa(_einittext), 8 << 20)); } mmu_patch_cmp_limit(&patch__dtlbmiss_linmem_top, mapped); @@ -140,8 +143,8 @@ void __init setup_initial_memory_limit(phys_addr_t first_memblock_base, */ BUG_ON(first_memblock_base != 0); - /* 8xx can only access 24MB at the moment */ - memblock_set_current_limit(min_t(u64, first_memblock_size, 0x01800000)); + /* 8xx can only access 32MB at the moment */ + memblock_set_current_limit(min_t(u64, first_memblock_size, 0x02000000)); } /* From 6d183ca8baec983dc4208ca45ece3c36763df912 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 21 Feb 2019 19:08:37 +0000 Subject: [PATCH 137/200] powerpc/wii: properly disable use of BATs when requested. 'nobats' kernel parameter or some options like CONFIG_DEBUG_PAGEALLOC deny the use of BATS for mapping memory. This patch makes sure that the specific wii RAM mapping function takes it into account as well. Fixes: de32400dd26e ("wii: use both mem1 and mem2 as ram") Cc: stable@vger.kernel.org Reviewed-by: Jonathan Neuschafer Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/embedded6xx/wii.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/powerpc/platforms/embedded6xx/wii.c b/arch/powerpc/platforms/embedded6xx/wii.c index ecf703ee3a76..ac4ee88efc80 100644 --- a/arch/powerpc/platforms/embedded6xx/wii.c +++ b/arch/powerpc/platforms/embedded6xx/wii.c @@ -83,6 +83,10 @@ unsigned long __init wii_mmu_mapin_mem2(unsigned long top) /* MEM2 64MB@0x10000000 */ delta = wii_hole_start + wii_hole_size; size = top - delta; + + if (__map_without_bats) + return delta; + for (bl = 128<<10; bl < max_size; bl <<= 1) { if (bl * 2 > size) break; From 14e609d693ef678a211a8dcd0e13463a2581ed85 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 21 Feb 2019 19:08:38 +0000 Subject: [PATCH 138/200] powerpc/mm/32: add base address to mmu_mapin_ram() At the time being, mmu_mapin_ram() always maps RAM from the beginning. But some platforms like the WII have to map a second block of RAM. This patch adds to mmu_mapin_ram() the base address of the block. At the moment, only base address 0 is supported. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/mm/40x_mmu.c | 2 +- arch/powerpc/mm/44x_mmu.c | 2 +- arch/powerpc/mm/8xx_mmu.c | 2 +- arch/powerpc/mm/fsl_booke_mmu.c | 2 +- arch/powerpc/mm/mmu_decl.h | 2 +- arch/powerpc/mm/pgtable_32.c | 6 +++--- arch/powerpc/mm/ppc_mmu_32.c | 2 +- 7 files changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/mm/40x_mmu.c b/arch/powerpc/mm/40x_mmu.c index 61ac468c87c6..b9cf6f8764b0 100644 --- a/arch/powerpc/mm/40x_mmu.c +++ b/arch/powerpc/mm/40x_mmu.c @@ -93,7 +93,7 @@ void __init MMU_init_hw(void) #define LARGE_PAGE_SIZE_16M (1<<24) #define LARGE_PAGE_SIZE_4M (1<<22) -unsigned long __init mmu_mapin_ram(unsigned long top) +unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) { unsigned long v, s, mapped; phys_addr_t p; diff --git a/arch/powerpc/mm/44x_mmu.c b/arch/powerpc/mm/44x_mmu.c index ea2b9af08a48..aad127acdbaa 100644 --- a/arch/powerpc/mm/44x_mmu.c +++ b/arch/powerpc/mm/44x_mmu.c @@ -170,7 +170,7 @@ void __init MMU_init_hw(void) flush_instruction_cache(); } -unsigned long __init mmu_mapin_ram(unsigned long top) +unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) { unsigned long addr; unsigned long memstart = memstart_addr & ~(PPC_PIN_SIZE - 1); diff --git a/arch/powerpc/mm/8xx_mmu.c b/arch/powerpc/mm/8xx_mmu.c index e95196fdc92b..ce11cbaa25d8 100644 --- a/arch/powerpc/mm/8xx_mmu.c +++ b/arch/powerpc/mm/8xx_mmu.c @@ -99,7 +99,7 @@ static void __init mmu_patch_cmp_limit(s32 *site, unsigned long mapped) modify_instruction_site(site, 0xffff, (unsigned long)__va(mapped) >> 16); } -unsigned long __init mmu_mapin_ram(unsigned long top) +unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) { unsigned long mapped; diff --git a/arch/powerpc/mm/fsl_booke_mmu.c b/arch/powerpc/mm/fsl_booke_mmu.c index 080d49b26c3a..210cbc1faf63 100644 --- a/arch/powerpc/mm/fsl_booke_mmu.c +++ b/arch/powerpc/mm/fsl_booke_mmu.c @@ -221,7 +221,7 @@ unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx, bool dryrun) #error "LOWMEM_CAM_NUM must be less than NUM_TLBCAMS" #endif -unsigned long __init mmu_mapin_ram(unsigned long top) +unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) { return tlbcam_addrs[tlbcam_index - 1].limit - PAGE_OFFSET + 1; } diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index c4a717da65eb..61730023dde3 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -130,7 +130,7 @@ extern void wii_memory_fixups(void); */ #ifdef CONFIG_PPC32 extern void MMU_init_hw(void); -extern unsigned long mmu_mapin_ram(unsigned long top); +unsigned long mmu_mapin_ram(unsigned long base, unsigned long top); #endif #ifdef CONFIG_PPC_FSL_BOOK3E diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index ded71126ce4c..b4858818523f 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -258,15 +258,15 @@ void __init mapin_ram(void) #ifndef CONFIG_WII top = total_lowmem; - s = mmu_mapin_ram(top); + s = mmu_mapin_ram(0, top); __mapin_ram_chunk(s, top); #else if (!wii_hole_size) { - s = mmu_mapin_ram(total_lowmem); + s = mmu_mapin_ram(0, total_lowmem); __mapin_ram_chunk(s, total_lowmem); } else { top = wii_hole_start; - s = mmu_mapin_ram(top); + s = mmu_mapin_ram(0, top); __mapin_ram_chunk(s, top); top = memblock_end_of_DRAM(); diff --git a/arch/powerpc/mm/ppc_mmu_32.c b/arch/powerpc/mm/ppc_mmu_32.c index fb747bb0b3e4..2d4d4bf9a63d 100644 --- a/arch/powerpc/mm/ppc_mmu_32.c +++ b/arch/powerpc/mm/ppc_mmu_32.c @@ -73,7 +73,7 @@ unsigned long p_block_mapped(phys_addr_t pa) return 0; } -unsigned long __init mmu_mapin_ram(unsigned long top) +unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) { unsigned long tot, bl, done; unsigned long max_size = (256<<20); From e4d6654ebe6e05c94db0ce2d30769e3822f178bd Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 21 Feb 2019 19:08:39 +0000 Subject: [PATCH 139/200] powerpc/mm/32s: rework mmu_mapin_ram() This patch reworks mmu_mapin_ram() to be more generic and map as much blocks as possible. It now supports blocks not starting at address 0. It scans DBATs array to find free ones instead of forcing the use of BAT2 and BAT3. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/mm/ppc_mmu_32.c | 63 +++++++++++++++++++++++------------- 1 file changed, 41 insertions(+), 22 deletions(-) diff --git a/arch/powerpc/mm/ppc_mmu_32.c b/arch/powerpc/mm/ppc_mmu_32.c index 2d4d4bf9a63d..621392e6d466 100644 --- a/arch/powerpc/mm/ppc_mmu_32.c +++ b/arch/powerpc/mm/ppc_mmu_32.c @@ -73,39 +73,58 @@ unsigned long p_block_mapped(phys_addr_t pa) return 0; } +static int find_free_bat(void) +{ + int b; + + if (cpu_has_feature(CPU_FTR_601)) { + for (b = 0; b < 4; b++) { + struct ppc_bat *bat = BATS[b]; + + if (!(bat[0].batl & 0x40)) + return b; + } + } else { + int n = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4; + + for (b = 0; b < n; b++) { + struct ppc_bat *bat = BATS[b]; + + if (!(bat[1].batu & 3)) + return b; + } + } + return -1; +} + +static unsigned int block_size(unsigned long base, unsigned long top) +{ + unsigned int max_size = (cpu_has_feature(CPU_FTR_601) ? 8 : 256) << 20; + unsigned int base_shift = (fls(base) - 1) & 31; + unsigned int block_shift = (fls(top - base) - 1) & 31; + + return min3(max_size, 1U << base_shift, 1U << block_shift); +} + unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) { - unsigned long tot, bl, done; - unsigned long max_size = (256<<20); + int idx; if (__map_without_bats) { printk(KERN_DEBUG "RAM mapped without BATs\n"); - return 0; + return base; } - /* Set up BAT2 and if necessary BAT3 to cover RAM. */ + while ((idx = find_free_bat()) != -1 && base != top) { + unsigned int size = block_size(base, top); - /* Make sure we don't map a block larger than the - smallest alignment of the physical address. */ - tot = top; - for (bl = 128<<10; bl < max_size; bl <<= 1) { - if (bl * 2 > tot) + if (size < 128 << 10) break; + setbat(idx, PAGE_OFFSET + base, base, size, PAGE_KERNEL_X); + base += size; } - setbat(2, PAGE_OFFSET, 0, bl, PAGE_KERNEL_X); - done = (unsigned long)bat_addrs[2].limit - PAGE_OFFSET + 1; - if ((done < tot) && !bat_addrs[3].limit) { - /* use BAT3 to cover a bit more */ - tot -= done; - for (bl = 128<<10; bl < max_size; bl <<= 1) - if (bl * 2 > tot) - break; - setbat(3, PAGE_OFFSET+done, done, bl, PAGE_KERNEL_X); - done = (unsigned long)bat_addrs[3].limit - PAGE_OFFSET + 1; - } - - return done; + return base; } /* From 9e849f231c3c72d4c3c1b07c9cd19ae789da0420 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 21 Feb 2019 19:08:40 +0000 Subject: [PATCH 140/200] powerpc/mm/32s: use generic mmu_mapin_ram() for all blocks. Now that mmu_mapin_ram() is able to handle other blocks than the one starting at 0, the WII can use it for all its blocks. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/mm/pgtable_32.c | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index b4858818523f..c4b0eb51f6d8 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -254,26 +254,17 @@ static void __init __mapin_ram_chunk(unsigned long offset, unsigned long top) void __init mapin_ram(void) { - unsigned long s, top; + struct memblock_region *reg; -#ifndef CONFIG_WII - top = total_lowmem; - s = mmu_mapin_ram(0, top); - __mapin_ram_chunk(s, top); -#else - if (!wii_hole_size) { - s = mmu_mapin_ram(0, total_lowmem); - __mapin_ram_chunk(s, total_lowmem); - } else { - top = wii_hole_start; - s = mmu_mapin_ram(0, top); - __mapin_ram_chunk(s, top); + for_each_memblock(memory, reg) { + phys_addr_t base = reg->base; + phys_addr_t top = min(base + reg->size, total_lowmem); - top = memblock_end_of_DRAM(); - s = wii_mmu_mapin_mem2(top); - __mapin_ram_chunk(s, top); + if (base >= top) + continue; + base = mmu_mapin_ram(base, top); + __mapin_ram_chunk(base, top); } -#endif } /* Scan the real Linux page tables and return a PTE pointer for From d2f15e0979ee779649dec730cf17511b6f79e5be Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 21 Feb 2019 19:08:41 +0000 Subject: [PATCH 141/200] powerpc/32: always populate page tables for Abatron BDI. When CONFIG_BDI_SWITCH is set, the page tables have to be populated allthough large TLBs are used, because the BDI switch knows nothing about those large TLBs which are handled directly in TLB miss logic. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/mm/pgtable_32.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index c4b0eb51f6d8..a000768a5cc9 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -263,7 +263,10 @@ void __init mapin_ram(void) if (base >= top) continue; base = mmu_mapin_ram(base, top); - __mapin_ram_chunk(base, top); + if (IS_ENABLED(CONFIG_BDI_SWITCH)) + __mapin_ram_chunk(reg->base, top); + else + __mapin_ram_chunk(base, top); } } From 160985f3025be79c860706aedfaf3f8945a7591d Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 21 Feb 2019 19:08:42 +0000 Subject: [PATCH 142/200] powerpc/wii: remove wii_mmu_mapin_mem2() wii_mmu_mapin_mem2() is not used anymore, remove it. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/embedded6xx/wii.c | 28 ------------------------ 1 file changed, 28 deletions(-) diff --git a/arch/powerpc/platforms/embedded6xx/wii.c b/arch/powerpc/platforms/embedded6xx/wii.c index ac4ee88efc80..235fe81aa2b1 100644 --- a/arch/powerpc/platforms/embedded6xx/wii.c +++ b/arch/powerpc/platforms/embedded6xx/wii.c @@ -54,10 +54,6 @@ static void __iomem *hw_ctrl; static void __iomem *hw_gpio; -unsigned long wii_hole_start; -unsigned long wii_hole_size; - - static int __init page_aligned(unsigned long x) { return !(x & (PAGE_SIZE-1)); @@ -69,30 +65,6 @@ void __init wii_memory_fixups(void) BUG_ON(memblock.memory.cnt != 2); BUG_ON(!page_aligned(p[0].base) || !page_aligned(p[1].base)); - - /* determine hole */ - wii_hole_start = ALIGN(p[0].base + p[0].size, PAGE_SIZE); - wii_hole_size = p[1].base - wii_hole_start; -} - -unsigned long __init wii_mmu_mapin_mem2(unsigned long top) -{ - unsigned long delta, size, bl; - unsigned long max_size = (256<<20); - - /* MEM2 64MB@0x10000000 */ - delta = wii_hole_start + wii_hole_size; - size = top - delta; - - if (__map_without_bats) - return delta; - - for (bl = 128<<10; bl < max_size; bl <<= 1) { - if (bl * 2 > size) - break; - } - setbat(4, PAGE_OFFSET+delta, delta, bl, PAGE_KERNEL_X); - return delta + bl; } static void __noreturn wii_spin(void) From df25f863903bab5fa0b8fd3a429eb8f455852986 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 21 Feb 2019 19:08:43 +0000 Subject: [PATCH 143/200] powerpc/mm/32s: use _PAGE_EXEC in setbat() Do not set IBAT when setbat() is called without _PAGE_EXEC Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/mm/ppc_mmu_32.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/mm/ppc_mmu_32.c b/arch/powerpc/mm/ppc_mmu_32.c index 621392e6d466..9225da8bae4c 100644 --- a/arch/powerpc/mm/ppc_mmu_32.c +++ b/arch/powerpc/mm/ppc_mmu_32.c @@ -131,6 +131,7 @@ unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) * Set up one of the I/D BAT (block address translation) register pairs. * The parameters are not checked; in particular size must be a power * of 2 between 128k and 256M. + * On 603+, only set IBAT when _PAGE_EXEC is set */ void __init setbat(int index, unsigned long virt, phys_addr_t phys, unsigned int size, pgprot_t prot) @@ -157,11 +158,12 @@ void __init setbat(int index, unsigned long virt, phys_addr_t phys, bat[1].batu |= 1; /* Vp = 1 */ if (flags & _PAGE_GUARDED) { /* G bit must be zero in IBATs */ - bat[0].batu = bat[0].batl = 0; - } else { - /* make IBAT same as DBAT */ - bat[0] = bat[1]; + flags &= ~_PAGE_EXEC; } + if (flags & _PAGE_EXEC) + bat[0] = bat[1]; + else + bat[0].batu = bat[0].batl = 0; } else { /* 601 cpu */ if (bl > BL_8M) From 02d5d13b4544dff5fc0c0aa0179085ed52b72ecd Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 21 Feb 2019 19:08:44 +0000 Subject: [PATCH 144/200] powerpc/32: add helper to write into segment registers This patch add an helper which wraps 'mtsrin' instruction to write into segment registers. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/reg.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index 1f79e1d8fb0b..c25880e6a16a 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -1425,6 +1425,11 @@ static inline void msr_check_and_clear(unsigned long bits) #define mfsrin(v) ({unsigned int rval; \ asm volatile("mfsrin %0,%1" : "=r" (rval) : "r" (v)); \ rval;}) + +static inline void mtsrin(u32 val, u32 idx) +{ + asm volatile("mtsrin %0, %1" : : "r" (val), "r" (idx)); +} #endif #define proc_trap() asm volatile("trap") From 28ea38b9cba68eec55cf550acd6b36b6f507cd17 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 21 Feb 2019 19:08:45 +0000 Subject: [PATCH 145/200] powerpc/mmu: add is_strict_kernel_rwx() helper Add a helper to know whether STRICT_KERNEL_RWX is enabled. This is based on rodata_enabled flag which is defined only when CONFIG_STRICT_KERNEL_RWX is selected. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/mmu.h | 11 +++++++++++ arch/powerpc/mm/init_32.c | 4 +--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h index 6d22a8e78fe2..d34ad1657d7b 100644 --- a/arch/powerpc/include/asm/mmu.h +++ b/arch/powerpc/include/asm/mmu.h @@ -289,6 +289,17 @@ static inline u16 get_mm_addr_key(struct mm_struct *mm, unsigned long address) } #endif /* CONFIG_PPC_MEM_KEYS */ +#ifdef CONFIG_STRICT_KERNEL_RWX +static inline bool strict_kernel_rwx_enabled(void) +{ + return rodata_enabled; +} +#else +static inline bool strict_kernel_rwx_enabled(void) +{ + return false; +} +#endif #endif /* !__ASSEMBLY__ */ /* The kernel use the constants below to index in the page sizes array. diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c index 3e59e5d64b01..ee5a430b9a18 100644 --- a/arch/powerpc/mm/init_32.c +++ b/arch/powerpc/mm/init_32.c @@ -108,12 +108,10 @@ static void __init MMU_setup(void) __map_without_bats = 1; __map_without_ltlbs = 1; } -#ifdef CONFIG_STRICT_KERNEL_RWX - if (rodata_enabled) { + if (strict_kernel_rwx_enabled()) { __map_without_bats = 1; __map_without_ltlbs = 1; } -#endif } /* From 555f4fdb93e70d39e664fcc52cda23c5b62a46cc Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 21 Feb 2019 19:08:46 +0000 Subject: [PATCH 146/200] powerpc/kconfig: define PAGE_SHIFT inside Kconfig This patch defined CONFIG_PPC_PAGE_SHIFT in order to be able to use PAGE_SHIFT value inside Kconfig. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/Kconfig | 7 +++++++ arch/powerpc/include/asm/page.h | 13 ++----------- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 3d5d63c9b797..0f933797c376 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -704,6 +704,13 @@ config PPC_256K_PAGES endchoice +config PPC_PAGE_SHIFT + int + default 18 if PPC_256K_PAGES + default 16 if PPC_64K_PAGES + default 14 if PPC_16K_PAGES + default 12 + config THREAD_SHIFT int "Thread shift" if EXPERT range 13 15 diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h index aa4497175bd3..ed870468ef6f 100644 --- a/arch/powerpc/include/asm/page.h +++ b/arch/powerpc/include/asm/page.h @@ -20,20 +20,11 @@ /* * On regular PPC32 page size is 4K (but we support 4K/16K/64K/256K pages - * on PPC44x). For PPC64 we support either 4K or 64K software + * on PPC44x and 4K/16K on 8xx). For PPC64 we support either 4K or 64K software * page size. When using 64K pages however, whether we are really supporting * 64K pages in HW or not is irrelevant to those definitions. */ -#if defined(CONFIG_PPC_256K_PAGES) -#define PAGE_SHIFT 18 -#elif defined(CONFIG_PPC_64K_PAGES) -#define PAGE_SHIFT 16 -#elif defined(CONFIG_PPC_16K_PAGES) -#define PAGE_SHIFT 14 -#else -#define PAGE_SHIFT 12 -#endif - +#define PAGE_SHIFT CONFIG_PPC_PAGE_SHIFT #define PAGE_SIZE (ASM_CONST(1) << PAGE_SHIFT) #ifndef __ASSEMBLY__ From 166d97d961588d2e52037e96da18d2ead455cec1 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 21 Feb 2019 19:08:47 +0000 Subject: [PATCH 147/200] powerpc/kconfig: define CONFIG_DATA_SHIFT and CONFIG_ETEXT_SHIFT CONFIG_STRICT_KERNEL_RWX requires a special alignment for DATA for some subarches. Today it is just defined as an #ifdef in vmlinux.lds.S In order to get more flexibility, this patch moves the definition of this alignment in Kconfig On some subarches, CONFIG_STRICT_KERNEL_RWX will require a special alignment of _etext. This patch also adds a configuration item for it in Kconfig Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/Kconfig | 9 +++++++++ arch/powerpc/kernel/vmlinux.lds.S | 9 +++------ 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 0f933797c376..14223363a33d 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -721,6 +721,15 @@ config THREAD_SHIFT Used to define the stack size. The default is almost always what you want. Only change this if you know what you are doing. +config ETEXT_SHIFT + int + default PPC_PAGE_SHIFT + +config DATA_SHIFT + int + default 24 if STRICT_KERNEL_RWX && PPC64 + default PPC_PAGE_SHIFT + config FORCE_MAX_ZONEORDER int "Maximum zone order" range 8 9 if PPC64 && PPC_64K_PAGES diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index c3efb972c8c1..060a1acd7c6d 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -12,11 +12,8 @@ #include #include -#if defined(CONFIG_STRICT_KERNEL_RWX) && !defined(CONFIG_PPC32) -#define STRICT_ALIGN_SIZE (1 << 24) -#else -#define STRICT_ALIGN_SIZE PAGE_SIZE -#endif +#define STRICT_ALIGN_SIZE (1 << CONFIG_DATA_SHIFT) +#define ETEXT_ALIGN_SIZE (1 << CONFIG_ETEXT_SHIFT) ENTRY(_stext) @@ -131,7 +128,7 @@ SECTIONS } :kernel - . = ALIGN(PAGE_SIZE); + . = ALIGN(ETEXT_ALIGN_SIZE); _etext = .; PROVIDE32 (etext = .); From 5e04ae85fbed8eef209a40a63f8ef507fe623064 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 21 Feb 2019 19:08:48 +0000 Subject: [PATCH 148/200] powerpc/mm/32s: add setibat() clearibat() and update_bats() setibat() and clearibat() allows to manipulate IBATs independently of DBATs. update_bats() allows to update bats after init. This is done with MMU off. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/32/mmu-hash.h | 2 ++ arch/powerpc/kernel/head_32.S | 35 +++++++++++++++++++ arch/powerpc/mm/ppc_mmu_32.c | 32 +++++++++++++++++ 3 files changed, 69 insertions(+) diff --git a/arch/powerpc/include/asm/book3s/32/mmu-hash.h b/arch/powerpc/include/asm/book3s/32/mmu-hash.h index 0c261ba2c826..5cb588395fdc 100644 --- a/arch/powerpc/include/asm/book3s/32/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/32/mmu-hash.h @@ -92,6 +92,8 @@ typedef struct { unsigned long vdso_base; } mm_context_t; +void update_bats(void); + /* patch sites */ extern s32 patch__hash_page_A0, patch__hash_page_A1, patch__hash_page_A2; extern s32 patch__hash_page_B, patch__hash_page_C; diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index fdb587c96a80..613900bb8c39 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -1096,6 +1096,41 @@ BEGIN_MMU_FTR_SECTION END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_HIGH_BATS) blr +_ENTRY(update_bats) + lis r4, 1f@h + ori r4, r4, 1f@l + tophys(r4, r4) + mfmsr r6 + mflr r7 + li r3, MSR_KERNEL & ~(MSR_IR | MSR_DR) + rlwinm r0, r6, 0, ~MSR_RI + rlwinm r0, r0, 0, ~MSR_EE + mtmsr r0 + mtspr SPRN_SRR0, r4 + mtspr SPRN_SRR1, r3 + SYNC + RFI +1: bl clear_bats + lis r3, BATS@ha + addi r3, r3, BATS@l + tophys(r3, r3) + LOAD_BAT(0, r3, r4, r5) + LOAD_BAT(1, r3, r4, r5) + LOAD_BAT(2, r3, r4, r5) + LOAD_BAT(3, r3, r4, r5) +BEGIN_MMU_FTR_SECTION + LOAD_BAT(4, r3, r4, r5) + LOAD_BAT(5, r3, r4, r5) + LOAD_BAT(6, r3, r4, r5) + LOAD_BAT(7, r3, r4, r5) +END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_HIGH_BATS) + li r3, MSR_KERNEL & ~(MSR_IR | MSR_DR | MSR_RI) + mtmsr r3 + mtspr SPRN_SRR0, r7 + mtspr SPRN_SRR1, r6 + SYNC + RFI + flush_tlbs: lis r10, 0x40 1: addic. r10, r10, -0x1000 diff --git a/arch/powerpc/mm/ppc_mmu_32.c b/arch/powerpc/mm/ppc_mmu_32.c index 9225da8bae4c..7b011280d076 100644 --- a/arch/powerpc/mm/ppc_mmu_32.c +++ b/arch/powerpc/mm/ppc_mmu_32.c @@ -106,6 +106,38 @@ static unsigned int block_size(unsigned long base, unsigned long top) return min3(max_size, 1U << base_shift, 1U << block_shift); } +/* + * Set up one of the IBAT (block address translation) register pairs. + * The parameters are not checked; in particular size must be a power + * of 2 between 128k and 256M. + * Only for 603+ ... + */ +static void setibat(int index, unsigned long virt, phys_addr_t phys, + unsigned int size, pgprot_t prot) +{ + unsigned int bl = (size >> 17) - 1; + int wimgxpp; + struct ppc_bat *bat = BATS[index]; + unsigned long flags = pgprot_val(prot); + + if (!cpu_has_feature(CPU_FTR_NEED_COHERENT)) + flags &= ~_PAGE_COHERENT; + + wimgxpp = (flags & _PAGE_COHERENT) | (_PAGE_EXEC ? BPP_RX : BPP_XX); + bat[0].batu = virt | (bl << 2) | 2; /* Vs=1, Vp=0 */ + bat[0].batl = BAT_PHYS_ADDR(phys) | wimgxpp; + if (flags & _PAGE_USER) + bat[0].batu |= 1; /* Vp = 1 */ +} + +static void clearibat(int index) +{ + struct ppc_bat *bat = BATS[index]; + + bat[0].batu = 0; + bat[0].batl = 0; +} + unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) { int idx; From 63b2bc619565ef7078e7b12fafb82f51867f002b Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 21 Feb 2019 19:08:49 +0000 Subject: [PATCH 149/200] powerpc/mm/32s: Use BATs for STRICT_KERNEL_RWX Today, STRICT_KERNEL_RWX is based on the use of regular pages to map kernel pages. On Book3s 32, it has three consequences: - Using pages instead of BAT for mapping kernel linear memory severely impacts performance. - Exec protection is not effective because no-execute cannot be set at page level (except on 603 which doesn't have hash tables) - Write protection is not effective because PP bits do not provide RO mode for kernel-only pages (except on 603 which handles it in software via PAGE_DIRTY) On the 603+, we have: - Independent IBAT and DBAT allowing limitation of exec parts. - NX bit can be set in segment registers to forbit execution on memory mapped by pages. - RO mode on DBATs even for kernel-only blocks. On the 601, there is nothing much we can do other than warn the user about it, because: - BATs are common to instructions and data. - BAT do not provide RO mode for kernel-only blocks. - segment registers don't have the NX bit. In order to use IBAT for exec protection, this patch: - Aligns _etext to BAT block sizes (128kb) - Set NX bit in kernel segment register (Except on vmalloc area when CONFIG_MODULES is selected) - Maps kernel text with IBATs. In order to use DBAT for exec protection, this patch: - Aligns RW DATA to BAT block sizes (4M) - Maps kernel RO area with write prohibited DBATs - Maps remaining memory with remaining DBATs Here is what we get with this patch on a 832x when activating STRICT_KERNEL_RWX: Symbols: c0000000 T _stext c0680000 R __start_rodata c0680000 R _etext c0800000 T __init_begin c0800000 T _sinittext ~# cat /sys/kernel/debug/block_address_translation ---[ Instruction Block Address Translation ]--- 0: 0xc0000000-0xc03fffff 0x00000000 Kernel EXEC coherent 1: 0xc0400000-0xc05fffff 0x00400000 Kernel EXEC coherent 2: 0xc0600000-0xc067ffff 0x00600000 Kernel EXEC coherent 3: - 4: - 5: - 6: - 7: - ---[ Data Block Address Translation ]--- 0: 0xc0000000-0xc07fffff 0x00000000 Kernel RO coherent 1: 0xc0800000-0xc0ffffff 0x00800000 Kernel RW coherent 2: 0xc1000000-0xc1ffffff 0x01000000 Kernel RW coherent 3: 0xc2000000-0xc3ffffff 0x02000000 Kernel RW coherent 4: 0xc4000000-0xc7ffffff 0x04000000 Kernel RW coherent 5: 0xc8000000-0xcfffffff 0x08000000 Kernel RW coherent 6: 0xd0000000-0xdfffffff 0x10000000 Kernel RW coherent 7: - ~# cat /sys/kernel/debug/segment_registers ---[ User Segments ]--- 0x00000000-0x0fffffff Kern key 1 User key 1 VSID 0xa085d0 0x10000000-0x1fffffff Kern key 1 User key 1 VSID 0xa086e1 0x20000000-0x2fffffff Kern key 1 User key 1 VSID 0xa087f2 0x30000000-0x3fffffff Kern key 1 User key 1 VSID 0xa08903 0x40000000-0x4fffffff Kern key 1 User key 1 VSID 0xa08a14 0x50000000-0x5fffffff Kern key 1 User key 1 VSID 0xa08b25 0x60000000-0x6fffffff Kern key 1 User key 1 VSID 0xa08c36 0x70000000-0x7fffffff Kern key 1 User key 1 VSID 0xa08d47 0x80000000-0x8fffffff Kern key 1 User key 1 VSID 0xa08e58 0x90000000-0x9fffffff Kern key 1 User key 1 VSID 0xa08f69 0xa0000000-0xafffffff Kern key 1 User key 1 VSID 0xa0907a 0xb0000000-0xbfffffff Kern key 1 User key 1 VSID 0xa0918b ---[ Kernel Segments ]--- 0xc0000000-0xcfffffff Kern key 0 User key 1 No Exec VSID 0x000ccc 0xd0000000-0xdfffffff Kern key 0 User key 1 No Exec VSID 0x000ddd 0xe0000000-0xefffffff Kern key 0 User key 1 No Exec VSID 0x000eee 0xf0000000-0xffffffff Kern key 0 User key 1 No Exec VSID 0x000fff Aligning _etext to 128kb allows to map up to 32Mb text with 8 IBATs: 16Mb + 8Mb + 4Mb + 2Mb + 1Mb + 512kb + 256kb + 128kb (+ 128kb) = 32Mb (A 9th IBAT is unneeded as 32Mb would need only a single 32Mb block) Aligning data to 4M allows to map up to 512Mb data with 8 DBATs: 16Mb + 8Mb + 4Mb + 4Mb + 32Mb + 64Mb + 128Mb + 256Mb = 512Mb Because some processors only have 4 BATs and because some targets need DBATs for mapping other areas, the following patch will allow to modify _etext and data alignment. Signed-off-by: Christophe Leroy Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/Kconfig | 2 + arch/powerpc/include/asm/book3s/32/pgtable.h | 11 +++ arch/powerpc/mm/init_32.c | 4 +- arch/powerpc/mm/mmu_decl.h | 8 ++ arch/powerpc/mm/pgtable_32.c | 10 ++- arch/powerpc/mm/ppc_mmu_32.c | 87 ++++++++++++++++++-- 6 files changed, 112 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 14223363a33d..2544dae471e1 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -723,11 +723,13 @@ config THREAD_SHIFT config ETEXT_SHIFT int + default 17 if STRICT_KERNEL_RWX && PPC_BOOK3S_32 default PPC_PAGE_SHIFT config DATA_SHIFT int default 24 if STRICT_KERNEL_RWX && PPC64 + default 22 if STRICT_KERNEL_RWX && PPC_BOOK3S_32 default PPC_PAGE_SHIFT config FORCE_MAX_ZONEORDER diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h index 49d76adb9bc5..aa8406b8f7ba 100644 --- a/arch/powerpc/include/asm/book3s/32/pgtable.h +++ b/arch/powerpc/include/asm/book3s/32/pgtable.h @@ -174,7 +174,18 @@ static inline bool pte_user(pte_t pte) * of RAM. -- Cort */ #define VMALLOC_OFFSET (0x1000000) /* 16M */ + +/* + * With CONFIG_STRICT_KERNEL_RWX, kernel segments are set NX. But when modules + * are used, NX cannot be set on VMALLOC space. So vmalloc VM space and linear + * memory shall not share segments. + */ +#if defined(CONFIG_STRICT_KERNEL_RWX) && defined(CONFIG_MODULES) +#define VMALLOC_START ((_ALIGN((long)high_memory, 256L << 20) + VMALLOC_OFFSET) & \ + ~(VMALLOC_OFFSET - 1)) +#else #define VMALLOC_START ((((long)high_memory + VMALLOC_OFFSET) & ~(VMALLOC_OFFSET-1))) +#endif #define VMALLOC_END ioremap_bot #ifndef __ASSEMBLY__ diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c index ee5a430b9a18..bc28995a37ea 100644 --- a/arch/powerpc/mm/init_32.c +++ b/arch/powerpc/mm/init_32.c @@ -108,10 +108,8 @@ static void __init MMU_setup(void) __map_without_bats = 1; __map_without_ltlbs = 1; } - if (strict_kernel_rwx_enabled()) { - __map_without_bats = 1; + if (strict_kernel_rwx_enabled()) __map_without_ltlbs = 1; - } } /* diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index 61730023dde3..98fc94affc29 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -165,3 +165,11 @@ unsigned long p_block_mapped(phys_addr_t pa); static inline phys_addr_t v_block_mapped(unsigned long va) { return 0; } static inline unsigned long p_block_mapped(phys_addr_t pa) { return 0; } #endif + +#if defined(CONFIG_PPC_BOOK3S_32) +void mmu_mark_initmem_nx(void); +void mmu_mark_rodata_ro(void); +#else +static inline void mmu_mark_initmem_nx(void) { } +static inline void mmu_mark_rodata_ro(void) { } +#endif diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index a000768a5cc9..6e56a6240bfa 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -353,7 +353,10 @@ void mark_initmem_nx(void) unsigned long numpages = PFN_UP((unsigned long)_einittext) - PFN_DOWN((unsigned long)_sinittext); - change_page_attr(page, numpages, PAGE_KERNEL); + if (v_block_mapped((unsigned long)_stext) + 1) + mmu_mark_initmem_nx(); + else + change_page_attr(page, numpages, PAGE_KERNEL); } #ifdef CONFIG_STRICT_KERNEL_RWX @@ -362,6 +365,11 @@ void mark_rodata_ro(void) struct page *page; unsigned long numpages; + if (v_block_mapped((unsigned long)_sinittext)) { + mmu_mark_rodata_ro(); + return; + } + page = virt_to_page(_stext); numpages = PFN_UP((unsigned long)_etext) - PFN_DOWN((unsigned long)_stext); diff --git a/arch/powerpc/mm/ppc_mmu_32.c b/arch/powerpc/mm/ppc_mmu_32.c index 7b011280d076..2d5b0d50fb31 100644 --- a/arch/powerpc/mm/ppc_mmu_32.c +++ b/arch/powerpc/mm/ppc_mmu_32.c @@ -32,6 +32,7 @@ #include #include #include +#include #include "mmu_decl.h" @@ -138,15 +139,10 @@ static void clearibat(int index) bat[0].batl = 0; } -unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) +static unsigned long __init __mmu_mapin_ram(unsigned long base, unsigned long top) { int idx; - if (__map_without_bats) { - printk(KERN_DEBUG "RAM mapped without BATs\n"); - return base; - } - while ((idx = find_free_bat()) != -1 && base != top) { unsigned int size = block_size(base, top); @@ -159,6 +155,85 @@ unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) return base; } +unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) +{ + int done; + unsigned long border = (unsigned long)__init_begin - PAGE_OFFSET; + + if (__map_without_bats) { + pr_debug("RAM mapped without BATs\n"); + return base; + } + + if (!strict_kernel_rwx_enabled() || base >= border || top <= border) + return __mmu_mapin_ram(base, top); + + done = __mmu_mapin_ram(base, border); + if (done != border - base) + return done; + + return done + __mmu_mapin_ram(border, top); +} + +void mmu_mark_initmem_nx(void) +{ + int nb = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4; + int i; + unsigned long base = (unsigned long)_stext - PAGE_OFFSET; + unsigned long top = (unsigned long)_etext - PAGE_OFFSET; + unsigned long size; + + if (cpu_has_feature(CPU_FTR_601)) + return; + + for (i = 0; i < nb - 1 && base < top && top - base > (128 << 10);) { + size = block_size(base, top); + setibat(i++, PAGE_OFFSET + base, base, size, PAGE_KERNEL_TEXT); + base += size; + } + if (base < top) { + size = block_size(base, top); + size = max(size, 128UL << 10); + if ((top - base) > size) { + if (strict_kernel_rwx_enabled()) + pr_warn("Kernel _etext not properly aligned\n"); + size <<= 1; + } + setibat(i++, PAGE_OFFSET + base, base, size, PAGE_KERNEL_TEXT); + base += size; + } + for (; i < nb; i++) + clearibat(i); + + update_bats(); + + for (i = TASK_SIZE >> 28; i < 16; i++) { + /* Do not set NX on VM space for modules */ + if (IS_ENABLED(CONFIG_MODULES) && + (VMALLOC_START & 0xf0000000) == i << 28) + break; + mtsrin(mfsrin(i << 28) | 0x10000000, i << 28); + } +} + +void mmu_mark_rodata_ro(void) +{ + int nb = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4; + int i; + + if (cpu_has_feature(CPU_FTR_601)) + return; + + for (i = 0; i < nb; i++) { + struct ppc_bat *bat = BATS[i]; + + if (bat_addrs[i].start < (unsigned long)__init_begin) + bat[1].batl = (bat[1].batl & ~BPP_RW) | BPP_RX; + } + + update_bats(); +} + /* * Set up one of the I/D BAT (block address translation) register pairs. * The parameters are not checked; in particular size must be a power From 0f4a9041c7a77240fa1ff927775620b574151f34 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 21 Feb 2019 19:08:50 +0000 Subject: [PATCH 150/200] powerpc/kconfig: make _etext and data areas alignment configurable on Book3s 32 Depending on the number of available BATs for mapping the different kernel areas, it might be needed to increase the alignment of _etext and/or of data areas. This patchs allows the user to do it via Kconfig. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/Kconfig | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 2544dae471e1..81df0dbc8a9a 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -721,16 +721,44 @@ config THREAD_SHIFT Used to define the stack size. The default is almost always what you want. Only change this if you know what you are doing. +config ETEXT_SHIFT_BOOL + bool "Set custom etext alignment" if STRICT_KERNEL_RWX && PPC_BOOK3S_32 + depends on ADVANCED_OPTIONS + help + This option allows you to set the kernel end of text alignment. When + RAM is mapped by blocks, the alignment needs to fit the size and + number of possible blocks. The default should be OK for most configs. + + Say N here unless you know what you are doing. + config ETEXT_SHIFT - int + int "_etext shift" if ETEXT_SHIFT_BOOL + range 17 28 if STRICT_KERNEL_RWX && PPC_BOOK3S_32 default 17 if STRICT_KERNEL_RWX && PPC_BOOK3S_32 default PPC_PAGE_SHIFT + help + On Book3S 32 (603+), IBATs are used to map kernel text. + Smaller is the alignment, greater is the number of necessary IBATs. + +config DATA_SHIFT_BOOL + bool "Set custom data alignment" if STRICT_KERNEL_RWX && PPC_BOOK3S_32 + depends on ADVANCED_OPTIONS + help + This option allows you to set the kernel data alignment. When + RAM is mapped by blocks, the alignment needs to fit the size and + number of possible blocks. The default should be OK for most configs. + + Say N here unless you know what you are doing. config DATA_SHIFT - int + int "Data shift" if DATA_SHIFT_BOOL default 24 if STRICT_KERNEL_RWX && PPC64 + range 17 28 if STRICT_KERNEL_RWX && PPC_BOOK3S_32 default 22 if STRICT_KERNEL_RWX && PPC_BOOK3S_32 default PPC_PAGE_SHIFT + help + On Book3S 32 (603+), DBATs are used to map kernel text and rodata RO. + Smaller is the alignment, greater is the number of necessary DBATs. config FORCE_MAX_ZONEORDER int "Maximum zone order" From d5f17ee96447736a84bc44ffc4b0dddb1b519222 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 21 Feb 2019 19:08:51 +0000 Subject: [PATCH 151/200] powerpc/8xx: don't disable large TLBs with CONFIG_STRICT_KERNEL_RWX This patch implements handling of STRICT_KERNEL_RWX with large TLBs directly in the TLB miss handlers. To do so, etext and sinittext are aligned on 512kB boundaries and the miss handlers use 512kB pages instead of 8Mb pages for addresses close to the boundaries. It sets RO PP flags for addresses under sinittext. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/Kconfig | 2 + arch/powerpc/include/asm/nohash/32/mmu-8xx.h | 3 +- arch/powerpc/kernel/head_8xx.S | 54 +++++++++++++++----- arch/powerpc/mm/8xx_mmu.c | 31 ++++++++++- arch/powerpc/mm/init_32.c | 2 +- arch/powerpc/mm/mmu_decl.h | 2 +- 6 files changed, 78 insertions(+), 16 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 81df0dbc8a9a..43fa82e409bf 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -735,6 +735,7 @@ config ETEXT_SHIFT int "_etext shift" if ETEXT_SHIFT_BOOL range 17 28 if STRICT_KERNEL_RWX && PPC_BOOK3S_32 default 17 if STRICT_KERNEL_RWX && PPC_BOOK3S_32 + default 19 if STRICT_KERNEL_RWX && PPC_8xx default PPC_PAGE_SHIFT help On Book3S 32 (603+), IBATs are used to map kernel text. @@ -755,6 +756,7 @@ config DATA_SHIFT default 24 if STRICT_KERNEL_RWX && PPC64 range 17 28 if STRICT_KERNEL_RWX && PPC_BOOK3S_32 default 22 if STRICT_KERNEL_RWX && PPC_BOOK3S_32 + default 19 if STRICT_KERNEL_RWX && PPC_8xx default PPC_PAGE_SHIFT help On Book3S 32 (603+), DBATs are used to map kernel text and rodata RO. diff --git a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h index b0f764c827c0..0a1a3fc54e54 100644 --- a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h @@ -231,9 +231,10 @@ static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize) } /* patch sites */ -extern s32 patch__itlbmiss_linmem_top; +extern s32 patch__itlbmiss_linmem_top, patch__itlbmiss_linmem_top8; extern s32 patch__dtlbmiss_linmem_top, patch__dtlbmiss_immr_jmp; extern s32 patch__fixupdar_linmem_top; +extern s32 patch__dtlbmiss_romem_top, patch__dtlbmiss_romem_top8; extern s32 patch__itlbmiss_exit_1, patch__itlbmiss_exit_2; extern s32 patch__dtlbmiss_exit_1, patch__dtlbmiss_exit_2, patch__dtlbmiss_exit_3; diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S index 52c92913e39b..3f2d1afba2d1 100644 --- a/arch/powerpc/kernel/head_8xx.S +++ b/arch/powerpc/kernel/head_8xx.S @@ -292,6 +292,17 @@ SystemCall: */ EXCEPTION(0x1000, SoftEmu, program_check_exception, EXC_XFER_STD) +/* Called from DataStoreTLBMiss when perf TLB misses events are activated */ +#ifdef CONFIG_PERF_EVENTS + patch_site 0f, patch__dtlbmiss_perf +0: lwz r10, (dtlb_miss_counter - PAGE_OFFSET)@l(0) + addi r10, r10, 1 + stw r10, (dtlb_miss_counter - PAGE_OFFSET)@l(0) + mfspr r10, SPRN_SPRG_SCRATCH0 + mfspr r11, SPRN_SPRG_SCRATCH1 + rfi +#endif + . = 0x1100 /* * For the MPC8xx, this is a software tablewalk to load the instruction @@ -405,10 +416,20 @@ InstructionTLBMiss: #ifndef CONFIG_PIN_TLB_TEXT ITLBMissLinear: mtcr r11 +#ifdef CONFIG_STRICT_KERNEL_RWX + patch_site 0f, patch__itlbmiss_linmem_top8 + + mfspr r10, SPRN_SRR0 +0: subis r11, r10, (PAGE_OFFSET - 0x80000000)@ha + rlwinm r11, r11, 4, MI_PS8MEG ^ MI_PS512K + ori r11, r11, MI_PS512K | MI_SVALID + rlwinm r10, r10, 0, 0x0ff80000 /* 8xx supports max 256Mb RAM */ +#else /* Set 8M byte page and mark it valid */ li r11, MI_PS8MEG | MI_SVALID - mtspr SPRN_MI_TWC, r11 rlwinm r10, r10, 20, 0x0f800000 /* 8xx supports max 256Mb RAM */ +#endif + mtspr SPRN_MI_TWC, r11 ori r10, r10, 0xf0 | MI_SPS16K | _PAGE_SH | _PAGE_DIRTY | \ _PAGE_PRESENT mtspr SPRN_MI_RPN, r10 /* Update TLB entry */ @@ -494,16 +515,6 @@ DataStoreTLBMiss: rfi patch_site 0b, patch__dtlbmiss_exit_1 -#ifdef CONFIG_PERF_EVENTS - patch_site 0f, patch__dtlbmiss_perf -0: lwz r10, (dtlb_miss_counter - PAGE_OFFSET)@l(0) - addi r10, r10, 1 - stw r10, (dtlb_miss_counter - PAGE_OFFSET)@l(0) - mfspr r10, SPRN_SPRG_SCRATCH0 - mfspr r11, SPRN_SPRG_SCRATCH1 - rfi -#endif - DTLBMissIMMR: mtcr r11 /* Set 512k byte guarded page and mark it valid */ @@ -525,10 +536,29 @@ DTLBMissIMMR: DTLBMissLinear: mtcr r11 + rlwinm r10, r10, 20, 0x0f800000 /* 8xx supports max 256Mb RAM */ +#ifdef CONFIG_STRICT_KERNEL_RWX + patch_site 0f, patch__dtlbmiss_romem_top8 + +0: subis r11, r10, (PAGE_OFFSET - 0x80000000)@ha + rlwinm r11, r11, 0, 0xff800000 + neg r10, r11 + or r11, r11, r10 + rlwinm r11, r11, 4, MI_PS8MEG ^ MI_PS512K + ori r11, r11, MI_PS512K | MI_SVALID + mfspr r10, SPRN_MD_EPN + rlwinm r10, r10, 0, 0x0ff80000 /* 8xx supports max 256Mb RAM */ +#else /* Set 8M byte page and mark it valid */ li r11, MD_PS8MEG | MD_SVALID +#endif mtspr SPRN_MD_TWC, r11 - rlwinm r10, r10, 20, 0x0f800000 /* 8xx supports max 256Mb RAM */ +#ifdef CONFIG_STRICT_KERNEL_RWX + patch_site 0f, patch__dtlbmiss_romem_top + +0: subis r11, r10, 0 + rlwimi r10, r11, 11, _PAGE_RO +#endif ori r10, r10, 0xf0 | MD_SPS16K | _PAGE_SH | _PAGE_DIRTY | \ _PAGE_PRESENT mtspr SPRN_MD_RPN, r10 /* Update TLB entry */ diff --git a/arch/powerpc/mm/8xx_mmu.c b/arch/powerpc/mm/8xx_mmu.c index ce11cbaa25d8..fe1f6443d57f 100644 --- a/arch/powerpc/mm/8xx_mmu.c +++ b/arch/powerpc/mm/8xx_mmu.c @@ -94,11 +94,20 @@ static void __init mmu_mapin_immr(void) map_kernel_page(v + offset, p + offset, PAGE_KERNEL_NCG); } -static void __init mmu_patch_cmp_limit(s32 *site, unsigned long mapped) +static void mmu_patch_cmp_limit(s32 *site, unsigned long mapped) { modify_instruction_site(site, 0xffff, (unsigned long)__va(mapped) >> 16); } +static void mmu_patch_addis(s32 *site, long simm) +{ + unsigned int instr = *(unsigned int *)patch_site_addr(site); + + instr &= 0xffff0000; + instr |= ((unsigned long)simm) >> 16; + patch_instruction_site(site, instr); +} + unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) { unsigned long mapped; @@ -135,6 +144,26 @@ unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) return mapped; } +void mmu_mark_initmem_nx(void) +{ + if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX) && CONFIG_ETEXT_SHIFT < 23) + mmu_patch_addis(&patch__itlbmiss_linmem_top8, + -((long)_etext & ~(LARGE_PAGE_SIZE_8M - 1))); + if (!IS_ENABLED(CONFIG_PIN_TLB_TEXT)) + mmu_patch_cmp_limit(&patch__itlbmiss_linmem_top, __pa(_etext)); +} + +#ifdef CONFIG_STRICT_KERNEL_RWX +void mmu_mark_rodata_ro(void) +{ + if (CONFIG_DATA_SHIFT < 23) + mmu_patch_addis(&patch__dtlbmiss_romem_top8, + -__pa(((unsigned long)_sinittext) & + ~(LARGE_PAGE_SIZE_8M - 1))); + mmu_patch_addis(&patch__dtlbmiss_romem_top, -__pa(_sinittext)); +} +#endif + void __init setup_initial_memory_limit(phys_addr_t first_memblock_base, phys_addr_t first_memblock_size) { diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c index bc28995a37ea..41a3513cadc9 100644 --- a/arch/powerpc/mm/init_32.c +++ b/arch/powerpc/mm/init_32.c @@ -108,7 +108,7 @@ static void __init MMU_setup(void) __map_without_bats = 1; __map_without_ltlbs = 1; } - if (strict_kernel_rwx_enabled()) + if (strict_kernel_rwx_enabled() && !IS_ENABLED(CONFIG_PPC_8xx)) __map_without_ltlbs = 1; } diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index 98fc94affc29..74ff61dabcb1 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -166,7 +166,7 @@ static inline phys_addr_t v_block_mapped(unsigned long va) { return 0; } static inline unsigned long p_block_mapped(phys_addr_t pa) { return 0; } #endif -#if defined(CONFIG_PPC_BOOK3S_32) +#if defined(CONFIG_PPC_BOOK3S_32) || defined(CONFIG_PPC_8xx) void mmu_mark_initmem_nx(void); void mmu_mark_rodata_ro(void); #else From 8f54a6f7406ee3b7b528c3ff569f6a51295b2608 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 21 Feb 2019 19:08:52 +0000 Subject: [PATCH 152/200] powerpc/kconfig: make _etext and data areas alignment configurable on 8xx On 8xx, large pages (512kb or 8M) are used to map kernel linear memory. Aligning to 8M reduces TLB misses as only 8M pages are used in that case. We make 8M the default for data. This patchs allows the user to do it via Kconfig. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/Kconfig | 18 +++++++++++++++--- arch/powerpc/kernel/head_8xx.S | 4 ++-- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 43fa82e409bf..ad8e22827de0 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -722,7 +722,8 @@ config THREAD_SHIFT want. Only change this if you know what you are doing. config ETEXT_SHIFT_BOOL - bool "Set custom etext alignment" if STRICT_KERNEL_RWX && PPC_BOOK3S_32 + bool "Set custom etext alignment" if STRICT_KERNEL_RWX && \ + (PPC_BOOK3S_32 || PPC_8xx) depends on ADVANCED_OPTIONS help This option allows you to set the kernel end of text alignment. When @@ -734,6 +735,7 @@ config ETEXT_SHIFT_BOOL config ETEXT_SHIFT int "_etext shift" if ETEXT_SHIFT_BOOL range 17 28 if STRICT_KERNEL_RWX && PPC_BOOK3S_32 + range 19 23 if STRICT_KERNEL_RWX && PPC_8xx default 17 if STRICT_KERNEL_RWX && PPC_BOOK3S_32 default 19 if STRICT_KERNEL_RWX && PPC_8xx default PPC_PAGE_SHIFT @@ -741,8 +743,13 @@ config ETEXT_SHIFT On Book3S 32 (603+), IBATs are used to map kernel text. Smaller is the alignment, greater is the number of necessary IBATs. + On 8xx, large pages (512kb or 8M) are used to map kernel linear + memory. Aligning to 8M reduces TLB misses as only 8M pages are used + in that case. + config DATA_SHIFT_BOOL - bool "Set custom data alignment" if STRICT_KERNEL_RWX && PPC_BOOK3S_32 + bool "Set custom data alignment" if STRICT_KERNEL_RWX && \ + (PPC_BOOK3S_32 || PPC_8xx) depends on ADVANCED_OPTIONS help This option allows you to set the kernel data alignment. When @@ -755,13 +762,18 @@ config DATA_SHIFT int "Data shift" if DATA_SHIFT_BOOL default 24 if STRICT_KERNEL_RWX && PPC64 range 17 28 if STRICT_KERNEL_RWX && PPC_BOOK3S_32 + range 19 23 if STRICT_KERNEL_RWX && PPC_8xx default 22 if STRICT_KERNEL_RWX && PPC_BOOK3S_32 - default 19 if STRICT_KERNEL_RWX && PPC_8xx + default 23 if STRICT_KERNEL_RWX && PPC_8xx default PPC_PAGE_SHIFT help On Book3S 32 (603+), DBATs are used to map kernel text and rodata RO. Smaller is the alignment, greater is the number of necessary DBATs. + On 8xx, large pages (512kb or 8M) are used to map kernel linear + memory. Aligning to 8M reduces TLB misses as only 8M pages are used + in that case. + config FORCE_MAX_ZONEORDER int "Maximum zone order" range 8 9 if PPC64 && PPC_64K_PAGES diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S index 3f2d1afba2d1..50303d25cbc1 100644 --- a/arch/powerpc/kernel/head_8xx.S +++ b/arch/powerpc/kernel/head_8xx.S @@ -416,7 +416,7 @@ InstructionTLBMiss: #ifndef CONFIG_PIN_TLB_TEXT ITLBMissLinear: mtcr r11 -#ifdef CONFIG_STRICT_KERNEL_RWX +#if defined(CONFIG_STRICT_KERNEL_RWX) && CONFIG_ETEXT_SHIFT < 23 patch_site 0f, patch__itlbmiss_linmem_top8 mfspr r10, SPRN_SRR0 @@ -537,7 +537,7 @@ DTLBMissIMMR: DTLBMissLinear: mtcr r11 rlwinm r10, r10, 20, 0x0f800000 /* 8xx supports max 256Mb RAM */ -#ifdef CONFIG_STRICT_KERNEL_RWX +#if defined(CONFIG_STRICT_KERNEL_RWX) && CONFIG_DATA_SHIFT < 23 patch_site 0f, patch__dtlbmiss_romem_top8 0: subis r11, r10, (PAGE_OFFSET - 0x80000000)@ha From fb0b0a73b223fc113e961b1d921322844e9c30d9 Mon Sep 17 00:00:00 2001 From: Andrew Donnellan Date: Fri, 22 Feb 2019 11:40:46 +1100 Subject: [PATCH 153/200] powerpc: Enable kcov kcov provides kernel coverage data that's useful for fuzzing tools like syzkaller. Wire up kcov support on powerpc. Disable kcov instrumentation on the same files where we currently disable gcov and UBSan instrumentation, plus some additional exclusions which appear necessary to boot on book3e machines. Signed-off-by: Andrew Donnellan Acked-by: Dmitry Vyukov Tested-by: Daniel Axtens # e6500 Signed-off-by: Michael Ellerman --- arch/powerpc/Kconfig | 1 + arch/powerpc/kernel/Makefile | 12 +++++++++++- arch/powerpc/kernel/trace/Makefile | 3 ++- arch/powerpc/kernel/vdso32/Makefile | 1 + arch/powerpc/kernel/vdso64/Makefile | 1 + arch/powerpc/mm/Makefile | 5 +++++ arch/powerpc/xmon/Makefile | 1 + 7 files changed, 22 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index ad8e22827de0..5b7945a7bd41 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -130,6 +130,7 @@ config PPC select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL + select ARCH_HAS_KCOV select ARCH_HAS_PHYS_TO_DMA select ARCH_HAS_PMEM_API if PPC64 select ARCH_HAS_PTE_SPECIAL diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 8809e287b80d..cddadccf551d 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -143,19 +143,29 @@ endif obj-$(CONFIG_EPAPR_PARAVIRT) += epapr_paravirt.o epapr_hcalls.o obj-$(CONFIG_KVM_GUEST) += kvm.o kvm_emul.o -# Disable GCOV & sanitizers in odd or sensitive code +# Disable GCOV, KCOV & sanitizers in odd or sensitive code GCOV_PROFILE_prom_init.o := n +KCOV_INSTRUMENT_prom_init.o := n UBSAN_SANITIZE_prom_init.o := n GCOV_PROFILE_machine_kexec_64.o := n +KCOV_INSTRUMENT_machine_kexec_64.o := n UBSAN_SANITIZE_machine_kexec_64.o := n GCOV_PROFILE_machine_kexec_32.o := n +KCOV_INSTRUMENT_machine_kexec_32.o := n UBSAN_SANITIZE_machine_kexec_32.o := n GCOV_PROFILE_kprobes.o := n +KCOV_INSTRUMENT_kprobes.o := n UBSAN_SANITIZE_kprobes.o := n GCOV_PROFILE_kprobes-ftrace.o := n +KCOV_INSTRUMENT_kprobes-ftrace.o := n UBSAN_SANITIZE_kprobes-ftrace.o := n UBSAN_SANITIZE_vdso.o := n +# Necessary for booting with kcov enabled on book3e machines +KCOV_INSTRUMENT_cputable.o := n +KCOV_INSTRUMENT_setup_64.o := n +KCOV_INSTRUMENT_paca.o := n + extra-$(CONFIG_PPC_FPU) += fpu.o extra-$(CONFIG_ALTIVEC) += vector.o extra-$(CONFIG_PPC64) += entry_64.o diff --git a/arch/powerpc/kernel/trace/Makefile b/arch/powerpc/kernel/trace/Makefile index b1725ad3e13d..858503775c58 100644 --- a/arch/powerpc/kernel/trace/Makefile +++ b/arch/powerpc/kernel/trace/Makefile @@ -23,6 +23,7 @@ obj-$(CONFIG_TRACING) += trace_clock.o obj-$(CONFIG_PPC64) += $(obj64-y) obj-$(CONFIG_PPC32) += $(obj32-y) -# Disable GCOV & sanitizers in odd or sensitive code +# Disable GCOV, KCOV & sanitizers in odd or sensitive code GCOV_PROFILE_ftrace.o := n +KCOV_INSTRUMENT_ftrace.o := n UBSAN_SANITIZE_ftrace.o := n diff --git a/arch/powerpc/kernel/vdso32/Makefile b/arch/powerpc/kernel/vdso32/Makefile index 50112d4473bb..ce199f6e4256 100644 --- a/arch/powerpc/kernel/vdso32/Makefile +++ b/arch/powerpc/kernel/vdso32/Makefile @@ -23,6 +23,7 @@ targets := $(obj-vdso32) vdso32.so vdso32.so.dbg obj-vdso32 := $(addprefix $(obj)/, $(obj-vdso32)) GCOV_PROFILE := n +KCOV_INSTRUMENT := n UBSAN_SANITIZE := n ccflags-y := -shared -fno-common -fno-builtin diff --git a/arch/powerpc/kernel/vdso64/Makefile b/arch/powerpc/kernel/vdso64/Makefile index 69cecb346269..28e7d112aa2f 100644 --- a/arch/powerpc/kernel/vdso64/Makefile +++ b/arch/powerpc/kernel/vdso64/Makefile @@ -9,6 +9,7 @@ targets := $(obj-vdso64) vdso64.so vdso64.so.dbg obj-vdso64 := $(addprefix $(obj)/, $(obj-vdso64)) GCOV_PROFILE := n +KCOV_INSTRUMENT := n UBSAN_SANITIZE := n ccflags-y := -shared -fno-common -fno-builtin diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index ee1efa3b3382..d52ec118e09d 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -47,3 +47,8 @@ obj-$(CONFIG_PPC_COPRO_BASE) += copro_fault.o obj-$(CONFIG_SPAPR_TCE_IOMMU) += mmu_context_iommu.o obj-$(CONFIG_PPC_PTDUMP) += ptdump/ obj-$(CONFIG_PPC_MEM_KEYS) += pkeys.o + +# Disable kcov instrumentation on sensitive code +# This is necessary for booting with kcov enabled on book3e machines +KCOV_INSTRUMENT_tlb_nohash.o := n +KCOV_INSTRUMENT_fsl_booke_mmu.o := n diff --git a/arch/powerpc/xmon/Makefile b/arch/powerpc/xmon/Makefile index 878f9c1d3615..3050f9323254 100644 --- a/arch/powerpc/xmon/Makefile +++ b/arch/powerpc/xmon/Makefile @@ -5,6 +5,7 @@ subdir-ccflags-y := $(call cc-disable-warning, builtin-requires-header) GCOV_PROFILE := n +KCOV_INSTRUMENT := n UBSAN_SANITIZE := n # Disable ftrace for the entire directory From e7fda7e569e1776d4dccbcef52d34882b62b0654 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 29 Nov 2018 17:42:24 +1100 Subject: [PATCH 154/200] powerpc/64s: Remove MSR_RI optimisation in system_call_exit() Currently in system_call_exit() we have an optimisation where we disable MSR_RI (recoverable interrupt) and MSR_EE (external interrupt enable) in a single mtmsrd instruction. Unfortunately this will no longer work with THREAD_INFO_IN_TASK, because then the load of TI_FLAGS might fault and faulting with MSR_RI clear is treated as an unrecoverable exception which leads to a panic(). So change the code to only clear MSR_EE prior to loading TI_FLAGS, leaving the clear of MSR_RI until later. We have some latitude in where do the clear of MSR_RI. A bit of experimentation has shown that this location gives the least slow down. This still causes a noticeable slow down in our null_syscall performance. On a Power9 DD2.2: Before After Delta Delta % 955 cycles 999 cycles -44 -4.6% On the plus side this does simplify the code somewhat, because we don't have to reenable MSR_RI on the restore_math() or syscall_exit_work() paths which was necessitated previously by the optimisation. Reviewed-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/entry_64.S | 34 ++++++++++++++-------------------- 1 file changed, 14 insertions(+), 20 deletions(-) diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index a2c168b395d2..c17c1bed6148 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -236,18 +236,14 @@ system_call_exit: /* * Disable interrupts so current_thread_info()->flags can't change, * and so that we don't get interrupted after loading SRR0/1. + * + * Leave MSR_RI enabled for now, because with THREAD_INFO_IN_TASK we + * could fault on the load of the TI_FLAGS below. */ #ifdef CONFIG_PPC_BOOK3E wrteei 0 #else - /* - * For performance reasons we clear RI the same time that we - * clear EE. We only need to clear RI just before we restore r13 - * below, but batching it with EE saves us one expensive mtmsrd call. - * We have to be careful to restore RI if we branch anywhere from - * here (eg syscall_exit_work). - */ - li r11,0 + li r11,MSR_RI mtmsrd r11,1 #endif /* CONFIG_PPC_BOOK3E */ @@ -263,15 +259,7 @@ system_call_exit: bne 3f #endif 2: addi r3,r1,STACK_FRAME_OVERHEAD -#ifdef CONFIG_PPC_BOOK3S - li r10,MSR_RI - mtmsrd r10,1 /* Restore RI */ -#endif bl restore_math -#ifdef CONFIG_PPC_BOOK3S - li r11,0 - mtmsrd r11,1 -#endif ld r8,_MSR(r1) ld r3,RESULT(r1) li r11,-MAX_ERRNO @@ -287,6 +275,16 @@ END_FTR_SECTION_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS) andi. r6,r8,MSR_PR ld r4,_LINK(r1) +#ifdef CONFIG_PPC_BOOK3S + /* + * Clear MSR_RI, MSR_EE is already and remains disabled. We could do + * this later, but testing shows that doing it here causes less slow + * down than doing it closer to the rfid. + */ + li r11,0 + mtmsrd r11,1 +#endif + beq- 1f ACCOUNT_CPU_USER_EXIT(r13, r11, r12) @@ -363,10 +361,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) b .Lsyscall_exit .Lsyscall_exit_work: -#ifdef CONFIG_PPC_BOOK3S - li r10,MSR_RI - mtmsrd r10,1 /* Restore RI */ -#endif /* If TIF_RESTOREALL is set, don't scribble on either r3 or ccr. If TIF_NOERROR is set, just save r3 as it is. */ From eafd825ed7106ac1ca84d20d8b3b86826f164df9 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 22 Feb 2019 23:58:21 +1100 Subject: [PATCH 155/200] powerpc/64: Simplify __secondary_start paca->kstack handling In __secondary_start() we load the thread_info of the idle task of the secondary CPU from current_set[cpu], and then convert it into a stack pointer before storing that back to paca->kstack. As pointed out in commit f761622e5943 ("powerpc: Initialise paca->kstack before early_setup_secondary") it's important that we initialise paca->kstack before calling the MMU setup code, in particular slb_initialize(), because it will bolt the SLB entry for the kstack into the SLB. However we have already setup paca->kstack in cpu_idle_thread_init(), since commit 3b5750644b2f ("[POWERPC] Bolt in SLB entry for kernel stack on secondary cpus") (May 2008). It's also in cpu_idle_thread_init() that we initialise current_set[cpu] with the thread_info pointer, so there is no issue of the timing being different between the two. Therefore the initialisation of paca->kstack in __setup_secondary() is completely redundant, so remove it. This has the added benefit of removing code that runs in real mode, and is therefore restricted by the RMO, and so opens the way for us to enable THREAD_INFO_IN_TASK. Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/head_64.S | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S index 4898e9491a1c..3fad8d499767 100644 --- a/arch/powerpc/kernel/head_64.S +++ b/arch/powerpc/kernel/head_64.S @@ -801,21 +801,19 @@ __secondary_start: /* Set thread priority to MEDIUM */ HMT_MEDIUM - /* Initialize the kernel stack */ - LOAD_REG_ADDR(r3, current_set) - sldi r28,r24,3 /* get current_set[cpu#] */ - ldx r14,r3,r28 - addi r14,r14,THREAD_SIZE-STACK_FRAME_OVERHEAD - std r14,PACAKSAVE(r13) - - /* Do early setup for that CPU (SLB and hash table pointer) */ + /* + * Do early setup for this CPU, in particular initialising the MMU so we + * can turn it on below. This is a call to C, which is OK, we're still + * running on the emergency stack. + */ bl early_setup_secondary /* - * setup the new stack pointer, but *don't* use this until - * translation is on. + * The primary has initialized our kernel stack for us in the paca, grab + * it and put it in r1. We must *not* use it until we turn on the MMU + * below, because it may not be inside the RMO. */ - mr r1, r14 + ld r1, PACAKSAVE(r13) /* Clear backchain so we get nice backtraces */ li r7,0 From c8e409a33cf8df5060064a70df3e1350841371e1 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 31 Jan 2019 10:08:44 +0000 Subject: [PATCH 156/200] powerpc/irq: use memblock functions returning virtual address Since only the virtual address of allocated blocks is used, lets use functions returning directly virtual address. Those functions have the advantage of also zeroing the block. Suggested-by: Mike Rapoport Signed-off-by: Christophe Leroy Acked-by: Mike Rapoport Reviewed-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/irq.c | 5 ----- arch/powerpc/kernel/setup_32.c | 26 ++++++++++++++++---------- arch/powerpc/kernel/setup_64.c | 19 +++++++------------ 3 files changed, 23 insertions(+), 27 deletions(-) diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index bb299613a462..4a5dd8800946 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -725,18 +725,15 @@ void exc_lvl_ctx_init(void) #endif #endif - memset((void *)critirq_ctx[cpu_nr], 0, THREAD_SIZE); tp = critirq_ctx[cpu_nr]; tp->cpu = cpu_nr; tp->preempt_count = 0; #ifdef CONFIG_BOOKE - memset((void *)dbgirq_ctx[cpu_nr], 0, THREAD_SIZE); tp = dbgirq_ctx[cpu_nr]; tp->cpu = cpu_nr; tp->preempt_count = 0; - memset((void *)mcheckirq_ctx[cpu_nr], 0, THREAD_SIZE); tp = mcheckirq_ctx[cpu_nr]; tp->cpu = cpu_nr; tp->preempt_count = HARDIRQ_OFFSET; @@ -754,12 +751,10 @@ void irq_ctx_init(void) int i; for_each_possible_cpu(i) { - memset((void *)softirq_ctx[i], 0, THREAD_SIZE); tp = softirq_ctx[i]; tp->cpu = i; klp_init_thread_info(tp); - memset((void *)hardirq_ctx[i], 0, THREAD_SIZE); tp = hardirq_ctx[i]; tp->cpu = i; klp_init_thread_info(tp); diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c index 947f904688b0..1f0b7629c1a6 100644 --- a/arch/powerpc/kernel/setup_32.c +++ b/arch/powerpc/kernel/setup_32.c @@ -196,6 +196,17 @@ static int __init ppc_init(void) } arch_initcall(ppc_init); +static void *__init alloc_stack(void) +{ + void *ptr = memblock_alloc(THREAD_SIZE, THREAD_SIZE); + + if (!ptr) + panic("cannot allocate %d bytes for stack at %pS\n", + THREAD_SIZE, (void *)_RET_IP_); + + return ptr; +} + void __init irqstack_early_init(void) { unsigned int i; @@ -203,10 +214,8 @@ void __init irqstack_early_init(void) /* interrupt stacks must be in lowmem, we get that for free on ppc32 * as the memblock is limited to lowmem by default */ for_each_possible_cpu(i) { - softirq_ctx[i] = (struct thread_info *) - __va(memblock_phys_alloc(THREAD_SIZE, THREAD_SIZE)); - hardirq_ctx[i] = (struct thread_info *) - __va(memblock_phys_alloc(THREAD_SIZE, THREAD_SIZE)); + softirq_ctx[i] = alloc_stack(); + hardirq_ctx[i] = alloc_stack(); } } @@ -224,13 +233,10 @@ void __init exc_lvl_early_init(void) hw_cpu = 0; #endif - critirq_ctx[hw_cpu] = (struct thread_info *) - __va(memblock_phys_alloc(THREAD_SIZE, THREAD_SIZE)); + critirq_ctx[hw_cpu] = alloc_stack(); #ifdef CONFIG_BOOKE - dbgirq_ctx[hw_cpu] = (struct thread_info *) - __va(memblock_phys_alloc(THREAD_SIZE, THREAD_SIZE)); - mcheckirq_ctx[hw_cpu] = (struct thread_info *) - __va(memblock_phys_alloc(THREAD_SIZE, THREAD_SIZE)); + dbgirq_ctx[hw_cpu] = alloc_stack(); + mcheckirq_ctx[hw_cpu] = alloc_stack(); #endif } } diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 236c1151a3a7..080dd515d587 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -634,19 +634,17 @@ __init u64 ppc64_bolted_size(void) static void *__init alloc_stack(unsigned long limit, int cpu) { - unsigned long pa; + void *ptr; BUILD_BUG_ON(STACK_INT_FRAME_SIZE % 16); - pa = memblock_alloc_base_nid(THREAD_SIZE, THREAD_SIZE, limit, - early_cpu_to_node(cpu), MEMBLOCK_NONE); - if (!pa) { - pa = memblock_alloc_base(THREAD_SIZE, THREAD_SIZE, limit); - if (!pa) - panic("cannot allocate stacks"); - } + ptr = memblock_alloc_try_nid(THREAD_SIZE, THREAD_SIZE, + MEMBLOCK_LOW_LIMIT, limit, + early_cpu_to_node(cpu)); + if (!ptr) + panic("cannot allocate stacks"); - return __va(pa); + return ptr; } void __init irqstack_early_init(void) @@ -739,20 +737,17 @@ void __init emergency_stack_init(void) struct thread_info *ti; ti = alloc_stack(limit, i); - memset(ti, 0, THREAD_SIZE); emerg_stack_init_thread_info(ti, i); paca_ptrs[i]->emergency_sp = (void *)ti + THREAD_SIZE; #ifdef CONFIG_PPC_BOOK3S_64 /* emergency stack for NMI exception handling. */ ti = alloc_stack(limit, i); - memset(ti, 0, THREAD_SIZE); emerg_stack_init_thread_info(ti, i); paca_ptrs[i]->nmi_emergency_sp = (void *)ti + THREAD_SIZE; /* emergency stack for machine check exception handling. */ ti = alloc_stack(limit, i); - memset(ti, 0, THREAD_SIZE); emerg_stack_init_thread_info(ti, i); paca_ptrs[i]->mc_emergency_sp = (void *)ti + THREAD_SIZE; #endif From 92ab45c5f2db0caa68243be8cfa5e390a1de8c3a Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 31 Jan 2019 10:08:48 +0000 Subject: [PATCH 157/200] powerpc: Avoid circular header inclusion in mmu-hash.h When activating CONFIG_THREAD_INFO_IN_TASK, linux/sched.h includes asm/current.h. This generates a circular dependency. To avoid that, asm/processor.h shall not be included in mmu-hash.h. In order to do that, this patch moves into a new header called asm/task_size_64/32.h all the TASK_SIZE related constants, which can then be included in mmu-hash.h directly. Signed-off-by: Christophe Leroy Reviewed-by: Nicholas Piggin [mpe: Split out all the TASK_SIZE constants not just 64-bit ones] Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/64/mmu-hash.h | 2 +- arch/powerpc/include/asm/processor.h | 102 ++---------------- arch/powerpc/include/asm/task_size_32.h | 21 ++++ arch/powerpc/include/asm/task_size_64.h | 79 ++++++++++++++ arch/powerpc/kvm/book3s_hv_hmi.c | 1 + 5 files changed, 108 insertions(+), 97 deletions(-) create mode 100644 arch/powerpc/include/asm/task_size_32.h create mode 100644 arch/powerpc/include/asm/task_size_64.h diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h index 12e522807f9f..a28a28079edb 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h @@ -23,7 +23,7 @@ */ #include #include -#include +#include #include /* diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index e8682122ea3d..2edab34ee288 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -77,106 +77,16 @@ extern int _chrp_type; #ifdef __KERNEL__ +#ifdef CONFIG_PPC64 +#include +#else +#include +#endif + struct task_struct; void start_thread(struct pt_regs *regs, unsigned long fdptr, unsigned long sp); void release_thread(struct task_struct *); -#ifdef CONFIG_PPC32 - -#if CONFIG_TASK_SIZE > CONFIG_KERNEL_START -#error User TASK_SIZE overlaps with KERNEL_START address -#endif -#define TASK_SIZE (CONFIG_TASK_SIZE) - -/* This decides where the kernel will search for a free chunk of vm - * space during mmap's. - */ -#define TASK_UNMAPPED_BASE (TASK_SIZE / 8 * 3) -#endif - -#ifdef CONFIG_PPC64 -/* - * 64-bit user address space can have multiple limits - * For now supported values are: - */ -#define TASK_SIZE_64TB (0x0000400000000000UL) -#define TASK_SIZE_128TB (0x0000800000000000UL) -#define TASK_SIZE_512TB (0x0002000000000000UL) -#define TASK_SIZE_1PB (0x0004000000000000UL) -#define TASK_SIZE_2PB (0x0008000000000000UL) -/* - * With 52 bits in the address we can support - * upto 4PB of range. - */ -#define TASK_SIZE_4PB (0x0010000000000000UL) - -/* - * For now 512TB is only supported with book3s and 64K linux page size. - */ -#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_PPC_64K_PAGES) -/* - * Max value currently used: - */ -#define TASK_SIZE_USER64 TASK_SIZE_4PB -#define DEFAULT_MAP_WINDOW_USER64 TASK_SIZE_128TB -#define TASK_CONTEXT_SIZE TASK_SIZE_512TB -#else -#define TASK_SIZE_USER64 TASK_SIZE_64TB -#define DEFAULT_MAP_WINDOW_USER64 TASK_SIZE_64TB -/* - * We don't need to allocate extended context ids for 4K page size, because - * we limit the max effective address on this config to 64TB. - */ -#define TASK_CONTEXT_SIZE TASK_SIZE_64TB -#endif - -/* - * 32-bit user address space is 4GB - 1 page - * (this 1 page is needed so referencing of 0xFFFFFFFF generates EFAULT - */ -#define TASK_SIZE_USER32 (0x0000000100000000UL - (1*PAGE_SIZE)) - -#define TASK_SIZE_OF(tsk) (test_tsk_thread_flag(tsk, TIF_32BIT) ? \ - TASK_SIZE_USER32 : TASK_SIZE_USER64) -#define TASK_SIZE TASK_SIZE_OF(current) -/* This decides where the kernel will search for a free chunk of vm - * space during mmap's. - */ -#define TASK_UNMAPPED_BASE_USER32 (PAGE_ALIGN(TASK_SIZE_USER32 / 4)) -#define TASK_UNMAPPED_BASE_USER64 (PAGE_ALIGN(DEFAULT_MAP_WINDOW_USER64 / 4)) - -#define TASK_UNMAPPED_BASE ((is_32bit_task()) ? \ - TASK_UNMAPPED_BASE_USER32 : TASK_UNMAPPED_BASE_USER64 ) -#endif - -/* - * Initial task size value for user applications. For book3s 64 we start - * with 128TB and conditionally enable upto 512TB - */ -#ifdef CONFIG_PPC_BOOK3S_64 -#define DEFAULT_MAP_WINDOW ((is_32bit_task()) ? \ - TASK_SIZE_USER32 : DEFAULT_MAP_WINDOW_USER64) -#else -#define DEFAULT_MAP_WINDOW TASK_SIZE -#endif - -#ifdef __powerpc64__ - -#define STACK_TOP_USER64 DEFAULT_MAP_WINDOW_USER64 -#define STACK_TOP_USER32 TASK_SIZE_USER32 - -#define STACK_TOP (is_32bit_task() ? \ - STACK_TOP_USER32 : STACK_TOP_USER64) - -#define STACK_TOP_MAX TASK_SIZE_USER64 - -#else /* __powerpc64__ */ - -#define STACK_TOP TASK_SIZE -#define STACK_TOP_MAX STACK_TOP - -#endif /* __powerpc64__ */ - typedef struct { unsigned long seg; } mm_segment_t; diff --git a/arch/powerpc/include/asm/task_size_32.h b/arch/powerpc/include/asm/task_size_32.h new file mode 100644 index 000000000000..de7290ee770f --- /dev/null +++ b/arch/powerpc/include/asm/task_size_32.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_POWERPC_TASK_SIZE_32_H +#define _ASM_POWERPC_TASK_SIZE_32_H + +#if CONFIG_TASK_SIZE > CONFIG_KERNEL_START +#error User TASK_SIZE overlaps with KERNEL_START address +#endif + +#define TASK_SIZE (CONFIG_TASK_SIZE) + +/* + * This decides where the kernel will search for a free chunk of vm space during + * mmap's. + */ +#define TASK_UNMAPPED_BASE (TASK_SIZE / 8 * 3) + +#define DEFAULT_MAP_WINDOW TASK_SIZE +#define STACK_TOP TASK_SIZE +#define STACK_TOP_MAX STACK_TOP + +#endif /* _ASM_POWERPC_TASK_SIZE_32_H */ diff --git a/arch/powerpc/include/asm/task_size_64.h b/arch/powerpc/include/asm/task_size_64.h new file mode 100644 index 000000000000..eab4779f6b84 --- /dev/null +++ b/arch/powerpc/include/asm/task_size_64.h @@ -0,0 +1,79 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_POWERPC_TASK_SIZE_64_H +#define _ASM_POWERPC_TASK_SIZE_64_H + +/* + * 64-bit user address space can have multiple limits + * For now supported values are: + */ +#define TASK_SIZE_64TB (0x0000400000000000UL) +#define TASK_SIZE_128TB (0x0000800000000000UL) +#define TASK_SIZE_512TB (0x0002000000000000UL) +#define TASK_SIZE_1PB (0x0004000000000000UL) +#define TASK_SIZE_2PB (0x0008000000000000UL) + +/* + * With 52 bits in the address we can support up to 4PB of range. + */ +#define TASK_SIZE_4PB (0x0010000000000000UL) + +/* + * For now 512TB is only supported with book3s and 64K linux page size. + */ +#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_PPC_64K_PAGES) +/* + * Max value currently used: + */ +#define TASK_SIZE_USER64 TASK_SIZE_4PB +#define DEFAULT_MAP_WINDOW_USER64 TASK_SIZE_128TB +#define TASK_CONTEXT_SIZE TASK_SIZE_512TB +#else +#define TASK_SIZE_USER64 TASK_SIZE_64TB +#define DEFAULT_MAP_WINDOW_USER64 TASK_SIZE_64TB + +/* + * We don't need to allocate extended context ids for 4K page size, because we + * limit the max effective address on this config to 64TB. + */ +#define TASK_CONTEXT_SIZE TASK_SIZE_64TB +#endif + +/* + * 32-bit user address space is 4GB - 1 page + * (this 1 page is needed so referencing of 0xFFFFFFFF generates EFAULT + */ +#define TASK_SIZE_USER32 (0x0000000100000000UL - (1 * PAGE_SIZE)) + +#define TASK_SIZE_OF(tsk) \ + (test_tsk_thread_flag(tsk, TIF_32BIT) ? TASK_SIZE_USER32 : \ + TASK_SIZE_USER64) + +#define TASK_SIZE TASK_SIZE_OF(current) + +#define TASK_UNMAPPED_BASE_USER32 (PAGE_ALIGN(TASK_SIZE_USER32 / 4)) +#define TASK_UNMAPPED_BASE_USER64 (PAGE_ALIGN(DEFAULT_MAP_WINDOW_USER64 / 4)) + +/* + * This decides where the kernel will search for a free chunk of vm space during + * mmap's. + */ +#define TASK_UNMAPPED_BASE \ + ((is_32bit_task()) ? TASK_UNMAPPED_BASE_USER32 : TASK_UNMAPPED_BASE_USER64) + +/* + * Initial task size value for user applications. For book3s 64 we start + * with 128TB and conditionally enable upto 512TB + */ +#ifdef CONFIG_PPC_BOOK3S_64 +#define DEFAULT_MAP_WINDOW \ + ((is_32bit_task()) ? TASK_SIZE_USER32 : DEFAULT_MAP_WINDOW_USER64) +#else +#define DEFAULT_MAP_WINDOW TASK_SIZE +#endif + +#define STACK_TOP_USER64 DEFAULT_MAP_WINDOW_USER64 +#define STACK_TOP_USER32 TASK_SIZE_USER32 +#define STACK_TOP_MAX TASK_SIZE_USER64 +#define STACK_TOP (is_32bit_task() ? STACK_TOP_USER32 : STACK_TOP_USER64) + +#endif /* _ASM_POWERPC_TASK_SIZE_64_H */ diff --git a/arch/powerpc/kvm/book3s_hv_hmi.c b/arch/powerpc/kvm/book3s_hv_hmi.c index e3f738eb1cac..64b5011475c7 100644 --- a/arch/powerpc/kvm/book3s_hv_hmi.c +++ b/arch/powerpc/kvm/book3s_hv_hmi.c @@ -24,6 +24,7 @@ #include #include #include +#include void wait_for_subcore_guest_exit(void) { From 054860897cd35a4e9cec953ae955b429e31e74f7 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 31 Jan 2019 10:08:50 +0000 Subject: [PATCH 158/200] powerpc: Only use task_struct 'cpu' field on SMP When moving to CONFIG_THREAD_INFO_IN_TASK, the thread_info 'cpu' field gets moved into task_struct and only defined when CONFIG_SMP is set. This patch ensures that TI_CPU is only used when CONFIG_SMP is set and that task_struct 'cpu' field is not used directly out of SMP code. Signed-off-by: Christophe Leroy Reviewed-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/head_fsl_booke.S | 2 ++ arch/powerpc/kernel/misc_32.S | 4 ++++ arch/powerpc/xmon/xmon.c | 2 +- 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S index 2386ce2a9c6e..2c21e8642a00 100644 --- a/arch/powerpc/kernel/head_fsl_booke.S +++ b/arch/powerpc/kernel/head_fsl_booke.S @@ -243,8 +243,10 @@ set_ivor: li r0,0 stwu r0,THREAD_SIZE-STACK_FRAME_OVERHEAD(r1) +#ifdef CONFIG_SMP CURRENT_THREAD_INFO(r22, r1) stw r24, TI_CPU(r22) +#endif bl early_init diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S index 57d2ffb2d45c..02b8cdd73792 100644 --- a/arch/powerpc/kernel/misc_32.S +++ b/arch/powerpc/kernel/misc_32.S @@ -183,10 +183,14 @@ _GLOBAL(low_choose_750fx_pll) or r4,r4,r5 mtspr SPRN_HID1,r4 +#ifdef CONFIG_SMP /* Store new HID1 image */ CURRENT_THREAD_INFO(r6, r1) lwz r6,TI_CPU(r6) slwi r6,r6,2 +#else + li r6, 0 +#endif addis r6,r6,nap_save_hid1@ha stw r4,nap_save_hid1@l(r6) diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index 757b8499aba2..a0f44f992360 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -2997,7 +2997,7 @@ static void show_task(struct task_struct *tsk) printf("%px %016lx %6d %6d %c %2d %s\n", tsk, tsk->thread.ksp, tsk->pid, rcu_dereference(tsk->parent)->pid, - state, task_thread_info(tsk)->cpu, + state, task_cpu(tsk), tsk->comm); } From 018cce33c5e62dda265df8ae0ddf7f3a3357ad1f Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 31 Jan 2019 10:08:52 +0000 Subject: [PATCH 159/200] powerpc: prep stack walkers for THREAD_INFO_IN_TASK [text copied from commit 9bbd4c56b0b6 ("arm64: prep stack walkers for THREAD_INFO_IN_TASK")] When CONFIG_THREAD_INFO_IN_TASK is selected, task stacks may be freed before a task is destroyed. To account for this, the stacks are refcounted, and when manipulating the stack of another task, it is necessary to get/put the stack to ensure it isn't freed and/or re-used while we do so. This patch reworks the powerpc stack walking code to account for this. When CONFIG_THREAD_INFO_IN_TASK is not selected these perform no refcounting, and this should only be a structural change that does not affect behaviour. Acked-by: Mark Rutland Signed-off-by: Christophe Leroy Reviewed-by: Nicholas Piggin [mpe: Move try_get_task_stack() below tsk == NULL check in show_stack()] Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/process.c | 26 +++++++++++++++++++++++--- arch/powerpc/kernel/stacktrace.c | 29 ++++++++++++++++++++++++++--- 2 files changed, 49 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 71bad4b6f80d..791bd8ea475d 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -2027,7 +2027,7 @@ int validate_sp(unsigned long sp, struct task_struct *p, EXPORT_SYMBOL(validate_sp); -unsigned long get_wchan(struct task_struct *p) +static unsigned long __get_wchan(struct task_struct *p) { unsigned long ip, sp; int count = 0; @@ -2053,6 +2053,20 @@ unsigned long get_wchan(struct task_struct *p) return 0; } +unsigned long get_wchan(struct task_struct *p) +{ + unsigned long ret; + + if (!try_get_task_stack(p)) + return 0; + + ret = __get_wchan(p); + + put_task_stack(p); + + return ret; +} + static int kstack_depth_to_print = CONFIG_PRINT_STACK_DEPTH; void show_stack(struct task_struct *tsk, unsigned long *stack) @@ -2067,9 +2081,13 @@ void show_stack(struct task_struct *tsk, unsigned long *stack) int curr_frame = 0; #endif - sp = (unsigned long) stack; if (tsk == NULL) tsk = current; + + if (!try_get_task_stack(tsk)) + return; + + sp = (unsigned long) stack; if (sp == 0) { if (tsk == current) sp = current_stack_pointer(); @@ -2081,7 +2099,7 @@ void show_stack(struct task_struct *tsk, unsigned long *stack) printk("Call Trace:\n"); do { if (!validate_sp(sp, tsk, STACK_FRAME_OVERHEAD)) - return; + break; stack = (unsigned long *) sp; newsp = stack[0]; @@ -2121,6 +2139,8 @@ void show_stack(struct task_struct *tsk, unsigned long *stack) sp = newsp; } while (count++ < kstack_depth_to_print); + + put_task_stack(tsk); } #ifdef CONFIG_PPC64 diff --git a/arch/powerpc/kernel/stacktrace.c b/arch/powerpc/kernel/stacktrace.c index cf31ce6c1f53..f958f3bcba04 100644 --- a/arch/powerpc/kernel/stacktrace.c +++ b/arch/powerpc/kernel/stacktrace.c @@ -67,12 +67,17 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) { unsigned long sp; + if (!try_get_task_stack(tsk)) + return; + if (tsk == current) sp = current_stack_pointer(); else sp = tsk->thread.ksp; save_context_stack(trace, sp, tsk, 0); + + put_task_stack(tsk); } EXPORT_SYMBOL_GPL(save_stack_trace_tsk); @@ -90,9 +95,8 @@ EXPORT_SYMBOL_GPL(save_stack_trace_regs); * * If the task is not 'current', the caller *must* ensure the task is inactive. */ -int -save_stack_trace_tsk_reliable(struct task_struct *tsk, - struct stack_trace *trace) +static int __save_stack_trace_tsk_reliable(struct task_struct *tsk, + struct stack_trace *trace) { unsigned long sp; unsigned long newsp; @@ -197,6 +201,25 @@ save_stack_trace_tsk_reliable(struct task_struct *tsk, } return 0; } + +int save_stack_trace_tsk_reliable(struct task_struct *tsk, + struct stack_trace *trace) +{ + int ret; + + /* + * If the task doesn't have a stack (e.g., a zombie), the stack is + * "reliably" empty. + */ + if (!try_get_task_stack(tsk)) + return 0; + + ret = __save_stack_trace_tsk_reliable(tsk, trace); + + put_task_stack(tsk); + + return ret; +} EXPORT_SYMBOL_GPL(save_stack_trace_tsk_reliable); #endif /* CONFIG_HAVE_RELIABLE_STACKTRACE */ From 8c1fc5abdccfb36102fa9647084eeb8c70e32562 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 31 Jan 2019 10:08:54 +0000 Subject: [PATCH 160/200] powerpc: Rename THREAD_INFO to TASK_STACK This patch renames THREAD_INFO to TASK_STACK, because it is in fact the offset of the pointer to the stack in task_struct so this pointer will not be impacted by the move of THREAD_INFO. Also make it available on 64-bit, as we'll need it there when we activate THREAD_INFO_IN_TASK. Signed-off-by: Christophe Leroy Reviewed-by: Nicholas Piggin [mpe: Make available on 64-bit] Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/asm-offsets.c | 2 +- arch/powerpc/kernel/entry_32.S | 2 +- arch/powerpc/kernel/head_32.S | 2 +- arch/powerpc/kernel/head_40x.S | 4 ++-- arch/powerpc/kernel/head_8xx.S | 2 +- arch/powerpc/kernel/head_booke.h | 4 ++-- arch/powerpc/kernel/head_fsl_booke.S | 2 +- 7 files changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index d6f9bdb1eb2e..ca55027f47a4 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -90,13 +90,13 @@ int main(void) DEFINE(SIGSEGV, SIGSEGV); DEFINE(NMI_MASK, NMI_MASK); #else - OFFSET(THREAD_INFO, task_struct, stack); DEFINE(THREAD_INFO_GAP, _ALIGN_UP(sizeof(struct thread_info), 16)); OFFSET(KSP_LIMIT, thread_struct, ksp_limit); #ifdef CONFIG_PPC_RTAS OFFSET(RTAS_SP, thread_struct, rtas_sp); #endif #endif /* CONFIG_PPC64 */ + OFFSET(TASK_STACK, task_struct, stack); #ifdef CONFIG_LIVEPATCH OFFSET(TI_livepatch_sp, thread_info, livepatch_sp); diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index 0c6977376233..063100df8325 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -1165,7 +1165,7 @@ ret_from_debug_exc: mfspr r9,SPRN_SPRG_THREAD lwz r10,SAVED_KSP_LIMIT(r1) stw r10,KSP_LIMIT(r9) - lwz r9,THREAD_INFO-THREAD(r9) + lwz r9,TASK_STACK-THREAD(r9) CURRENT_THREAD_INFO(r10, r1) lwz r10,TI_PREEMPT(r10) stw r10,TI_PREEMPT(r9) diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index 613900bb8c39..e07cfd5756d9 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -261,7 +261,7 @@ __secondary_hold_acknowledge: tophys(r11,r1); /* use tophys(r1) if kernel */ \ beq 1f; \ mfspr r11,SPRN_SPRG_THREAD; \ - lwz r11,THREAD_INFO-THREAD(r11); \ + lwz r11,TASK_STACK-THREAD(r11); \ addi r11,r11,THREAD_SIZE; \ tophys(r11,r11); \ 1: subi r11,r11,INT_FRAME_SIZE /* alloc exc. frame */ diff --git a/arch/powerpc/kernel/head_40x.S b/arch/powerpc/kernel/head_40x.S index 11dd09d0ce1a..a9c934f2319b 100644 --- a/arch/powerpc/kernel/head_40x.S +++ b/arch/powerpc/kernel/head_40x.S @@ -115,7 +115,7 @@ _ENTRY(saved_ksp_limit) andi. r11,r11,MSR_PR; \ beq 1f; \ mfspr r1,SPRN_SPRG_THREAD; /* if from user, start at top of */\ - lwz r1,THREAD_INFO-THREAD(r1); /* this thread's kernel stack */\ + lwz r1,TASK_STACK-THREAD(r1); /* this thread's kernel stack */\ addi r1,r1,THREAD_SIZE; \ 1: subi r1,r1,INT_FRAME_SIZE; /* Allocate an exception frame */\ tophys(r11,r1); \ @@ -158,7 +158,7 @@ _ENTRY(saved_ksp_limit) beq 1f; \ /* COMING FROM USER MODE */ \ mfspr r11,SPRN_SPRG_THREAD; /* if from user, start at top of */\ - lwz r11,THREAD_INFO-THREAD(r11); /* this thread's kernel stack */\ + lwz r11,TASK_STACK-THREAD(r11); /* this thread's kernel stack */\ 1: addi r11,r11,THREAD_SIZE-INT_FRAME_SIZE; /* Alloc an excpt frm */\ tophys(r11,r11); \ stw r10,_CCR(r11); /* save various registers */\ diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S index 50303d25cbc1..03c73b4c6435 100644 --- a/arch/powerpc/kernel/head_8xx.S +++ b/arch/powerpc/kernel/head_8xx.S @@ -142,7 +142,7 @@ instruction_counter: tophys(r11,r1); /* use tophys(r1) if kernel */ \ beq 1f; \ mfspr r11,SPRN_SPRG_THREAD; \ - lwz r11,THREAD_INFO-THREAD(r11); \ + lwz r11,TASK_STACK-THREAD(r11); \ addi r11,r11,THREAD_SIZE; \ tophys(r11,r11); \ 1: subi r11,r11,INT_FRAME_SIZE /* alloc exc. frame */ diff --git a/arch/powerpc/kernel/head_booke.h b/arch/powerpc/kernel/head_booke.h index 306e26c073a0..69e80e6d0d16 100644 --- a/arch/powerpc/kernel/head_booke.h +++ b/arch/powerpc/kernel/head_booke.h @@ -55,7 +55,7 @@ END_BTB_FLUSH_SECTION beq 1f; \ BOOKE_CLEAR_BTB(r11) \ /* if from user, start at top of this thread's kernel stack */ \ - lwz r11, THREAD_INFO-THREAD(r10); \ + lwz r11, TASK_STACK - THREAD(r10); \ ALLOC_STACK_FRAME(r11, THREAD_SIZE); \ 1 : subi r11, r11, INT_FRAME_SIZE; /* Allocate exception frame */ \ stw r13, _CCR(r11); /* save various registers */ \ @@ -142,7 +142,7 @@ END_BTB_FLUSH_SECTION BOOKE_CLEAR_BTB(r10) \ andi. r11,r11,MSR_PR; \ mfspr r11,SPRN_SPRG_THREAD; /* if from user, start at top of */\ - lwz r11,THREAD_INFO-THREAD(r11); /* this thread's kernel stack */\ + lwz r11, TASK_STACK - THREAD(r11); /* this thread's kernel stack */\ addi r11,r11,EXC_LVL_FRAME_OVERHEAD; /* allocate stack frame */\ beq 1f; \ /* COMING FROM USER MODE */ \ diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S index 2c21e8642a00..42d8d6fc00cb 100644 --- a/arch/powerpc/kernel/head_fsl_booke.S +++ b/arch/powerpc/kernel/head_fsl_booke.S @@ -719,7 +719,7 @@ finish_tlb_load: /* Get the next_tlbcam_idx percpu var */ #ifdef CONFIG_SMP - lwz r12, THREAD_INFO-THREAD(r12) + lwz r12, TASK_STACK-THREAD(r12) lwz r15, TI_CPU(r12) lis r14, __per_cpu_offset@h ori r14, r14, __per_cpu_offset@l From 1e35f29c6b2eba72521d6f3c38f9c86f331cfd0a Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 17 Jan 2019 23:17:56 +1100 Subject: [PATCH 161/200] powerpc: call_do_[soft]irq() takes a pointer to the stack The purpose of the pointer given to call_do_softirq() and call_do_irq() is to point the new stack. Currently that's the same thing as the thread_info, but won't be with THREAD_INFO_IN_TASK. So change the parameter to void* and rename it 'sp'. Signed-off-by: Christophe Leroy Reviewed-by: Nicholas Piggin [mpe: Split out of larger patch] Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/irq.h | 4 ++-- arch/powerpc/kernel/misc_32.S | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/irq.h b/arch/powerpc/include/asm/irq.h index ee39ce56b2a2..2efbae8d93be 100644 --- a/arch/powerpc/include/asm/irq.h +++ b/arch/powerpc/include/asm/irq.h @@ -63,8 +63,8 @@ extern struct thread_info *hardirq_ctx[NR_CPUS]; extern struct thread_info *softirq_ctx[NR_CPUS]; extern void irq_ctx_init(void); -extern void call_do_softirq(struct thread_info *tp); -extern void call_do_irq(struct pt_regs *regs, struct thread_info *tp); +void call_do_softirq(void *sp); +void call_do_irq(struct pt_regs *regs, void *sp); extern void do_IRQ(struct pt_regs *regs); extern void __init init_IRQ(void); extern void __do_irq(struct pt_regs *regs); diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S index 02b8cdd73792..242f0c88010e 100644 --- a/arch/powerpc/kernel/misc_32.S +++ b/arch/powerpc/kernel/misc_32.S @@ -60,7 +60,7 @@ _GLOBAL(call_do_softirq) blr /* - * void call_do_irq(struct pt_regs *regs, struct thread_info *irqtp); + * void call_do_irq(struct pt_regs *regs, void *sp); */ _GLOBAL(call_do_irq) mflr r0 From 7306e83ccf5ce3a324546d274945ec1981d78f9a Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 17 Jan 2019 23:23:57 +1100 Subject: [PATCH 162/200] powerpc: Don't use CURRENT_THREAD_INFO to find the stack A few places use CURRENT_THREAD_INFO, or the C version, to find the stack. This will no longer work with THREAD_INFO_IN_TASK so change them to find the stack in other ways. Signed-off-by: Christophe Leroy Reviewed-by: Nicholas Piggin [mpe: Split out of larger patch] Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/entry_64.S | 2 +- arch/powerpc/kernel/irq.c | 2 +- arch/powerpc/kernel/misc_32.S | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index c17c1bed6148..21f1cb4d464e 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -689,7 +689,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) 2: #endif /* CONFIG_PPC_BOOK3S_64 */ - CURRENT_THREAD_INFO(r7, r8) /* base of new stack */ + clrrdi r7, r8, THREAD_SHIFT /* base of new stack */ /* Note: this uses SWITCH_FRAME_SIZE rather than INT_FRAME_SIZE because we don't need to leave the 288-byte ABI gap at the top of the kernel stack. */ diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 4a5dd8800946..531e9ef153c0 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -663,7 +663,7 @@ void do_IRQ(struct pt_regs *regs) struct thread_info *curtp, *irqtp, *sirqtp; /* Switch to the irq stack to handle this */ - curtp = current_thread_info(); + curtp = (void *)(current_stack_pointer() & ~(THREAD_SIZE - 1)); irqtp = hardirq_ctx[raw_smp_processor_id()]; sirqtp = softirq_ctx[raw_smp_processor_id()]; diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S index 242f0c88010e..b37b50fde828 100644 --- a/arch/powerpc/kernel/misc_32.S +++ b/arch/powerpc/kernel/misc_32.S @@ -603,7 +603,7 @@ EXPORT_SYMBOL(__bswapdi2) #ifdef CONFIG_SMP _GLOBAL(start_secondary_resume) /* Reset stack */ - CURRENT_THREAD_INFO(r1, r1) + rlwinm r1, r1, 0, 0, 31 - THREAD_SHIFT addi r1,r1,THREAD_SIZE-STACK_FRAME_OVERHEAD li r3,0 stw r3,0(r1) /* Zero the stack frame pointer */ From 05b98791ec60f6a1862c58b3424f6aaeb00dfb72 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 17 Jan 2019 23:25:12 +1100 Subject: [PATCH 163/200] powerpc: Replace current_thread_info()->task with current We have a few places that use current_thread_info()->task to access current. This won't work with THREAD_INFO_IN_TASK so fix them now. Signed-off-by: Christophe Leroy Reviewed-by: Nicholas Piggin [mpe: Split out of larger patch] Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/process.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 791bd8ea475d..dc2aaaf75c87 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1231,8 +1231,8 @@ struct task_struct *__switch_to(struct task_struct *prev, batch->active = 1; } - if (current_thread_info()->task->thread.regs) { - restore_math(current_thread_info()->task->thread.regs); + if (current->thread.regs) { + restore_math(current->thread.regs); /* * The copy-paste buffer can only store into foreign real @@ -1242,7 +1242,7 @@ struct task_struct *__switch_to(struct task_struct *prev, * mappings, we must issue a cp_abort to clear any state and * prevent snooping, corruption or a covert channel. */ - if (current_thread_info()->task->thread.used_vas) + if (current->thread.used_vas) asm volatile(PPC_CP_ABORT); } #endif /* CONFIG_PPC_BOOK3S_64 */ From 4e67bfd7aa21b4b737a43df627956dba9c742983 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 17 Jan 2019 23:25:53 +1100 Subject: [PATCH 164/200] powerpc: Update comments in preparation for THREAD_INFO_IN_TASK Update a few comments that talk about current_thread_info() in preparation for THREAD_INFO_IN_TASK. Signed-off-by: Christophe Leroy Reviewed-by: Nicholas Piggin [mpe: Split out of larger patch] Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/reg.h | 2 +- arch/powerpc/kernel/head_32.S | 2 +- arch/powerpc/kernel/head_44x.S | 2 +- arch/powerpc/kernel/head_fsl_booke.S | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index c25880e6a16a..c5b2aff0ce8e 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -1062,7 +1062,7 @@ * - SPRG9 debug exception scratch * * All 32-bit: - * - SPRG3 current thread_info pointer + * - SPRG3 current thread_struct physical addr pointer * (virtual on BookE, physical on others) * * 32-bit classic: diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index e07cfd5756d9..2112805ef1d1 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -831,7 +831,7 @@ __secondary_start: bl init_idle_6xx #endif /* CONFIG_PPC_BOOK3S_32 */ - /* get current_thread_info and current */ + /* get current's stack and current */ lis r1,secondary_ti@ha tophys(r1,r1) lwz r1,secondary_ti@l(r1) diff --git a/arch/powerpc/kernel/head_44x.S b/arch/powerpc/kernel/head_44x.S index bf23c19c92d6..4e8c8bf50413 100644 --- a/arch/powerpc/kernel/head_44x.S +++ b/arch/powerpc/kernel/head_44x.S @@ -1019,7 +1019,7 @@ _GLOBAL(start_secondary_47x) /* Now we can get our task struct and real stack pointer */ - /* Get current_thread_info and current */ + /* Get current's stack and current */ lis r1,secondary_ti@ha lwz r1,secondary_ti@l(r1) lwz r2,TI_TASK(r1) diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S index 42d8d6fc00cb..6301bb24889a 100644 --- a/arch/powerpc/kernel/head_fsl_booke.S +++ b/arch/powerpc/kernel/head_fsl_booke.S @@ -1091,7 +1091,7 @@ __secondary_start: mr r4,r24 /* Why? */ bl call_setup_cpu - /* get current_thread_info and current */ + /* get current's stack and current */ lis r1,secondary_ti@ha lwz r1,secondary_ti@l(r1) lwz r2,TI_TASK(r1) From 678c668a7732f3a11c25ae86d5737a019667e3c5 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 17 Jan 2019 23:26:56 +1100 Subject: [PATCH 165/200] powerpc/64: Use task_stack_page() to initialise paca->kstack Rather than using the thread info use task_stack_page() to initialise paca->kstack, that way it will work with THREAD_INFO_IN_TASK. Signed-off-by: Christophe Leroy Reviewed-by: Nicholas Piggin [mpe: Split out of larger patch] Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/smp.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 5366d9e7bed4..829ef5411b50 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -984,7 +985,8 @@ static void cpu_idle_thread_init(unsigned int cpu, struct task_struct *idle) #ifdef CONFIG_PPC64 paca_ptrs[cpu]->__current = idle; - paca_ptrs[cpu]->kstack = (unsigned long)ti + THREAD_SIZE - STACK_FRAME_OVERHEAD; + paca_ptrs[cpu]->kstack = (unsigned long)task_stack_page(idle) + + THREAD_SIZE - STACK_FRAME_OVERHEAD; #endif ti->cpu = cpu; secondary_ti = current_set[cpu] = ti; From 5497c2536f09e733bb68362ffeba147203295ae2 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 17 Jan 2019 23:27:40 +1100 Subject: [PATCH 166/200] powerpc: Use sizeof(struct thread_info) in INIT_SP_LIMIT Currently INIT_SP_LIMIT uses sizeof(init_thread_info), but that symbol won't exist when we enable THREAD_INFO_IN_TASK. So just use the sizeof the type which is the same value but will continue to work. Signed-off-by: Christophe Leroy Reviewed-by: Nicholas Piggin [mpe: Split out of larger patch] Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/processor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index 2edab34ee288..c406ec3b4b3c 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -271,7 +271,7 @@ struct thread_struct { #define INIT_SP (sizeof(init_stack) + (unsigned long) &init_stack) #define INIT_SP_LIMIT \ - (_ALIGN_UP(sizeof(init_thread_info), 16) + (unsigned long) &init_stack) + (_ALIGN_UP(sizeof(struct thread_info), 16) + (unsigned long)&init_stack) #ifdef CONFIG_SPE #define SPEFSCR_INIT \ From 3733304048feb9bdfc3daff02ca4da8cfc9c4352 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 17 Jan 2019 23:27:28 +1100 Subject: [PATCH 167/200] powerpc: Use linux/thread_info.h in processor.h When we enable THREAD_INFO_IN_TASK we will remove our definition of current_thread_info(). Instead it will come from linux/thread_info.h So switch processor.h to include the latter, so that it can continue to find current_thread_info(). Signed-off-by: Christophe Leroy Reviewed-by: Nicholas Piggin [mpe: Split out of larger patch] Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/processor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index c406ec3b4b3c..2c740042b8d3 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -40,7 +40,7 @@ #ifndef __ASSEMBLY__ #include -#include +#include #include #include From b72cc2e7aea1e42a82358bdc6c41dfaf7a5fa742 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 18 Jan 2019 18:40:34 +1100 Subject: [PATCH 168/200] powerpc: Use task_stack_page() in current_pt_regs() Change current_pt_regs() to use task_stack_page() rather than current_thread_info() so that it keeps working once we enable THREAD_INFO_IN_TASK. Signed-off-by: Christophe Leroy [mpe: Split out of large patch] Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/ptrace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h index 0b8a735b6d85..64271e562fed 100644 --- a/arch/powerpc/include/asm/ptrace.h +++ b/arch/powerpc/include/asm/ptrace.h @@ -157,7 +157,7 @@ extern int ptrace_put_reg(struct task_struct *task, int regno, unsigned long data); #define current_pt_regs() \ - ((struct pt_regs *)((unsigned long)current_thread_info() + THREAD_SIZE) - 1) + ((struct pt_regs *)((unsigned long)task_stack_page(current) + THREAD_SIZE) - 1) /* * We use the least-significant bit of the trap field to indicate * whether we have saved the full set of registers, or only a From 7aef376679a428c6b7792fb4ce93364fd02caad4 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 4 Feb 2019 22:16:48 +1100 Subject: [PATCH 169/200] powerpc/idle/6xx: Use r1 with CURRENT_THREAD_INFO() Make sure CURRENT_THREAD_INFO() is used with r1 which is the virtual address of the stack, in order to ease the switch to r2 when we enable THREAD_INFO_IN_TASK, as we have no register having the phys address of current. Signed-off-by: Christophe Leroy [mpe: Split out of larger patch] Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/idle_6xx.S | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/idle_6xx.S b/arch/powerpc/kernel/idle_6xx.S index ff026c9d3cab..d9b6e7e0b5e3 100644 --- a/arch/powerpc/kernel/idle_6xx.S +++ b/arch/powerpc/kernel/idle_6xx.S @@ -159,7 +159,8 @@ _GLOBAL(power_save_ppc32_restore) stw r9,_NIP(r11) /* make it do a blr */ #ifdef CONFIG_SMP - CURRENT_THREAD_INFO(r12, r11) + CURRENT_THREAD_INFO(r12, r1) + tophys(r12, r12) lwz r11,TI_CPU(r12) /* get cpu number * 4 */ slwi r11,r11,2 #else From ed1cd6deb013a11959d17a94e35ce159197632da Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 31 Jan 2019 10:08:58 +0000 Subject: [PATCH 170/200] powerpc: Activate CONFIG_THREAD_INFO_IN_TASK This patch activates CONFIG_THREAD_INFO_IN_TASK which moves the thread_info into task_struct. Moving thread_info into task_struct has the following advantages: - It protects thread_info from corruption in the case of stack overflows. - Its address is harder to determine if stack addresses are leaked, making a number of attacks more difficult. This has the following consequences: - thread_info is now located at the beginning of task_struct. - The 'cpu' field is now in task_struct, and only exists when CONFIG_SMP is active. - thread_info doesn't have anymore the 'task' field. This patch: - Removes all recopy of thread_info struct when the stack changes. - Changes the CURRENT_THREAD_INFO() macro to point to current. - Selects CONFIG_THREAD_INFO_IN_TASK. - Modifies raw_smp_processor_id() to get ->cpu from current without including linux/sched.h to avoid circular inclusion and without including asm/asm-offsets.h to avoid symbol names duplication between ASM constants and C constants. - Modifies klp_init_thread_info() to take a task_struct pointer argument. Signed-off-by: Christophe Leroy Reviewed-by: Nicholas Piggin [mpe: Add task_stack.h to livepatch.h to fix build fails] Signed-off-by: Michael Ellerman --- arch/powerpc/Kconfig | 1 + arch/powerpc/Makefile | 7 +++ arch/powerpc/include/asm/irq.h | 4 -- arch/powerpc/include/asm/livepatch.h | 7 ++- arch/powerpc/include/asm/smp.h | 17 +++++- arch/powerpc/include/asm/thread_info.h | 17 +----- arch/powerpc/kernel/asm-offsets.c | 7 ++- arch/powerpc/kernel/entry_32.S | 9 ++- arch/powerpc/kernel/exceptions-64e.S | 11 ---- arch/powerpc/kernel/head_32.S | 6 +- arch/powerpc/kernel/head_44x.S | 4 +- arch/powerpc/kernel/head_booke.h | 8 +-- arch/powerpc/kernel/head_fsl_booke.S | 7 +-- arch/powerpc/kernel/irq.c | 79 +------------------------- arch/powerpc/kernel/kgdb.c | 28 --------- arch/powerpc/kernel/machine_kexec_64.c | 6 +- arch/powerpc/kernel/process.c | 2 +- arch/powerpc/kernel/setup-common.c | 2 +- arch/powerpc/kernel/setup_64.c | 21 ------- arch/powerpc/kernel/smp.c | 2 +- arch/powerpc/net/bpf_jit32.h | 5 +- 21 files changed, 56 insertions(+), 194 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 5b7945a7bd41..652c25260838 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -238,6 +238,7 @@ config PPC select RTC_LIB select SPARSE_IRQ select SYSCTL_EXCEPTION_TRACE + select THREAD_INFO_IN_TASK select VIRT_TO_BUS if !PPC64 # # Please keep this list sorted alphabetically. diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index ac033341ed55..53ffe935f3b0 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -427,6 +427,13 @@ else endif endif +ifdef CONFIG_SMP +prepare: task_cpu_prepare + +task_cpu_prepare: prepare0 + $(eval KBUILD_CFLAGS += -D_TASK_CPU=$(shell awk '{if ($$2 == "TI_CPU") print $$3;}' include/generated/asm-offsets.h)) +endif + # Check toolchain versions: # - gcc-4.6 is the minimum kernel-wide version so nothing required. checkbin: diff --git a/arch/powerpc/include/asm/irq.h b/arch/powerpc/include/asm/irq.h index 2efbae8d93be..28a7ace0a1b9 100644 --- a/arch/powerpc/include/asm/irq.h +++ b/arch/powerpc/include/asm/irq.h @@ -51,9 +51,6 @@ struct pt_regs; extern struct thread_info *critirq_ctx[NR_CPUS]; extern struct thread_info *dbgirq_ctx[NR_CPUS]; extern struct thread_info *mcheckirq_ctx[NR_CPUS]; -extern void exc_lvl_ctx_init(void); -#else -#define exc_lvl_ctx_init() #endif /* @@ -62,7 +59,6 @@ extern void exc_lvl_ctx_init(void); extern struct thread_info *hardirq_ctx[NR_CPUS]; extern struct thread_info *softirq_ctx[NR_CPUS]; -extern void irq_ctx_init(void); void call_do_softirq(void *sp); void call_do_irq(struct pt_regs *regs, void *sp); extern void do_IRQ(struct pt_regs *regs); diff --git a/arch/powerpc/include/asm/livepatch.h b/arch/powerpc/include/asm/livepatch.h index 47a03b9b528b..5070df19d463 100644 --- a/arch/powerpc/include/asm/livepatch.h +++ b/arch/powerpc/include/asm/livepatch.h @@ -21,6 +21,7 @@ #include #include +#include #ifdef CONFIG_LIVEPATCH static inline int klp_check_compiler_support(void) @@ -43,13 +44,13 @@ static inline unsigned long klp_get_ftrace_location(unsigned long faddr) return ftrace_location_range(faddr, faddr + 16); } -static inline void klp_init_thread_info(struct thread_info *ti) +static inline void klp_init_thread_info(struct task_struct *p) { /* + 1 to account for STACK_END_MAGIC */ - ti->livepatch_sp = (unsigned long *)(ti + 1) + 1; + task_thread_info(p)->livepatch_sp = end_of_stack(p) + 1; } #else -static void klp_init_thread_info(struct thread_info *ti) { } +static inline void klp_init_thread_info(struct task_struct *p) { } #endif /* CONFIG_LIVEPATCH */ #endif /* _ASM_POWERPC_LIVEPATCH_H */ diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h index 41695745032c..0de717e16dd6 100644 --- a/arch/powerpc/include/asm/smp.h +++ b/arch/powerpc/include/asm/smp.h @@ -83,7 +83,22 @@ int is_cpu_dead(unsigned int cpu); /* 32-bit */ extern int smp_hw_index[]; -#define raw_smp_processor_id() (current_thread_info()->cpu) +/* + * This is particularly ugly: it appears we can't actually get the definition + * of task_struct here, but we need access to the CPU this task is running on. + * Instead of using task_struct we're using _TASK_CPU which is extracted from + * asm-offsets.h by kbuild to get the current processor ID. + * + * This also needs to be safeguarded when building asm-offsets.s because at + * that time _TASK_CPU is not defined yet. It could have been guarded by + * _TASK_CPU itself, but we want the build to fail if _TASK_CPU is missing + * when building something else than asm-offsets.s + */ +#ifdef GENERATING_ASM_OFFSETS +#define raw_smp_processor_id() (0) +#else +#define raw_smp_processor_id() (*(unsigned int *)((void *)current + _TASK_CPU)) +#endif #define hard_smp_processor_id() (smp_hw_index[smp_processor_id()]) static inline int get_hard_smp_processor_id(int cpu) diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h index 544cac0474cb..d91523c2c7d8 100644 --- a/arch/powerpc/include/asm/thread_info.h +++ b/arch/powerpc/include/asm/thread_info.h @@ -18,9 +18,9 @@ #define THREAD_SIZE (1 << THREAD_SHIFT) #ifdef CONFIG_PPC64 -#define CURRENT_THREAD_INFO(dest, sp) stringify_in_c(clrrdi dest, sp, THREAD_SHIFT) +#define CURRENT_THREAD_INFO(dest, sp) stringify_in_c(ld dest, PACACURRENT(r13)) #else -#define CURRENT_THREAD_INFO(dest, sp) stringify_in_c(rlwinm dest, sp, 0, 0, 31-THREAD_SHIFT) +#define CURRENT_THREAD_INFO(dest, sp) stringify_in_c(mr dest, r2) #endif #ifndef __ASSEMBLY__ @@ -34,8 +34,6 @@ * low level task data. */ struct thread_info { - struct task_struct *task; /* main task structure */ - int cpu; /* cpu we're on */ int preempt_count; /* 0 => preemptable, <0 => BUG */ unsigned long local_flags; /* private flags for thread */ @@ -58,8 +56,6 @@ struct thread_info { */ #define INIT_THREAD_INFO(tsk) \ { \ - .task = &tsk, \ - .cpu = 0, \ .preempt_count = INIT_PREEMPT_COUNT, \ .flags = 0, \ } @@ -67,15 +63,6 @@ struct thread_info { #define THREAD_SIZE_ORDER (THREAD_SHIFT - PAGE_SHIFT) /* how to get the thread information struct from C */ -static inline struct thread_info *current_thread_info(void) -{ - unsigned long val; - - asm (CURRENT_THREAD_INFO(%0,1) : "=r" (val)); - - return (struct thread_info *)val; -} - extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src); #ifdef CONFIG_PPC_BOOK3S_64 diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index ca55027f47a4..ca3fb836cbb9 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -13,6 +13,8 @@ * 2 of the License, or (at your option) any later version. */ +#define GENERATING_ASM_OFFSETS /* asm/smp.h */ + #include #include #include @@ -97,6 +99,9 @@ int main(void) #endif #endif /* CONFIG_PPC64 */ OFFSET(TASK_STACK, task_struct, stack); +#ifdef CONFIG_SMP + OFFSET(TI_CPU, task_struct, cpu); +#endif #ifdef CONFIG_LIVEPATCH OFFSET(TI_livepatch_sp, thread_info, livepatch_sp); @@ -164,8 +169,6 @@ int main(void) OFFSET(TI_FLAGS, thread_info, flags); OFFSET(TI_LOCAL_FLAGS, thread_info, local_flags); OFFSET(TI_PREEMPT, thread_info, preempt_count); - OFFSET(TI_TASK, thread_info, task); - OFFSET(TI_CPU, thread_info, cpu); #ifdef CONFIG_PPC64 OFFSET(DCACHEL1BLOCKSIZE, ppc64_caches, l1d.block_size); diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index 063100df8325..f3618353c1c4 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -1165,10 +1165,6 @@ ret_from_debug_exc: mfspr r9,SPRN_SPRG_THREAD lwz r10,SAVED_KSP_LIMIT(r1) stw r10,KSP_LIMIT(r9) - lwz r9,TASK_STACK-THREAD(r9) - CURRENT_THREAD_INFO(r10, r1) - lwz r10,TI_PREEMPT(r10) - stw r10,TI_PREEMPT(r9) RESTORE_xSRR(SRR0,SRR1); RESTORE_xSRR(CSRR0,CSRR1); RESTORE_MMU_REGS; @@ -1291,10 +1287,13 @@ BEGIN_FTR_SECTION END_FTR_SECTION_IFSET(CPU_FTR_601) lwz r3,_TRAP(r1) andi. r0,r3,1 - beq 4f + beq 5f SAVE_NVGPRS(r1) rlwinm r3,r3,0,0,30 stw r3,_TRAP(r1) +5: mfspr r2,SPRN_SPRG_THREAD + addi r2,r2,-THREAD + tovirt(r2,r2) /* set back r2 to current */ 4: addi r3,r1,STACK_FRAME_OVERHEAD bl unrecoverable_exception /* shouldn't return */ diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S index afb638778f44..20f14996281d 100644 --- a/arch/powerpc/kernel/exceptions-64e.S +++ b/arch/powerpc/kernel/exceptions-64e.S @@ -77,17 +77,6 @@ special_reg_save: andi. r3,r3,MSR_PR bnelr - /* Copy info into temporary exception thread info */ - ld r11,PACAKSAVE(r13) - CURRENT_THREAD_INFO(r11, r11) - CURRENT_THREAD_INFO(r12, r1) - ld r10,TI_FLAGS(r11) - std r10,TI_FLAGS(r12) - ld r10,TI_PREEMPT(r11) - std r10,TI_PREEMPT(r12) - ld r10,TI_TASK(r11) - std r10,TI_TASK(r12) - /* * Advance to the next TLB exception frame for handler * types that don't do it automatically. diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index 2112805ef1d1..888fcff3f8cc 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -834,9 +834,9 @@ __secondary_start: /* get current's stack and current */ lis r1,secondary_ti@ha tophys(r1,r1) - lwz r1,secondary_ti@l(r1) - tophys(r2,r1) - lwz r2,TI_TASK(r2) + lwz r2,secondary_ti@l(r1) + tophys(r1,r2) + lwz r1,TASK_STACK(r1) /* stack */ addi r1,r1,THREAD_SIZE-STACK_FRAME_OVERHEAD diff --git a/arch/powerpc/kernel/head_44x.S b/arch/powerpc/kernel/head_44x.S index 4e8c8bf50413..f94a93b6c2f2 100644 --- a/arch/powerpc/kernel/head_44x.S +++ b/arch/powerpc/kernel/head_44x.S @@ -1021,8 +1021,8 @@ _GLOBAL(start_secondary_47x) /* Get current's stack and current */ lis r1,secondary_ti@ha - lwz r1,secondary_ti@l(r1) - lwz r2,TI_TASK(r1) + lwz r2,secondary_ti@l(r1) + lwz r1,TASK_STACK(r2) /* Current stack pointer */ addi r1,r1,THREAD_SIZE-STACK_FRAME_OVERHEAD diff --git a/arch/powerpc/kernel/head_booke.h b/arch/powerpc/kernel/head_booke.h index 69e80e6d0d16..1b22a8dea399 100644 --- a/arch/powerpc/kernel/head_booke.h +++ b/arch/powerpc/kernel/head_booke.h @@ -155,13 +155,7 @@ END_BTB_FLUSH_SECTION stw r10,GPR11(r11); \ b 2f; \ /* COMING FROM PRIV MODE */ \ -1: lwz r9,TI_FLAGS-EXC_LVL_FRAME_OVERHEAD(r11); \ - lwz r10,TI_PREEMPT-EXC_LVL_FRAME_OVERHEAD(r11); \ - stw r9,TI_FLAGS-EXC_LVL_FRAME_OVERHEAD(r8); \ - stw r10,TI_PREEMPT-EXC_LVL_FRAME_OVERHEAD(r8); \ - lwz r9,TI_TASK-EXC_LVL_FRAME_OVERHEAD(r11); \ - stw r9,TI_TASK-EXC_LVL_FRAME_OVERHEAD(r8); \ - mr r11,r8; \ +1: mr r11, r8; \ 2: mfspr r8,SPRN_SPRG_RSCRATCH_##exc_level; \ stw r12,GPR12(r11); /* save various registers */\ mflr r10; \ diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S index 6301bb24889a..11f38adbe020 100644 --- a/arch/powerpc/kernel/head_fsl_booke.S +++ b/arch/powerpc/kernel/head_fsl_booke.S @@ -719,8 +719,7 @@ finish_tlb_load: /* Get the next_tlbcam_idx percpu var */ #ifdef CONFIG_SMP - lwz r12, TASK_STACK-THREAD(r12) - lwz r15, TI_CPU(r12) + lwz r15, TI_CPU-THREAD(r12) lis r14, __per_cpu_offset@h ori r14, r14, __per_cpu_offset@l rlwinm r15, r15, 2, 0, 29 @@ -1093,8 +1092,8 @@ __secondary_start: /* get current's stack and current */ lis r1,secondary_ti@ha - lwz r1,secondary_ti@l(r1) - lwz r2,TI_TASK(r1) + lwz r2,secondary_ti@l(r1) + lwz r1,TASK_STACK(r2) /* stack */ addi r1,r1,THREAD_SIZE-STACK_FRAME_OVERHEAD diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 531e9ef153c0..85c48911938a 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -673,24 +673,9 @@ void do_IRQ(struct pt_regs *regs) set_irq_regs(old_regs); return; } - - /* Prepare the thread_info in the irq stack */ - irqtp->task = curtp->task; - irqtp->flags = 0; - - /* Copy the preempt_count so that the [soft]irq checks work. */ - irqtp->preempt_count = curtp->preempt_count; - /* Switch stack and call */ call_do_irq(regs, irqtp); - /* Restore stack limit */ - irqtp->task = NULL; - - /* Copy back updates to the thread_info */ - if (irqtp->flags) - set_bits(irqtp->flags, &curtp->flags); - set_irq_regs(old_regs); } @@ -698,85 +683,23 @@ void __init init_IRQ(void) { if (ppc_md.init_IRQ) ppc_md.init_IRQ(); - - exc_lvl_ctx_init(); - - irq_ctx_init(); } #if defined(CONFIG_BOOKE) || defined(CONFIG_40x) struct thread_info *critirq_ctx[NR_CPUS] __read_mostly; struct thread_info *dbgirq_ctx[NR_CPUS] __read_mostly; struct thread_info *mcheckirq_ctx[NR_CPUS] __read_mostly; - -void exc_lvl_ctx_init(void) -{ - struct thread_info *tp; - int i, cpu_nr; - - for_each_possible_cpu(i) { -#ifdef CONFIG_PPC64 - cpu_nr = i; -#else -#ifdef CONFIG_SMP - cpu_nr = get_hard_smp_processor_id(i); -#else - cpu_nr = 0; -#endif -#endif - - tp = critirq_ctx[cpu_nr]; - tp->cpu = cpu_nr; - tp->preempt_count = 0; - -#ifdef CONFIG_BOOKE - tp = dbgirq_ctx[cpu_nr]; - tp->cpu = cpu_nr; - tp->preempt_count = 0; - - tp = mcheckirq_ctx[cpu_nr]; - tp->cpu = cpu_nr; - tp->preempt_count = HARDIRQ_OFFSET; -#endif - } -} #endif struct thread_info *softirq_ctx[NR_CPUS] __read_mostly; struct thread_info *hardirq_ctx[NR_CPUS] __read_mostly; -void irq_ctx_init(void) -{ - struct thread_info *tp; - int i; - - for_each_possible_cpu(i) { - tp = softirq_ctx[i]; - tp->cpu = i; - klp_init_thread_info(tp); - - tp = hardirq_ctx[i]; - tp->cpu = i; - klp_init_thread_info(tp); - } -} - void do_softirq_own_stack(void) { - struct thread_info *curtp, *irqtp; + struct thread_info *irqtp; - curtp = current_thread_info(); irqtp = softirq_ctx[smp_processor_id()]; - irqtp->task = curtp->task; - irqtp->flags = 0; call_do_softirq(irqtp); - irqtp->task = NULL; - - /* Set any flag that may have been set on the - * alternate stack - */ - if (irqtp->flags) - set_bits(irqtp->flags, &curtp->flags); } irq_hw_number_t virq_to_hw(unsigned int virq) diff --git a/arch/powerpc/kernel/kgdb.c b/arch/powerpc/kernel/kgdb.c index e1865565f0ae..7dd55eb1259d 100644 --- a/arch/powerpc/kernel/kgdb.c +++ b/arch/powerpc/kernel/kgdb.c @@ -151,41 +151,13 @@ static int kgdb_handle_breakpoint(struct pt_regs *regs) return 1; } -static DEFINE_PER_CPU(struct thread_info, kgdb_thread_info); static int kgdb_singlestep(struct pt_regs *regs) { - struct thread_info *thread_info, *exception_thread_info; - struct thread_info *backup_current_thread_info = - this_cpu_ptr(&kgdb_thread_info); - if (user_mode(regs)) return 0; - /* - * On Book E and perhaps other processors, singlestep is handled on - * the critical exception stack. This causes current_thread_info() - * to fail, since it it locates the thread_info by masking off - * the low bits of the current stack pointer. We work around - * this issue by copying the thread_info from the kernel stack - * before calling kgdb_handle_exception, and copying it back - * afterwards. On most processors the copy is avoided since - * exception_thread_info == thread_info. - */ - thread_info = (struct thread_info *)(regs->gpr[1] & ~(THREAD_SIZE-1)); - exception_thread_info = current_thread_info(); - - if (thread_info != exception_thread_info) { - /* Save the original current_thread_info. */ - memcpy(backup_current_thread_info, exception_thread_info, sizeof *thread_info); - memcpy(exception_thread_info, thread_info, sizeof *thread_info); - } - kgdb_handle_exception(0, SIGTRAP, 0, regs); - if (thread_info != exception_thread_info) - /* Restore current_thread_info lastly. */ - memcpy(exception_thread_info, backup_current_thread_info, sizeof *thread_info); - return 1; } diff --git a/arch/powerpc/kernel/machine_kexec_64.c b/arch/powerpc/kernel/machine_kexec_64.c index a0f6f45005bd..75692c327ba0 100644 --- a/arch/powerpc/kernel/machine_kexec_64.c +++ b/arch/powerpc/kernel/machine_kexec_64.c @@ -317,10 +317,8 @@ void default_machine_kexec(struct kimage *image) * We setup preempt_count to avoid using VMX in memcpy. * XXX: the task struct will likely be invalid once we do the copy! */ - kexec_stack.thread_info.task = current_thread_info()->task; - kexec_stack.thread_info.flags = 0; - kexec_stack.thread_info.preempt_count = HARDIRQ_OFFSET; - kexec_stack.thread_info.cpu = current_thread_info()->cpu; + current_thread_info()->flags = 0; + current_thread_info()->preempt_count = HARDIRQ_OFFSET; /* We need a static PACA, too; copy this CPU's PACA over and switch to * it. Also poison per_cpu_offset and NULL lppaca to catch anyone using diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index dc2aaaf75c87..fd07711035bd 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1634,7 +1634,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long sp = (unsigned long)task_stack_page(p) + THREAD_SIZE; struct thread_info *ti = task_thread_info(p); - klp_init_thread_info(ti); + klp_init_thread_info(p); /* Copy registers */ sp -= sizeof(struct pt_regs); diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 9a6a0859c1ef..e7534f306c8e 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -937,7 +937,7 @@ void __init setup_arch(char **cmdline_p) /* Reserve large chunks of memory for use by CMA for KVM. */ kvm_cma_reserve(); - klp_init_thread_info(&init_thread_info); + klp_init_thread_info(&init_task); init_mm.start_code = (unsigned long)_stext; init_mm.end_code = (unsigned long) _etext; diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 080dd515d587..0912948a8ea6 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -689,24 +689,6 @@ void __init exc_lvl_early_init(void) } #endif -/* - * Emergency stacks are used for a range of things, from asynchronous - * NMIs (system reset, machine check) to synchronous, process context. - * We set preempt_count to zero, even though that isn't necessarily correct. To - * get the right value we'd need to copy it from the previous thread_info, but - * doing that might fault causing more problems. - * TODO: what to do with accounting? - */ -static void emerg_stack_init_thread_info(struct thread_info *ti, int cpu) -{ - ti->task = NULL; - ti->cpu = cpu; - ti->preempt_count = 0; - ti->local_flags = 0; - ti->flags = 0; - klp_init_thread_info(ti); -} - /* * Stack space used when we detect a bad kernel stack pointer, and * early in SMP boots before relocation is enabled. Exclusive emergency @@ -737,18 +719,15 @@ void __init emergency_stack_init(void) struct thread_info *ti; ti = alloc_stack(limit, i); - emerg_stack_init_thread_info(ti, i); paca_ptrs[i]->emergency_sp = (void *)ti + THREAD_SIZE; #ifdef CONFIG_PPC_BOOK3S_64 /* emergency stack for NMI exception handling. */ ti = alloc_stack(limit, i); - emerg_stack_init_thread_info(ti, i); paca_ptrs[i]->nmi_emergency_sp = (void *)ti + THREAD_SIZE; /* emergency stack for machine check exception handling. */ ti = alloc_stack(limit, i); - emerg_stack_init_thread_info(ti, i); paca_ptrs[i]->mc_emergency_sp = (void *)ti + THREAD_SIZE; #endif } diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 829ef5411b50..96c25a89e877 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -988,7 +988,7 @@ static void cpu_idle_thread_init(unsigned int cpu, struct task_struct *idle) paca_ptrs[cpu]->kstack = (unsigned long)task_stack_page(idle) + THREAD_SIZE - STACK_FRAME_OVERHEAD; #endif - ti->cpu = cpu; + idle->cpu = cpu; secondary_ti = current_set[cpu] = ti; } diff --git a/arch/powerpc/net/bpf_jit32.h b/arch/powerpc/net/bpf_jit32.h index 6f4daacad296..dc50a8d4b3b9 100644 --- a/arch/powerpc/net/bpf_jit32.h +++ b/arch/powerpc/net/bpf_jit32.h @@ -106,9 +106,8 @@ DECLARE_LOAD_FUNC(sk_load_byte_msh); } while (0) #else #define PPC_BPF_LOAD_CPU(r) \ - do { BUILD_BUG_ON(FIELD_SIZEOF(struct thread_info, cpu) != 4); \ - PPC_LHZ_OFFS(r, (1 & ~(THREAD_SIZE - 1)), \ - offsetof(struct thread_info, cpu)); \ + do { BUILD_BUG_ON(FIELD_SIZEOF(struct task_struct, cpu) != 4); \ + PPC_LHZ_OFFS(r, 2, offsetof(struct task_struct, cpu)); \ } while(0) #endif #else From a7916a1de526162d73e894b6d3ebd895d4302078 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 31 Jan 2019 10:09:00 +0000 Subject: [PATCH 171/200] powerpc: regain entire stack space thread_info is not anymore in the stack, so the entire stack can now be used. There is also no risk anymore of corrupting task_cpu(p) with a stack overflow so the patch removes the test. When doing this, an explicit test for NULL stack pointer is needed in validate_sp() as it is not anymore implicitely covered by the sizeof(thread_info) gap. In the meantime, with the previous patch all pointers to the stacks are not anymore pointers to thread_info so this patch changes them to void* Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/irq.h | 10 +++++----- arch/powerpc/include/asm/processor.h | 3 +-- arch/powerpc/kernel/asm-offsets.c | 1 - arch/powerpc/kernel/entry_32.S | 14 ++++--------- arch/powerpc/kernel/irq.c | 19 +++++++++--------- arch/powerpc/kernel/misc_32.S | 6 ++---- arch/powerpc/kernel/process.c | 30 +++++++++++----------------- arch/powerpc/kernel/setup_64.c | 8 ++++---- 8 files changed, 37 insertions(+), 54 deletions(-) diff --git a/arch/powerpc/include/asm/irq.h b/arch/powerpc/include/asm/irq.h index 28a7ace0a1b9..c91a60cda4fa 100644 --- a/arch/powerpc/include/asm/irq.h +++ b/arch/powerpc/include/asm/irq.h @@ -48,16 +48,16 @@ struct pt_regs; * Per-cpu stacks for handling critical, debug and machine check * level interrupts. */ -extern struct thread_info *critirq_ctx[NR_CPUS]; -extern struct thread_info *dbgirq_ctx[NR_CPUS]; -extern struct thread_info *mcheckirq_ctx[NR_CPUS]; +extern void *critirq_ctx[NR_CPUS]; +extern void *dbgirq_ctx[NR_CPUS]; +extern void *mcheckirq_ctx[NR_CPUS]; #endif /* * Per-cpu stacks for handling hard and soft interrupts. */ -extern struct thread_info *hardirq_ctx[NR_CPUS]; -extern struct thread_info *softirq_ctx[NR_CPUS]; +extern void *hardirq_ctx[NR_CPUS]; +extern void *softirq_ctx[NR_CPUS]; void call_do_softirq(void *sp); void call_do_irq(struct pt_regs *regs, void *sp); diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index 2c740042b8d3..3351bcf42f2d 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -270,8 +270,7 @@ struct thread_struct { #define ARCH_MIN_TASKALIGN 16 #define INIT_SP (sizeof(init_stack) + (unsigned long) &init_stack) -#define INIT_SP_LIMIT \ - (_ALIGN_UP(sizeof(struct thread_info), 16) + (unsigned long)&init_stack) +#define INIT_SP_LIMIT ((unsigned long)&init_stack) #ifdef CONFIG_SPE #define SPEFSCR_INIT \ diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index ca3fb836cbb9..1ad0cbcc5f13 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -92,7 +92,6 @@ int main(void) DEFINE(SIGSEGV, SIGSEGV); DEFINE(NMI_MASK, NMI_MASK); #else - DEFINE(THREAD_INFO_GAP, _ALIGN_UP(sizeof(struct thread_info), 16)); OFFSET(KSP_LIMIT, thread_struct, ksp_limit); #ifdef CONFIG_PPC_RTAS OFFSET(RTAS_SP, thread_struct, rtas_sp); diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index f3618353c1c4..424e7265e790 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -97,14 +97,11 @@ crit_transfer_to_handler: mfspr r0,SPRN_SRR1 stw r0,_SRR1(r11) - /* set the stack limit to the current stack - * and set the limit to protect the thread_info - * struct - */ + /* set the stack limit to the current stack */ mfspr r8,SPRN_SPRG_THREAD lwz r0,KSP_LIMIT(r8) stw r0,SAVED_KSP_LIMIT(r11) - rlwimi r0,r1,0,0,(31-THREAD_SHIFT) + rlwinm r0,r1,0,0,(31 - THREAD_SHIFT) stw r0,KSP_LIMIT(r8) /* fall through */ #endif @@ -121,14 +118,11 @@ crit_transfer_to_handler: mfspr r0,SPRN_SRR1 stw r0,crit_srr1@l(0) - /* set the stack limit to the current stack - * and set the limit to protect the thread_info - * struct - */ + /* set the stack limit to the current stack */ mfspr r8,SPRN_SPRG_THREAD lwz r0,KSP_LIMIT(r8) stw r0,saved_ksp_limit@l(0) - rlwimi r0,r1,0,0,(31-THREAD_SHIFT) + rlwinm r0,r1,0,0,(31 - THREAD_SHIFT) stw r0,KSP_LIMIT(r8) /* fall through */ #endif diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 85c48911938a..938944c6e2ee 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -618,9 +618,8 @@ static inline void check_stack_overflow(void) sp = current_stack_pointer() & (THREAD_SIZE-1); /* check for stack overflow: is there less than 2KB free? */ - if (unlikely(sp < (sizeof(struct thread_info) + 2048))) { - pr_err("do_IRQ: stack overflow: %ld\n", - sp - sizeof(struct thread_info)); + if (unlikely(sp < 2048)) { + pr_err("do_IRQ: stack overflow: %ld\n", sp); dump_stack(); } #endif @@ -660,7 +659,7 @@ void __do_irq(struct pt_regs *regs) void do_IRQ(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); - struct thread_info *curtp, *irqtp, *sirqtp; + void *curtp, *irqtp, *sirqtp; /* Switch to the irq stack to handle this */ curtp = (void *)(current_stack_pointer() & ~(THREAD_SIZE - 1)); @@ -686,17 +685,17 @@ void __init init_IRQ(void) } #if defined(CONFIG_BOOKE) || defined(CONFIG_40x) -struct thread_info *critirq_ctx[NR_CPUS] __read_mostly; -struct thread_info *dbgirq_ctx[NR_CPUS] __read_mostly; -struct thread_info *mcheckirq_ctx[NR_CPUS] __read_mostly; +void *critirq_ctx[NR_CPUS] __read_mostly; +void *dbgirq_ctx[NR_CPUS] __read_mostly; +void *mcheckirq_ctx[NR_CPUS] __read_mostly; #endif -struct thread_info *softirq_ctx[NR_CPUS] __read_mostly; -struct thread_info *hardirq_ctx[NR_CPUS] __read_mostly; +void *softirq_ctx[NR_CPUS] __read_mostly; +void *hardirq_ctx[NR_CPUS] __read_mostly; void do_softirq_own_stack(void) { - struct thread_info *irqtp; + void *irqtp; irqtp = softirq_ctx[smp_processor_id()]; call_do_softirq(irqtp); diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S index b37b50fde828..6f6127c3760c 100644 --- a/arch/powerpc/kernel/misc_32.S +++ b/arch/powerpc/kernel/misc_32.S @@ -46,11 +46,10 @@ _GLOBAL(call_do_softirq) mflr r0 stw r0,4(r1) lwz r10,THREAD+KSP_LIMIT(r2) - addi r11,r3,THREAD_INFO_GAP + stw r3, THREAD+KSP_LIMIT(r2) stwu r1,THREAD_SIZE-STACK_FRAME_OVERHEAD(r3) mr r1,r3 stw r10,8(r1) - stw r11,THREAD+KSP_LIMIT(r2) bl __do_softirq lwz r10,8(r1) lwz r1,0(r1) @@ -66,11 +65,10 @@ _GLOBAL(call_do_irq) mflr r0 stw r0,4(r1) lwz r10,THREAD+KSP_LIMIT(r2) - addi r11,r4,THREAD_INFO_GAP + stw r4, THREAD+KSP_LIMIT(r2) stwu r1,THREAD_SIZE-STACK_FRAME_OVERHEAD(r4) mr r1,r4 stw r10,8(r1) - stw r11,THREAD+KSP_LIMIT(r2) bl __do_irq lwz r10,8(r1) lwz r1,0(r1) diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index fd07711035bd..dd9e0d5386ee 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1691,8 +1691,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, sp -= STACK_FRAME_OVERHEAD; p->thread.ksp = sp; #ifdef CONFIG_PPC32 - p->thread.ksp_limit = (unsigned long)task_stack_page(p) + - _ALIGN_UP(sizeof(struct thread_info), 16); + p->thread.ksp_limit = (unsigned long)end_of_stack(p); #endif #ifdef CONFIG_HAVE_HW_BREAKPOINT p->thread.ptrace_bps[0] = NULL; @@ -1995,21 +1994,14 @@ static inline int valid_irq_stack(unsigned long sp, struct task_struct *p, unsigned long stack_page; unsigned long cpu = task_cpu(p); - /* - * Avoid crashing if the stack has overflowed and corrupted - * task_cpu(p), which is in the thread_info struct. - */ - if (cpu < NR_CPUS && cpu_possible(cpu)) { - stack_page = (unsigned long) hardirq_ctx[cpu]; - if (sp >= stack_page + sizeof(struct thread_struct) - && sp <= stack_page + THREAD_SIZE - nbytes) - return 1; + stack_page = (unsigned long)hardirq_ctx[cpu]; + if (sp >= stack_page && sp <= stack_page + THREAD_SIZE - nbytes) + return 1; + + stack_page = (unsigned long)softirq_ctx[cpu]; + if (sp >= stack_page && sp <= stack_page + THREAD_SIZE - nbytes) + return 1; - stack_page = (unsigned long) softirq_ctx[cpu]; - if (sp >= stack_page + sizeof(struct thread_struct) - && sp <= stack_page + THREAD_SIZE - nbytes) - return 1; - } return 0; } @@ -2018,8 +2010,10 @@ int validate_sp(unsigned long sp, struct task_struct *p, { unsigned long stack_page = (unsigned long)task_stack_page(p); - if (sp >= stack_page + sizeof(struct thread_struct) - && sp <= stack_page + THREAD_SIZE - nbytes) + if (sp < THREAD_SIZE) + return 0; + + if (sp >= stack_page && sp <= stack_page + THREAD_SIZE - nbytes) return 1; return valid_irq_stack(sp, p, nbytes); diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 0912948a8ea6..2db1c5f7d141 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -716,19 +716,19 @@ void __init emergency_stack_init(void) limit = min(ppc64_bolted_size(), ppc64_rma_size); for_each_possible_cpu(i) { - struct thread_info *ti; + void *ti; ti = alloc_stack(limit, i); - paca_ptrs[i]->emergency_sp = (void *)ti + THREAD_SIZE; + paca_ptrs[i]->emergency_sp = ti + THREAD_SIZE; #ifdef CONFIG_PPC_BOOK3S_64 /* emergency stack for NMI exception handling. */ ti = alloc_stack(limit, i); - paca_ptrs[i]->nmi_emergency_sp = (void *)ti + THREAD_SIZE; + paca_ptrs[i]->nmi_emergency_sp = ti + THREAD_SIZE; /* emergency stack for machine check exception handling. */ ti = alloc_stack(limit, i); - paca_ptrs[i]->mc_emergency_sp = (void *)ti + THREAD_SIZE; + paca_ptrs[i]->mc_emergency_sp = ti + THREAD_SIZE; #endif } } From 7c19c2e5f9c18e364a306253065474e5f6ad960c Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 31 Jan 2019 10:09:02 +0000 Subject: [PATCH 172/200] powerpc: 'current_set' is now a table of task_struct pointers The table of pointers 'current_set' has been used for retrieving the stack and current. They used to be thread_info pointers as they were pointing to the stack and current was taken from the 'task' field of the thread_info. Now, the pointers of 'current_set' table are now both pointers to task_struct and pointers to thread_info. As they are used to get current, and the stack pointer is retrieved from current's stack field, this patch changes their type to task_struct, and renames secondary_ti to secondary_current. Reviewed-by: Nicholas Piggin Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/asm-prototypes.h | 4 ++-- arch/powerpc/kernel/head_32.S | 6 +++--- arch/powerpc/kernel/head_44x.S | 4 ++-- arch/powerpc/kernel/head_fsl_booke.S | 4 ++-- arch/powerpc/kernel/smp.c | 10 ++++------ 5 files changed, 13 insertions(+), 15 deletions(-) diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h index 1d911f68a23b..1484df6779ab 100644 --- a/arch/powerpc/include/asm/asm-prototypes.h +++ b/arch/powerpc/include/asm/asm-prototypes.h @@ -23,8 +23,8 @@ #include /* SMP */ -extern struct thread_info *current_set[NR_CPUS]; -extern struct thread_info *secondary_ti; +extern struct task_struct *current_set[NR_CPUS]; +extern struct task_struct *secondary_current; void start_secondary(void *unused); /* kexec */ diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index 888fcff3f8cc..ce6a972f2584 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -832,9 +832,9 @@ __secondary_start: #endif /* CONFIG_PPC_BOOK3S_32 */ /* get current's stack and current */ - lis r1,secondary_ti@ha - tophys(r1,r1) - lwz r2,secondary_ti@l(r1) + lis r2,secondary_current@ha + tophys(r2,r2) + lwz r2,secondary_current@l(r2) tophys(r1,r2) lwz r1,TASK_STACK(r1) diff --git a/arch/powerpc/kernel/head_44x.S b/arch/powerpc/kernel/head_44x.S index f94a93b6c2f2..37117ab11584 100644 --- a/arch/powerpc/kernel/head_44x.S +++ b/arch/powerpc/kernel/head_44x.S @@ -1020,8 +1020,8 @@ _GLOBAL(start_secondary_47x) /* Now we can get our task struct and real stack pointer */ /* Get current's stack and current */ - lis r1,secondary_ti@ha - lwz r2,secondary_ti@l(r1) + lis r2,secondary_current@ha + lwz r2,secondary_current@l(r2) lwz r1,TASK_STACK(r2) /* Current stack pointer */ diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S index 11f38adbe020..4ed2a7c8e89b 100644 --- a/arch/powerpc/kernel/head_fsl_booke.S +++ b/arch/powerpc/kernel/head_fsl_booke.S @@ -1091,8 +1091,8 @@ __secondary_start: bl call_setup_cpu /* get current's stack and current */ - lis r1,secondary_ti@ha - lwz r2,secondary_ti@l(r1) + lis r2,secondary_current@ha + lwz r2,secondary_current@l(r2) lwz r1,TASK_STACK(r2) /* stack */ diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 96c25a89e877..e784342bdaa1 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -76,7 +76,7 @@ static DEFINE_PER_CPU(int, cpu_state) = { 0 }; #endif -struct thread_info *secondary_ti; +struct task_struct *secondary_current; bool has_big_cores; DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map); @@ -631,7 +631,7 @@ void smp_send_stop(void) } #endif /* CONFIG_NMI_IPI */ -struct thread_info *current_set[NR_CPUS]; +struct task_struct *current_set[NR_CPUS]; static void smp_store_cpu_info(int id) { @@ -896,7 +896,7 @@ void smp_prepare_boot_cpu(void) paca_ptrs[boot_cpuid]->__current = current; #endif set_numa_node(numa_cpu_lookup_table[boot_cpuid]); - current_set[boot_cpuid] = task_thread_info(current); + current_set[boot_cpuid] = current; } #ifdef CONFIG_HOTPLUG_CPU @@ -981,15 +981,13 @@ static bool secondaries_inhibited(void) static void cpu_idle_thread_init(unsigned int cpu, struct task_struct *idle) { - struct thread_info *ti = task_thread_info(idle); - #ifdef CONFIG_PPC64 paca_ptrs[cpu]->__current = idle; paca_ptrs[cpu]->kstack = (unsigned long)task_stack_page(idle) + THREAD_SIZE - STACK_FRAME_OVERHEAD; #endif idle->cpu = cpu; - secondary_ti = current_set[cpu] = ti; + secondary_current = current_set[cpu] = idle; } int __cpu_up(unsigned int cpu, struct task_struct *tidle) From f7354ccac844da7b1af8cc4f09da330fa3e960e4 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 31 Jan 2019 10:09:04 +0000 Subject: [PATCH 173/200] powerpc/32: Remove CURRENT_THREAD_INFO and rename TI_CPU Now that thread_info is similar to task_struct, its address is in r2 so CURRENT_THREAD_INFO() macro is useless. This patch removes it. This patch also moves the 'tovirt(r2, r2)' down just before the reactivation of MMU translation, so that we keep the physical address of 'current' in r2 until then. It avoids a few calls to tophys(). At the same time, as the 'cpu' field is not anymore in thread_info, TI_CPU is renamed TASK_CPU by this patch. It also allows to get rid of a couple of '#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE' as ACCOUNT_CPU_USER_ENTRY() and ACCOUNT_CPU_USER_EXIT() are empty when CONFIG_VIRT_CPU_ACCOUNTING_NATIVE is not defined. Signed-off-by: Christophe Leroy [mpe: Fix a missed conversion of TI_CPU idle_6xx.S] Signed-off-by: Michael Ellerman --- arch/powerpc/Makefile | 2 +- arch/powerpc/include/asm/thread_info.h | 2 - arch/powerpc/kernel/asm-offsets.c | 2 +- arch/powerpc/kernel/entry_32.S | 55 +++++++++----------------- arch/powerpc/kernel/epapr_hcalls.S | 5 +-- arch/powerpc/kernel/head_fsl_booke.S | 5 +-- arch/powerpc/kernel/idle_6xx.S | 9 ++--- arch/powerpc/kernel/idle_e500.S | 8 ++-- arch/powerpc/kernel/misc_32.S | 3 +- arch/powerpc/mm/hash_low_32.S | 13 ++---- arch/powerpc/sysdev/6xx-suspend.S | 5 +-- 11 files changed, 37 insertions(+), 72 deletions(-) diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index 53ffe935f3b0..7de49889bd5d 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -431,7 +431,7 @@ ifdef CONFIG_SMP prepare: task_cpu_prepare task_cpu_prepare: prepare0 - $(eval KBUILD_CFLAGS += -D_TASK_CPU=$(shell awk '{if ($$2 == "TI_CPU") print $$3;}' include/generated/asm-offsets.h)) + $(eval KBUILD_CFLAGS += -D_TASK_CPU=$(shell awk '{if ($$2 == "TASK_CPU") print $$3;}' include/generated/asm-offsets.h)) endif # Check toolchain versions: diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h index d91523c2c7d8..c959b8d66cac 100644 --- a/arch/powerpc/include/asm/thread_info.h +++ b/arch/powerpc/include/asm/thread_info.h @@ -19,8 +19,6 @@ #ifdef CONFIG_PPC64 #define CURRENT_THREAD_INFO(dest, sp) stringify_in_c(ld dest, PACACURRENT(r13)) -#else -#define CURRENT_THREAD_INFO(dest, sp) stringify_in_c(mr dest, r2) #endif #ifndef __ASSEMBLY__ diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 1ad0cbcc5f13..8b688b19776a 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -99,7 +99,7 @@ int main(void) #endif /* CONFIG_PPC64 */ OFFSET(TASK_STACK, task_struct, stack); #ifdef CONFIG_SMP - OFFSET(TI_CPU, task_struct, cpu); + OFFSET(TASK_CPU, task_struct, cpu); #endif #ifdef CONFIG_LIVEPATCH diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index 424e7265e790..96dce6a4b61e 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -151,7 +151,6 @@ transfer_to_handler: stw r2,_XER(r11) mfspr r12,SPRN_SPRG_THREAD addi r2,r12,-THREAD - tovirt(r2,r2) /* set r2 to current */ beq 2f /* if from user, fix up THREAD.regs */ addi r11,r1,STACK_FRAME_OVERHEAD stw r11,PT_REGS(r12) @@ -161,11 +160,7 @@ transfer_to_handler: lwz r12,THREAD_DBCR0(r12) andis. r12,r12,DBCR0_IDM@h #endif -#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE - CURRENT_THREAD_INFO(r9, r1) - tophys(r9, r9) - ACCOUNT_CPU_USER_ENTRY(r9, r11, r12) -#endif + ACCOUNT_CPU_USER_ENTRY(r2, r11, r12) #if defined(CONFIG_40x) || defined(CONFIG_BOOKE) beq+ 3f /* From user and task is ptraced - load up global dbcr0 */ @@ -175,8 +170,7 @@ transfer_to_handler: tophys(r11,r11) addi r11,r11,global_dbcr0@l #ifdef CONFIG_SMP - CURRENT_THREAD_INFO(r9, r1) - lwz r9,TI_CPU(r9) + lwz r9,TASK_CPU(r2) slwi r9,r9,3 add r11,r11,r9 #endif @@ -197,9 +191,7 @@ transfer_to_handler: ble- stack_ovf /* then the kernel stack overflowed */ 5: #if defined(CONFIG_PPC_BOOK3S_32) || defined(CONFIG_E500) - CURRENT_THREAD_INFO(r9, r1) - tophys(r9,r9) /* check local flags */ - lwz r12,TI_LOCAL_FLAGS(r9) + lwz r12,TI_LOCAL_FLAGS(r2) mtcrf 0x01,r12 bt- 31-TLF_NAPPING,4f bt- 31-TLF_SLEEPING,7f @@ -208,6 +200,7 @@ transfer_to_handler: transfer_to_handler_cont: 3: mflr r9 + tovirt(r2, r2) /* set r2 to current */ lwz r11,0(r9) /* virtual address of handler */ lwz r9,4(r9) /* where to go when done */ #if defined(CONFIG_PPC_8xx) && defined(CONFIG_PERF_EVENTS) @@ -271,11 +264,11 @@ reenable_mmu: /* re-enable mmu so we can */ #if defined (CONFIG_PPC_BOOK3S_32) || defined(CONFIG_E500) 4: rlwinm r12,r12,0,~_TLF_NAPPING - stw r12,TI_LOCAL_FLAGS(r9) + stw r12,TI_LOCAL_FLAGS(r2) b power_save_ppc32_restore 7: rlwinm r12,r12,0,~_TLF_SLEEPING - stw r12,TI_LOCAL_FLAGS(r9) + stw r12,TI_LOCAL_FLAGS(r2) lwz r9,_MSR(r11) /* if sleeping, clear MSR.EE */ rlwinm r9,r9,0,~MSR_EE lwz r12,_LINK(r11) /* and return to address in LR */ @@ -347,8 +340,7 @@ _GLOBAL(DoSyscall) mtmsr r11 1: #endif /* CONFIG_TRACE_IRQFLAGS */ - CURRENT_THREAD_INFO(r10, r1) - lwz r11,TI_FLAGS(r10) + lwz r11,TI_FLAGS(r2) andi. r11,r11,_TIF_SYSCALL_DOTRACE bne- syscall_dotrace syscall_dotrace_cont: @@ -381,13 +373,12 @@ ret_from_syscall: lwz r3,GPR3(r1) #endif mr r6,r3 - CURRENT_THREAD_INFO(r12, r1) /* disable interrupts so current_thread_info()->flags can't change */ LOAD_MSR_KERNEL(r10,MSR_KERNEL) /* doesn't include MSR_EE */ /* Note: We don't bother telling lockdep about it */ SYNC MTMSRD(r10) - lwz r9,TI_FLAGS(r12) + lwz r9,TI_FLAGS(r2) li r8,-MAX_ERRNO andi. r0,r9,(_TIF_SYSCALL_DOTRACE|_TIF_SINGLESTEP|_TIF_USER_WORK_MASK|_TIF_PERSYSCALL_MASK) bne- syscall_exit_work @@ -434,8 +425,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_NEED_PAIRED_STWCX) #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE andi. r4,r8,MSR_PR beq 3f - CURRENT_THREAD_INFO(r4, r1) - ACCOUNT_CPU_USER_EXIT(r4, r5, r7) + ACCOUNT_CPU_USER_EXIT(r2, r5, r7) 3: #endif lwz r4,_LINK(r1) @@ -528,7 +518,7 @@ syscall_exit_work: /* Clear per-syscall TIF flags if any are set. */ li r11,_TIF_PERSYSCALL_MASK - addi r12,r12,TI_FLAGS + addi r12,r2,TI_FLAGS 3: lwarx r8,0,r12 andc r8,r8,r11 #ifdef CONFIG_IBM405_ERR77 @@ -536,7 +526,6 @@ syscall_exit_work: #endif stwcx. r8,0,r12 bne- 3b - subi r12,r12,TI_FLAGS 4: /* Anything which requires enabling interrupts? */ andi. r0,r9,(_TIF_SYSCALL_DOTRACE|_TIF_SINGLESTEP) @@ -815,8 +804,7 @@ ret_from_except: user_exc_return: /* r10 contains MSR_KERNEL here */ /* Check current_thread_info()->flags */ - CURRENT_THREAD_INFO(r9, r1) - lwz r9,TI_FLAGS(r9) + lwz r9,TI_FLAGS(r2) andi. r0,r9,_TIF_USER_WORK_MASK bne do_work @@ -828,18 +816,14 @@ restore_user: andis. r10,r0,DBCR0_IDM@h bnel- load_dbcr0 #endif -#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE - CURRENT_THREAD_INFO(r9, r1) - ACCOUNT_CPU_USER_EXIT(r9, r10, r11) -#endif + ACCOUNT_CPU_USER_EXIT(r2, r10, r11) b restore /* N.B. the only way to get here is from the beq following ret_from_except. */ resume_kernel: /* check current_thread_info, _TIF_EMULATE_STACK_STORE */ - CURRENT_THREAD_INFO(r9, r1) - lwz r8,TI_FLAGS(r9) + lwz r8,TI_FLAGS(r2) andis. r0,r8,_TIF_EMULATE_STACK_STORE@h beq+ 1f @@ -865,7 +849,7 @@ resume_kernel: /* Clear _TIF_EMULATE_STACK_STORE flag */ lis r11,_TIF_EMULATE_STACK_STORE@h - addi r5,r9,TI_FLAGS + addi r5,r2,TI_FLAGS 0: lwarx r8,0,r5 andc r8,r8,r11 #ifdef CONFIG_IBM405_ERR77 @@ -877,7 +861,7 @@ resume_kernel: #ifdef CONFIG_PREEMPT /* check current_thread_info->preempt_count */ - lwz r0,TI_PREEMPT(r9) + lwz r0,TI_PREEMPT(r2) cmpwi 0,r0,0 /* if non-zero, just restore regs and return */ bne restore andi. r8,r8,_TIF_NEED_RESCHED @@ -893,8 +877,7 @@ resume_kernel: bl trace_hardirqs_off #endif 1: bl preempt_schedule_irq - CURRENT_THREAD_INFO(r9, r1) - lwz r3,TI_FLAGS(r9) + lwz r3,TI_FLAGS(r2) andi. r0,r3,_TIF_NEED_RESCHED bne- 1b #ifdef CONFIG_TRACE_IRQFLAGS @@ -1190,8 +1173,7 @@ load_dbcr0: lis r11,global_dbcr0@ha addi r11,r11,global_dbcr0@l #ifdef CONFIG_SMP - CURRENT_THREAD_INFO(r9, r1) - lwz r9,TI_CPU(r9) + lwz r9,TASK_CPU(r2) slwi r9,r9,3 add r11,r11,r9 #endif @@ -1231,8 +1213,7 @@ recheck: LOAD_MSR_KERNEL(r10,MSR_KERNEL) SYNC MTMSRD(r10) /* disable interrupts */ - CURRENT_THREAD_INFO(r9, r1) - lwz r9,TI_FLAGS(r9) + lwz r9,TI_FLAGS(r2) andi. r0,r9,_TIF_NEED_RESCHED bne- do_resched andi. r0,r9,_TIF_USER_WORK_MASK diff --git a/arch/powerpc/kernel/epapr_hcalls.S b/arch/powerpc/kernel/epapr_hcalls.S index 52ca2471ee1a..d252f4663a23 100644 --- a/arch/powerpc/kernel/epapr_hcalls.S +++ b/arch/powerpc/kernel/epapr_hcalls.S @@ -21,10 +21,9 @@ #ifndef CONFIG_PPC64 /* epapr_ev_idle() was derived from e500_idle() */ _GLOBAL(epapr_ev_idle) - CURRENT_THREAD_INFO(r3, r1) - PPC_LL r4, TI_LOCAL_FLAGS(r3) /* set napping bit */ + PPC_LL r4, TI_LOCAL_FLAGS(r2) /* set napping bit */ ori r4, r4,_TLF_NAPPING /* so when we take an exception */ - PPC_STL r4, TI_LOCAL_FLAGS(r3) /* it will return to our caller */ + PPC_STL r4, TI_LOCAL_FLAGS(r2) /* it will return to our caller */ wrteei 1 diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S index 4ed2a7c8e89b..1881127682e9 100644 --- a/arch/powerpc/kernel/head_fsl_booke.S +++ b/arch/powerpc/kernel/head_fsl_booke.S @@ -244,8 +244,7 @@ set_ivor: stwu r0,THREAD_SIZE-STACK_FRAME_OVERHEAD(r1) #ifdef CONFIG_SMP - CURRENT_THREAD_INFO(r22, r1) - stw r24, TI_CPU(r22) + stw r24, TASK_CPU(r2) #endif bl early_init @@ -719,7 +718,7 @@ finish_tlb_load: /* Get the next_tlbcam_idx percpu var */ #ifdef CONFIG_SMP - lwz r15, TI_CPU-THREAD(r12) + lwz r15, TASK_CPU-THREAD(r12) lis r14, __per_cpu_offset@h ori r14, r14, __per_cpu_offset@l rlwinm r15, r15, 2, 0, 29 diff --git a/arch/powerpc/kernel/idle_6xx.S b/arch/powerpc/kernel/idle_6xx.S index d9b6e7e0b5e3..c5e7f5bb2e66 100644 --- a/arch/powerpc/kernel/idle_6xx.S +++ b/arch/powerpc/kernel/idle_6xx.S @@ -136,10 +136,9 @@ BEGIN_FTR_SECTION DSSALL sync END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) - CURRENT_THREAD_INFO(r9, r1) - lwz r8,TI_LOCAL_FLAGS(r9) /* set napping bit */ + lwz r8,TI_LOCAL_FLAGS(r2) /* set napping bit */ ori r8,r8,_TLF_NAPPING /* so when we take an exception */ - stw r8,TI_LOCAL_FLAGS(r9) /* it will return to our caller */ + stw r8,TI_LOCAL_FLAGS(r2) /* it will return to our caller */ mfmsr r7 ori r7,r7,MSR_EE oris r7,r7,MSR_POW@h @@ -159,9 +158,7 @@ _GLOBAL(power_save_ppc32_restore) stw r9,_NIP(r11) /* make it do a blr */ #ifdef CONFIG_SMP - CURRENT_THREAD_INFO(r12, r1) - tophys(r12, r12) - lwz r11,TI_CPU(r12) /* get cpu number * 4 */ + lwz r11,TASK_CPU(r2) /* get cpu number * 4 */ slwi r11,r11,2 #else li r11,0 diff --git a/arch/powerpc/kernel/idle_e500.S b/arch/powerpc/kernel/idle_e500.S index 583e55ac7d26..69dfcd2ca011 100644 --- a/arch/powerpc/kernel/idle_e500.S +++ b/arch/powerpc/kernel/idle_e500.S @@ -22,10 +22,9 @@ .text _GLOBAL(e500_idle) - CURRENT_THREAD_INFO(r3, r1) - lwz r4,TI_LOCAL_FLAGS(r3) /* set napping bit */ + lwz r4,TI_LOCAL_FLAGS(r2) /* set napping bit */ ori r4,r4,_TLF_NAPPING /* so when we take an exception */ - stw r4,TI_LOCAL_FLAGS(r3) /* it will return to our caller */ + stw r4,TI_LOCAL_FLAGS(r2) /* it will return to our caller */ #ifdef CONFIG_PPC_E500MC wrteei 1 @@ -88,8 +87,7 @@ _GLOBAL(power_save_ppc32_restore) stw r9,_NIP(r11) /* make it do a blr */ #ifdef CONFIG_SMP - CURRENT_THREAD_INFO(r12, r1) - lwz r11,TI_CPU(r12) /* get cpu number * 4 */ + lwz r11,TASK_CPU(r2) /* get cpu number * 4 */ slwi r11,r11,2 #else li r11,0 diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S index 6f6127c3760c..0dda4f8e3d7a 100644 --- a/arch/powerpc/kernel/misc_32.S +++ b/arch/powerpc/kernel/misc_32.S @@ -183,8 +183,7 @@ _GLOBAL(low_choose_750fx_pll) #ifdef CONFIG_SMP /* Store new HID1 image */ - CURRENT_THREAD_INFO(r6, r1) - lwz r6,TI_CPU(r6) + lwz r6,TASK_CPU(r2) slwi r6,r6,2 #else li r6, 0 diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/hash_low_32.S index d94fef524ef5..1f13494efb2b 100644 --- a/arch/powerpc/mm/hash_low_32.S +++ b/arch/powerpc/mm/hash_low_32.S @@ -183,8 +183,7 @@ _GLOBAL(add_hash_page) add r3,r3,r0 /* note create_hpte trims to 24 bits */ #ifdef CONFIG_SMP - CURRENT_THREAD_INFO(r8, r1) /* use cpu number to make tag */ - lwz r8,TI_CPU(r8) /* to go in mmu_hash_lock */ + lwz r8,TASK_CPU(r2) /* to go in mmu_hash_lock */ oris r8,r8,12 #endif /* CONFIG_SMP */ @@ -540,9 +539,7 @@ _GLOBAL(flush_hash_pages) #ifdef CONFIG_SMP lis r9, (mmu_hash_lock - PAGE_OFFSET)@ha addi r9, r9, (mmu_hash_lock - PAGE_OFFSET)@l - CURRENT_THREAD_INFO(r8, r1) - tophys(r8, r8) - lwz r8,TI_CPU(r8) + lwz r8,TASK_CPU(r2) oris r8,r8,9 10: lwarx r0,0,r9 cmpi 0,r0,0 @@ -637,8 +634,7 @@ EXPORT_SYMBOL(flush_hash_pages) */ _GLOBAL(_tlbie) #ifdef CONFIG_SMP - CURRENT_THREAD_INFO(r8, r1) - lwz r8,TI_CPU(r8) + lwz r8,TASK_CPU(r2) oris r8,r8,11 mfmsr r10 SYNC @@ -675,8 +671,7 @@ _GLOBAL(_tlbie) */ _GLOBAL(_tlbia) #if defined(CONFIG_SMP) - CURRENT_THREAD_INFO(r8, r1) - lwz r8,TI_CPU(r8) + lwz r8,TASK_CPU(r2) oris r8,r8,10 mfmsr r10 SYNC diff --git a/arch/powerpc/sysdev/6xx-suspend.S b/arch/powerpc/sysdev/6xx-suspend.S index cf48e9cb2575..6c4aec25c4ba 100644 --- a/arch/powerpc/sysdev/6xx-suspend.S +++ b/arch/powerpc/sysdev/6xx-suspend.S @@ -29,10 +29,9 @@ _GLOBAL(mpc6xx_enter_standby) ori r5, r5, ret_from_standby@l mtlr r5 - CURRENT_THREAD_INFO(r5, r1) - lwz r6, TI_LOCAL_FLAGS(r5) + lwz r6, TI_LOCAL_FLAGS(r2) ori r6, r6, _TLF_SLEEPING - stw r6, TI_LOCAL_FLAGS(r5) + stw r6, TI_LOCAL_FLAGS(r2) mfmsr r5 ori r5, r5, MSR_EE From c911d2e128e8ab7e789a5488dcb63ae9fe130aca Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sat, 12 Jan 2019 09:55:50 +0000 Subject: [PATCH 174/200] powerpc/64: Replace CURRENT_THREAD_INFO with PACA_THREAD_INFO Now that current_thread_info is located at the beginning of 'current' task struct, CURRENT_THREAD_INFO macro is not really needed any more. This patch replaces it by loads of the value at PACA_THREAD_INFO(r13). Signed-off-by: Christophe Leroy [mpe: Add PACA_THREAD_INFO rather than using PACACURRENT] Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/exception-64s.h | 4 ++-- arch/powerpc/include/asm/thread_info.h | 4 ---- arch/powerpc/kernel/asm-offsets.c | 2 ++ arch/powerpc/kernel/entry_64.S | 10 +++++----- arch/powerpc/kernel/exceptions-64e.S | 2 +- arch/powerpc/kernel/exceptions-64s.S | 2 +- arch/powerpc/kernel/idle_book3e.S | 2 +- arch/powerpc/kernel/idle_power4.S | 2 +- arch/powerpc/kernel/trace/ftrace_64_mprofile.S | 6 +++--- 9 files changed, 16 insertions(+), 18 deletions(-) diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index 3b4767ed3ec5..937bb630093f 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -671,7 +671,7 @@ END_FTR_SECTION_NESTED(ftr,ftr,943) #define RUNLATCH_ON \ BEGIN_FTR_SECTION \ - CURRENT_THREAD_INFO(r3, r1); \ + ld r3, PACA_THREAD_INFO(r13); \ ld r4,TI_LOCAL_FLAGS(r3); \ andi. r0,r4,_TLF_RUNLATCH; \ beql ppc64_runlatch_on_trampoline; \ @@ -721,7 +721,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_CTRL) #ifdef CONFIG_PPC_970_NAP #define FINISH_NAP \ BEGIN_FTR_SECTION \ - CURRENT_THREAD_INFO(r11, r1); \ + ld r11, PACA_THREAD_INFO(r13); \ ld r9,TI_LOCAL_FLAGS(r11); \ andi. r10,r9,_TLF_NAPPING; \ bnel power4_fixup_nap; \ diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h index c959b8d66cac..8e1d0195ac36 100644 --- a/arch/powerpc/include/asm/thread_info.h +++ b/arch/powerpc/include/asm/thread_info.h @@ -17,10 +17,6 @@ #define THREAD_SIZE (1 << THREAD_SHIFT) -#ifdef CONFIG_PPC64 -#define CURRENT_THREAD_INFO(dest, sp) stringify_in_c(ld dest, PACACURRENT(r13)) -#endif - #ifndef __ASSEMBLY__ #include #include diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 8b688b19776a..86a61e5f8285 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -182,6 +182,8 @@ int main(void) OFFSET(PACAPROCSTART, paca_struct, cpu_start); OFFSET(PACAKSAVE, paca_struct, kstack); OFFSET(PACACURRENT, paca_struct, __current); + DEFINE(PACA_THREAD_INFO, offsetof(struct paca_struct, __current) + + offsetof(struct task_struct, thread_info)); OFFSET(PACASAVEDMSR, paca_struct, saved_msr); OFFSET(PACAR1, paca_struct, saved_r1); OFFSET(PACATOC, paca_struct, kernel_toc); diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 21f1cb4d464e..15c67d2c0534 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -166,7 +166,7 @@ system_call: /* label this so stack traces look sane */ li r10,IRQS_ENABLED std r10,SOFTE(r1) - CURRENT_THREAD_INFO(r11, r1) + ld r11, PACA_THREAD_INFO(r13) ld r10,TI_FLAGS(r11) andi. r11,r10,_TIF_SYSCALL_DOTRACE bne .Lsyscall_dotrace /* does not return */ @@ -213,7 +213,7 @@ system_call: /* label this so stack traces look sane */ ld r3,RESULT(r1) #endif - CURRENT_THREAD_INFO(r12, r1) + ld r12, PACA_THREAD_INFO(r13) ld r8,_MSR(r1) #ifdef CONFIG_PPC_BOOK3S @@ -346,7 +346,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) /* Repopulate r9 and r10 for the syscall path */ addi r9,r1,STACK_FRAME_OVERHEAD - CURRENT_THREAD_INFO(r10, r1) + ld r10, PACA_THREAD_INFO(r13) ld r10,TI_FLAGS(r10) cmpldi r0,NR_syscalls @@ -740,7 +740,7 @@ _GLOBAL(ret_from_except_lite) mtmsrd r10,1 /* Update machine state */ #endif /* CONFIG_PPC_BOOK3E */ - CURRENT_THREAD_INFO(r9, r1) + ld r9, PACA_THREAD_INFO(r13) ld r3,_MSR(r1) #ifdef CONFIG_PPC_BOOK3E ld r10,PACACURRENT(r13) @@ -854,7 +854,7 @@ resume_kernel: 1: bl preempt_schedule_irq /* Re-test flags and eventually loop */ - CURRENT_THREAD_INFO(r9, r1) + ld r9, PACA_THREAD_INFO(r13) ld r4,TI_FLAGS(r9) andi. r0,r4,_TIF_NEED_RESCHED bne 1b diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S index 20f14996281d..4549ce8d4637 100644 --- a/arch/powerpc/kernel/exceptions-64e.S +++ b/arch/powerpc/kernel/exceptions-64e.S @@ -493,7 +493,7 @@ exc_##n##_bad_stack: \ * interrupts happen before the wait instruction. */ #define CHECK_NAPPING() \ - CURRENT_THREAD_INFO(r11, r1); \ + ld r11, PACA_THREAD_INFO(r13); \ ld r10,TI_LOCAL_FLAGS(r11); \ andi. r9,r10,_TLF_NAPPING; \ beq+ 1f; \ diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 9e253ce27e08..b179b8b5d3f0 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -1629,7 +1629,7 @@ do_hash_page: ori r0,r0,DSISR_BAD_FAULT_64S@l and. r0,r4,r0 /* weird error? */ bne- handle_page_fault /* if not, try to insert a HPTE */ - CURRENT_THREAD_INFO(r11, r1) + ld r11, PACA_THREAD_INFO(r13) lwz r0,TI_PREEMPT(r11) /* If we're in an "NMI" */ andis. r0,r0,NMI_MASK@h /* (i.e. an irq when soft-disabled) */ bne 77f /* then don't call hash_page now */ diff --git a/arch/powerpc/kernel/idle_book3e.S b/arch/powerpc/kernel/idle_book3e.S index 4e0d94d02030..31e732c378ad 100644 --- a/arch/powerpc/kernel/idle_book3e.S +++ b/arch/powerpc/kernel/idle_book3e.S @@ -63,7 +63,7 @@ _GLOBAL(\name) 1: /* Let's set the _TLF_NAPPING flag so interrupts make us return * to the right spot */ - CURRENT_THREAD_INFO(r11, r1) + ld r11, PACACURRENT(r13) ld r10,TI_LOCAL_FLAGS(r11) ori r10,r10,_TLF_NAPPING std r10,TI_LOCAL_FLAGS(r11) diff --git a/arch/powerpc/kernel/idle_power4.S b/arch/powerpc/kernel/idle_power4.S index a09b3c7ca176..a2fdb0a34b75 100644 --- a/arch/powerpc/kernel/idle_power4.S +++ b/arch/powerpc/kernel/idle_power4.S @@ -68,7 +68,7 @@ BEGIN_FTR_SECTION DSSALL sync END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) - CURRENT_THREAD_INFO(r9, r1) + ld r9, PACA_THREAD_INFO(r13) ld r8,TI_LOCAL_FLAGS(r9) /* set napping bit */ ori r8,r8,_TLF_NAPPING /* so when we take an exception */ std r8,TI_LOCAL_FLAGS(r9) /* it will return to our caller */ diff --git a/arch/powerpc/kernel/trace/ftrace_64_mprofile.S b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S index 32476a6e4e9c..01b1224add49 100644 --- a/arch/powerpc/kernel/trace/ftrace_64_mprofile.S +++ b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S @@ -229,7 +229,7 @@ ftrace_call: * - r0, r11 & r12 are free */ livepatch_handler: - CURRENT_THREAD_INFO(r12, r1) + ld r12, PACA_THREAD_INFO(r13) /* Allocate 3 x 8 bytes */ ld r11, TI_livepatch_sp(r12) @@ -256,7 +256,7 @@ livepatch_handler: * restore it. */ - CURRENT_THREAD_INFO(r12, r1) + ld r12, PACA_THREAD_INFO(r13) ld r11, TI_livepatch_sp(r12) @@ -273,7 +273,7 @@ livepatch_handler: ld r2, -24(r11) /* Pop livepatch stack frame */ - CURRENT_THREAD_INFO(r12, r1) + ld r12, PACA_THREAD_INFO(r13) subi r11, r11, 24 std r11, TI_livepatch_sp(r12) From d608898abc749424e26aa0e451d39e33cf3f4adc Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sat, 12 Jan 2019 09:55:53 +0000 Subject: [PATCH 175/200] powerpc: clean stack pointers naming Some stack pointers used to also be thread_info pointers and were called tp. Now that they are only stack pointers, rename them sp. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/irq.c | 17 +++++++---------- arch/powerpc/kernel/setup_64.c | 11 +++-------- 2 files changed, 10 insertions(+), 18 deletions(-) diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 938944c6e2ee..8a936723c791 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -659,21 +659,21 @@ void __do_irq(struct pt_regs *regs) void do_IRQ(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); - void *curtp, *irqtp, *sirqtp; + void *cursp, *irqsp, *sirqsp; /* Switch to the irq stack to handle this */ - curtp = (void *)(current_stack_pointer() & ~(THREAD_SIZE - 1)); - irqtp = hardirq_ctx[raw_smp_processor_id()]; - sirqtp = softirq_ctx[raw_smp_processor_id()]; + cursp = (void *)(current_stack_pointer() & ~(THREAD_SIZE - 1)); + irqsp = hardirq_ctx[raw_smp_processor_id()]; + sirqsp = softirq_ctx[raw_smp_processor_id()]; /* Already there ? */ - if (unlikely(curtp == irqtp || curtp == sirqtp)) { + if (unlikely(cursp == irqsp || cursp == sirqsp)) { __do_irq(regs); set_irq_regs(old_regs); return; } /* Switch stack and call */ - call_do_irq(regs, irqtp); + call_do_irq(regs, irqsp); set_irq_regs(old_regs); } @@ -695,10 +695,7 @@ void *hardirq_ctx[NR_CPUS] __read_mostly; void do_softirq_own_stack(void) { - void *irqtp; - - irqtp = softirq_ctx[smp_processor_id()]; - call_do_softirq(irqtp); + call_do_softirq(softirq_ctx[smp_processor_id()]); } irq_hw_number_t virq_to_hw(unsigned int virq) diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 2db1c5f7d141..daa361fc6a24 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -716,19 +716,14 @@ void __init emergency_stack_init(void) limit = min(ppc64_bolted_size(), ppc64_rma_size); for_each_possible_cpu(i) { - void *ti; - - ti = alloc_stack(limit, i); - paca_ptrs[i]->emergency_sp = ti + THREAD_SIZE; + paca_ptrs[i]->emergency_sp = alloc_stack(limit, i) + THREAD_SIZE; #ifdef CONFIG_PPC_BOOK3S_64 /* emergency stack for NMI exception handling. */ - ti = alloc_stack(limit, i); - paca_ptrs[i]->nmi_emergency_sp = ti + THREAD_SIZE; + paca_ptrs[i]->nmi_emergency_sp = alloc_stack(limit, i) + THREAD_SIZE; /* emergency stack for machine check exception handling. */ - ti = alloc_stack(limit, i); - paca_ptrs[i]->mc_emergency_sp = ti + THREAD_SIZE; + paca_ptrs[i]->mc_emergency_sp = alloc_stack(limit, i) + THREAD_SIZE; #endif } } From 930d6288a26787d2e7f633705434171a506db9c5 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Fri, 22 Feb 2019 12:23:27 +0530 Subject: [PATCH 176/200] powerpc: sstep: Add support for maddhd, maddhdu, maddld instructions This adds emulation support for the following integer instructions: * Multiply-Add High Doubleword (maddhd) * Multiply-Add High Doubleword Unsigned (maddhdu) * Multiply-Add Low Doubleword (maddld) As suggested by Michael, this uses a raw .long for specifying the instruction word when using inline assembly to retain compatibility with older binutils. Signed-off-by: Sandipan Das Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/ppc-opcode.h | 15 +++++++++++- arch/powerpc/lib/sstep.c | 35 ++++++++++++++++++++++++++- 2 files changed, 48 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h index 87b73aa56b53..2bc949414669 100644 --- a/arch/powerpc/include/asm/ppc-opcode.h +++ b/arch/powerpc/include/asm/ppc-opcode.h @@ -335,6 +335,9 @@ #define PPC_INST_MULLW 0x7c0001d6 #define PPC_INST_MULHWU 0x7c000016 #define PPC_INST_MULLI 0x1c000000 +#define PPC_INST_MADDHD 0x10000030 +#define PPC_INST_MADDHDU 0x10000031 +#define PPC_INST_MADDLD 0x10000033 #define PPC_INST_DIVWU 0x7c000396 #define PPC_INST_DIVD 0x7c0003d2 #define PPC_INST_RLWINM 0x54000000 @@ -377,6 +380,7 @@ /* macros to insert fields into opcodes */ #define ___PPC_RA(a) (((a) & 0x1f) << 16) #define ___PPC_RB(b) (((b) & 0x1f) << 11) +#define ___PPC_RC(c) (((c) & 0x1f) << 6) #define ___PPC_RS(s) (((s) & 0x1f) << 21) #define ___PPC_RT(t) ___PPC_RS(t) #define ___PPC_R(r) (((r) & 0x1) << 16) @@ -396,7 +400,7 @@ #define __PPC_WS(w) (((w) & 0x1f) << 11) #define __PPC_SH(s) __PPC_WS(s) #define __PPC_SH64(s) (__PPC_SH(s) | (((s) & 0x20) >> 4)) -#define __PPC_MB(s) (((s) & 0x1f) << 6) +#define __PPC_MB(s) ___PPC_RC(s) #define __PPC_ME(s) (((s) & 0x1f) << 1) #define __PPC_MB64(s) (__PPC_MB(s) | ((s) & 0x20)) #define __PPC_ME64(s) __PPC_MB64(s) @@ -438,6 +442,15 @@ #define PPC_STQCX(t, a, b) stringify_in_c(.long PPC_INST_STQCX | \ ___PPC_RT(t) | ___PPC_RA(a) | \ ___PPC_RB(b)) +#define PPC_MADDHD(t, a, b, c) stringify_in_c(.long PPC_INST_MADDHD | \ + ___PPC_RT(t) | ___PPC_RA(a) | \ + ___PPC_RB(b) | ___PPC_RC(c)) +#define PPC_MADDHDU(t, a, b, c) stringify_in_c(.long PPC_INST_MADDHDU | \ + ___PPC_RT(t) | ___PPC_RA(a) | \ + ___PPC_RB(b) | ___PPC_RC(c)) +#define PPC_MADDLD(t, a, b, c) stringify_in_c(.long PPC_INST_MADDLD | \ + ___PPC_RT(t) | ___PPC_RA(a) | \ + ___PPC_RB(b) | ___PPC_RC(c)) #define PPC_MSGSND(b) stringify_in_c(.long PPC_INST_MSGSND | \ ___PPC_RB(b)) #define PPC_MSGSYNC stringify_in_c(.long PPC_INST_MSGSYNC) diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c index d81568f783e5..67e69ebd6c00 100644 --- a/arch/powerpc/lib/sstep.c +++ b/arch/powerpc/lib/sstep.c @@ -1169,7 +1169,7 @@ static nokprobe_inline int trap_compare(long v1, long v2) int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, unsigned int instr) { - unsigned int opcode, ra, rb, rd, spr, u; + unsigned int opcode, ra, rb, rc, rd, spr, u; unsigned long int imm; unsigned long int val, val2; unsigned int mb, me, sh; @@ -1292,6 +1292,7 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, rd = (instr >> 21) & 0x1f; ra = (instr >> 16) & 0x1f; rb = (instr >> 11) & 0x1f; + rc = (instr >> 6) & 0x1f; switch (opcode) { #ifdef __powerpc64__ @@ -1305,6 +1306,38 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, goto trap; return 1; +#ifdef __powerpc64__ + case 4: + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + return -1; + + switch (instr & 0x3f) { + case 48: /* maddhd */ + asm volatile(PPC_MADDHD(%0, %1, %2, %3) : + "=r" (op->val) : "r" (regs->gpr[ra]), + "r" (regs->gpr[rb]), "r" (regs->gpr[rc])); + goto compute_done; + + case 49: /* maddhdu */ + asm volatile(PPC_MADDHDU(%0, %1, %2, %3) : + "=r" (op->val) : "r" (regs->gpr[ra]), + "r" (regs->gpr[rb]), "r" (regs->gpr[rc])); + goto compute_done; + + case 51: /* maddld */ + asm volatile(PPC_MADDLD(%0, %1, %2, %3) : + "=r" (op->val) : "r" (regs->gpr[ra]), + "r" (regs->gpr[rb]), "r" (regs->gpr[rc])); + goto compute_done; + } + + /* + * There are other instructions from ISA 3.0 with the same + * primary opcode which do not have emulation support yet. + */ + return -1; +#endif + case 7: /* mulli */ op->val = regs->gpr[ra] * (short) instr; goto compute_done; From a23987ef267a3549667fed5d69c0174e7fc15910 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Fri, 22 Feb 2019 12:23:28 +0530 Subject: [PATCH 177/200] powerpc: sstep: Add support for darn instruction This adds emulation support for the following integer instructions: * Deliver A Random Number (darn) As suggested by Michael, this uses a raw .long for specifying the instruction word when using inline assembly to retain compatibility with older binutils. Signed-off-by: Sandipan Das Signed-off-by: Michael Ellerman --- arch/powerpc/lib/sstep.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c index 67e69ebd6c00..ab575e02f9b8 100644 --- a/arch/powerpc/lib/sstep.c +++ b/arch/powerpc/lib/sstep.c @@ -1728,6 +1728,28 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, (int) regs->gpr[rb]; goto arith_done; + case 755: /* darn */ + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + return -1; + switch (ra & 0x3) { + case 0: + /* 32-bit conditioned */ + asm volatile(PPC_DARN(%0, 0) : "=r" (op->val)); + goto compute_done; + + case 1: + /* 64-bit conditioned */ + asm volatile(PPC_DARN(%0, 1) : "=r" (op->val)); + goto compute_done; + + case 2: + /* 64-bit raw */ + asm volatile(PPC_DARN(%0, 2) : "=r" (op->val)); + goto compute_done; + } + + return -1; + /* * Logical instructions From 32628b5cf3bcdf31d7e00b0e8229051ee2afe96e Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Fri, 22 Feb 2019 12:23:29 +0530 Subject: [PATCH 178/200] powerpc sstep: Add support for cnttzw, cnttzd instructions This adds emulation support for the following integer instructions: * Count Trailing Zeros Word (cnttzw[.]) * Count Trailing Zeros Doubleword (cnttzd[.]) Signed-off-by: Sandipan Das Signed-off-by: Michael Ellerman --- arch/powerpc/lib/sstep.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c index ab575e02f9b8..94189da4c159 100644 --- a/arch/powerpc/lib/sstep.c +++ b/arch/powerpc/lib/sstep.c @@ -1819,6 +1819,20 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, case 506: /* popcntd */ do_popcnt(regs, op, regs->gpr[rd], 64); goto logical_done_nocc; +#endif + case 538: /* cnttzw */ + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + return -1; + val = (unsigned int) regs->gpr[rd]; + op->val = (val ? __builtin_ctz(val) : 32); + goto logical_done; +#ifdef __powerpc64__ + case 570: /* cnttzd */ + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + return -1; + val = regs->gpr[rd]; + op->val = (val ? __builtin_ctzl(val) : 64); + goto logical_done; #endif case 922: /* extsh */ op->val = (signed short) regs->gpr[rd]; From 3e751acba2658d664dc593d284714073c38380a8 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Fri, 22 Feb 2019 12:23:30 +0530 Subject: [PATCH 179/200] powerpc sstep: Add support for extswsli instruction This adds emulation support for the following integer instructions: * Extend-Sign Word and Shift Left Immediate (extswsli[.]) Signed-off-by: Sandipan Das Signed-off-by: Michael Ellerman --- arch/powerpc/lib/sstep.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c index 94189da4c159..742298bdf30b 100644 --- a/arch/powerpc/lib/sstep.c +++ b/arch/powerpc/lib/sstep.c @@ -1935,6 +1935,20 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, op->xerval &= ~XER_CA; set_ca32(op, op->xerval & XER_CA); goto logical_done; + + case 890: /* extswsli with sh_5 = 0 */ + case 891: /* extswsli with sh_5 = 1 */ + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + return -1; + op->type = COMPUTE + SETREG; + sh = rb | ((instr & 2) << 4); + val = (signed int) regs->gpr[rd]; + if (sh) + op->val = ROTATE(val, sh) & MASK64(0, 63 - sh); + else + op->val = val; + goto logical_done; + #endif /* __powerpc64__ */ /* From 6c180071509aacb5989993b435e71e70500b72dd Mon Sep 17 00:00:00 2001 From: PrasannaKumar Muralidharan Date: Fri, 22 Feb 2019 12:23:31 +0530 Subject: [PATCH 180/200] powerpc sstep: Add support for modsw, moduw instructions This adds emulation support for the following integer instructions: * Modulo Signed Word (modsw) * Modulo Unsigned Word (moduw) Signed-off-by: PrasannaKumar Muralidharan Signed-off-by: Sandipan Das Signed-off-by: Michael Ellerman --- arch/powerpc/lib/sstep.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c index 742298bdf30b..9c65fb1da298 100644 --- a/arch/powerpc/lib/sstep.c +++ b/arch/powerpc/lib/sstep.c @@ -1708,6 +1708,13 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, case 266: /* add */ op->val = regs->gpr[ra] + regs->gpr[rb]; goto arith_done; + + case 267: /* moduw */ + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + return -1; + op->val = (unsigned int) regs->gpr[ra] % + (unsigned int) regs->gpr[rb]; + goto compute_done; #ifdef __powerpc64__ case 457: /* divdu */ op->val = regs->gpr[ra] / regs->gpr[rb]; @@ -1750,6 +1757,13 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, return -1; + case 779: /* modsw */ + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + return -1; + op->val = (int) regs->gpr[ra] % + (int) regs->gpr[rb]; + goto compute_done; + /* * Logical instructions From 6324320de609766f79b85b681a53061885bc679d Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Fri, 22 Feb 2019 12:23:32 +0530 Subject: [PATCH 181/200] powerpc sstep: Add support for modsd, modud instructions This adds emulation support for the following integer instructions: * Modulo Signed Doubleword (modsd) * Modulo Unsigned Doubleword (modud) Signed-off-by: Sandipan Das Signed-off-by: Michael Ellerman --- arch/powerpc/lib/sstep.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c index 9c65fb1da298..3d33fb509ef4 100644 --- a/arch/powerpc/lib/sstep.c +++ b/arch/powerpc/lib/sstep.c @@ -1704,7 +1704,13 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, (int) regs->gpr[rb]; goto arith_done; - +#ifdef __powerpc64__ + case 265: /* modud */ + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + return -1; + op->val = regs->gpr[ra] % regs->gpr[rb]; + goto compute_done; +#endif case 266: /* add */ op->val = regs->gpr[ra] + regs->gpr[rb]; goto arith_done; @@ -1756,7 +1762,14 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, } return -1; - +#ifdef __powerpc64__ + case 777: /* modsd */ + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + return -1; + op->val = (long int) regs->gpr[ra] % + (long int) regs->gpr[rb]; + goto compute_done; +#endif case 779: /* modsw */ if (!cpu_has_feature(CPU_FTR_ARCH_300)) return -1; From 6cd96c5b684d9d6873e2bcbf17c43b32f3de80ef Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 26 Feb 2019 12:32:06 +1100 Subject: [PATCH 182/200] selftests/powerpc: Remove duplicate header Remove duplicate headers which are included twice. Signed-off-by: Sabyasachi Gupta Signed-off-by: Souptick Joarder [mpe: Split out of larger patch] Signed-off-by: Michael Ellerman --- tools/testing/selftests/powerpc/pmu/ebb/fork_cleanup_test.c | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/testing/selftests/powerpc/pmu/ebb/fork_cleanup_test.c b/tools/testing/selftests/powerpc/pmu/ebb/fork_cleanup_test.c index 167135bd92a8..af1b80265076 100644 --- a/tools/testing/selftests/powerpc/pmu/ebb/fork_cleanup_test.c +++ b/tools/testing/selftests/powerpc/pmu/ebb/fork_cleanup_test.c @@ -11,7 +11,6 @@ #include #include #include -#include #include "ebb.h" From 5330367fa300742a97e20e953b1f77f48392faae Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 26 Feb 2019 10:09:34 +0530 Subject: [PATCH 183/200] powerpc/hugetlb: Handle mmap_min_addr correctly in get_unmapped_area callback After we ALIGN up the address we need to make sure we didn't overflow and resulted in zero address. In that case, we need to make sure that the returned address is greater than mmap_min_addr. This fixes selftest va_128TBswitch --run-hugetlb reporting failures when run as non root user for mmap(-1, MAP_HUGETLB) The bug is that a non-root user requesting address -1 will be given address 0 which will then fail, whereas they should have been given something else that would have succeeded. We also avoid the first mmap(-1, MAP_HUGETLB) returning NULL address as mmap address with this change. So we think this is not a security issue, because it only affects whether we choose an address below mmap_min_addr, not whether we actually allow that address to be mapped. ie. there are existing capability checks to prevent a user mapping below mmap_min_addr and those will still be honoured even without this fix. Fixes: 484837601d4d ("powerpc/mm: Add radix support for hugetlb") Reviewed-by: Laurent Dufour Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/mm/hugetlbpage-radix.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/mm/hugetlbpage-radix.c b/arch/powerpc/mm/hugetlbpage-radix.c index 2486bee0f93e..97c7a39ebc00 100644 --- a/arch/powerpc/mm/hugetlbpage-radix.c +++ b/arch/powerpc/mm/hugetlbpage-radix.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include #include +#include #include #include #include @@ -73,7 +74,7 @@ radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr, if (addr) { addr = ALIGN(addr, huge_page_size(h)); vma = find_vma(mm, addr); - if (high_limit - len >= addr && + if (high_limit - len >= addr && addr >= mmap_min_addr && (!vma || addr + len <= vm_start_gap(vma))) return addr; } @@ -83,7 +84,7 @@ radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr, */ info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; - info.low_limit = PAGE_SIZE; + info.low_limit = max(PAGE_SIZE, mmap_min_addr); info.high_limit = mm->mmap_base + (high_limit - DEFAULT_MAP_WINDOW); info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; From 3b4d07d2674f6b4a9281031f99d1f7efd325b16d Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 26 Feb 2019 10:09:35 +0530 Subject: [PATCH 184/200] powerpc/mm/hash: Handle mmap_min_addr correctly in get_unmapped_area topdown search When doing top-down search the low_limit is not PAGE_SIZE but rather max(PAGE_SIZE, mmap_min_addr). This handle cases in which mmap_min_addr > PAGE_SIZE. Fixes: fba2369e6ceb ("mm: use vm_unmapped_area() on powerpc architecture") Reviewed-by: Laurent Dufour Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/mm/slice.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c index 06898c13901d..aec91dbcdc0b 100644 --- a/arch/powerpc/mm/slice.c +++ b/arch/powerpc/mm/slice.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -377,6 +378,7 @@ static unsigned long slice_find_area_topdown(struct mm_struct *mm, int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT); unsigned long addr, found, prev; struct vm_unmapped_area_info info; + unsigned long min_addr = max(PAGE_SIZE, mmap_min_addr); info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; @@ -393,7 +395,7 @@ static unsigned long slice_find_area_topdown(struct mm_struct *mm, if (high_limit > DEFAULT_MAP_WINDOW) addr += mm->context.slb_addr_limit - DEFAULT_MAP_WINDOW; - while (addr > PAGE_SIZE) { + while (addr > min_addr) { info.high_limit = addr; if (!slice_scan_available(addr - 1, available, 0, &addr)) continue; @@ -405,8 +407,8 @@ static unsigned long slice_find_area_topdown(struct mm_struct *mm, * Check if we need to reduce the range, or if we can * extend it to cover the previous available slice. */ - if (addr < PAGE_SIZE) - addr = PAGE_SIZE; + if (addr < min_addr) + addr = min_addr; else if (slice_scan_available(addr - 1, available, 0, &prev)) { addr = prev; goto prev_slice; @@ -528,7 +530,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, addr = _ALIGN_UP(addr, page_size); slice_dbg(" aligned addr=%lx\n", addr); /* Ignore hint if it's too large or overlaps a VMA */ - if (addr > high_limit - len || + if (addr > high_limit - len || addr < mmap_min_addr || !slice_area_is_free(mm, addr, len)) addr = 0; } From ccd477028a202993b9ddca5d2404fdaca3b7a55c Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 26 Feb 2019 18:51:07 +1000 Subject: [PATCH 185/200] powerpc/64s: Fix HV NMI vs HV interrupt recoverability test HV interrupts that use HSRR registers do not enter with MSR[RI] clear, but their entry code is not recoverable vs NMI, due to shared use of HSPRG1 as a scratch register to save r13. This means that a system reset or machine check that hits in HSRR interrupt entry can cause r13 to be silently corrupted. Fix this by marking NMIs non-recoverable if they land in HV interrupt ranges. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/asm-prototypes.h | 8 +++ arch/powerpc/include/asm/nmi.h | 2 + arch/powerpc/kernel/exceptions-64s.S | 8 +++ arch/powerpc/kernel/mce.c | 3 ++ arch/powerpc/kernel/traps.c | 66 +++++++++++++++++++++++ 5 files changed, 87 insertions(+) diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h index 1484df6779ab..e01f31fb0865 100644 --- a/arch/powerpc/include/asm/asm-prototypes.h +++ b/arch/powerpc/include/asm/asm-prototypes.h @@ -51,6 +51,14 @@ int exit_vmx_usercopy(void); int enter_vmx_ops(void); void *exit_vmx_ops(void *dest); +/* Exceptions */ +#ifdef CONFIG_PPC_POWERNV +extern unsigned long real_trampolines_start; +extern unsigned long real_trampolines_end; +extern unsigned long virt_trampolines_start; +extern unsigned long virt_trampolines_end; +#endif + /* Traps */ long machine_check_early(struct pt_regs *regs); long hmi_exception_realmode(struct pt_regs *regs); diff --git a/arch/powerpc/include/asm/nmi.h b/arch/powerpc/include/asm/nmi.h index bd9ba8defd72..84b4cfe73edd 100644 --- a/arch/powerpc/include/asm/nmi.h +++ b/arch/powerpc/include/asm/nmi.h @@ -14,4 +14,6 @@ extern void arch_trigger_cpumask_backtrace(const cpumask_t *mask, #define arch_trigger_cpumask_backtrace arch_trigger_cpumask_backtrace #endif +extern void hv_nmi_check_nonrecoverable(struct pt_regs *regs); + #endif /* _ASM_NMI_H */ diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index b179b8b5d3f0..76442af8c191 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -68,6 +68,14 @@ OPEN_FIXED_SECTION(real_vectors, 0x0100, 0x1900) OPEN_FIXED_SECTION(real_trampolines, 0x1900, 0x4000) OPEN_FIXED_SECTION(virt_vectors, 0x4000, 0x5900) OPEN_FIXED_SECTION(virt_trampolines, 0x5900, 0x7000) + +#ifdef CONFIG_PPC_POWERNV + .globl real_trampolines_start + .globl real_trampolines_end + .globl virt_trampolines_start + .globl virt_trampolines_end +#endif + #if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) /* * Data area reserved for FWNMI option. diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index d501b48f287e..b5fec1f9751a 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -31,6 +31,7 @@ #include #include +#include static DEFINE_PER_CPU(int, mce_nest_count); static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event); @@ -490,6 +491,8 @@ long machine_check_early(struct pt_regs *regs) { long handled = 0; + hv_nmi_check_nonrecoverable(regs); + /* * See if platform is capable of handling machine check. */ diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index b25bc8af7d38..eee8f843f3d6 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -369,6 +369,70 @@ void _exception(int signr, struct pt_regs *regs, int code, unsigned long addr) force_sig_fault(signr, code, (void __user *)addr, current); } +/* + * The interrupt architecture has a quirk in that the HV interrupts excluding + * the NMIs (0x100 and 0x200) do not clear MSR[RI] at entry. The first thing + * that an interrupt handler must do is save off a GPR into a scratch register, + * and all interrupts on POWERNV (HV=1) use the HSPRG1 register as scratch. + * Therefore an NMI can clobber an HV interrupt's live HSPRG1 without noticing + * that it is non-reentrant, which leads to random data corruption. + * + * The solution is for NMI interrupts in HV mode to check if they originated + * from these critical HV interrupt regions. If so, then mark them not + * recoverable. + * + * An alternative would be for HV NMIs to use SPRG for scratch to avoid the + * HSPRG1 clobber, however this would cause guest SPRG to be clobbered. Linux + * guests should always have MSR[RI]=0 when its scratch SPRG is in use, so + * that would work. However any other guest OS that may have the SPRG live + * and MSR[RI]=1 could encounter silent corruption. + * + * Builds that do not support KVM could take this second option to increase + * the recoverability of NMIs. + */ +void hv_nmi_check_nonrecoverable(struct pt_regs *regs) +{ +#ifdef CONFIG_PPC_POWERNV + unsigned long kbase = (unsigned long)_stext; + unsigned long nip = regs->nip; + + if (!(regs->msr & MSR_RI)) + return; + if (!(regs->msr & MSR_HV)) + return; + if (regs->msr & MSR_PR) + return; + + /* + * Now test if the interrupt has hit a range that may be using + * HSPRG1 without having RI=0 (i.e., an HSRR interrupt). The + * problem ranges all run un-relocated. Test real and virt modes + * at the same time by droping the high bit of the nip (virt mode + * entry points still have the +0x4000 offset). + */ + nip &= ~0xc000000000000000ULL; + if ((nip >= 0x500 && nip < 0x600) || (nip >= 0x4500 && nip < 0x4600)) + goto nonrecoverable; + if ((nip >= 0x980 && nip < 0xa00) || (nip >= 0x4980 && nip < 0x4a00)) + goto nonrecoverable; + if ((nip >= 0xe00 && nip < 0xec0) || (nip >= 0x4e00 && nip < 0x4ec0)) + goto nonrecoverable; + if ((nip >= 0xf80 && nip < 0xfa0) || (nip >= 0x4f80 && nip < 0x4fa0)) + goto nonrecoverable; + /* Trampoline code runs un-relocated so subtract kbase. */ + if (nip >= real_trampolines_start - kbase && + nip < real_trampolines_end - kbase) + goto nonrecoverable; + if (nip >= virt_trampolines_start - kbase && + nip < virt_trampolines_end - kbase) + goto nonrecoverable; + return; + +nonrecoverable: + regs->msr &= ~MSR_RI; +#endif +} + void system_reset_exception(struct pt_regs *regs) { /* @@ -379,6 +443,8 @@ void system_reset_exception(struct pt_regs *regs) if (!nested) nmi_enter(); + hv_nmi_check_nonrecoverable(regs); + __this_cpu_inc(irq_stat.sreset_irqs); /* See if any machine dependent calls */ From cbf2ba952a70399c972f2a2126a4ac6f79437f37 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 26 Feb 2019 18:51:08 +1000 Subject: [PATCH 186/200] powerpc/64s: system reset interrupt preserve HSRRs Code that uses HSRR registers is not required to clear MSR[RI] by convention, however the system reset NMI itself may use HSRR registers (e.g., to call OPAL) and clobber them. Rather than introduce the requirement to clear RI in order to use HSRRs, have system reset interrupt save and restore HSRRs. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/traps.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index eee8f843f3d6..a5757bef03cd 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -435,14 +435,32 @@ nonrecoverable: void system_reset_exception(struct pt_regs *regs) { + unsigned long hsrr0, hsrr1; + bool nested = in_nmi(); + bool saved_hsrrs = false; + /* * Avoid crashes in case of nested NMI exceptions. Recoverability * is determined by RI and in_nmi */ - bool nested = in_nmi(); if (!nested) nmi_enter(); + /* + * System reset can interrupt code where HSRRs are live and MSR[RI]=1. + * The system reset interrupt itself may clobber HSRRs (e.g., to call + * OPAL), so save them here and restore them before returning. + * + * Machine checks don't need to save HSRRs, as the real mode handler + * is careful to avoid them, and the regular handler is not delivered + * as an NMI. + */ + if (cpu_has_feature(CPU_FTR_HVMODE)) { + hsrr0 = mfspr(SPRN_HSRR0); + hsrr1 = mfspr(SPRN_HSRR1); + saved_hsrrs = true; + } + hv_nmi_check_nonrecoverable(regs); __this_cpu_inc(irq_stat.sreset_irqs); @@ -492,6 +510,11 @@ out: if (!(regs->msr & MSR_RI)) nmi_panic(regs, "Unrecoverable System Reset"); + if (saved_hsrrs) { + mtspr(SPRN_HSRR0, hsrr0); + mtspr(SPRN_HSRR1, hsrr1); + } + if (!nested) nmi_exit(); From e779fc93643c1181b0164745a537986a525850ca Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 26 Feb 2019 18:51:09 +1000 Subject: [PATCH 187/200] powerpc/64s: Prepare to handle data interrupts vs d-side MCE reentrancy A subsequent fix for data interrupts (those that set DAR / DSISR) requires some interrupt macros to be open-coded, and also requires the 0x300 interrupt handler to be moved out-of-line. This patch does that without changing behaviour, which makes the later fix a smaller change. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/exceptions-64s.S | 48 ++++++++++++++++++++++++---- 1 file changed, 42 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 76442af8c191..680197df4aea 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -574,8 +574,23 @@ EXC_COMMON_BEGIN(mce_return) RFI_TO_KERNEL b . -EXC_REAL(data_access, 0x300, 0x80) -EXC_VIRT(data_access, 0x4300, 0x80, 0x300) +EXC_REAL_BEGIN(data_access, 0x300, 0x80) +SET_SCRATCH0(r13) /* save r13 */ +EXCEPTION_PROLOG_0(PACA_EXGEN) + b tramp_real_data_access +EXC_REAL_END(data_access, 0x300, 0x80) + +TRAMP_REAL_BEGIN(tramp_real_data_access) +EXCEPTION_PROLOG_1(PACA_EXGEN, KVMTEST_PR, 0x300) +EXCEPTION_PROLOG_2(data_access_common, EXC_STD) + +EXC_VIRT_BEGIN(data_access, 0x4300, 0x80) +SET_SCRATCH0(r13) /* save r13 */ +EXCEPTION_PROLOG_0(PACA_EXGEN) +EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, 0x300) +EXCEPTION_PROLOG_2_RELON(data_access_common, EXC_STD) +EXC_VIRT_END(data_access, 0x4300, 0x80) + TRAMP_KVM_SKIP(PACA_EXGEN, 0x300) EXC_COMMON_BEGIN(data_access_common) @@ -604,11 +619,20 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX) EXC_REAL_BEGIN(data_access_slb, 0x380, 0x80) -EXCEPTION_PROLOG(PACA_EXSLB, data_access_slb_common, EXC_STD, KVMTEST_PR, 0x380); +SET_SCRATCH0(r13) /* save r13 */ +EXCEPTION_PROLOG_0(PACA_EXSLB) + b tramp_real_data_access_slb EXC_REAL_END(data_access_slb, 0x380, 0x80) +TRAMP_REAL_BEGIN(tramp_real_data_access_slb) +EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_PR, 0x380) +EXCEPTION_PROLOG_2(data_access_slb_common, EXC_STD) + EXC_VIRT_BEGIN(data_access_slb, 0x4380, 0x80) -EXCEPTION_RELON_PROLOG(PACA_EXSLB, data_access_slb_common, EXC_STD, NOTEST, 0x380); +SET_SCRATCH0(r13) /* save r13 */ +EXCEPTION_PROLOG_0(PACA_EXSLB) +EXCEPTION_PROLOG_1(PACA_EXSLB, NOTEST, 0x380) +EXCEPTION_PROLOG_2_RELON(data_access_slb_common, EXC_STD) EXC_VIRT_END(data_access_slb, 0x4380, 0x80) TRAMP_KVM_SKIP(PACA_EXSLB, 0x380) @@ -711,8 +735,20 @@ TRAMP_KVM_HV(PACA_EXGEN, 0x500) EXC_COMMON_ASYNC(hardware_interrupt_common, 0x500, do_IRQ) -EXC_REAL(alignment, 0x600, 0x100) -EXC_VIRT(alignment, 0x4600, 0x100, 0x600) +EXC_REAL_BEGIN(alignment, 0x600, 0x100) +SET_SCRATCH0(r13) /* save r13 */ +EXCEPTION_PROLOG_0(PACA_EXGEN) +EXCEPTION_PROLOG_1(PACA_EXGEN, KVMTEST_PR, 0x600) +EXCEPTION_PROLOG_2(alignment_common, EXC_STD) +EXC_REAL_END(alignment, 0x600, 0x100) + +EXC_VIRT_BEGIN(alignment, 0x4600, 0x100) +SET_SCRATCH0(r13) /* save r13 */ +EXCEPTION_PROLOG_0(PACA_EXGEN) +EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, 0x600) +EXCEPTION_PROLOG_2_RELON(alignment_common, EXC_STD) +EXC_VIRT_END(alignment, 0x4600, 0x100) + TRAMP_KVM(PACA_EXGEN, 0x600) EXC_COMMON_BEGIN(alignment_common) mfspr r10,SPRN_DAR From 38555434a910a657ba6d7d06a4fe0376c8b04685 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 26 Feb 2019 18:51:10 +1000 Subject: [PATCH 188/200] powerpc/64s: Fix data interrupts vs d-side MCE reentrancy Handlers for interrupts that set DAR / DSISR, set MSR[RI] before those SPRs are read. If a d-side machine check hits in this window, DAR / DSISR will be clobbered silently, leading to random corruption. Fix this by having handlers save those registers before setting MSR[RI]. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/exceptions-64s.S | 36 ++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 680197df4aea..99312328ee66 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -582,12 +582,25 @@ EXC_REAL_END(data_access, 0x300, 0x80) TRAMP_REAL_BEGIN(tramp_real_data_access) EXCEPTION_PROLOG_1(PACA_EXGEN, KVMTEST_PR, 0x300) + /* + * DAR/DSISR must be read before setting MSR[RI], because + * a d-side MCE will clobber those registers so is not + * recoverable if they are live. + */ + mfspr r10,SPRN_DAR + mfspr r11,SPRN_DSISR + std r10,PACA_EXGEN+EX_DAR(r13) + stw r11,PACA_EXGEN+EX_DSISR(r13) EXCEPTION_PROLOG_2(data_access_common, EXC_STD) EXC_VIRT_BEGIN(data_access, 0x4300, 0x80) SET_SCRATCH0(r13) /* save r13 */ EXCEPTION_PROLOG_0(PACA_EXGEN) EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, 0x300) + mfspr r10,SPRN_DAR + mfspr r11,SPRN_DSISR + std r10,PACA_EXGEN+EX_DAR(r13) + stw r11,PACA_EXGEN+EX_DSISR(r13) EXCEPTION_PROLOG_2_RELON(data_access_common, EXC_STD) EXC_VIRT_END(data_access, 0x4300, 0x80) @@ -598,11 +611,8 @@ EXC_COMMON_BEGIN(data_access_common) * Here r13 points to the paca, r9 contains the saved CR, * SRR0 and SRR1 are saved in r11 and r12, * r9 - r13 are saved in paca->exgen. + * EX_DAR and EX_DSISR have saved DAR/DSISR */ - mfspr r10,SPRN_DAR - std r10,PACA_EXGEN+EX_DAR(r13) - mfspr r10,SPRN_DSISR - stw r10,PACA_EXGEN+EX_DSISR(r13) EXCEPTION_PROLOG_COMMON(0x300, PACA_EXGEN) RECONCILE_IRQ_STATE(r10, r11) ld r12,_MSR(r1) @@ -626,20 +636,22 @@ EXC_REAL_END(data_access_slb, 0x380, 0x80) TRAMP_REAL_BEGIN(tramp_real_data_access_slb) EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_PR, 0x380) + mfspr r10,SPRN_DAR + std r10,PACA_EXSLB+EX_DAR(r13) EXCEPTION_PROLOG_2(data_access_slb_common, EXC_STD) EXC_VIRT_BEGIN(data_access_slb, 0x4380, 0x80) SET_SCRATCH0(r13) /* save r13 */ EXCEPTION_PROLOG_0(PACA_EXSLB) EXCEPTION_PROLOG_1(PACA_EXSLB, NOTEST, 0x380) + mfspr r10,SPRN_DAR + std r10,PACA_EXSLB+EX_DAR(r13) EXCEPTION_PROLOG_2_RELON(data_access_slb_common, EXC_STD) EXC_VIRT_END(data_access_slb, 0x4380, 0x80) TRAMP_KVM_SKIP(PACA_EXSLB, 0x380) EXC_COMMON_BEGIN(data_access_slb_common) - mfspr r10,SPRN_DAR - std r10,PACA_EXSLB+EX_DAR(r13) EXCEPTION_PROLOG_COMMON(0x380, PACA_EXSLB) ld r4,PACA_EXSLB+EX_DAR(r13) std r4,_DAR(r1) @@ -739,6 +751,10 @@ EXC_REAL_BEGIN(alignment, 0x600, 0x100) SET_SCRATCH0(r13) /* save r13 */ EXCEPTION_PROLOG_0(PACA_EXGEN) EXCEPTION_PROLOG_1(PACA_EXGEN, KVMTEST_PR, 0x600) + mfspr r10,SPRN_DAR + mfspr r11,SPRN_DSISR + std r10,PACA_EXGEN+EX_DAR(r13) + stw r11,PACA_EXGEN+EX_DSISR(r13) EXCEPTION_PROLOG_2(alignment_common, EXC_STD) EXC_REAL_END(alignment, 0x600, 0x100) @@ -746,15 +762,15 @@ EXC_VIRT_BEGIN(alignment, 0x4600, 0x100) SET_SCRATCH0(r13) /* save r13 */ EXCEPTION_PROLOG_0(PACA_EXGEN) EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, 0x600) + mfspr r10,SPRN_DAR + mfspr r11,SPRN_DSISR + std r10,PACA_EXGEN+EX_DAR(r13) + stw r11,PACA_EXGEN+EX_DSISR(r13) EXCEPTION_PROLOG_2_RELON(alignment_common, EXC_STD) EXC_VIRT_END(alignment, 0x4600, 0x100) TRAMP_KVM(PACA_EXGEN, 0x600) EXC_COMMON_BEGIN(alignment_common) - mfspr r10,SPRN_DAR - std r10,PACA_EXGEN+EX_DAR(r13) - mfspr r10,SPRN_DSISR - stw r10,PACA_EXGEN+EX_DSISR(r13) EXCEPTION_PROLOG_COMMON(0x600, PACA_EXGEN) ld r3,PACA_EXGEN+EX_DAR(r13) lwz r4,PACA_EXGEN+EX_DSISR(r13) From 75d9fc7fd94eb43cdf0bec04499a27ced780af19 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 26 Feb 2019 19:30:35 +1000 Subject: [PATCH 189/200] powerpc/powernv: move OPAL call wrapper tracing and interrupt handling to C The OPAL call wrapper gets interrupt disabling wrong. It disables interrupts just by clearing MSR[EE], which has two problems: - It doesn't call into the IRQ tracing subsystem, which means tracing across OPAL calls does not always notice IRQs have been disabled. - It doesn't go through the IRQ soft-mask code, which causes a minor bug. MSR[EE] can not be restored by saving the MSR then clearing MSR[EE], because a racing interrupt while soft-masked could clear MSR[EE] between the two steps. This can cause MSR[EE] to be incorrectly enabled when the OPAL call returns. Fortunately that should only result in another masked interrupt being taken to disable MSR[EE] again, but it's a bit sloppy. The existing code also saves MSR to PACA, which is not re-entrant if there is a nested OPAL call from different MSR contexts, which can happen these days with SRESET interrupts on bare metal. To fix these issues, move the tracing and IRQ handling code to C, and call into asm just for the low level call when everything is ready to go. Save the MSR on stack rather than PACA. Performance cost is kept to a minimum with a few optimisations: - The endian switch upon return is combined with the MSR restore, which avoids an expensive context synchronizing operation for LE kernels. This makes up for the additional mtmsrd to enable interrupts with local_irq_enable(). - blr is now used to return from the opal_* functions that are called as C functions, to avoid link stack corruption. This requires a skiboot fix as well to keep the call stack balanced. A NULL call is more costly after this, (410ns->430ns on POWER9), but OPAL calls are generally not performance critical at this scale. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/asm-prototypes.h | 10 +- arch/powerpc/platforms/powernv/Makefile | 5 +- arch/powerpc/platforms/powernv/opal-call.c | 283 +++++++++++++++ .../powerpc/platforms/powernv/opal-wrappers.S | 342 ++---------------- 4 files changed, 327 insertions(+), 313 deletions(-) create mode 100644 arch/powerpc/platforms/powernv/opal-call.c diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h index e01f31fb0865..effdd096fa4c 100644 --- a/arch/powerpc/include/asm/asm-prototypes.h +++ b/arch/powerpc/include/asm/asm-prototypes.h @@ -37,13 +37,11 @@ void kexec_copy_flush(struct kimage *image); extern struct static_key hcall_tracepoint_key; void __trace_hcall_entry(unsigned long opcode, unsigned long *args); void __trace_hcall_exit(long opcode, long retval, unsigned long *retbuf); -/* OPAL tracing */ -#ifdef CONFIG_JUMP_LABEL -extern struct static_key opal_tracepoint_key; -#endif -void __trace_opal_entry(unsigned long opcode, unsigned long *args); -void __trace_opal_exit(long opcode, unsigned long retval); +/* OPAL */ +int64_t __opal_call(int64_t a0, int64_t a1, int64_t a2, int64_t a3, + int64_t a4, int64_t a5, int64_t a6, int64_t a7, + int64_t opcode, uint64_t msr); /* VMX copying */ int enter_vmx_usercopy(void); diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile index b540ce8eec55..da2e99efbd04 100644 --- a/arch/powerpc/platforms/powernv/Makefile +++ b/arch/powerpc/platforms/powernv/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 -obj-y += setup.o opal-wrappers.o opal.o opal-async.o idle.o -obj-y += opal-rtc.o opal-nvram.o opal-lpc.o opal-flash.o +obj-y += setup.o opal-call.o opal-wrappers.o opal.o opal-async.o +obj-y += idle.o opal-rtc.o opal-nvram.o opal-lpc.o opal-flash.o obj-y += rng.o opal-elog.o opal-dump.o opal-sysparam.o opal-sensor.o obj-y += opal-msglog.o opal-hmi.o opal-power.o opal-irqchip.o obj-y += opal-kmsg.o opal-powercap.o opal-psr.o opal-sensor-groups.o @@ -11,7 +11,6 @@ obj-$(CONFIG_CXL_BASE) += pci-cxl.o obj-$(CONFIG_EEH) += eeh-powernv.o obj-$(CONFIG_PPC_SCOM) += opal-xscom.o obj-$(CONFIG_MEMORY_FAILURE) += opal-memory-errors.o -obj-$(CONFIG_TRACEPOINTS) += opal-tracepoints.o obj-$(CONFIG_OPAL_PRD) += opal-prd.o obj-$(CONFIG_PERF_EVENTS) += opal-imc.o obj-$(CONFIG_PPC_MEMTRACE) += memtrace.o diff --git a/arch/powerpc/platforms/powernv/opal-call.c b/arch/powerpc/platforms/powernv/opal-call.c new file mode 100644 index 000000000000..578757d403ab --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-call.c @@ -0,0 +1,283 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include + +#ifdef CONFIG_TRACEPOINTS +/* + * Since the tracing code might execute OPAL calls we need to guard against + * recursion. + */ +static DEFINE_PER_CPU(unsigned int, opal_trace_depth); + +static void __trace_opal_entry(s64 a0, s64 a1, s64 a2, s64 a3, + s64 a4, s64 a5, s64 a6, s64 a7, + unsigned long opcode) +{ + unsigned int *depth; + unsigned long args[8]; + + depth = this_cpu_ptr(&opal_trace_depth); + + if (*depth) + return; + + args[0] = a0; + args[1] = a1; + args[2] = a2; + args[3] = a3; + args[4] = a4; + args[5] = a5; + args[6] = a6; + args[7] = a7; + + (*depth)++; + trace_opal_entry(opcode, &args[0]); + (*depth)--; +} + +static void __trace_opal_exit(unsigned long opcode, unsigned long retval) +{ + unsigned int *depth; + + depth = this_cpu_ptr(&opal_trace_depth); + + if (*depth) + return; + + (*depth)++; + trace_opal_exit(opcode, retval); + (*depth)--; +} + +static DEFINE_STATIC_KEY_FALSE(opal_tracepoint_key); + +int opal_tracepoint_regfunc(void) +{ + static_branch_inc(&opal_tracepoint_key); + return 0; +} + +void opal_tracepoint_unregfunc(void) +{ + static_branch_dec(&opal_tracepoint_key); +} + +static s64 __opal_call_trace(s64 a0, s64 a1, s64 a2, s64 a3, + s64 a4, s64 a5, s64 a6, s64 a7, + unsigned long opcode, unsigned long msr) +{ + s64 ret; + + __trace_opal_entry(a0, a1, a2, a3, a4, a5, a6, a7, opcode); + ret = __opal_call(a0, a1, a2, a3, a4, a5, a6, a7, opcode, msr); + __trace_opal_exit(opcode, ret); + + return ret; +} + +#define DO_TRACE (static_branch_unlikely(&opal_tracepoint_key)) + +#else /* CONFIG_TRACEPOINTS */ + +static s64 __opal_call_trace(s64 a0, s64 a1, s64 a2, s64 a3, + s64 a4, s64 a5, s64 a6, s64 a7, + unsigned long opcode, unsigned long msr) +{ +} + +#define DO_TRACE false +#endif /* CONFIG_TRACEPOINTS */ + +static int64_t opal_call(int64_t a0, int64_t a1, int64_t a2, int64_t a3, + int64_t a4, int64_t a5, int64_t a6, int64_t a7, int64_t opcode) +{ + unsigned long flags; + unsigned long msr = mfmsr(); + bool mmu = (msr & (MSR_IR|MSR_DR)); + int64_t ret; + + msr &= ~MSR_EE; + + if (unlikely(!mmu)) + return __opal_call(a0, a1, a2, a3, a4, a5, a6, a7, opcode, msr); + + local_save_flags(flags); + hard_irq_disable(); + + if (DO_TRACE) { + ret = __opal_call_trace(a0, a1, a2, a3, a4, a5, a6, a7, opcode, msr); + } else { + ret = __opal_call(a0, a1, a2, a3, a4, a5, a6, a7, opcode, msr); + } + + local_irq_restore(flags); + + return ret; +} + +#define OPAL_CALL(name, opcode) \ +int64_t name(int64_t a0, int64_t a1, int64_t a2, int64_t a3, \ + int64_t a4, int64_t a5, int64_t a6, int64_t a7) \ +{ \ + return opal_call(a0, a1, a2, a3, a4, a5, a6, a7, opcode); \ +} + +OPAL_CALL(opal_invalid_call, OPAL_INVALID_CALL); +OPAL_CALL(opal_console_write, OPAL_CONSOLE_WRITE); +OPAL_CALL(opal_console_read, OPAL_CONSOLE_READ); +OPAL_CALL(opal_console_write_buffer_space, OPAL_CONSOLE_WRITE_BUFFER_SPACE); +OPAL_CALL(opal_rtc_read, OPAL_RTC_READ); +OPAL_CALL(opal_rtc_write, OPAL_RTC_WRITE); +OPAL_CALL(opal_cec_power_down, OPAL_CEC_POWER_DOWN); +OPAL_CALL(opal_cec_reboot, OPAL_CEC_REBOOT); +OPAL_CALL(opal_cec_reboot2, OPAL_CEC_REBOOT2); +OPAL_CALL(opal_read_nvram, OPAL_READ_NVRAM); +OPAL_CALL(opal_write_nvram, OPAL_WRITE_NVRAM); +OPAL_CALL(opal_handle_interrupt, OPAL_HANDLE_INTERRUPT); +OPAL_CALL(opal_poll_events, OPAL_POLL_EVENTS); +OPAL_CALL(opal_pci_set_hub_tce_memory, OPAL_PCI_SET_HUB_TCE_MEMORY); +OPAL_CALL(opal_pci_set_phb_tce_memory, OPAL_PCI_SET_PHB_TCE_MEMORY); +OPAL_CALL(opal_pci_config_read_byte, OPAL_PCI_CONFIG_READ_BYTE); +OPAL_CALL(opal_pci_config_read_half_word, OPAL_PCI_CONFIG_READ_HALF_WORD); +OPAL_CALL(opal_pci_config_read_word, OPAL_PCI_CONFIG_READ_WORD); +OPAL_CALL(opal_pci_config_write_byte, OPAL_PCI_CONFIG_WRITE_BYTE); +OPAL_CALL(opal_pci_config_write_half_word, OPAL_PCI_CONFIG_WRITE_HALF_WORD); +OPAL_CALL(opal_pci_config_write_word, OPAL_PCI_CONFIG_WRITE_WORD); +OPAL_CALL(opal_set_xive, OPAL_SET_XIVE); +OPAL_CALL(opal_get_xive, OPAL_GET_XIVE); +OPAL_CALL(opal_register_exception_handler, OPAL_REGISTER_OPAL_EXCEPTION_HANDLER); +OPAL_CALL(opal_pci_eeh_freeze_status, OPAL_PCI_EEH_FREEZE_STATUS); +OPAL_CALL(opal_pci_eeh_freeze_clear, OPAL_PCI_EEH_FREEZE_CLEAR); +OPAL_CALL(opal_pci_eeh_freeze_set, OPAL_PCI_EEH_FREEZE_SET); +OPAL_CALL(opal_pci_err_inject, OPAL_PCI_ERR_INJECT); +OPAL_CALL(opal_pci_shpc, OPAL_PCI_SHPC); +OPAL_CALL(opal_pci_phb_mmio_enable, OPAL_PCI_PHB_MMIO_ENABLE); +OPAL_CALL(opal_pci_set_phb_mem_window, OPAL_PCI_SET_PHB_MEM_WINDOW); +OPAL_CALL(opal_pci_map_pe_mmio_window, OPAL_PCI_MAP_PE_MMIO_WINDOW); +OPAL_CALL(opal_pci_set_phb_table_memory, OPAL_PCI_SET_PHB_TABLE_MEMORY); +OPAL_CALL(opal_pci_set_pe, OPAL_PCI_SET_PE); +OPAL_CALL(opal_pci_set_peltv, OPAL_PCI_SET_PELTV); +OPAL_CALL(opal_pci_set_mve, OPAL_PCI_SET_MVE); +OPAL_CALL(opal_pci_set_mve_enable, OPAL_PCI_SET_MVE_ENABLE); +OPAL_CALL(opal_pci_get_xive_reissue, OPAL_PCI_GET_XIVE_REISSUE); +OPAL_CALL(opal_pci_set_xive_reissue, OPAL_PCI_SET_XIVE_REISSUE); +OPAL_CALL(opal_pci_set_xive_pe, OPAL_PCI_SET_XIVE_PE); +OPAL_CALL(opal_get_xive_source, OPAL_GET_XIVE_SOURCE); +OPAL_CALL(opal_get_msi_32, OPAL_GET_MSI_32); +OPAL_CALL(opal_get_msi_64, OPAL_GET_MSI_64); +OPAL_CALL(opal_start_cpu, OPAL_START_CPU); +OPAL_CALL(opal_query_cpu_status, OPAL_QUERY_CPU_STATUS); +OPAL_CALL(opal_write_oppanel, OPAL_WRITE_OPPANEL); +OPAL_CALL(opal_pci_map_pe_dma_window, OPAL_PCI_MAP_PE_DMA_WINDOW); +OPAL_CALL(opal_pci_map_pe_dma_window_real, OPAL_PCI_MAP_PE_DMA_WINDOW_REAL); +OPAL_CALL(opal_pci_reset, OPAL_PCI_RESET); +OPAL_CALL(opal_pci_get_hub_diag_data, OPAL_PCI_GET_HUB_DIAG_DATA); +OPAL_CALL(opal_pci_get_phb_diag_data, OPAL_PCI_GET_PHB_DIAG_DATA); +OPAL_CALL(opal_pci_fence_phb, OPAL_PCI_FENCE_PHB); +OPAL_CALL(opal_pci_reinit, OPAL_PCI_REINIT); +OPAL_CALL(opal_pci_mask_pe_error, OPAL_PCI_MASK_PE_ERROR); +OPAL_CALL(opal_set_slot_led_status, OPAL_SET_SLOT_LED_STATUS); +OPAL_CALL(opal_get_epow_status, OPAL_GET_EPOW_STATUS); +OPAL_CALL(opal_get_dpo_status, OPAL_GET_DPO_STATUS); +OPAL_CALL(opal_set_system_attention_led, OPAL_SET_SYSTEM_ATTENTION_LED); +OPAL_CALL(opal_pci_next_error, OPAL_PCI_NEXT_ERROR); +OPAL_CALL(opal_pci_poll, OPAL_PCI_POLL); +OPAL_CALL(opal_pci_msi_eoi, OPAL_PCI_MSI_EOI); +OPAL_CALL(opal_pci_get_phb_diag_data2, OPAL_PCI_GET_PHB_DIAG_DATA2); +OPAL_CALL(opal_xscom_read, OPAL_XSCOM_READ); +OPAL_CALL(opal_xscom_write, OPAL_XSCOM_WRITE); +OPAL_CALL(opal_lpc_read, OPAL_LPC_READ); +OPAL_CALL(opal_lpc_write, OPAL_LPC_WRITE); +OPAL_CALL(opal_return_cpu, OPAL_RETURN_CPU); +OPAL_CALL(opal_reinit_cpus, OPAL_REINIT_CPUS); +OPAL_CALL(opal_read_elog, OPAL_ELOG_READ); +OPAL_CALL(opal_send_ack_elog, OPAL_ELOG_ACK); +OPAL_CALL(opal_get_elog_size, OPAL_ELOG_SIZE); +OPAL_CALL(opal_resend_pending_logs, OPAL_ELOG_RESEND); +OPAL_CALL(opal_write_elog, OPAL_ELOG_WRITE); +OPAL_CALL(opal_validate_flash, OPAL_FLASH_VALIDATE); +OPAL_CALL(opal_manage_flash, OPAL_FLASH_MANAGE); +OPAL_CALL(opal_update_flash, OPAL_FLASH_UPDATE); +OPAL_CALL(opal_resync_timebase, OPAL_RESYNC_TIMEBASE); +OPAL_CALL(opal_check_token, OPAL_CHECK_TOKEN); +OPAL_CALL(opal_dump_init, OPAL_DUMP_INIT); +OPAL_CALL(opal_dump_info, OPAL_DUMP_INFO); +OPAL_CALL(opal_dump_info2, OPAL_DUMP_INFO2); +OPAL_CALL(opal_dump_read, OPAL_DUMP_READ); +OPAL_CALL(opal_dump_ack, OPAL_DUMP_ACK); +OPAL_CALL(opal_get_msg, OPAL_GET_MSG); +OPAL_CALL(opal_write_oppanel_async, OPAL_WRITE_OPPANEL_ASYNC); +OPAL_CALL(opal_check_completion, OPAL_CHECK_ASYNC_COMPLETION); +OPAL_CALL(opal_dump_resend_notification, OPAL_DUMP_RESEND); +OPAL_CALL(opal_sync_host_reboot, OPAL_SYNC_HOST_REBOOT); +OPAL_CALL(opal_sensor_read, OPAL_SENSOR_READ); +OPAL_CALL(opal_get_param, OPAL_GET_PARAM); +OPAL_CALL(opal_set_param, OPAL_SET_PARAM); +OPAL_CALL(opal_handle_hmi, OPAL_HANDLE_HMI); +OPAL_CALL(opal_config_cpu_idle_state, OPAL_CONFIG_CPU_IDLE_STATE); +OPAL_CALL(opal_slw_set_reg, OPAL_SLW_SET_REG); +OPAL_CALL(opal_register_dump_region, OPAL_REGISTER_DUMP_REGION); +OPAL_CALL(opal_unregister_dump_region, OPAL_UNREGISTER_DUMP_REGION); +OPAL_CALL(opal_pci_set_phb_cxl_mode, OPAL_PCI_SET_PHB_CAPI_MODE); +OPAL_CALL(opal_tpo_write, OPAL_WRITE_TPO); +OPAL_CALL(opal_tpo_read, OPAL_READ_TPO); +OPAL_CALL(opal_ipmi_send, OPAL_IPMI_SEND); +OPAL_CALL(opal_ipmi_recv, OPAL_IPMI_RECV); +OPAL_CALL(opal_i2c_request, OPAL_I2C_REQUEST); +OPAL_CALL(opal_flash_read, OPAL_FLASH_READ); +OPAL_CALL(opal_flash_write, OPAL_FLASH_WRITE); +OPAL_CALL(opal_flash_erase, OPAL_FLASH_ERASE); +OPAL_CALL(opal_prd_msg, OPAL_PRD_MSG); +OPAL_CALL(opal_leds_get_ind, OPAL_LEDS_GET_INDICATOR); +OPAL_CALL(opal_leds_set_ind, OPAL_LEDS_SET_INDICATOR); +OPAL_CALL(opal_console_flush, OPAL_CONSOLE_FLUSH); +OPAL_CALL(opal_get_device_tree, OPAL_GET_DEVICE_TREE); +OPAL_CALL(opal_pci_get_presence_state, OPAL_PCI_GET_PRESENCE_STATE); +OPAL_CALL(opal_pci_get_power_state, OPAL_PCI_GET_POWER_STATE); +OPAL_CALL(opal_pci_set_power_state, OPAL_PCI_SET_POWER_STATE); +OPAL_CALL(opal_int_get_xirr, OPAL_INT_GET_XIRR); +OPAL_CALL(opal_int_set_cppr, OPAL_INT_SET_CPPR); +OPAL_CALL(opal_int_eoi, OPAL_INT_EOI); +OPAL_CALL(opal_int_set_mfrr, OPAL_INT_SET_MFRR); +OPAL_CALL(opal_pci_tce_kill, OPAL_PCI_TCE_KILL); +OPAL_CALL(opal_nmmu_set_ptcr, OPAL_NMMU_SET_PTCR); +OPAL_CALL(opal_xive_reset, OPAL_XIVE_RESET); +OPAL_CALL(opal_xive_get_irq_info, OPAL_XIVE_GET_IRQ_INFO); +OPAL_CALL(opal_xive_get_irq_config, OPAL_XIVE_GET_IRQ_CONFIG); +OPAL_CALL(opal_xive_set_irq_config, OPAL_XIVE_SET_IRQ_CONFIG); +OPAL_CALL(opal_xive_get_queue_info, OPAL_XIVE_GET_QUEUE_INFO); +OPAL_CALL(opal_xive_set_queue_info, OPAL_XIVE_SET_QUEUE_INFO); +OPAL_CALL(opal_xive_donate_page, OPAL_XIVE_DONATE_PAGE); +OPAL_CALL(opal_xive_alloc_vp_block, OPAL_XIVE_ALLOCATE_VP_BLOCK); +OPAL_CALL(opal_xive_free_vp_block, OPAL_XIVE_FREE_VP_BLOCK); +OPAL_CALL(opal_xive_allocate_irq, OPAL_XIVE_ALLOCATE_IRQ); +OPAL_CALL(opal_xive_free_irq, OPAL_XIVE_FREE_IRQ); +OPAL_CALL(opal_xive_get_vp_info, OPAL_XIVE_GET_VP_INFO); +OPAL_CALL(opal_xive_set_vp_info, OPAL_XIVE_SET_VP_INFO); +OPAL_CALL(opal_xive_sync, OPAL_XIVE_SYNC); +OPAL_CALL(opal_xive_dump, OPAL_XIVE_DUMP); +OPAL_CALL(opal_signal_system_reset, OPAL_SIGNAL_SYSTEM_RESET); +OPAL_CALL(opal_npu_init_context, OPAL_NPU_INIT_CONTEXT); +OPAL_CALL(opal_npu_destroy_context, OPAL_NPU_DESTROY_CONTEXT); +OPAL_CALL(opal_npu_map_lpar, OPAL_NPU_MAP_LPAR); +OPAL_CALL(opal_imc_counters_init, OPAL_IMC_COUNTERS_INIT); +OPAL_CALL(opal_imc_counters_start, OPAL_IMC_COUNTERS_START); +OPAL_CALL(opal_imc_counters_stop, OPAL_IMC_COUNTERS_STOP); +OPAL_CALL(opal_pci_set_p2p, OPAL_PCI_SET_P2P); +OPAL_CALL(opal_get_powercap, OPAL_GET_POWERCAP); +OPAL_CALL(opal_set_powercap, OPAL_SET_POWERCAP); +OPAL_CALL(opal_get_power_shift_ratio, OPAL_GET_POWER_SHIFT_RATIO); +OPAL_CALL(opal_set_power_shift_ratio, OPAL_SET_POWER_SHIFT_RATIO); +OPAL_CALL(opal_sensor_group_clear, OPAL_SENSOR_GROUP_CLEAR); +OPAL_CALL(opal_quiesce, OPAL_QUIESCE); +OPAL_CALL(opal_npu_spa_setup, OPAL_NPU_SPA_SETUP); +OPAL_CALL(opal_npu_spa_clear_cache, OPAL_NPU_SPA_CLEAR_CACHE); +OPAL_CALL(opal_npu_tl_set, OPAL_NPU_TL_SET); +OPAL_CALL(opal_pci_get_pbcq_tunnel_bar, OPAL_PCI_GET_PBCQ_TUNNEL_BAR); +OPAL_CALL(opal_pci_set_pbcq_tunnel_bar, OPAL_PCI_SET_PBCQ_TUNNEL_BAR); +OPAL_CALL(opal_sensor_read_u64, OPAL_SENSOR_READ_U64); +OPAL_CALL(opal_sensor_group_enable, OPAL_SENSOR_GROUP_ENABLE); +OPAL_CALL(opal_nx_coproc_init, OPAL_NX_COPROC_INIT); diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S index f4875fe3f8ff..7d2052d8af9d 100644 --- a/arch/powerpc/platforms/powernv/opal-wrappers.S +++ b/arch/powerpc/platforms/powernv/opal-wrappers.S @@ -17,317 +17,51 @@ #include #include - .section ".text" - -#ifdef CONFIG_TRACEPOINTS -#ifdef CONFIG_JUMP_LABEL -#define OPAL_BRANCH(LABEL) \ - ARCH_STATIC_BRANCH(LABEL, opal_tracepoint_key) -#else - - .section ".toc","aw" - - .globl opal_tracepoint_refcount -opal_tracepoint_refcount: - .8byte 0 - - .section ".text" + .section ".text" /* - * We branch around this in early init by using an unconditional cpu - * feature. + * r3-r10 - OPAL call arguments + * STK_PARAM(R11) - OPAL opcode + * STK_PARAM(R12) - MSR to restore */ -#define OPAL_BRANCH(LABEL) \ -BEGIN_FTR_SECTION; \ - b 1f; \ -END_FTR_SECTION(0, 1); \ - ld r11,opal_tracepoint_refcount@toc(r2); \ - cmpdi r11,0; \ - bne- LABEL; \ -1: - -#endif - -#else -#define OPAL_BRANCH(LABEL) -#endif - -/* - * DO_OPAL_CALL assumes: - * r0 = opal call token - * r12 = msr - * LR has been saved - */ -#define DO_OPAL_CALL() \ - mfcr r11; \ - stw r11,8(r1); \ - li r11,0; \ - ori r11,r11,MSR_EE; \ - std r12,PACASAVEDMSR(r13); \ - andc r12,r12,r11; \ - mtmsrd r12,1; \ - LOAD_REG_ADDR(r11,opal_return); \ - mtlr r11; \ - li r11,MSR_DR|MSR_IR|MSR_LE;\ - andc r12,r12,r11; \ - mtspr SPRN_HSRR1,r12; \ - LOAD_REG_ADDR(r11,opal); \ - ld r12,8(r11); \ - ld r2,0(r11); \ - mtspr SPRN_HSRR0,r12; \ +_GLOBAL_TOC(__opal_call) + mflr r0 + std r0,PPC_LR_STKOFF(r1) + ld r12,STK_PARAM(R12)(r1) + li r0,MSR_IR|MSR_DR|MSR_LE + andc r12,r12,r0 + LOAD_REG_ADDR(r11, opal_return) + mtlr r11 + LOAD_REG_ADDR(r11, opal) + ld r2,0(r11) + ld r11,8(r11) + mtspr SPRN_HSRR0,r11 + mtspr SPRN_HSRR1,r12 + /* set token to r0 */ + ld r0,STK_PARAM(R11)(r1) hrfid - -#define OPAL_CALL(name, token) \ - _GLOBAL_TOC(name); \ - mfmsr r12; \ - mflr r0; \ - andi. r11,r12,MSR_IR|MSR_DR; \ - std r0,PPC_LR_STKOFF(r1); \ - li r0,token; \ - beq opal_real_call; \ - OPAL_BRANCH(opal_tracepoint_entry) \ - DO_OPAL_CALL() - - opal_return: /* - * Fixup endian on OPAL return... we should be able to simplify - * this by instead converting the below trampoline to a set of - * bytes (always BE) since MSR:LE will end up fixed up as a side - * effect of the rfid. + * Restore MSR on OPAL return. The MSR is set to big-endian. */ - FIXUP_ENDIAN_HV - ld r2,PACATOC(r13); - lwz r4,8(r1); - ld r5,PPC_LR_STKOFF(r1); - ld r6,PACASAVEDMSR(r13); - mtcr r4; - mtspr SPRN_HSRR0,r5; - mtspr SPRN_HSRR1,r6; - hrfid +#ifdef __BIG_ENDIAN__ + ld r11,STK_PARAM(R12)(r1) + mtmsrd r11 +#else + /* Endian can only be switched with rfi, must byte reverse MSR load */ + .short 0x4039 /* li r10,STK_PARAM(R12) */ + .byte (STK_PARAM(R12) >> 8) & 0xff + .byte STK_PARAM(R12) & 0xff -opal_real_call: - mfcr r11 - stw r11,8(r1) - /* Set opal return address */ - LOAD_REG_ADDR(r11, opal_return_realmode) - mtlr r11 - li r11,MSR_LE - andc r12,r12,r11 - mtspr SPRN_HSRR1,r12 - LOAD_REG_ADDR(r11,opal) - ld r12,8(r11) - ld r2,0(r11) - mtspr SPRN_HSRR0,r12 - hrfid - -opal_return_realmode: - FIXUP_ENDIAN_HV - ld r2,PACATOC(r13); - lwz r11,8(r1); - ld r12,PPC_LR_STKOFF(r1) - mtcr r11; - mtlr r12 - blr - -#ifdef CONFIG_TRACEPOINTS -opal_tracepoint_entry: - stdu r1,-STACKFRAMESIZE(r1) - std r0,STK_REG(R23)(r1) - std r3,STK_REG(R24)(r1) - std r4,STK_REG(R25)(r1) - std r5,STK_REG(R26)(r1) - std r6,STK_REG(R27)(r1) - std r7,STK_REG(R28)(r1) - std r8,STK_REG(R29)(r1) - std r9,STK_REG(R30)(r1) - std r10,STK_REG(R31)(r1) - mr r3,r0 - addi r4,r1,STK_REG(R24) - bl __trace_opal_entry - ld r0,STK_REG(R23)(r1) - ld r3,STK_REG(R24)(r1) - ld r4,STK_REG(R25)(r1) - ld r5,STK_REG(R26)(r1) - ld r6,STK_REG(R27)(r1) - ld r7,STK_REG(R28)(r1) - ld r8,STK_REG(R29)(r1) - ld r9,STK_REG(R30)(r1) - ld r10,STK_REG(R31)(r1) - - /* setup LR so we return via tracepoint_return */ - LOAD_REG_ADDR(r11,opal_tracepoint_return) - std r11,16(r1) - - mfmsr r12 - DO_OPAL_CALL() - -opal_tracepoint_return: - std r3,STK_REG(R31)(r1) - mr r4,r3 - ld r3,STK_REG(R23)(r1) - bl __trace_opal_exit - ld r3,STK_REG(R31)(r1) - addi r1,r1,STACKFRAMESIZE - ld r0,16(r1) + .long 0x280c6a7d /* ldbrx r11,r10,r1 */ + .long 0x05009f42 /* bcl 20,31,$+4 */ + .long 0xa602487d /* mflr r10 */ + .long 0x14004a39 /* addi r10,r10,20 */ + .long 0xa64b5a7d /* mthsrr0 r10 */ + .long 0xa64b7b7d /* mthsrr1 r11 */ + .long 0x2402004c /* hrfid */ +#endif + ld r2,PACATOC(r13) + ld r0,PPC_LR_STKOFF(r1) mtlr r0 blr -#endif - - -OPAL_CALL(opal_invalid_call, OPAL_INVALID_CALL); -OPAL_CALL(opal_console_write, OPAL_CONSOLE_WRITE); -OPAL_CALL(opal_console_read, OPAL_CONSOLE_READ); -OPAL_CALL(opal_console_write_buffer_space, OPAL_CONSOLE_WRITE_BUFFER_SPACE); -OPAL_CALL(opal_rtc_read, OPAL_RTC_READ); -OPAL_CALL(opal_rtc_write, OPAL_RTC_WRITE); -OPAL_CALL(opal_cec_power_down, OPAL_CEC_POWER_DOWN); -OPAL_CALL(opal_cec_reboot, OPAL_CEC_REBOOT); -OPAL_CALL(opal_cec_reboot2, OPAL_CEC_REBOOT2); -OPAL_CALL(opal_read_nvram, OPAL_READ_NVRAM); -OPAL_CALL(opal_write_nvram, OPAL_WRITE_NVRAM); -OPAL_CALL(opal_handle_interrupt, OPAL_HANDLE_INTERRUPT); -OPAL_CALL(opal_poll_events, OPAL_POLL_EVENTS); -OPAL_CALL(opal_pci_set_hub_tce_memory, OPAL_PCI_SET_HUB_TCE_MEMORY); -OPAL_CALL(opal_pci_set_phb_tce_memory, OPAL_PCI_SET_PHB_TCE_MEMORY); -OPAL_CALL(opal_pci_config_read_byte, OPAL_PCI_CONFIG_READ_BYTE); -OPAL_CALL(opal_pci_config_read_half_word, OPAL_PCI_CONFIG_READ_HALF_WORD); -OPAL_CALL(opal_pci_config_read_word, OPAL_PCI_CONFIG_READ_WORD); -OPAL_CALL(opal_pci_config_write_byte, OPAL_PCI_CONFIG_WRITE_BYTE); -OPAL_CALL(opal_pci_config_write_half_word, OPAL_PCI_CONFIG_WRITE_HALF_WORD); -OPAL_CALL(opal_pci_config_write_word, OPAL_PCI_CONFIG_WRITE_WORD); -OPAL_CALL(opal_set_xive, OPAL_SET_XIVE); -OPAL_CALL(opal_get_xive, OPAL_GET_XIVE); -OPAL_CALL(opal_register_exception_handler, OPAL_REGISTER_OPAL_EXCEPTION_HANDLER); -OPAL_CALL(opal_pci_eeh_freeze_status, OPAL_PCI_EEH_FREEZE_STATUS); -OPAL_CALL(opal_pci_eeh_freeze_clear, OPAL_PCI_EEH_FREEZE_CLEAR); -OPAL_CALL(opal_pci_eeh_freeze_set, OPAL_PCI_EEH_FREEZE_SET); -OPAL_CALL(opal_pci_err_inject, OPAL_PCI_ERR_INJECT); -OPAL_CALL(opal_pci_shpc, OPAL_PCI_SHPC); -OPAL_CALL(opal_pci_phb_mmio_enable, OPAL_PCI_PHB_MMIO_ENABLE); -OPAL_CALL(opal_pci_set_phb_mem_window, OPAL_PCI_SET_PHB_MEM_WINDOW); -OPAL_CALL(opal_pci_map_pe_mmio_window, OPAL_PCI_MAP_PE_MMIO_WINDOW); -OPAL_CALL(opal_pci_set_phb_table_memory, OPAL_PCI_SET_PHB_TABLE_MEMORY); -OPAL_CALL(opal_pci_set_pe, OPAL_PCI_SET_PE); -OPAL_CALL(opal_pci_set_peltv, OPAL_PCI_SET_PELTV); -OPAL_CALL(opal_pci_set_mve, OPAL_PCI_SET_MVE); -OPAL_CALL(opal_pci_set_mve_enable, OPAL_PCI_SET_MVE_ENABLE); -OPAL_CALL(opal_pci_get_xive_reissue, OPAL_PCI_GET_XIVE_REISSUE); -OPAL_CALL(opal_pci_set_xive_reissue, OPAL_PCI_SET_XIVE_REISSUE); -OPAL_CALL(opal_pci_set_xive_pe, OPAL_PCI_SET_XIVE_PE); -OPAL_CALL(opal_get_xive_source, OPAL_GET_XIVE_SOURCE); -OPAL_CALL(opal_get_msi_32, OPAL_GET_MSI_32); -OPAL_CALL(opal_get_msi_64, OPAL_GET_MSI_64); -OPAL_CALL(opal_start_cpu, OPAL_START_CPU); -OPAL_CALL(opal_query_cpu_status, OPAL_QUERY_CPU_STATUS); -OPAL_CALL(opal_write_oppanel, OPAL_WRITE_OPPANEL); -OPAL_CALL(opal_pci_map_pe_dma_window, OPAL_PCI_MAP_PE_DMA_WINDOW); -OPAL_CALL(opal_pci_map_pe_dma_window_real, OPAL_PCI_MAP_PE_DMA_WINDOW_REAL); -OPAL_CALL(opal_pci_reset, OPAL_PCI_RESET); -OPAL_CALL(opal_pci_get_hub_diag_data, OPAL_PCI_GET_HUB_DIAG_DATA); -OPAL_CALL(opal_pci_get_phb_diag_data, OPAL_PCI_GET_PHB_DIAG_DATA); -OPAL_CALL(opal_pci_fence_phb, OPAL_PCI_FENCE_PHB); -OPAL_CALL(opal_pci_reinit, OPAL_PCI_REINIT); -OPAL_CALL(opal_pci_mask_pe_error, OPAL_PCI_MASK_PE_ERROR); -OPAL_CALL(opal_set_slot_led_status, OPAL_SET_SLOT_LED_STATUS); -OPAL_CALL(opal_get_epow_status, OPAL_GET_EPOW_STATUS); -OPAL_CALL(opal_get_dpo_status, OPAL_GET_DPO_STATUS); -OPAL_CALL(opal_set_system_attention_led, OPAL_SET_SYSTEM_ATTENTION_LED); -OPAL_CALL(opal_pci_next_error, OPAL_PCI_NEXT_ERROR); -OPAL_CALL(opal_pci_poll, OPAL_PCI_POLL); -OPAL_CALL(opal_pci_msi_eoi, OPAL_PCI_MSI_EOI); -OPAL_CALL(opal_pci_get_phb_diag_data2, OPAL_PCI_GET_PHB_DIAG_DATA2); -OPAL_CALL(opal_xscom_read, OPAL_XSCOM_READ); -OPAL_CALL(opal_xscom_write, OPAL_XSCOM_WRITE); -OPAL_CALL(opal_lpc_read, OPAL_LPC_READ); -OPAL_CALL(opal_lpc_write, OPAL_LPC_WRITE); -OPAL_CALL(opal_return_cpu, OPAL_RETURN_CPU); -OPAL_CALL(opal_reinit_cpus, OPAL_REINIT_CPUS); -OPAL_CALL(opal_read_elog, OPAL_ELOG_READ); -OPAL_CALL(opal_send_ack_elog, OPAL_ELOG_ACK); -OPAL_CALL(opal_get_elog_size, OPAL_ELOG_SIZE); -OPAL_CALL(opal_resend_pending_logs, OPAL_ELOG_RESEND); -OPAL_CALL(opal_write_elog, OPAL_ELOG_WRITE); -OPAL_CALL(opal_validate_flash, OPAL_FLASH_VALIDATE); -OPAL_CALL(opal_manage_flash, OPAL_FLASH_MANAGE); -OPAL_CALL(opal_update_flash, OPAL_FLASH_UPDATE); -OPAL_CALL(opal_resync_timebase, OPAL_RESYNC_TIMEBASE); -OPAL_CALL(opal_check_token, OPAL_CHECK_TOKEN); -OPAL_CALL(opal_dump_init, OPAL_DUMP_INIT); -OPAL_CALL(opal_dump_info, OPAL_DUMP_INFO); -OPAL_CALL(opal_dump_info2, OPAL_DUMP_INFO2); -OPAL_CALL(opal_dump_read, OPAL_DUMP_READ); -OPAL_CALL(opal_dump_ack, OPAL_DUMP_ACK); -OPAL_CALL(opal_get_msg, OPAL_GET_MSG); -OPAL_CALL(opal_write_oppanel_async, OPAL_WRITE_OPPANEL_ASYNC); -OPAL_CALL(opal_check_completion, OPAL_CHECK_ASYNC_COMPLETION); -OPAL_CALL(opal_dump_resend_notification, OPAL_DUMP_RESEND); -OPAL_CALL(opal_sync_host_reboot, OPAL_SYNC_HOST_REBOOT); -OPAL_CALL(opal_sensor_read, OPAL_SENSOR_READ); -OPAL_CALL(opal_get_param, OPAL_GET_PARAM); -OPAL_CALL(opal_set_param, OPAL_SET_PARAM); -OPAL_CALL(opal_handle_hmi, OPAL_HANDLE_HMI); -OPAL_CALL(opal_config_cpu_idle_state, OPAL_CONFIG_CPU_IDLE_STATE); -OPAL_CALL(opal_slw_set_reg, OPAL_SLW_SET_REG); -OPAL_CALL(opal_register_dump_region, OPAL_REGISTER_DUMP_REGION); -OPAL_CALL(opal_unregister_dump_region, OPAL_UNREGISTER_DUMP_REGION); -OPAL_CALL(opal_pci_set_phb_cxl_mode, OPAL_PCI_SET_PHB_CAPI_MODE); -OPAL_CALL(opal_tpo_write, OPAL_WRITE_TPO); -OPAL_CALL(opal_tpo_read, OPAL_READ_TPO); -OPAL_CALL(opal_ipmi_send, OPAL_IPMI_SEND); -OPAL_CALL(opal_ipmi_recv, OPAL_IPMI_RECV); -OPAL_CALL(opal_i2c_request, OPAL_I2C_REQUEST); -OPAL_CALL(opal_flash_read, OPAL_FLASH_READ); -OPAL_CALL(opal_flash_write, OPAL_FLASH_WRITE); -OPAL_CALL(opal_flash_erase, OPAL_FLASH_ERASE); -OPAL_CALL(opal_prd_msg, OPAL_PRD_MSG); -OPAL_CALL(opal_leds_get_ind, OPAL_LEDS_GET_INDICATOR); -OPAL_CALL(opal_leds_set_ind, OPAL_LEDS_SET_INDICATOR); -OPAL_CALL(opal_console_flush, OPAL_CONSOLE_FLUSH); -OPAL_CALL(opal_get_device_tree, OPAL_GET_DEVICE_TREE); -OPAL_CALL(opal_pci_get_presence_state, OPAL_PCI_GET_PRESENCE_STATE); -OPAL_CALL(opal_pci_get_power_state, OPAL_PCI_GET_POWER_STATE); -OPAL_CALL(opal_pci_set_power_state, OPAL_PCI_SET_POWER_STATE); -OPAL_CALL(opal_int_get_xirr, OPAL_INT_GET_XIRR); -OPAL_CALL(opal_int_set_cppr, OPAL_INT_SET_CPPR); -OPAL_CALL(opal_int_eoi, OPAL_INT_EOI); -OPAL_CALL(opal_int_set_mfrr, OPAL_INT_SET_MFRR); -OPAL_CALL(opal_pci_tce_kill, OPAL_PCI_TCE_KILL); -OPAL_CALL(opal_nmmu_set_ptcr, OPAL_NMMU_SET_PTCR); -OPAL_CALL(opal_xive_reset, OPAL_XIVE_RESET); -OPAL_CALL(opal_xive_get_irq_info, OPAL_XIVE_GET_IRQ_INFO); -OPAL_CALL(opal_xive_get_irq_config, OPAL_XIVE_GET_IRQ_CONFIG); -OPAL_CALL(opal_xive_set_irq_config, OPAL_XIVE_SET_IRQ_CONFIG); -OPAL_CALL(opal_xive_get_queue_info, OPAL_XIVE_GET_QUEUE_INFO); -OPAL_CALL(opal_xive_set_queue_info, OPAL_XIVE_SET_QUEUE_INFO); -OPAL_CALL(opal_xive_donate_page, OPAL_XIVE_DONATE_PAGE); -OPAL_CALL(opal_xive_alloc_vp_block, OPAL_XIVE_ALLOCATE_VP_BLOCK); -OPAL_CALL(opal_xive_free_vp_block, OPAL_XIVE_FREE_VP_BLOCK); -OPAL_CALL(opal_xive_allocate_irq, OPAL_XIVE_ALLOCATE_IRQ); -OPAL_CALL(opal_xive_free_irq, OPAL_XIVE_FREE_IRQ); -OPAL_CALL(opal_xive_get_vp_info, OPAL_XIVE_GET_VP_INFO); -OPAL_CALL(opal_xive_set_vp_info, OPAL_XIVE_SET_VP_INFO); -OPAL_CALL(opal_xive_sync, OPAL_XIVE_SYNC); -OPAL_CALL(opal_xive_dump, OPAL_XIVE_DUMP); -OPAL_CALL(opal_signal_system_reset, OPAL_SIGNAL_SYSTEM_RESET); -OPAL_CALL(opal_npu_init_context, OPAL_NPU_INIT_CONTEXT); -OPAL_CALL(opal_npu_destroy_context, OPAL_NPU_DESTROY_CONTEXT); -OPAL_CALL(opal_npu_map_lpar, OPAL_NPU_MAP_LPAR); -OPAL_CALL(opal_imc_counters_init, OPAL_IMC_COUNTERS_INIT); -OPAL_CALL(opal_imc_counters_start, OPAL_IMC_COUNTERS_START); -OPAL_CALL(opal_imc_counters_stop, OPAL_IMC_COUNTERS_STOP); -OPAL_CALL(opal_pci_set_p2p, OPAL_PCI_SET_P2P); -OPAL_CALL(opal_get_powercap, OPAL_GET_POWERCAP); -OPAL_CALL(opal_set_powercap, OPAL_SET_POWERCAP); -OPAL_CALL(opal_get_power_shift_ratio, OPAL_GET_POWER_SHIFT_RATIO); -OPAL_CALL(opal_set_power_shift_ratio, OPAL_SET_POWER_SHIFT_RATIO); -OPAL_CALL(opal_sensor_group_clear, OPAL_SENSOR_GROUP_CLEAR); -OPAL_CALL(opal_quiesce, OPAL_QUIESCE); -OPAL_CALL(opal_npu_spa_setup, OPAL_NPU_SPA_SETUP); -OPAL_CALL(opal_npu_spa_clear_cache, OPAL_NPU_SPA_CLEAR_CACHE); -OPAL_CALL(opal_npu_tl_set, OPAL_NPU_TL_SET); -OPAL_CALL(opal_pci_get_pbcq_tunnel_bar, OPAL_PCI_GET_PBCQ_TUNNEL_BAR); -OPAL_CALL(opal_pci_set_pbcq_tunnel_bar, OPAL_PCI_SET_PBCQ_TUNNEL_BAR); -OPAL_CALL(opal_sensor_read_u64, OPAL_SENSOR_READ_U64); -OPAL_CALL(opal_sensor_group_enable, OPAL_SENSOR_GROUP_ENABLE); -OPAL_CALL(opal_nx_coproc_init, OPAL_NX_COPROC_INIT); From e7140639b1de65bba435a6bd772d134901141f86 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Mon, 25 Feb 2019 22:38:55 -0700 Subject: [PATCH 190/200] powerpc/xmon: Fix opcode being uninitialized in print_insn_powerpc When building with -Wsometimes-uninitialized, Clang warns: arch/powerpc/xmon/ppc-dis.c:157:7: warning: variable 'opcode' is used uninitialized whenever 'if' condition is false [-Wsometimes-uninitialized] if (cpu_has_feature(CPU_FTRS_POWER9)) ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ arch/powerpc/xmon/ppc-dis.c:167:7: note: uninitialized use occurs here if (opcode == NULL) ^~~~~~ arch/powerpc/xmon/ppc-dis.c:157:3: note: remove the 'if' if its condition is always true if (cpu_has_feature(CPU_FTRS_POWER9)) ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ arch/powerpc/xmon/ppc-dis.c:132:38: note: initialize the variable 'opcode' to silence this warning const struct powerpc_opcode *opcode; ^ = NULL 1 warning generated. This warning seems to make no sense on the surface because opcode is set to NULL right below this statement. However, there is a comma instead of semicolon to end the dialect assignment, meaning that the opcode assignment only happens in the if statement. Properly terminate that line so that Clang no longer warns. Fixes: 5b102782c7f4 ("powerpc/xmon: Enable disassembly files (compilation changes)") Signed-off-by: Nathan Chancellor Reviewed-by: Nick Desaulniers Signed-off-by: Michael Ellerman --- arch/powerpc/xmon/ppc-dis.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/xmon/ppc-dis.c b/arch/powerpc/xmon/ppc-dis.c index 9deea5ee13f6..27f1e6415036 100644 --- a/arch/powerpc/xmon/ppc-dis.c +++ b/arch/powerpc/xmon/ppc-dis.c @@ -158,7 +158,7 @@ int print_insn_powerpc (unsigned long insn, unsigned long memaddr) dialect |= (PPC_OPCODE_POWER5 | PPC_OPCODE_POWER6 | PPC_OPCODE_POWER7 | PPC_OPCODE_POWER8 | PPC_OPCODE_POWER9 | PPC_OPCODE_HTM | PPC_OPCODE_ALTIVEC | PPC_OPCODE_ALTIVEC2 - | PPC_OPCODE_VSX | PPC_OPCODE_VSX3), + | PPC_OPCODE_VSX | PPC_OPCODE_VSX3); /* Get the major opcode of the insn. */ opcode = NULL; From 7b62f9bd2246b7d3d086e571397c14ba52645ef1 Mon Sep 17 00:00:00 2001 From: Jordan Niethe Date: Wed, 27 Feb 2019 14:02:29 +1100 Subject: [PATCH 191/200] powerpc/powernv: Make opal log only readable by root Currently the opal log is globally readable. It is kernel policy to limit the visibility of physical addresses / kernel pointers to root. Given this and the fact the opal log may contain this information it would be better to limit the readability to root. Fixes: bfc36894a48b ("powerpc/powernv: Add OPAL message log interface") Cc: stable@vger.kernel.org # v3.15+ Signed-off-by: Jordan Niethe Reviewed-by: Stewart Smith Reviewed-by: Andrew Donnellan Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/opal-msglog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/powernv/opal-msglog.c b/arch/powerpc/platforms/powernv/opal-msglog.c index acd3206dfae3..06628c71cef6 100644 --- a/arch/powerpc/platforms/powernv/opal-msglog.c +++ b/arch/powerpc/platforms/powernv/opal-msglog.c @@ -98,7 +98,7 @@ static ssize_t opal_msglog_read(struct file *file, struct kobject *kobj, } static struct bin_attribute opal_msglog_attr = { - .attr = {.name = "msglog", .mode = 0444}, + .attr = {.name = "msglog", .mode = 0400}, .read = opal_msglog_read }; From 27da80719ef132cf8c80eb406d5aeb37dddf78cc Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 26 Feb 2019 18:18:48 +0000 Subject: [PATCH 192/200] powerpc/fsl: Fix the flush of branch predictor. The commit identified below adds MC_BTB_FLUSH macro only when CONFIG_PPC_FSL_BOOK3E is defined. This results in the following error on some configs (seen several times with kisskb randconfig_defconfig) arch/powerpc/kernel/exceptions-64e.S:576: Error: Unrecognized opcode: `mc_btb_flush' make[3]: *** [scripts/Makefile.build:367: arch/powerpc/kernel/exceptions-64e.o] Error 1 make[2]: *** [scripts/Makefile.build:492: arch/powerpc/kernel] Error 2 make[1]: *** [Makefile:1043: arch/powerpc] Error 2 make: *** [Makefile:152: sub-make] Error 2 This patch adds a blank definition of MC_BTB_FLUSH for other cases. Fixes: 10c5e83afd4a ("powerpc/fsl: Flush the branch predictor at each kernel entry (64bit)") Cc: Diana Craciun Signed-off-by: Christophe Leroy Reviewed-by: Daniel Axtens Reviewed-by: Diana Craciun Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/exceptions-64e.S | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S index 4549ce8d4637..49381f32b374 100644 --- a/arch/powerpc/kernel/exceptions-64e.S +++ b/arch/powerpc/kernel/exceptions-64e.S @@ -338,6 +338,7 @@ ret_from_mc_except: #define GEN_BTB_FLUSH #define CRIT_BTB_FLUSH #define DBG_BTB_FLUSH +#define MC_BTB_FLUSH #define GDBELL_BTB_FLUSH #endif From 11f5acce2fa43b015a8120fa7620fa4efd0a2952 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Wed, 13 Feb 2019 14:38:18 +1100 Subject: [PATCH 193/200] powerpc/powernv/ioda: Fix locked_vm counting for memory used by IOMMU tables We store 2 multilevel tables in iommu_table - one for the hardware and one with the corresponding userspace addresses. Before allocating the tables, the iommu_table_group_ops::get_table_size() hook returns the combined size of the two and VFIO SPAPR TCE IOMMU driver adjusts the locked_vm counter correctly. When the table is actually allocated, the amount of allocated memory is stored in iommu_table::it_allocated_size and used to decrement the locked_vm counter when we release the memory used by the table; .get_table_size() and .create_table() calculate it independently but the result is expected to be the same. However the allocator does not add the userspace table size to .it_allocated_size so when we destroy the table because of VFIO PCI unplug (i.e. VFIO container is gone but the userspace keeps running), we decrement locked_vm by just a half of size of memory we are releasing. To make things worse, since we enabled on-demand allocation of indirect levels, it_allocated_size contains only the amount of memory actually allocated at the table creation time which can just be a fraction. It is not a problem with incrementing locked_vm (as get_table_size() value is used) but it is with decrementing. As the result, we leak locked_vm and may not be able to allocate more IOMMU tables after few iterations of hotplug/unplug. This sets it_allocated_size in the pnv_pci_ioda2_ops::create_table() hook to what pnv_pci_ioda2_get_table_size() returns so from now on we have a single place which calculates the maximum memory a table can occupy. The original meaning of it_allocated_size is somewhat lost now though. We do not ditch it_allocated_size whatsoever here and we do not call get_table_size() from vfio_iommu_spapr_tce.c when decrementing locked_vm as we may have multiple IOMMU groups per container and even though they all are supposed to have the same get_table_size() implementation, there is a small chance for failure or confusion. Fixes: 090bad39b237 ("powerpc/powernv: Add indirect levels to it_userspace") Signed-off-by: Alexey Kardashevskiy Reviewed-by: David Gibson Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/pci-ioda-tce.c | 1 - arch/powerpc/platforms/powernv/pci-ioda.c | 7 ++++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c b/arch/powerpc/platforms/powernv/pci-ioda-tce.c index 697449afb3f7..e28f03e1eb5e 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda-tce.c +++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c @@ -313,7 +313,6 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, page_shift); tbl->it_level_size = 1ULL << (level_shift - 3); tbl->it_indirect_levels = levels - 1; - tbl->it_allocated_size = total_allocated; tbl->it_userspace = uas; tbl->it_nid = nid; diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index e9986c9f779d..f1ce39f64329 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -2524,8 +2524,13 @@ static long pnv_pci_ioda2_create_table_userspace( int num, __u32 page_shift, __u64 window_size, __u32 levels, struct iommu_table **ptbl) { - return pnv_pci_ioda2_create_table(table_group, + long ret = pnv_pci_ioda2_create_table(table_group, num, page_shift, window_size, levels, true, ptbl); + + if (!ret) + (*ptbl)->it_allocated_size = pnv_pci_ioda2_get_table_size( + page_shift, window_size, levels); + return ret; } static void pnv_ioda2_take_ownership(struct iommu_table_group *table_group) From bd3524feac214f0ab9693c6d4c0cb5be8e1318b9 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 1 Mar 2019 22:56:36 +1000 Subject: [PATCH 194/200] powerpc/64s: Fix unrelocated interrupt trampoline address test The recent commit got this test wrong, it declared the assembler symbols the wrong way, and also used the wrong symbol name (xxx_start rather than start_xxx, see asm/head-64.h). Fixes: ccd477028a ("powerpc/64s: Fix HV NMI vs HV interrupt recoverability test") Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/asm-prototypes.h | 8 -------- arch/powerpc/include/asm/sections.h | 7 +++++++ arch/powerpc/kernel/exceptions-64s.S | 8 ++++---- arch/powerpc/kernel/traps.c | 9 +++++---- 4 files changed, 16 insertions(+), 16 deletions(-) diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h index effdd096fa4c..296584e6dd55 100644 --- a/arch/powerpc/include/asm/asm-prototypes.h +++ b/arch/powerpc/include/asm/asm-prototypes.h @@ -49,14 +49,6 @@ int exit_vmx_usercopy(void); int enter_vmx_ops(void); void *exit_vmx_ops(void *dest); -/* Exceptions */ -#ifdef CONFIG_PPC_POWERNV -extern unsigned long real_trampolines_start; -extern unsigned long real_trampolines_end; -extern unsigned long virt_trampolines_start; -extern unsigned long virt_trampolines_end; -#endif - /* Traps */ long machine_check_early(struct pt_regs *regs); long hmi_exception_realmode(struct pt_regs *regs); diff --git a/arch/powerpc/include/asm/sections.h b/arch/powerpc/include/asm/sections.h index e335a8f846af..4a1664a8658d 100644 --- a/arch/powerpc/include/asm/sections.h +++ b/arch/powerpc/include/asm/sections.h @@ -17,6 +17,13 @@ extern char __end_interrupts[]; extern char __prom_init_toc_start[]; extern char __prom_init_toc_end[]; +#ifdef CONFIG_PPC_POWERNV +extern char start_real_trampolines[]; +extern char end_real_trampolines[]; +extern char start_virt_trampolines[]; +extern char end_virt_trampolines[]; +#endif + static inline int in_kernel_text(unsigned long addr) { if (addr >= (unsigned long)_stext && addr < (unsigned long)__init_end) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 99312328ee66..a5b8fbae56a0 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -70,10 +70,10 @@ OPEN_FIXED_SECTION(virt_vectors, 0x4000, 0x5900) OPEN_FIXED_SECTION(virt_trampolines, 0x5900, 0x7000) #ifdef CONFIG_PPC_POWERNV - .globl real_trampolines_start - .globl real_trampolines_end - .globl virt_trampolines_start - .globl virt_trampolines_end + .globl start_real_trampolines + .globl end_real_trampolines + .globl start_virt_trampolines + .globl end_virt_trampolines #endif #if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index a5757bef03cd..a21200c6aaea 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -419,12 +419,13 @@ void hv_nmi_check_nonrecoverable(struct pt_regs *regs) goto nonrecoverable; if ((nip >= 0xf80 && nip < 0xfa0) || (nip >= 0x4f80 && nip < 0x4fa0)) goto nonrecoverable; + /* Trampoline code runs un-relocated so subtract kbase. */ - if (nip >= real_trampolines_start - kbase && - nip < real_trampolines_end - kbase) + if (nip >= (unsigned long)(start_real_trampolines - kbase) && + nip < (unsigned long)(end_real_trampolines - kbase)) goto nonrecoverable; - if (nip >= virt_trampolines_start - kbase && - nip < virt_trampolines_end - kbase) + if (nip >= (unsigned long)(start_virt_trampolines - kbase) && + nip < (unsigned long)(end_virt_trampolines - kbase)) goto nonrecoverable; return; From 6b1200facc051a3e487a52cbabd745f7c8f4e9f8 Mon Sep 17 00:00:00 2001 From: Firoz Khan Date: Wed, 2 Jan 2019 20:32:03 +0530 Subject: [PATCH 195/200] powerpc: remove nargs from __SYSCALL The __SYSCALL macro's arguments are system call number, system call entry name and number of arguments for the system call. Argument- nargs in __SYSCALL(nr, entry, nargs) is neither calculated nor used anywhere. So it would be better to keep the implementaion as __SYSCALL(nr, entry). This will unifies the implementation with some other architetures too. Signed-off-by: Firoz Khan Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/syscalls/syscalltbl.sh | 4 ++-- arch/powerpc/kernel/systbl.S | 6 +++--- arch/powerpc/platforms/cell/spu_callbacks.c | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/kernel/syscalls/syscalltbl.sh b/arch/powerpc/kernel/syscalls/syscalltbl.sh index fd620490a542..f7393a7b18aa 100644 --- a/arch/powerpc/kernel/syscalls/syscalltbl.sh +++ b/arch/powerpc/kernel/syscalls/syscalltbl.sh @@ -13,10 +13,10 @@ emit() { t_entry="$3" while [ $t_nxt -lt $t_nr ]; do - printf "__SYSCALL(%s,sys_ni_syscall, )\n" "${t_nxt}" + printf "__SYSCALL(%s,sys_ni_syscall)\n" "${t_nxt}" t_nxt=$((t_nxt+1)) done - printf "__SYSCALL(%s,%s, )\n" "${t_nxt}" "${t_entry}" + printf "__SYSCALL(%s,%s)\n" "${t_nxt}" "${t_entry}" } grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | ( diff --git a/arch/powerpc/kernel/systbl.S b/arch/powerpc/kernel/systbl.S index 23265a28740b..02f28faba125 100644 --- a/arch/powerpc/kernel/systbl.S +++ b/arch/powerpc/kernel/systbl.S @@ -25,11 +25,11 @@ .globl sys_call_table sys_call_table: #ifdef CONFIG_PPC64 -#define __SYSCALL(nr, entry, nargs) .8byte DOTSYM(entry) +#define __SYSCALL(nr, entry) .8byte DOTSYM(entry) #include #undef __SYSCALL #else -#define __SYSCALL(nr, entry, nargs) .long entry +#define __SYSCALL(nr, entry) .long entry #include #undef __SYSCALL #endif @@ -38,7 +38,7 @@ sys_call_table: .globl compat_sys_call_table compat_sys_call_table: #define compat_sys_sigsuspend sys_sigsuspend -#define __SYSCALL(nr, entry, nargs) .8byte DOTSYM(entry) +#define __SYSCALL(nr, entry) .8byte DOTSYM(entry) #include #undef __SYSCALL #endif diff --git a/arch/powerpc/platforms/cell/spu_callbacks.c b/arch/powerpc/platforms/cell/spu_callbacks.c index 125f2a5f02de..b5f35cbe9e21 100644 --- a/arch/powerpc/platforms/cell/spu_callbacks.c +++ b/arch/powerpc/platforms/cell/spu_callbacks.c @@ -34,7 +34,7 @@ */ static void *spu_syscall_table[] = { -#define __SYSCALL(nr, entry, nargs) entry, +#define __SYSCALL(nr, entry) entry, #include #undef __SYSCALL }; From 790845e2f12709d273d08ea7a2af7c2593689519 Mon Sep 17 00:00:00 2001 From: Rashmica Gupta Date: Wed, 13 Feb 2019 10:29:49 +1100 Subject: [PATCH 196/200] powerpc/mm: Check secondary hash page table We were always calling base_hpte_find() with primary = true, even when we wanted to check the secondary table. mpe: I broke this when refactoring Rashmica's original patch. Fixes: 1515ab932156 ("powerpc/mm: Dump hash table") Signed-off-by: Rashmica Gupta Signed-off-by: Michael Ellerman --- arch/powerpc/mm/ptdump/hashpagetable.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/mm/ptdump/hashpagetable.c b/arch/powerpc/mm/ptdump/hashpagetable.c index 869294695048..b430e4e08af6 100644 --- a/arch/powerpc/mm/ptdump/hashpagetable.c +++ b/arch/powerpc/mm/ptdump/hashpagetable.c @@ -342,7 +342,7 @@ static unsigned long hpte_find(struct pg_state *st, unsigned long ea, int psize) /* Look in secondary table */ if (slot == -1) - slot = base_hpte_find(ea, psize, true, &v, &r); + slot = base_hpte_find(ea, psize, false, &v, &r); /* No entry found */ if (slot == -1) From 8132cf115efc3b3684bb5fd3bfdf6860886f0e47 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Wed, 27 Feb 2019 21:35:05 -0500 Subject: [PATCH 197/200] powerpc/mm: Fix "sz" set but not used warning Fix compiler warning: arch/powerpc/mm/hugetlbpage-hash64.c: In function '__hash_page_huge': arch/powerpc/mm/hugetlbpage-hash64.c:29:28: warning: variable 'sz' set but not used [-Wunused-but-set-variable] mpe: The last usage of sz was removed in 0895ecda7942 ("powerpc/mm: Bring hugepage PTE accessor functions back into sync with normal accessors"). Signed-off-by: Qian Cai Signed-off-by: Michael Ellerman --- arch/powerpc/mm/hugetlbpage-hash64.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c index 2e6a8f9345d3..f6b09edc5e6e 100644 --- a/arch/powerpc/mm/hugetlbpage-hash64.c +++ b/arch/powerpc/mm/hugetlbpage-hash64.c @@ -26,7 +26,7 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, real_pte_t rpte; unsigned long vpn; unsigned long old_pte, new_pte; - unsigned long rflags, pa, sz; + unsigned long rflags, pa; long slot, offset; BUG_ON(shift != mmu_psize_defs[mmu_psize].shift); @@ -73,7 +73,6 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, offset = PTRS_PER_PMD; rpte = __real_pte(__pte(old_pte), ptep, offset); - sz = ((1UL) << shift); if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) /* No CPU has hugepages but lacks no execute, so we * don't need to worry about that case */ From c38ca265525a00d635219450e8fcc858082ff630 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Fri, 1 Mar 2019 09:20:40 -0500 Subject: [PATCH 198/200] powerpc/mm: fix "section_base" set but not used The commit 24b6d4164348 ("mm: pass the vmem_altmap to vmemmap_free") removed a line in vmemmap_free(), altmap = to_vmem_altmap((unsigned long) section_base); but left a variable no longer used. arch/powerpc/mm/init_64.c: In function 'vmemmap_free': arch/powerpc/mm/init_64.c:277:16: error: variable 'section_base' set but not used [-Werror=unused-but-set-variable] Signed-off-by: Qian Cai Signed-off-by: Michael Ellerman --- arch/powerpc/mm/init_64.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index a5091c034747..a4c155af1597 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -274,7 +274,6 @@ void __ref vmemmap_free(unsigned long start, unsigned long end, for (; start < end; start += page_size) { unsigned long nr_pages, addr; - struct page *section_base; struct page *page; /* @@ -290,7 +289,6 @@ void __ref vmemmap_free(unsigned long start, unsigned long end, continue; page = pfn_to_page(addr >> PAGE_SHIFT); - section_base = pfn_to_page(vmemmap_section_start(start)); nr_pages = 1 << page_order; base_pfn = PHYS_PFN(addr); From 39070a96a1c2c502b2f77972ba8c2eba3ca6cd3a Mon Sep 17 00:00:00 2001 From: Joe Lawrence Date: Fri, 1 Mar 2019 14:17:21 -0500 Subject: [PATCH 199/200] powerpc: Remove export of save_stack_trace_tsk_reliable() As tglx points out, there are no in-tree module users of save_stack_trace_tsk_reliable() and its x86 counterpart is not exported, so remove the powerpc symbol export. Suggested-by: Thomas Gleixner Signed-off-by: Joe Lawrence Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/stacktrace.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/powerpc/kernel/stacktrace.c b/arch/powerpc/kernel/stacktrace.c index f958f3bcba04..1e2276963f6d 100644 --- a/arch/powerpc/kernel/stacktrace.c +++ b/arch/powerpc/kernel/stacktrace.c @@ -220,7 +220,6 @@ int save_stack_trace_tsk_reliable(struct task_struct *tsk, return ret; } -EXPORT_SYMBOL_GPL(save_stack_trace_tsk_reliable); #endif /* CONFIG_HAVE_RELIABLE_STACKTRACE */ #if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_NMI_IPI) From 9580b71b5a7863c24a9bd18bcd2ad759b86b1eff Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 27 Feb 2019 11:45:30 +0000 Subject: [PATCH 200/200] powerpc/32: Clear on-stack exception marker upon exception return Clear the on-stack STACK_FRAME_REGS_MARKER on exception exit in order to avoid confusing stacktrace like the one below. Call Trace: [c0e9dca0] [c01c42a0] print_address_description+0x64/0x2bc (unreliable) [c0e9dcd0] [c01c4684] kasan_report+0xfc/0x180 [c0e9dd10] [c0895130] memchr+0x24/0x74 [c0e9dd30] [c00a9e38] msg_print_text+0x124/0x574 [c0e9dde0] [c00ab710] console_unlock+0x114/0x4f8 [c0e9de40] [c00adc60] vprintk_emit+0x188/0x1c4 --- interrupt: c0e9df00 at 0x400f330 LR = init_stack+0x1f00/0x2000 [c0e9de80] [c00ae3c4] printk+0xa8/0xcc (unreliable) [c0e9df20] [c0c27e44] early_irq_init+0x38/0x108 [c0e9df50] [c0c15434] start_kernel+0x310/0x488 [c0e9dff0] [00003484] 0x3484 With this patch the trace becomes: Call Trace: [c0e9dca0] [c01c42c0] print_address_description+0x64/0x2bc (unreliable) [c0e9dcd0] [c01c46a4] kasan_report+0xfc/0x180 [c0e9dd10] [c0895150] memchr+0x24/0x74 [c0e9dd30] [c00a9e58] msg_print_text+0x124/0x574 [c0e9dde0] [c00ab730] console_unlock+0x114/0x4f8 [c0e9de40] [c00adc80] vprintk_emit+0x188/0x1c4 [c0e9de80] [c00ae3e4] printk+0xa8/0xcc [c0e9df20] [c0c27e44] early_irq_init+0x38/0x108 [c0e9df50] [c0c15434] start_kernel+0x310/0x488 [c0e9dff0] [00003484] 0x3484 Cc: stable@vger.kernel.org Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/entry_32.S | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index 96dce6a4b61e..b61cfd29c76f 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -730,6 +730,9 @@ fast_exception_return: mtcr r10 lwz r10,_LINK(r11) mtlr r10 + /* Clear the exception_marker on the stack to avoid confusing stacktrace */ + li r10, 0 + stw r10, 8(r11) REST_GPR(10, r11) #if defined(CONFIG_PPC_8xx) && defined(CONFIG_PERF_EVENTS) mtspr SPRN_NRI, r0 @@ -961,6 +964,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_NEED_PAIRED_STWCX) mtcrf 0xFF,r10 mtlr r11 + /* Clear the exception_marker on the stack to avoid confusing stacktrace */ + li r10, 0 + stw r10, 8(r1) /* * Once we put values in SRR0 and SRR1, we are in a state * where exceptions are not recoverable, since taking an @@ -997,6 +1003,9 @@ exc_exit_restart_end: mtlr r11 lwz r10,_CCR(r1) mtcrf 0xff,r10 + /* Clear the exception_marker on the stack to avoid confusing stacktrace */ + li r10, 0 + stw r10, 8(r1) REST_2GPRS(9, r1) .globl exc_exit_restart exc_exit_restart: