From f1fe5252018075e18d7f0c0e23ee173546420361 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Mon, 1 May 2017 22:01:09 +1000 Subject: [PATCH 001/158] powerpc/64s: Fix FIXUP_ENDIAN non-maskable interrupt reentrancy FIXUP_ENDIAN uses SRR[01] with MSR_RI=1, which gets corrupted if there is an interleaving system reset or machine check interrupt. Set MSR_RI=0 before setting SRRs. The rfid will restore MSR. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/boot/ppc_asm.h | 12 +++++++----- arch/powerpc/include/asm/ppc_asm.h | 11 +++++++---- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/boot/ppc_asm.h b/arch/powerpc/boot/ppc_asm.h index b03373d8b386..68e388ee94fe 100644 --- a/arch/powerpc/boot/ppc_asm.h +++ b/arch/powerpc/boot/ppc_asm.h @@ -67,13 +67,15 @@ #define MSR_LE 0x0000000000000001 #define FIXUP_ENDIAN \ - tdi 0, 0, 0x48; /* Reverse endian of b . + 8 */ \ - b $+36; /* Skip trampoline if endian is good */ \ - .long 0x05009f42; /* bcl 20,31,$+4 */ \ - .long 0xa602487d; /* mflr r10 */ \ - .long 0x1c004a39; /* addi r10,r10,28 */ \ + tdi 0,0,0x48; /* Reverse endian of b . + 8 */ \ + b $+44; /* Skip trampoline if endian is good */ \ .long 0xa600607d; /* mfmsr r11 */ \ .long 0x01006b69; /* xori r11,r11,1 */ \ + .long 0x00004039; /* li r10,0 */ \ + .long 0x6401417d; /* mtmsrd r10,1 */ \ + .long 0x05009f42; /* bcl 20,31,$+4 */ \ + .long 0xa602487d; /* mflr r10 */ \ + .long 0x14004a39; /* addi r10,r10,20 */ \ .long 0xa6035a7d; /* mtsrr0 r10 */ \ .long 0xa6037b7d; /* mtsrr1 r11 */ \ .long 0x2400004c /* rfid */ diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h index 359c44341761..6baeeb9acd0d 100644 --- a/arch/powerpc/include/asm/ppc_asm.h +++ b/arch/powerpc/include/asm/ppc_asm.h @@ -770,15 +770,18 @@ END_FTR_SECTION_IFCLR(CPU_FTR_601) #else #define FIXUP_ENDIAN \ tdi 0,0,0x48; /* Reverse endian of b . + 8 */ \ - b $+36; /* Skip trampoline if endian is good */ \ - .long 0x05009f42; /* bcl 20,31,$+4 */ \ - .long 0xa602487d; /* mflr r10 */ \ - .long 0x1c004a39; /* addi r10,r10,28 */ \ + b $+44; /* Skip trampoline if endian is good */ \ .long 0xa600607d; /* mfmsr r11 */ \ .long 0x01006b69; /* xori r11,r11,1 */ \ + .long 0x00004039; /* li r10,0 */ \ + .long 0x6401417d; /* mtmsrd r10,1 */ \ + .long 0x05009f42; /* bcl 20,31,$+4 */ \ + .long 0xa602487d; /* mflr r10 */ \ + .long 0x14004a39; /* addi r10,r10,20 */ \ .long 0xa6035a7d; /* mtsrr0 r10 */ \ .long 0xa6037b7d; /* mtsrr1 r11 */ \ .long 0x2400004c /* rfid */ + #endif /* !CONFIG_PPC_BOOK3E */ #endif /* __ASSEMBLY__ */ From c07e1b8a27a1419cdd4af7a1def26d12bb24d2b1 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Mon, 1 May 2017 22:01:10 +1000 Subject: [PATCH 002/158] powerpc/64s: Fix OPAL_CALL non-maskable interrupt reentrancy OPAL_CALL uses SRR[01] with MSR_RI=1, which gets corrupted if there is an interleaving system reset or machine check interrupt. Use HSRR[01] instead, which does not require MSR_RI=0. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/opal-wrappers.S | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S index f620572f891f..4ca6c26a56d5 100644 --- a/arch/powerpc/platforms/powernv/opal-wrappers.S +++ b/arch/powerpc/platforms/powernv/opal-wrappers.S @@ -99,10 +99,10 @@ opal_return: lwz r4,8(r1); ld r5,PPC_LR_STKOFF(r1); ld r6,PACASAVEDMSR(r13); - mtspr SPRN_SRR0,r5; - mtspr SPRN_SRR1,r6; mtcr r4; - rfid + mtspr SPRN_HSRR0,r5; + mtspr SPRN_HSRR1,r6; + hrfid opal_real_call: mfcr r11 From 67d20418088680d22037119d914982cd982b5c5a Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 12 May 2017 01:15:20 +1000 Subject: [PATCH 003/158] powerpc/powernv: Fix CPU_HOTPLUG=n idle.c compile error Fixes: a7cd88da97 ("powerpc/powernv: Move CPU-Offline idle state invocation from smp.c to idle.c") Cc: Gautham R. Shenoy Signed-off-by: Nicholas Piggin Acked-by: Gautham R. Shenoy Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/idle.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index 445f30a2c5ef..0c21747ed7e0 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -261,6 +261,7 @@ static u64 pnv_deepest_stop_psscr_val; static u64 pnv_deepest_stop_psscr_mask; static bool deepest_stop_found; +#ifdef CONFIG_HOTPLUG_CPU /* * pnv_cpu_offline: A function that puts the CPU into the deepest * available platform idle state on a CPU-Offline. @@ -293,6 +294,7 @@ unsigned long pnv_cpu_offline(unsigned int cpu) return srr1; } +#endif /* * Power ISA 3.0 idle initialization. From 09b6c1129f899c72d70b8bea36020644aa3b5a28 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 12 May 2017 10:47:07 +1000 Subject: [PATCH 004/158] powerpc/xmon: Fix compile error with PPC_8xx=y Rearrange the code so that mode and badaddr are only defined when they're used. Also unsplit the string for easier grepping, and switch from CONFIG_8xx which is deprecated to CONFIG_PPC_8xx. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/xmon/xmon.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index f11f65634aab..a728e1919613 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -1242,14 +1242,14 @@ bpt_cmds(void) { int cmd; unsigned long a; - int mode, i; + int i; struct bpt *bp; - const char badaddr[] = "Only kernel addresses are permitted " - "for breakpoints\n"; cmd = inchar(); switch (cmd) { -#ifndef CONFIG_8xx +#ifndef CONFIG_PPC_8xx + static const char badaddr[] = "Only kernel addresses are permitted for breakpoints\n"; + int mode; case 'd': /* bd - hardware data breakpoint */ mode = 7; cmd = inchar(); From a3f952df3c6d205fe3b1e0e4848d3c3db4a28ef0 Mon Sep 17 00:00:00 2001 From: Andrew Jeffery Date: Fri, 12 May 2017 13:28:10 +0930 Subject: [PATCH 005/158] powerpc: Tweak copy selection parameter in __copy_tofrom_user_power7() Experiments with the netperf benchmark indicated that the size selecting VMX-based copies in __copy_tofrom_user_power7() was suboptimal on POWER8. Measurements showed that parity was in the neighbourhood of 3328 bytes, rather than greater than 4096. The change gives a 1.5-2.0% improvement in performance for 4096-byte buffers, reducing the relative time spent in __copy_tofrom_user_power7() from approximately 7% to approximately 5% in the TCP_RR benchmark. Signed-off-by: Andrew Jeffery Acked-by: Anton Blanchard Signed-off-by: Michael Ellerman --- arch/powerpc/lib/copyuser_power7.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/lib/copyuser_power7.S b/arch/powerpc/lib/copyuser_power7.S index a24b4039352c..706b7cc19846 100644 --- a/arch/powerpc/lib/copyuser_power7.S +++ b/arch/powerpc/lib/copyuser_power7.S @@ -82,14 +82,14 @@ _GLOBAL(__copy_tofrom_user_power7) #ifdef CONFIG_ALTIVEC cmpldi r5,16 - cmpldi cr1,r5,4096 + cmpldi cr1,r5,3328 std r3,-STACKFRAMESIZE+STK_REG(R31)(r1) std r4,-STACKFRAMESIZE+STK_REG(R30)(r1) std r5,-STACKFRAMESIZE+STK_REG(R29)(r1) blt .Lshort_copy - bgt cr1,.Lvmx_copy + bge cr1,.Lvmx_copy #else cmpldi r5,16 From 70a92003de599ff2f15815b3d9c4459411ca4ebf Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Sun, 2 Apr 2017 12:05:36 +0200 Subject: [PATCH 006/158] powerpc/sequoia: Fix NAND partitions not to overlap Currently the DTS defines two partitions at the same addresses, if you use one, you will corrupt information on the other one. Fix it by shifting the second partition. Signed-off-by: Pavel Machek [mpe: Reconstruct change log from email thread] Signed-off-by: Michael Ellerman --- arch/powerpc/boot/dts/sequoia.dts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/boot/dts/sequoia.dts b/arch/powerpc/boot/dts/sequoia.dts index b1d329246b08..e41b88a5eaee 100644 --- a/arch/powerpc/boot/dts/sequoia.dts +++ b/arch/powerpc/boot/dts/sequoia.dts @@ -229,7 +229,7 @@ }; partition@84000 { label = "user"; - reg = <0x00000000 0x01f7c000>; + reg = <0x00084000 0x01f7c000>; }; }; }; From 518470fe962e23ca69a818e1e507eb4d28b6b09b Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Fri, 19 May 2017 15:46:41 +1000 Subject: [PATCH 007/158] powerpc: Add HAVE_IRQ_TIME_ACCOUNTING Allow us to enable IRQ_TIME_ACCOUNTING. Even though we currently use VIRT_CPU_ACCOUNTING_NATIVE, that option is quite heavy weight and IRQ_TIME_ACCOUNTING might be better in some cases. Signed-off-by: Anton Blanchard Signed-off-by: Michael Ellerman --- arch/powerpc/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index f7c8f9972f61..d090275ace44 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -208,6 +208,7 @@ config PPC select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_SYSCALL_TRACEPOINTS select HAVE_VIRT_CPU_ACCOUNTING + select HAVE_IRQ_TIME_ACCOUNTING select IRQ_DOMAIN select IRQ_FORCED_THREADING select MODULES_USE_ELF_RELA From 5f221c3ca13dceaea8eefe21dbd85da91ed9b1e8 Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Tue, 16 May 2017 14:19:43 +0530 Subject: [PATCH 008/158] powerpc/powernv/idle: Correctly initialize core_idle_state_ptr The lower 8 bits of core_idle_state_ptr tracks the number of non-idle threads in the core. This is supposed to be initialized to bit-map corresponding to the threads_per_core. However, currently it is initialized to PNV_CORE_IDLE_THREAD_BITS (0xFF). This is correct for POWER8 which has 8 threads per core, but not for POWER9 which has 4 threads per core. As a result, on POWER9, core_idle_state_ptr gets initialized to 0xFF. In case when all the threads of the core are idle, the bits corresponding tracking the idle-threads are non-zero. As a result, the idle entry/exit code fails to save/restore per-core hypervisor state since it assumes that there are threads in the cores which are still active. Fix this by correctly initializing the lower bits of the core_idle_state_ptr on the basis of threads_per_core. Signed-off-by: Gautham R. Shenoy Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/idle.c | 29 ++++++++++++++++++--------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index 0c21747ed7e0..502f3275d8c2 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -96,15 +96,24 @@ static void pnv_alloc_idle_core_states(void) u32 *core_idle_state; /* - * core_idle_state - First 8 bits track the idle state of each thread - * of the core. The 8th bit is the lock bit. Initially all thread bits - * are set. They are cleared when the thread enters deep idle state - * like sleep and winkle. Initially the lock bit is cleared. - * The lock bit has 2 purposes - * a. While the first thread is restoring core state, it prevents - * other threads in the core from switching to process context. - * b. While the last thread in the core is saving the core state, it - * prevents a different thread from waking up. + * core_idle_state - The lower 8 bits track the idle state of + * each thread of the core. + * + * The most significant bit is the lock bit. + * + * Initially all the bits corresponding to threads_per_core + * are set. They are cleared when the thread enters deep idle + * state like sleep and winkle/stop. + * + * Initially the lock bit is cleared. The lock bit has 2 + * purposes: + * a. While the first thread in the core waking up from + * idle is restoring core state, it prevents other + * threads in the core from switching to process + * context. + * b. While the last thread in the core is saving the + * core state, it prevents a different thread from + * waking up. */ for (i = 0; i < nr_cores; i++) { int first_cpu = i * threads_per_core; @@ -112,7 +121,7 @@ static void pnv_alloc_idle_core_states(void) size_t paca_ptr_array_size; core_idle_state = kmalloc_node(sizeof(u32), GFP_KERNEL, node); - *core_idle_state = PNV_CORE_IDLE_THREAD_BITS; + *core_idle_state = (1 << threads_per_core) - 1; paca_ptr_array_size = (threads_per_core * sizeof(struct paca_struct *)); From ec4867355244755fb5c06037ad2fff760701b465 Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Tue, 16 May 2017 14:19:44 +0530 Subject: [PATCH 009/158] powerpc/powernv/idle: Decouple Timebase restore & Per-core SPRs restore On POWER8, in case of - nap: both timebase and hypervisor state is retained. - fast-sleep: timebase is lost. But the hypervisor state is retained. - winkle: timebase and hypervisor state is lost. Hence, the current code for handling exit from a idle state assumes that if the timebase value is retained, then so is the hypervisor state. Thus, the current code doesn't restore per-core hypervisor state in such cases. But that is no longer the case on POWER9 where we do have stop states in which timebase value is retained, but the hypervisor state is lost. So we have to ensure that the per-core hypervisor state gets restored in such cases. Fix this by ensuring that even in the case when timebase is retained, we explicitly check if we are waking up from a deep stop that loses per-core hypervisor state (indicated by cr4 being eq or gt), and if this is the case, we restore the per-core hypervisor state. Signed-off-by: Gautham R. Shenoy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/idle_book3s.S | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S index 4898d676dcae..afd029f1039b 100644 --- a/arch/powerpc/kernel/idle_book3s.S +++ b/arch/powerpc/kernel/idle_book3s.S @@ -731,13 +731,14 @@ timebase_resync: * Use cr3 which indicates that we are waking up with atleast partial * hypervisor state loss to determine if TIMEBASE RESYNC is needed. */ - ble cr3,clear_lock + ble cr3,.Ltb_resynced /* Time base re-sync */ bl opal_resync_timebase; /* - * If waking up from sleep, per core state is not lost, skip to - * clear_lock. + * If waking up from sleep (POWER8), per core state + * is not lost, skip to clear_lock. */ +.Ltb_resynced: blt cr4,clear_lock /* From cb0be7ec03077a31712183bfbe7801061e2966b8 Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Tue, 16 May 2017 14:19:45 +0530 Subject: [PATCH 010/158] powerpc/powernv/idle: Restore LPCR on wakeup from deep-stop On wakeup from a deep stop state which is supposed to lose the hypervisor state, we don't restore the LPCR to the old value but set it to a "sane" value via cur_cpu_spec->cpu_restore(). The problem is that the "sane" value doesn't include UPRT and the HR bits which are required to run correctly in Radix mode. Fix this on POWER9 onwards by restoring the LPCR value whatever it was before executing the stop instruction. Signed-off-by: Gautham R. Shenoy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/idle_book3s.S | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S index afd029f1039b..6c9920d9221c 100644 --- a/arch/powerpc/kernel/idle_book3s.S +++ b/arch/powerpc/kernel/idle_book3s.S @@ -31,6 +31,7 @@ * registers for winkle support. */ #define _SDR1 GPR3 +#define _PTCR GPR3 #define _RPR GPR4 #define _SPURR GPR5 #define _PURR GPR6 @@ -39,7 +40,7 @@ #define _AMOR GPR9 #define _WORT GPR10 #define _WORC GPR11 -#define _PTCR GPR12 +#define _LPCR GPR12 #define PSSCR_EC_ESL_MASK_SHIFTED (PSSCR_EC | PSSCR_ESL) >> 16 @@ -55,12 +56,14 @@ save_sprs_to_stack: * here since any thread in the core might wake up first */ BEGIN_FTR_SECTION - mfspr r3,SPRN_PTCR - std r3,_PTCR(r1) /* * Note - SDR1 is dropped in Power ISA v3. Hence not restoring * SDR1 here */ + mfspr r3,SPRN_PTCR + std r3,_PTCR(r1) + mfspr r3,SPRN_LPCR + std r3,_LPCR(r1) FTR_SECTION_ELSE mfspr r3,SPRN_SDR1 std r3,_SDR1(r1) @@ -813,6 +816,10 @@ no_segments: mtctr r12 bctrl +BEGIN_FTR_SECTION + ld r4,_LPCR(r1) + mtspr SPRN_LPCR,r4 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) hypervisor_state_restored: mtspr SPRN_SRR1,r16 From 1e1601b38e6e0179fafafc4db2c4ccb71d0854c4 Mon Sep 17 00:00:00 2001 From: Akshay Adiga Date: Tue, 16 May 2017 14:19:46 +0530 Subject: [PATCH 011/158] powerpc/powernv/idle: Restore SPRs for deep idle states via stop API. Some of the SPR values (HID0, MSR, SPRG0) don't change during the run time of a booted kernel, once they have been initialized. The contents of these SPRs are lost when the CPUs enter deep stop states. So instead saving and restoring SPRs from the kernel, use the stop-api provided by the firmware by which the firmware can restore the contents of these SPRs to their initialized values after wakeup from a deep stop state. Apart from these, program the PSSCR value to that of the deepest stop state via the stop-api. This will be used to indicate to the underlying firmware as to what stop state to put the threads that have been woken up by a special-wakeup. And while we are at programming SPRs via stop-api, ensure that HID1, HID4 and HID5 registers which are only available on POWER8 are not requested to be restored by the firware on POWER9. Signed-off-by: Akshay Adiga Signed-off-by: Gautham R. Shenoy Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/idle.c | 83 +++++++++++++++++---------- 1 file changed, 52 insertions(+), 31 deletions(-) diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index 502f3275d8c2..46946a587004 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -30,8 +30,33 @@ /* Power ISA 3.0 allows for stop states 0x0 - 0xF */ #define MAX_STOP_STATE 0xF +#define P9_STOP_SPR_MSR 2000 +#define P9_STOP_SPR_PSSCR 855 + static u32 supported_cpuidle_states; +/* + * The default stop state that will be used by ppc_md.power_save + * function on platforms that support stop instruction. + */ +static u64 pnv_default_stop_val; +static u64 pnv_default_stop_mask; +static bool default_stop_found; + +/* + * First deep stop state. Used to figure out when to save/restore + * hypervisor context. + */ +u64 pnv_first_deep_stop_state = MAX_STOP_STATE; + +/* + * psscr value and mask of the deepest stop idle state. + * Used when a cpu is offlined. + */ +static u64 pnv_deepest_stop_psscr_val; +static u64 pnv_deepest_stop_psscr_mask; +static bool deepest_stop_found; + static int pnv_save_sprs_for_deep_states(void) { int cpu; @@ -48,6 +73,8 @@ static int pnv_save_sprs_for_deep_states(void) uint64_t hid4_val = mfspr(SPRN_HID4); uint64_t hid5_val = mfspr(SPRN_HID5); uint64_t hmeer_val = mfspr(SPRN_HMEER); + uint64_t msr_val = MSR_IDLE; + uint64_t psscr_val = pnv_deepest_stop_psscr_val; for_each_possible_cpu(cpu) { uint64_t pir = get_hard_smp_processor_id(cpu); @@ -61,6 +88,18 @@ static int pnv_save_sprs_for_deep_states(void) if (rc != 0) return rc; + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + rc = opal_slw_set_reg(pir, P9_STOP_SPR_MSR, msr_val); + if (rc) + return rc; + + rc = opal_slw_set_reg(pir, + P9_STOP_SPR_PSSCR, psscr_val); + + if (rc) + return rc; + } + /* HIDs are per core registers */ if (cpu_thread_in_core(cpu) == 0) { @@ -72,17 +111,21 @@ static int pnv_save_sprs_for_deep_states(void) if (rc != 0) return rc; - rc = opal_slw_set_reg(pir, SPRN_HID1, hid1_val); - if (rc != 0) - return rc; + /* Only p8 needs to set extra HID regiters */ + if (!cpu_has_feature(CPU_FTR_ARCH_300)) { - rc = opal_slw_set_reg(pir, SPRN_HID4, hid4_val); - if (rc != 0) - return rc; + rc = opal_slw_set_reg(pir, SPRN_HID1, hid1_val); + if (rc != 0) + return rc; - rc = opal_slw_set_reg(pir, SPRN_HID5, hid5_val); - if (rc != 0) - return rc; + rc = opal_slw_set_reg(pir, SPRN_HID4, hid4_val); + if (rc != 0) + return rc; + + rc = opal_slw_set_reg(pir, SPRN_HID5, hid5_val); + if (rc != 0) + return rc; + } } } @@ -240,14 +283,6 @@ static DEVICE_ATTR(fastsleep_workaround_applyonce, 0600, show_fastsleep_workaround_applyonce, store_fastsleep_workaround_applyonce); -/* - * The default stop state that will be used by ppc_md.power_save - * function on platforms that support stop instruction. - */ -static u64 pnv_default_stop_val; -static u64 pnv_default_stop_mask; -static bool default_stop_found; - /* * Used for ppc_md.power_save which needs a function with no parameters */ @@ -256,20 +291,6 @@ static void power9_idle(void) power9_idle_stop(pnv_default_stop_val, pnv_default_stop_mask); } -/* - * First deep stop state. Used to figure out when to save/restore - * hypervisor context. - */ -u64 pnv_first_deep_stop_state = MAX_STOP_STATE; - -/* - * psscr value and mask of the deepest stop idle state. - * Used when a cpu is offlined. - */ -static u64 pnv_deepest_stop_psscr_val; -static u64 pnv_deepest_stop_psscr_mask; -static bool deepest_stop_found; - #ifdef CONFIG_HOTPLUG_CPU /* * pnv_cpu_offline: A function that puts the CPU into the deepest From 22c6663dc69a042a7b4158f162582b4b1ba7a4b7 Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Tue, 16 May 2017 14:19:47 +0530 Subject: [PATCH 012/158] powerpc/powernv/idle: Use Requested Level for restoring state on P9 DD1 On Power9 DD1 due to a hardware bug the Power-Saving Level Status field (PLS) of the PSSCR for a thread waking up from a deep state can under-report if some other thread in the core is in a shallow stop state. The scenario in which this can manifest is as follows: 1) All the threads of the core are in deep stop. 2) One of the threads is woken up. The PLS for this thread will correctly reflect that it is waking up from deep stop. 3) The thread that has woken up now executes a shallow stop. 4) When some other thread in the core is woken, its PLS will reflect the shallow stop state. Thus, the subsequent thread for which the PLS is under-reporting the wakeup state will not restore the hypervisor resources. Hence, on DD1 systems, use the Requested Level (RL) field as a workaround to restore the contents of the hypervisor resources on the wakeup from the stop state. Signed-off-by: Gautham R. Shenoy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/paca.h | 2 ++ arch/powerpc/kernel/asm-offsets.c | 1 + arch/powerpc/kernel/idle_book3s.S | 13 ++++++++++++- 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index 1c09f8fe2ee8..77f60a0f1405 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -177,6 +177,8 @@ struct paca_struct { * to the sibling threads' paca. */ struct paca_struct **thread_sibling_pacas; + /* The PSSCR value that the kernel requested before going to stop */ + u64 requested_psscr; #endif #ifdef CONFIG_PPC_STD_MMU_64 diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 709e23425317..e15c178ba079 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -742,6 +742,7 @@ int main(void) OFFSET(PACA_THREAD_MASK, paca_struct, thread_mask); OFFSET(PACA_SUBCORE_SIBLING_MASK, paca_struct, subcore_sibling_mask); OFFSET(PACA_SIBLING_PACA_PTRS, paca_struct, thread_sibling_pacas); + OFFSET(PACA_REQ_PSSCR, paca_struct, requested_psscr); #endif DEFINE(PPC_DBELL_SERVER, PPC_DBELL_SERVER); diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S index 6c9920d9221c..98a6d07ecb5c 100644 --- a/arch/powerpc/kernel/idle_book3s.S +++ b/arch/powerpc/kernel/idle_book3s.S @@ -379,6 +379,7 @@ _GLOBAL(power9_idle_stop) mfspr r5,SPRN_PSSCR andc r5,r5,r4 or r3,r3,r5 + std r3, PACA_REQ_PSSCR(r13) mtspr SPRN_PSSCR,r3 LOAD_REG_ADDR(r5,power_enter_stop) li r4,1 @@ -498,12 +499,22 @@ pnv_restore_hyp_resource_arch300: LOAD_REG_ADDRBASE(r5,pnv_first_deep_stop_state) ld r4,ADDROFF(pnv_first_deep_stop_state)(r5) - mfspr r5,SPRN_PSSCR +BEGIN_FTR_SECTION_NESTED(71) + /* + * Assume that we are waking up from the state + * same as the Requested Level (RL) in the PSSCR + * which are Bits 60-63 + */ + ld r5,PACA_REQ_PSSCR(r13) + rldicl r5,r5,0,60 +FTR_SECTION_ELSE_NESTED(71) /* * 0-3 bits correspond to Power-Saving Level Status * which indicates the idle state we are waking up from */ + mfspr r5, SPRN_PSSCR rldicl r5,r5,4,60 +ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_POWER9_DD1, 71) cmpd cr4,r5,r4 bge cr4,pnv_wakeup_tb_loss /* returns to caller */ From f9122ee4f5580194014d4ca72092787ae33f29fb Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Tue, 16 May 2017 14:19:48 +0530 Subject: [PATCH 013/158] cpuidle-powernv: Allow Deep stop states that don't stop time The current code in the cpuidle-powernv intialization only allows deep stop states (indicated by OPAL_PM_STOP_INST_DEEP) which lose timebase (indicated by OPAL_PM_TIMEBASE_STOP). This assumption goes back to POWER8 time where deep states used to lose the timebase. However, on POWER9, we do have stop states that are deep (they lose hypervisor state) but retain the timebase. Fix the initialization code in the cpuidle-powernv driver to allow such deep states. Further, there is a bug in cpuidle-powernv driver with CONFIG_TICK_ONESHOT=n where we end up incrementing the nr_idle_states even if a platform idle state which loses time base was not added to the cpuidle table. Fix this by ensuring that the nr_idle_states variable gets incremented only when the platform idle state was added to the cpuidle table. Signed-off-by: Gautham R. Shenoy Signed-off-by: Michael Ellerman --- drivers/cpuidle/cpuidle-powernv.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c index 12409a519cc5..45eaf06462ae 100644 --- a/drivers/cpuidle/cpuidle-powernv.c +++ b/drivers/cpuidle/cpuidle-powernv.c @@ -354,6 +354,7 @@ static int powernv_add_idle_states(void) for (i = 0; i < dt_idle_states; i++) { unsigned int exit_latency, target_residency; + bool stops_timebase = false; /* * If an idle state has exit latency beyond * POWERNV_THRESHOLD_LATENCY_NS then don't use it @@ -381,6 +382,9 @@ static int powernv_add_idle_states(void) } } + if (flags[i] & OPAL_PM_TIMEBASE_STOP) + stops_timebase = true; + /* * For nap and fastsleep, use default target_residency * values if f/w does not expose it. @@ -392,8 +396,7 @@ static int powernv_add_idle_states(void) add_powernv_state(nr_idle_states, "Nap", CPUIDLE_FLAG_NONE, nap_loop, target_residency, exit_latency, 0, 0); - } else if ((flags[i] & OPAL_PM_STOP_INST_FAST) && - !(flags[i] & OPAL_PM_TIMEBASE_STOP)) { + } else if (has_stop_states && !stops_timebase) { add_powernv_state(nr_idle_states, names[i], CPUIDLE_FLAG_NONE, stop_loop, target_residency, exit_latency, @@ -405,8 +408,8 @@ static int powernv_add_idle_states(void) * within this config dependency check. */ #ifdef CONFIG_TICK_ONESHOT - if (flags[i] & OPAL_PM_SLEEP_ENABLED || - flags[i] & OPAL_PM_SLEEP_ENABLED_ER1) { + else if (flags[i] & OPAL_PM_SLEEP_ENABLED || + flags[i] & OPAL_PM_SLEEP_ENABLED_ER1) { if (!rc) target_residency = 300000; /* Add FASTSLEEP state */ @@ -414,14 +417,15 @@ static int powernv_add_idle_states(void) CPUIDLE_FLAG_TIMER_STOP, fastsleep_loop, target_residency, exit_latency, 0, 0); - } else if ((flags[i] & OPAL_PM_STOP_INST_DEEP) && - (flags[i] & OPAL_PM_TIMEBASE_STOP)) { + } else if (has_stop_states && stops_timebase) { add_powernv_state(nr_idle_states, names[i], CPUIDLE_FLAG_TIMER_STOP, stop_loop, target_residency, exit_latency, psscr_val[i], psscr_mask[i]); } #endif + else + continue; nr_idle_states++; } out: From c4b56b023daa91953e9ebe91143e6ca156f0bcb7 Mon Sep 17 00:00:00 2001 From: Ivan Mikhaylov Date: Mon, 15 May 2017 16:07:51 +0300 Subject: [PATCH 014/158] powerpc/44x/fsp2: Platform support for FSP2 (476fpe) board Add platform code support for FSP2 (476fpe) board. Signed-off-by: Ivan Mikhaylov Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/44x/Kconfig | 12 ++++++ arch/powerpc/platforms/44x/Makefile | 1 + arch/powerpc/platforms/44x/fsp2.c | 62 +++++++++++++++++++++++++++++ 3 files changed, 75 insertions(+) create mode 100644 arch/powerpc/platforms/44x/fsp2.c diff --git a/arch/powerpc/platforms/44x/Kconfig b/arch/powerpc/platforms/44x/Kconfig index 9b0afe935cc1..01cb109ebf17 100644 --- a/arch/powerpc/platforms/44x/Kconfig +++ b/arch/powerpc/platforms/44x/Kconfig @@ -199,6 +199,18 @@ config CURRITUCK help This option enables support for the IBM Currituck (476fpe) evaluation board +config FSP2 + bool "IBM FSP2 (476fpe) Support" + depends on PPC_47x + default n + select 476FPE + select IBM_EMAC_EMAC4 if IBM_EMAC + select IBM_EMAC_RGMII if IBM_EMAC + select COMMON_CLK + select DEFAULT_UIMAGE + help + This option enables support for the IBM FSP2 (476fpe) board + config AKEBONO bool "IBM Akebono (476gtr) Support" depends on PPC_47x diff --git a/arch/powerpc/platforms/44x/Makefile b/arch/powerpc/platforms/44x/Makefile index 26d35b5941f7..72b824160660 100644 --- a/arch/powerpc/platforms/44x/Makefile +++ b/arch/powerpc/platforms/44x/Makefile @@ -12,3 +12,4 @@ obj-$(CONFIG_ISS4xx) += iss4xx.o obj-$(CONFIG_CANYONLANDS)+= canyonlands.o obj-$(CONFIG_CURRITUCK) += ppc476.o obj-$(CONFIG_AKEBONO) += ppc476.o +obj-$(CONFIG_FSP2) += fsp2.o diff --git a/arch/powerpc/platforms/44x/fsp2.c b/arch/powerpc/platforms/44x/fsp2.c new file mode 100644 index 000000000000..92e98048404f --- /dev/null +++ b/arch/powerpc/platforms/44x/fsp2.c @@ -0,0 +1,62 @@ +/* + * FSP-2 board specific routines + * + * Based on earlier code: + * Matt Porter + * Copyright 2002-2005 MontaVista Software Inc. + * + * Eugene Surovegin or + * Copyright (c) 2003-2005 Zultys Technologies + * + * Rewritten and ported to the merged powerpc tree: + * Copyright 2007 David Gibson , IBM Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +static __initdata struct of_device_id fsp2_of_bus[] = { + { .compatible = "ibm,plb4", }, + { .compatible = "ibm,plb6", }, + { .compatible = "ibm,opb", }, + {}, +}; + +static int __init fsp2_device_probe(void) +{ + of_platform_bus_probe(NULL, fsp2_of_bus, NULL); + return 0; +} +machine_device_initcall(fsp2, fsp2_device_probe); + +static int __init fsp2_probe(void) +{ + unsigned long root = of_get_flat_dt_root(); + + if (!of_flat_dt_is_compatible(root, "ibm,fsp2")) + return 0; + return 1; +} + +define_machine(fsp2) { + .name = "FSP-2", + .probe = fsp2_probe, + .progress = udbg_progress, + .init_IRQ = uic_init_tree, + .get_irq = uic_get_irq, + .restart = ppc4xx_reset_system, + .calibrate_decr = generic_calibrate_decr, +}; From 9eec6cb142bddd35268c7c59b6c81be6c091aaf9 Mon Sep 17 00:00:00 2001 From: Ivan Mikhaylov Date: Mon, 15 May 2017 16:07:52 +0300 Subject: [PATCH 015/158] powerpc/44x/fsp2: Add device tree for FSP2 board Add a device tree for FSP2 board (476 based). Signed-off-by: Ivan Mikhaylov Signed-off-by: Michael Ellerman --- arch/powerpc/boot/dts/fsp2.dts | 608 +++++++++++++++++++++++++++++++++ 1 file changed, 608 insertions(+) create mode 100644 arch/powerpc/boot/dts/fsp2.dts diff --git a/arch/powerpc/boot/dts/fsp2.dts b/arch/powerpc/boot/dts/fsp2.dts new file mode 100644 index 000000000000..475953ada707 --- /dev/null +++ b/arch/powerpc/boot/dts/fsp2.dts @@ -0,0 +1,608 @@ +/* + * Device Tree Source for FSP2 + * + * Copyright 2010,2012 IBM Corp. + * + * This file is licensed under the terms of the GNU General Public + * License version 2. This program is licensed "as is" without + * any warranty of any kind, whether express or implied. + */ + + +/dts-v1/; + +/ { + #address-cells = <2>; + #size-cells = <1>; + model = "ibm,fsp2"; + compatible = "ibm,fsp2"; + dcr-parent = <&{/cpus/cpu@0}>; + + aliases { + ethernet0 = &EMAC0; + ethernet1 = &EMAC1; + serial0 = &UART0; + }; + + cpus { + #address-cells = <1>; + #size-cells = <0>; + + cpu@0 { + device_type = "cpu"; + model = "PowerPC, 476FSP2"; + reg = <0x0>; + clock-frequency = <0>; /* Filled in by cuboot */ + timebase-frequency = <0>; /* Filled in by cuboot */ + i-cache-line-size = <32>; + d-cache-line-size = <32>; + d-cache-size = <32768>; + i-cache-size = <32768>; + dcr-controller; + dcr-access-method = "native"; + }; + }; + + memory { + device_type = "memory"; + reg = <0x00000000 0x00000000 0x00000000>; /* Filled in by + cuboot */ + }; + + clocks { + mmc_clk: mmc_clk { + compatible = "fixed-clock"; + clock-frequency = <50000000>; + clock-output-names = "mmc_clk"; + }; + }; + + UIC0: uic0 { + #address-cells = <0>; + #size-cells = <0>; + #interrupt-cells = <2>; + compatible = "ibm,uic"; + interrupt-controller; + cell-index = <0>; + dcr-reg = <0x2c0 0x8>; + }; + + /* "interrupts" field is + first pair is non-critical, second is critical */ + UIC1_0: uic1_0 { + #address-cells = <0>; + #size-cells = <0>; + #interrupt-cells = <2>; + + compatible = "ibm,uic"; + interrupt-controller; + cell-index = <1>; + dcr-reg = <0x2c8 0x8>; + interrupt-parent = <&UIC0>; + interrupts = <21 0x4 4 0x84>; + }; + + /* PSI and DMA */ + UIC1_1: uic1_1 { + #address-cells = <0>; + #size-cells = <0>; + #interrupt-cells = <2>; + + compatible = "ibm,uic"; + interrupt-controller; + cell-index = <2>; + dcr-reg = <0x350 0x8>; + interrupt-parent = <&UIC0>; + interrupts = <22 0x4 5 0x84>; + }; + + /* Ethernet and USB */ + UIC1_2: uic1_2 { + #address-cells = <0>; + #size-cells = <0>; + #interrupt-cells = <2>; + + compatible = "ibm,uic"; + interrupt-controller; + cell-index = <3>; + dcr-reg = <0x358 0x8>; + interrupt-parent = <&UIC0>; + interrupts = <23 0x4 6 0x84>; + }; + + /* PLB Errors */ + UIC1_3: uic1_3 { + #address-cells = <0>; + #size-cells = <0>; + #interrupt-cells = <2>; + + compatible = "ibm,uic"; + interrupt-controller; + cell-index = <4>; + dcr-reg = <0x360 0x8>; + interrupt-parent = <&UIC0>; + interrupts = <24 0x4 7 0x84>; + }; + + UIC1_4: uic1_4 { + #address-cells = <0>; + #size-cells = <0>; + #interrupt-cells = <2>; + + compatible = "ibm,uic"; + interrupt-controller; + cell-index = <5>; + dcr-reg = <0x368 0x8>; + interrupt-parent = <&UIC0>; + interrupts = <25 0x4 8 0x84>; + }; + + UIC1_5: uic1_5 { + #address-cells = <0>; + #size-cells = <0>; + #interrupt-cells = <2>; + + compatible = "ibm,uic"; + interrupt-controller; + cell-index = <6>; + dcr-reg = <0x370 0x8>; + interrupt-parent = <&UIC0>; + interrupts = <26 0x4 9 0x84>; + }; + + /* 2nd level UICs for FSI */ + UIC2_0: uic2_0 { + #address-cells = <0>; + #size-cells = <0>; + #interrupt-cells = <2>; + + compatible = "ibm,uic"; + interrupt-controller; + cell-index = <7>; + dcr-reg = <0x2d0 0x8>; + interrupt-parent = <&UIC1_0>; + interrupts = <16 0x4 0 0x84>; + }; + + UIC2_1: uic2_1 { + #address-cells = <0>; + #size-cells = <0>; + #interrupt-cells = <2>; + + compatible = "ibm,uic"; + interrupt-controller; + cell-index = <8>; + dcr-reg = <0x2d8 0x8>; + interrupt-parent = <&UIC1_0>; + interrupts = <17 0x4 1 0x84>; + }; + + UIC2_2: uic2_2 { + #address-cells = <0>; + #size-cells = <0>; + #interrupt-cells = <2>; + + compatible = "ibm,uic"; + interrupt-controller; + cell-index = <9>; + dcr-reg = <0x2e0 0x8>; + interrupt-parent = <&UIC1_0>; + interrupts = <18 0x4 2 0x84>; + }; + + UIC2_3: uic2_3 { + #address-cells = <0>; + #size-cells = <0>; + #interrupt-cells = <2>; + + compatible = "ibm,uic"; + interrupt-controller; + cell-index = <10>; + dcr-reg = <0x2e8 0x8>; + interrupt-parent = <&UIC1_0>; + interrupts = <19 0x4 3 0x84>; + }; + + UIC2_4: uic2_4 { + #address-cells = <0>; + #size-cells = <0>; + #interrupt-cells = <2>; + + compatible = "ibm,uic"; + interrupt-controller; + cell-index = <11>; + dcr-reg = <0x2f0 0x8>; + interrupt-parent = <&UIC1_0>; + interrupts = <20 0x4 4 0x84>; + }; + + UIC2_5: uic2_5 { + #address-cells = <0>; + #size-cells = <0>; + #interrupt-cells = <2>; + + compatible = "ibm,uic"; + interrupt-controller; + cell-index = <12>; + dcr-reg = <0x2f8 0x8>; + interrupt-parent = <&UIC1_0>; + interrupts = <21 0x4 5 0x84>; + }; + + UIC2_6: uic2_6 { + #address-cells = <0>; + #size-cells = <0>; + #interrupt-cells = <2>; + + compatible = "ibm,uic"; + interrupt-controller; + cell-index = <13>; + dcr-reg = <0x300 0x8>; + interrupt-parent = <&UIC1_0>; + interrupts = <22 0x4 6 0x84>; + }; + + UIC2_7: uic2_7 { + #address-cells = <0>; + #size-cells = <0>; + #interrupt-cells = <2>; + + compatible = "ibm,uic"; + interrupt-controller; + cell-index = <14>; + dcr-reg = <0x308 0x8>; + interrupt-parent = <&UIC1_0>; + interrupts = <23 0x4 7 0x84>; + }; + + UIC2_8: uic2_8 { + #address-cells = <0>; + #size-cells = <0>; + #interrupt-cells = <2>; + + compatible = "ibm,uic"; + interrupt-controller; + cell-index = <15>; + dcr-reg = <0x310 0x8>; + interrupt-parent = <&UIC1_0>; + interrupts = <24 0x4 8 0x84>; + }; + + UIC2_9: uic2_9 { + #address-cells = <0>; + #size-cells = <0>; + #interrupt-cells = <2>; + + compatible = "ibm,uic"; + interrupt-controller; + cell-index = <16>; + dcr-reg = <0x318 0x8>; + interrupt-parent = <&UIC1_0>; + interrupts = <25 0x4 9 0x84>; + }; + + UIC2_10: uic2_10 { + #address-cells = <0>; + #size-cells = <0>; + #interrupt-cells = <2>; + + compatible = "ibm,uic"; + interrupt-controller; + cell-index = <17>; + dcr-reg = <0x320 0x8>; + interrupt-parent = <&UIC1_0>; + interrupts = <26 0x4 10 0x84>; + }; + + UIC2_11: uic2_11 { + #address-cells = <0>; + #size-cells = <0>; + #interrupt-cells = <2>; + + compatible = "ibm,uic"; + interrupt-controller; + cell-index = <18>; + dcr-reg = <0x328 0x8>; + interrupt-parent = <&UIC1_0>; + interrupts = <27 0x4 11 0x84>; + }; + + UIC2_12: uic2_12 { + #address-cells = <0>; + #size-cells = <0>; + #interrupt-cells = <2>; + + compatible = "ibm,uic"; + interrupt-controller; + cell-index = <19>; + dcr-reg = <0x330 0x8>; + interrupt-parent = <&UIC1_0>; + interrupts = <28 0x4 12 0x84>; + }; + + UIC2_13: uic2_13 { + #address-cells = <0>; + #size-cells = <0>; + #interrupt-cells = <2>; + + compatible = "ibm,uic"; + interrupt-controller; + cell-index = <20>; + dcr-reg = <0x338 0x8>; + interrupt-parent = <&UIC1_0>; + interrupts = <29 0x4 13 0x84>; + }; + + UIC2_14: uic2_14 { + #address-cells = <0>; + #size-cells = <0>; + #interrupt-cells = <2>; + + compatible = "ibm,uic"; + interrupt-controller; + cell-index = <21>; + dcr-reg = <0x340 0x8>; + interrupt-parent = <&UIC1_0>; + interrupts = <30 0x4 14 0x84>; + }; + + UIC2_15: uic2_15 { + #address-cells = <0>; + #size-cells = <0>; + #interrupt-cells = <2>; + + compatible = "ibm,uic"; + interrupt-controller; + cell-index = <22>; + dcr-reg = <0x348 0x8>; + interrupt-parent = <&UIC1_0>; + interrupts = <31 0x4 15 0x84>; + }; + + mmc0: sdhci@020c0000 { + compatible = "st,sdhci-stih407", "st,sdhci"; + status = "disabled"; + reg = <0x020c0000 0x20000>; + reg-names = "mmc"; + interrupt-parent = <&UIC1_3>; + interrupts = <21 0x4 22 0x4>; + interrupt-names = "mmcirq"; + pinctrl-names = "default"; + pinctrl-0 = <>; + clock-names = "mmc"; + clocks = <&mmc_clk>; + }; + + plb6 { + compatible = "ibm,plb6"; + #address-cells = <2>; + #size-cells = <1>; + ranges; + + MCW0: memory-controller-wrapper { + compatible = "ibm,cw-476fsp2"; + dcr-reg = <0x11111800 0x40>; + }; + + MCIF0: memory-controller { + compatible = "ibm,sdram-476fsp2", "ibm,sdram-4xx-ddr3"; + dcr-reg = <0x11120000 0x10000>; + mcer-device = <&MCW0>; + interrupt-parent = <&UIC0>; + interrupts = <10 0x84 /* ECC UE */ + 11 0x84>; /* ECC CE */ + }; + }; + + plb4 { + compatible = "ibm,plb4"; + #address-cells = <1>; + #size-cells = <1>; + ranges = <0x00000000 0x00000010 0x00000000 0x80000000 + 0x80000000 0x00000010 0x80000000 0x80000000>; + clock-frequency = <333333334>; + + plb6-system-hung-irq { + compatible = "ibm,bus-error-irq"; + #interrupt-cells = <2>; + interrupt-parent = <&UIC0>; + interrupts = <0 0x84>; + }; + + l2-error-irq { + compatible = "ibm,bus-error-irq"; + #interrupt-cells = <2>; + interrupt-parent = <&UIC0>; + interrupts = <20 0x84>; + }; + + plb6-plb4-irq { + compatible = "ibm,bus-error-irq"; + #interrupt-cells = <2>; + interrupt-parent = <&UIC0>; + interrupts = <1 0x84>; + }; + + plb4-ahb-irq { + compatible = "ibm,bus-error-irq"; + #interrupt-cells = <2>; + interrupt-parent = <&UIC1_3>; + interrupts = <20 0x84>; + }; + + opbd-error-irq { + compatible = "ibm,opbd-error-irq"; + #interrupt-cells = <2>; + interrupt-parent = <&UIC1_4>; + interrupts = <5 0x84>; + }; + + cmu-error-irq { + compatible = "ibm,cmu-error-irq"; + #interrupt-cells = <2>; + interrupt-parent = <&UIC0>; + interrupts = <28 0x84>; + }; + + conf-error-irq { + compatible = "ibm,conf-error-irq"; + #interrupt-cells = <2>; + interrupt-parent = <&UIC1_4>; + interrupts = <11 0x84>; + }; + + mc-ue-irq { + compatible = "ibm,mc-ue-irq"; + #interrupt-cells = <2>; + interrupt-parent = <&UIC0>; + interrupts = <10 0x84>; + }; + + reset-warning-irq { + compatible = "ibm,reset-warning-irq"; + #interrupt-cells = <2>; + interrupt-parent = <&UIC0>; + interrupts = <17 0x84>; + }; + + MAL0: mcmal0 { + #interrupt-cells = <1>; + #address-cells = <0>; + #size-cells = <0>; + compatible = "ibm,mcmal"; + dcr-reg = <0x80 0x80>; + num-tx-chans = <1>; + num-rx-chans = <1>; + interrupt-parent = <&MAL0>; + interrupts = <0 1 2 3 4>; + /* index interrupt-parent interrupt# type */ + interrupt-map = ; + }; + + MAL1: mcmal1 { + #interrupt-cells = <1>; + #address-cells = <0>; + #size-cells = <0>; + compatible = "ibm,mcmal"; + dcr-reg = <0x100 0x80>; + num-tx-chans = <1>; + num-rx-chans = <1>; + interrupt-parent = <&MAL1>; + interrupts = <0 1 2 3 4>; + /* index interrupt-parent interrupt# type */ + interrupt-map = ; + }; + + opb { + compatible = "ibm,opb"; + #address-cells = <1>; + #size-cells = <1>; + ranges; // pass-thru to parent bus + clock-frequency = <83333334>; + + EMAC0: ethernet@b0000000 { + linux,network-index = <0>; + device_type = "network"; + compatible = "ibm,emac4sync"; + has-inverted-stacr-oc; + interrupt-parent = <&UIC1_2>; + interrupts = <1 0x4 0 0x4>; + reg = <0xb0000000 0x100>; + local-mac-address = [000000000000]; /* Filled in by + cuboot */ + mal-device = <&MAL0>; + mal-tx-channel = <0>; + mal-rx-channel = <0>; + cell-index = <0>; + max-frame-size = <1500>; + rx-fifo-size = <4096>; + tx-fifo-size = <4096>; + rx-fifo-size-gige = <16384>; + tx-fifo-size-gige = <8192>; + phy-address = <1>; + phy-mode = "rgmii"; + phy-map = <00000003>; + rgmii-device = <&RGMII>; + rgmii-channel = <0>; + }; + + EMAC1: ethernet@b0000100 { + linux,network-index = <1>; + device_type = "network"; + compatible = "ibm,emac4sync"; + has-inverted-stacr-oc; + interrupt-parent = <&UIC1_2>; + interrupts = <9 0x4 8 0x4>; + reg = <0xb0000100 0x100>; + local-mac-address = [000000000000]; /* Filled in by + cuboot */ + mal-device = <&MAL1>; + mal-tx-channel = <0>; + mal-rx-channel = <0>; + cell-index = <1>; + max-frame-size = <1500>; + rx-fifo-size = <4096>; + tx-fifo-size = <4096>; + rx-fifo-size-gige = <16384>; + tx-fifo-size-gige = <8192>; + phy-address = <2>; + phy-mode = "rgmii"; + phy-map = <00000003>; + rgmii-device = <&RGMII>; + rgmii-channel = <1>; + }; + + RGMII: rgmii@b0000600 { + compatible = "ibm,rgmii"; + has-mdio; + reg = <0xb0000600 0x8>; + }; + + UART0: serial@b0020000 { + device_type = "serial"; + compatible = "ns16550"; + reg = <0xb0020000 0x8>; + virtual-reg = <0xb0020000>; + clock-frequency = <20833333>; + current-speed = <115200>; + interrupt-parent = <&UIC0>; + interrupts = <31 0x4>; + }; + }; + + OHCI1: ohci@02040000 { + compatible = "ohci-le"; + reg = <0x02040000 0xa0>; + interrupt-parent = <&UIC1_3>; + interrupts = <28 0x8 29 0x8>; + }; + + OHCI2: ohci@02080000 { + compatible = "ohci-le"; + reg = <0x02080000 0xa0>; + interrupt-parent = <&UIC1_3>; + interrupts = <30 0x8 31 0x8>; + }; + + EHCI: ehci@02000000 { + compatible = "usb-ehci"; + reg = <0x02000000 0xa4>; + interrupt-parent = <&UIC1_3>; + interrupts = <23 0x4>; + }; + + }; + + chosen { + linux,stdout-path = "/plb/opb/serial@b0020000"; + bootargs = "console=ttyS0,115200 rw log_buf_len=32768 debug"; + }; +}; From 20975a0ae1ccc59a21a149e2fd2964ce88edb0a9 Mon Sep 17 00:00:00 2001 From: Ivan Mikhaylov Date: Mon, 15 May 2017 16:07:53 +0300 Subject: [PATCH 016/158] powerpc/44x/fsp2: Add defconfig for FSP2 board This patch adds default FSP2 config for main usage. Signed-off-by: Ivan Mikhaylov Signed-off-by: Michael Ellerman --- arch/powerpc/configs/44x/fsp2_defconfig | 126 ++++++++++++++++++++++++ 1 file changed, 126 insertions(+) create mode 100644 arch/powerpc/configs/44x/fsp2_defconfig diff --git a/arch/powerpc/configs/44x/fsp2_defconfig b/arch/powerpc/configs/44x/fsp2_defconfig new file mode 100644 index 000000000000..e8e6a6999852 --- /dev/null +++ b/arch/powerpc/configs/44x/fsp2_defconfig @@ -0,0 +1,126 @@ +CONFIG_44x=y +# CONFIG_SWAP is not set +CONFIG_SYSVIPC=y +# CONFIG_CROSS_MEMORY_ATTACH is not set +# CONFIG_FHANDLE is not set +CONFIG_NO_HZ=y +CONFIG_HIGH_RES_TIMERS=y +CONFIG_IKCONFIG=y +CONFIG_IKCONFIG_PROC=y +CONFIG_LOG_BUF_SHIFT=16 +CONFIG_BLK_DEV_INITRD=y +# CONFIG_RD_LZMA is not set +# CONFIG_RD_XZ is not set +# CONFIG_RD_LZO is not set +# CONFIG_RD_LZ4 is not set +CONFIG_KALLSYMS_ALL=y +CONFIG_BPF_SYSCALL=y +CONFIG_EMBEDDED=y +CONFIG_PROFILING=y +CONFIG_OPROFILE=y +CONFIG_MODULES=y +CONFIG_MODULE_UNLOAD=y +# CONFIG_BLK_DEV_BSG is not set +CONFIG_PPC_47x=y +# CONFIG_EBONY is not set +CONFIG_FSP2=y +CONFIG_476FPE_ERR46=y +CONFIG_SWIOTLB=y +CONFIG_KEXEC=y +CONFIG_CRASH_DUMP=y +CONFIG_CMDLINE_BOOL=y +CONFIG_CMDLINE="ip=on rw" +# CONFIG_SUSPEND is not set +# CONFIG_PCI is not set +CONFIG_NET=y +CONFIG_PACKET=y +CONFIG_UNIX=y +CONFIG_INET=y +CONFIG_IP_PNP=y +CONFIG_IP_PNP_DHCP=y +CONFIG_IP_PNP_BOOTP=y +# CONFIG_INET_XFRM_MODE_TRANSPORT is not set +# CONFIG_INET_XFRM_MODE_TUNNEL is not set +# CONFIG_INET_XFRM_MODE_BEET is not set +# CONFIG_IPV6 is not set +CONFIG_VLAN_8021Q=m +CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" +CONFIG_DEVTMPFS=y +CONFIG_DEVTMPFS_MOUNT=y +CONFIG_CONNECTOR=y +CONFIG_MTD=y +CONFIG_MTD_BLOCK=y +CONFIG_MTD_JEDECPROBE=y +CONFIG_MTD_CFI_AMDSTD=y +CONFIG_MTD_PHYSMAP_OF=y +CONFIG_BLK_DEV_RAM=y +CONFIG_BLK_DEV_RAM_SIZE=35000 +# CONFIG_SCSI_PROC_FS is not set +CONFIG_BLK_DEV_SD=y +# CONFIG_SCSI_LOWLEVEL is not set +CONFIG_ATA=y +# CONFIG_SATA_PMP is not set +# CONFIG_ATA_SFF is not set +CONFIG_NETDEVICES=y +CONFIG_BONDING=m +CONFIG_IBM_EMAC=m +# CONFIG_INPUT is not set +# CONFIG_SERIO is not set +# CONFIG_VT is not set +# CONFIG_LEGACY_PTYS is not set +# CONFIG_DEVMEM is not set +CONFIG_SERIAL_8250=y +CONFIG_SERIAL_8250_CONSOLE=y +CONFIG_SERIAL_8250_NR_UARTS=32 +CONFIG_SERIAL_8250_RUNTIME_UARTS=32 +CONFIG_SERIAL_8250_EXTENDED=y +CONFIG_SERIAL_8250_SHARE_IRQ=y +CONFIG_SERIAL_OF_PLATFORM=y +# CONFIG_HW_RANDOM is not set +CONFIG_I2C=y +CONFIG_I2C_IBM_IIC=y +CONFIG_PTP_1588_CLOCK=y +# CONFIG_HWMON is not set +CONFIG_THERMAL=y +CONFIG_WATCHDOG=y +CONFIG_BOOKE_WDT=y +CONFIG_USB=y +CONFIG_USB_EHCI_HCD=y +CONFIG_USB_OHCI_HCD=y +CONFIG_MMC=y +CONFIG_MMC_DEBUG=y +CONFIG_MMC_SDHCI=y +CONFIG_MMC_SDHCI_PLTFM=y +CONFIG_MMC_SDHCI_OF_ARASAN=y +CONFIG_RTC_CLASS=y +CONFIG_RTC_DRV_M41T80=y +CONFIG_EXT2_FS=y +CONFIG_EXT4_FS=y +CONFIG_EXT4_FS_POSIX_ACL=y +CONFIG_EXT4_FS_SECURITY=y +CONFIG_PROC_KCORE=y +CONFIG_TMPFS=y +CONFIG_JFFS2_FS=y +CONFIG_JFFS2_FS_WBUF_VERIFY=y +CONFIG_JFFS2_SUMMARY=y +CONFIG_JFFS2_FS_XATTR=y +CONFIG_CRAMFS=y +CONFIG_NFS_FS=y +CONFIG_NFS_V3_ACL=y +CONFIG_NFS_V4=y +CONFIG_ROOT_NFS=y +CONFIG_NLS_DEFAULT="n" +CONFIG_XZ_DEC=y +CONFIG_PRINTK_TIME=y +CONFIG_MESSAGE_LOGLEVEL_DEFAULT=3 +CONFIG_DYNAMIC_DEBUG=y +CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_FS=y +CONFIG_MAGIC_SYSRQ=y +CONFIG_DETECT_HUNG_TASK=y +CONFIG_CRYPTO_CBC=y +CONFIG_CRYPTO_ECB=y +CONFIG_CRYPTO_PCBC=y +CONFIG_CRYPTO_MD5=y +CONFIG_CRYPTO_DES=y +# CONFIG_CRYPTO_HW is not set From 6e2f03e292ef46eed2b31b0a344a91d514f9cd81 Mon Sep 17 00:00:00 2001 From: Ivan Mikhaylov Date: Fri, 19 May 2017 18:47:05 +0300 Subject: [PATCH 017/158] powerpc/[booke|4xx]: Don't clobber TCR[WP] when setting TCR[DIE] Prevent a kernel panic caused by unintentionally clearing TCR watchdog bits. At this point in the kernel boot, the watchdog may have already been enabled by u-boot. The original code's attempt to write to the TCR register results in an inadvertent clearing of the watchdog configuration bits, causing the 476 to reset. Signed-off-by: Ivan Mikhaylov Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/time.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 2b33cfaac7b8..60714b8c9a2f 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -739,12 +739,20 @@ static int __init get_freq(char *name, int cells, unsigned long *val) static void start_cpu_decrementer(void) { #if defined(CONFIG_BOOKE) || defined(CONFIG_40x) + unsigned int tcr; + /* Clear any pending timer interrupts */ mtspr(SPRN_TSR, TSR_ENW | TSR_WIS | TSR_DIS | TSR_FIS); - /* Enable decrementer interrupt */ - mtspr(SPRN_TCR, TCR_DIE); -#endif /* defined(CONFIG_BOOKE) || defined(CONFIG_40x) */ + tcr = mfspr(SPRN_TCR); + /* + * The watchdog may have already been enabled by u-boot. So leave + * TRC[WP] (Watchdog Period) alone. + */ + tcr &= TCR_WP_MASK; /* Clear all bits except for TCR[WP] */ + tcr |= TCR_DIE; /* Enable decrementer */ + mtspr(SPRN_TCR, tcr); +#endif } void __init generic_calibrate_decr(void) From b87901e6ec782b881224e1723eb142ff4b61bf70 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Fri, 26 May 2017 16:19:46 +1000 Subject: [PATCH 018/158] powerpc: Use the asm-generic versions of some uapi includes These are completely obvious as all they do is include the asm-generic versions. Signed-off-by: Stephen Rothwell Acked-by: Arnd Bergmann Signed-off-by: Michael Ellerman --- arch/powerpc/include/uapi/asm/Kbuild | 5 +++++ arch/powerpc/include/uapi/asm/param.h | 1 - arch/powerpc/include/uapi/asm/poll.h | 1 - arch/powerpc/include/uapi/asm/resource.h | 1 - arch/powerpc/include/uapi/asm/statfs.h | 6 ------ 5 files changed, 5 insertions(+), 9 deletions(-) delete mode 100644 arch/powerpc/include/uapi/asm/param.h delete mode 100644 arch/powerpc/include/uapi/asm/poll.h delete mode 100644 arch/powerpc/include/uapi/asm/resource.h delete mode 100644 arch/powerpc/include/uapi/asm/statfs.h diff --git a/arch/powerpc/include/uapi/asm/Kbuild b/arch/powerpc/include/uapi/asm/Kbuild index b15bf6bc0e94..9aba4af4122e 100644 --- a/arch/powerpc/include/uapi/asm/Kbuild +++ b/arch/powerpc/include/uapi/asm/Kbuild @@ -1,2 +1,7 @@ # UAPI Header export list include include/uapi/asm-generic/Kbuild.asm + +generic-y += param.h +generic-y += poll.h +generic-y += resource.h +generic-y += statfs.h diff --git a/arch/powerpc/include/uapi/asm/param.h b/arch/powerpc/include/uapi/asm/param.h deleted file mode 100644 index 965d45427975..000000000000 --- a/arch/powerpc/include/uapi/asm/param.h +++ /dev/null @@ -1 +0,0 @@ -#include diff --git a/arch/powerpc/include/uapi/asm/poll.h b/arch/powerpc/include/uapi/asm/poll.h deleted file mode 100644 index c98509d3149e..000000000000 --- a/arch/powerpc/include/uapi/asm/poll.h +++ /dev/null @@ -1 +0,0 @@ -#include diff --git a/arch/powerpc/include/uapi/asm/resource.h b/arch/powerpc/include/uapi/asm/resource.h deleted file mode 100644 index 04bc4db8921b..000000000000 --- a/arch/powerpc/include/uapi/asm/resource.h +++ /dev/null @@ -1 +0,0 @@ -#include diff --git a/arch/powerpc/include/uapi/asm/statfs.h b/arch/powerpc/include/uapi/asm/statfs.h deleted file mode 100644 index 5244834583a4..000000000000 --- a/arch/powerpc/include/uapi/asm/statfs.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef _ASM_POWERPC_STATFS_H -#define _ASM_POWERPC_STATFS_H - -#include - -#endif From 308d263d3f28e816b0d7425ef8b35d69643b7ee0 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Fri, 26 May 2017 18:32:29 +1000 Subject: [PATCH 019/158] powerpc: Use uapi/asm-generic/sockios.h The arch version is identical except for comments and white space. Signed-off-by: Stephen Rothwell Acked-by: Arnd Bergmann Signed-off-by: Michael Ellerman --- arch/powerpc/include/uapi/asm/Kbuild | 1 + arch/powerpc/include/uapi/asm/sockios.h | 20 -------------------- 2 files changed, 1 insertion(+), 20 deletions(-) delete mode 100644 arch/powerpc/include/uapi/asm/sockios.h diff --git a/arch/powerpc/include/uapi/asm/Kbuild b/arch/powerpc/include/uapi/asm/Kbuild index 9aba4af4122e..0d960ef78a9a 100644 --- a/arch/powerpc/include/uapi/asm/Kbuild +++ b/arch/powerpc/include/uapi/asm/Kbuild @@ -4,4 +4,5 @@ include include/uapi/asm-generic/Kbuild.asm generic-y += param.h generic-y += poll.h generic-y += resource.h +generic-y += sockios.h generic-y += statfs.h diff --git a/arch/powerpc/include/uapi/asm/sockios.h b/arch/powerpc/include/uapi/asm/sockios.h deleted file mode 100644 index 55cef7675a31..000000000000 --- a/arch/powerpc/include/uapi/asm/sockios.h +++ /dev/null @@ -1,20 +0,0 @@ -#ifndef _ASM_POWERPC_SOCKIOS_H -#define _ASM_POWERPC_SOCKIOS_H - -/* - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -/* Socket-level I/O control calls. */ -#define FIOSETOWN 0x8901 -#define SIOCSPGRP 0x8902 -#define FIOGETOWN 0x8903 -#define SIOCGPGRP 0x8904 -#define SIOCATMARK 0x8905 -#define SIOCGSTAMP 0x8906 /* Get stamp (timeval) */ -#define SIOCGSTAMPNS 0x8907 /* Get stamp (timespec) */ - -#endif /* _ASM_POWERPC_SOCKIOS_H */ From e8c688251d0e8baca1cd68992c9ef4078a0361c8 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 12 May 2017 01:56:48 +1000 Subject: [PATCH 020/158] powerpc/64: Place sfpr section explicitly with the linker script Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/vmlinux.lds.S | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index 2f793be3d2b1..bcfda21c3179 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -115,6 +115,14 @@ SECTIONS KPROBES_TEXT IRQENTRY_TEXT SOFTIRQENTRY_TEXT + /* + * -Os builds call FP save/restore functions. The powerpc64 + * linker generates those on demand in the .sfpr section. + * .sfpr gets placed at the beginning of a group of input + * sections, which can break start-of-text offset if it is + * included with the main text sections, so put it by itself. + */ + *(.sfpr); MEM_KEEP(init.text) MEM_KEEP(exit.text) From baa25b571a168aff5a13bfdc973f1229e2b12b63 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 12 May 2017 01:56:49 +1000 Subject: [PATCH 021/158] powerpc/64: Do not link crtsavres.o in vmlinux The 64-bit linker creates save/restore functions on demand with final links, so vmlinux does not require crtsavres.o. Make crtsavres.o extra-y on 64-bit (it is still required by modules). Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/lib/Makefile | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile index ed7dfce331e0..0ca405a27ac1 100644 --- a/arch/powerpc/lib/Makefile +++ b/arch/powerpc/lib/Makefile @@ -9,8 +9,12 @@ ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) CFLAGS_REMOVE_code-patching.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_feature-fixups.o = $(CC_FLAGS_FTRACE) -obj-y += string.o alloc.o crtsavres.o code-patching.o \ - feature-fixups.o +obj-y += string.o alloc.o code-patching.o feature-fixups.o + +# 64-bit linker creates .sfpr on demand for final link (vmlinux), +# so it is only needed for modules. +obj-$(CONFIG_PPC32) += crtsavres.o +extra-$(CONFIG_PPC64) += crtsavres.o obj-$(CONFIG_PPC32) += div64.o copy_32.o From 7c868b66f8cc29bb9a63632b324a6fa661bdec05 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 12 May 2017 01:56:50 +1000 Subject: [PATCH 022/158] powerpc/64: Do not link crtsaveres.o in boot crtsaveres.S is empty with 64-bit builds already, so just don't build and link it to match the vmlinux build. Signed-off-by: Nicholas Piggin [mpe: Use CONFIG_PPC64_BOOT_WRAPPER not CONFIG_PPC32 to fix BE build] Signed-off-by: Michael Ellerman --- arch/powerpc/boot/Makefile | 5 ++++- arch/powerpc/boot/crtsavres.S | 8 ++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile index e82f333cc84a..a7814a7b1523 100644 --- a/arch/powerpc/boot/Makefile +++ b/arch/powerpc/boot/Makefile @@ -95,13 +95,16 @@ libfdtheader := fdt.h libfdt.h libfdt_internal.h $(addprefix $(obj)/,$(libfdt) libfdt-wrapper.o simpleboot.o epapr.o opal.o): \ $(addprefix $(obj)/,$(libfdtheader)) -src-wlib-y := string.S crt0.S crtsavres.S stdio.c decompress.c main.c \ +src-wlib-y := string.S crt0.S stdio.c decompress.c main.c \ $(libfdt) libfdt-wrapper.c \ ns16550.c serial.c simple_alloc.c div64.S util.S \ elf_util.c $(zlib-y) devtree.c stdlib.c \ oflib.c ofconsole.c cuboot.c mpsc.c cpm-serial.c \ uartlite.c mpc52xx-psc.c opal.c src-wlib-$(CONFIG_PPC64_BOOT_WRAPPER) += opal-calls.S +ifndef CONFIG_PPC64_BOOT_WRAPPER +src-wlib-y += crtsavres.S +endif src-wlib-$(CONFIG_40x) += 4xx.c planetcore.c src-wlib-$(CONFIG_44x) += 4xx.c ebony.c bamboo.c src-wlib-$(CONFIG_8xx) += mpc8xx.c planetcore.c fsl-soc.c diff --git a/arch/powerpc/boot/crtsavres.S b/arch/powerpc/boot/crtsavres.S index f3d9b35c07d4..085fb2b9a8b8 100644 --- a/arch/powerpc/boot/crtsavres.S +++ b/arch/powerpc/boot/crtsavres.S @@ -37,12 +37,13 @@ * the executable file might be covered by the GNU General Public License. */ +#ifdef __powerpc64__ +#error "On PPC64, FPR save/restore functions are provided by the linker." +#endif + .file "crtsavres.S" .section ".text" -/* On PowerPC64 Linux, these functions are provided by the linker. */ -#ifndef __powerpc64__ - #define _GLOBAL(name) \ .type name,@function; \ .globl name; \ @@ -230,4 +231,3 @@ _GLOBAL(_rest32gpr_31_x) mtlr 0 mr 1,11 blr -#endif From cde9f2f4207f769c0f8f0f53a431348ca6f3c182 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 12 May 2017 01:56:51 +1000 Subject: [PATCH 023/158] powerpc/64: Do not create new section for save/restore functions There is no need to create a new section for these. Consolidate with 32-bit and just use .text. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/lib/crtsavres.S | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/lib/crtsavres.S b/arch/powerpc/lib/crtsavres.S index 18af0b3d3eb2..7e5e1c28e56a 100644 --- a/arch/powerpc/lib/crtsavres.S +++ b/arch/powerpc/lib/crtsavres.S @@ -44,10 +44,10 @@ #ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE -#ifndef CONFIG_PPC64 - .section ".text" +#ifndef CONFIG_PPC64 + /* Routines for saving integer registers, called by the compiler. */ /* Called with r11 pointing to the stack header word of the caller of the */ /* function, just beyond the end of the integer save area. */ @@ -314,8 +314,6 @@ _GLOBAL(_restvr_31) #else /* CONFIG_PPC64 */ - .section ".text.save.restore","ax",@progbits - .globl _savegpr0_14 _savegpr0_14: std r14,-144(r1) From efe0160cfd40a99c052a00e174787c1f4158a9cd Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 12 May 2017 01:56:52 +1000 Subject: [PATCH 024/158] powerpc/64: Linker on-demand sfpr functions for modules For final link, the powerpc64 linker generates fpr save/restore functions on-demand, placing them in the .sfpr section. Starting with binutils 2.25, these can be provided for non-final links with --save-restore-funcs. Use that where possible for module links. This saves about 200 bytes per module (~60kB) on powernv defconfig build. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/Makefile | 10 ++++++++++ arch/powerpc/lib/Makefile | 13 ++++++++----- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index 3e0f0e1fadef..eaa1865e4a8d 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -189,7 +189,17 @@ else CHECKFLAGS += -D__LITTLE_ENDIAN__ endif +ifdef CONFIG_PPC32 KBUILD_LDFLAGS_MODULE += arch/powerpc/lib/crtsavres.o +else +ifeq ($(call ld-ifversion, -ge, 225000000, y),y) +# Have the linker provide sfpr if possible. +# There is a corresponding test in arch/powerpc/lib/Makefile +KBUILD_LDFLAGS_MODULE += --save-restore-funcs +else +KBUILD_LDFLAGS_MODULE += arch/powerpc/lib/crtsavres.o +endif +endif ifeq ($(CONFIG_476FPE_ERR46),y) KBUILD_LDFLAGS_MODULE += --ppc476-workaround \ diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile index 0ca405a27ac1..2c56f4636c2b 100644 --- a/arch/powerpc/lib/Makefile +++ b/arch/powerpc/lib/Makefile @@ -11,12 +11,15 @@ CFLAGS_REMOVE_feature-fixups.o = $(CC_FLAGS_FTRACE) obj-y += string.o alloc.o code-patching.o feature-fixups.o -# 64-bit linker creates .sfpr on demand for final link (vmlinux), -# so it is only needed for modules. -obj-$(CONFIG_PPC32) += crtsavres.o -extra-$(CONFIG_PPC64) += crtsavres.o +obj-$(CONFIG_PPC32) += div64.o copy_32.o crtsavres.o -obj-$(CONFIG_PPC32) += div64.o copy_32.o +# See corresponding test in arch/powerpc/Makefile +# 64-bit linker creates .sfpr on demand for final link (vmlinux), +# so it is only needed for modules, and only for older linkers which +# do not support --save-restore-funcs +ifeq ($(call ld-ifversion, -lt, 225000000, y),y) +extra-$(CONFIG_PPC64) += crtsavres.o +endif obj64-y += copypage_64.o copyuser_64.o mem_64.o hweight_64.o \ copyuser_power7.o string_64.o copypage_power7.o memcpy_power7.o \ From 4ea80652dc75482dca1739762075dd5caa57ae29 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 12 May 2017 03:40:38 +1000 Subject: [PATCH 025/158] powerpc/64s: Tool to flag direct branches from unrelocated interrupt vectors Direct banches from code below __end_interrupts to code above __end_interrupts when built with CONFIG_RELOCATABLE are disallowed because they will break when the kernel is not located at 0. Sample output: WARNING: Unrelocated relative branches c000000000000118 bl-> 0xc000000000038fb8 c00000000000013c b-> 0xc0000000001068a4 c000000000000148 b-> 0xc00000000003919c c00000000000014c b-> 0xc00000000003923c c0000000000005a4 b-> 0xc000000000106ffc c000000000001af0 b-> 0xc000000000106ffc c000000000001b24 b-> 0xc000000000106ffc c000000000001b58 b-> 0xc000000000106ffc Signed-off-by: Balbir Singh Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/Makefile.postlink | 9 +++- arch/powerpc/tools/unrel_branch_check.sh | 57 ++++++++++++++++++++++++ 2 files changed, 65 insertions(+), 1 deletion(-) create mode 100755 arch/powerpc/tools/unrel_branch_check.sh diff --git a/arch/powerpc/Makefile.postlink b/arch/powerpc/Makefile.postlink index eccfcc88afae..50336930e6f7 100644 --- a/arch/powerpc/Makefile.postlink +++ b/arch/powerpc/Makefile.postlink @@ -11,7 +11,14 @@ __archpost: include scripts/Kbuild.include quiet_cmd_relocs_check = CHKREL $@ - cmd_relocs_check = $(CONFIG_SHELL) $(srctree)/arch/powerpc/tools/relocs_check.sh "$(OBJDUMP)" "$@" +ifdef CONFIG_PPC_BOOK3S_64 + cmd_relocs_check = \ + $(CONFIG_SHELL) $(srctree)/arch/powerpc/tools/relocs_check.sh "$(OBJDUMP)" "$@" ; \ + $(CONFIG_SHELL) $(srctree)/arch/powerpc/tools/unrel_branch_check.sh "$(OBJDUMP)" "$@" +else + cmd_relocs_check = \ + $(CONFIG_SHELL) $(srctree)/arch/powerpc/tools/relocs_check.sh "$(OBJDUMP)" "$@" +endif # `@true` prevents complaint when there is nothing to be done diff --git a/arch/powerpc/tools/unrel_branch_check.sh b/arch/powerpc/tools/unrel_branch_check.sh new file mode 100755 index 000000000000..1e972df3107e --- /dev/null +++ b/arch/powerpc/tools/unrel_branch_check.sh @@ -0,0 +1,57 @@ +# Copyright © 2016 IBM Corporation +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version +# 2 of the License, or (at your option) any later version. +# +# This script checks the relocations of a vmlinux for "suspicious" +# branches from unrelocated code (head_64.S code). + +# Turn this on if you want more debug output: +# set -x + +# Have Kbuild supply the path to objdump so we handle cross compilation. +objdump="$1" +vmlinux="$2" + +#__end_interrupts should be located within the first 64K + +end_intr=0x$( +"$objdump" -R "$vmlinux" -d --start-address=0xc000000000000000 \ + --stop-address=0xc000000000010000 | +grep '\<__end_interrupts>:' | +awk '{print $1}' +) + +BRANCHES=$( +"$objdump" -R "$vmlinux" -D --start-address=0xc000000000000000 \ + --stop-address=${end_intr} | +grep -e "^c[0-9a-f]*:[[:space:]]*\([0-9a-f][0-9a-f][[:space:]]\)\{4\}[[:space:]]*b" | +grep -v '\<__start_initialization_multiplatform>' | +grep -v -e 'b.\?.\?ctr' | +grep -v -e 'b.\?.\?lr' | +sed 's/://' | +awk '{ print $1 ":" $6 ":0x" $7 ":" $8 " "}' +) + +for tuple in $BRANCHES +do + from=`echo $tuple | cut -d':' -f1` + branch=`echo $tuple | cut -d':' -f2` + to=`echo $tuple | cut -d':' -f3 | sed 's/cr[0-7],//'` + sym=`echo $tuple | cut -d':' -f4` + + if (( $to > $end_intr )) + then + if [ -z "$bad_branches" ]; then + echo "WARNING: Unrelocated relative branches" + bad_branches="yes" + fi + echo "$from $branch-> $to $sym" + fi +done + +if [ -z "$bad_branches" ]; then + exit 0 +fi From 951eedebcdea06fdcc742c82dc347509ce0e1ba4 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Mon, 29 May 2017 17:39:40 +1000 Subject: [PATCH 026/158] powerpc/64: Handle linker stubs in low .text code Very large kernels may require linker stubs for branches from HEAD text code. The linker may place these stubs before the HEAD text sections, which breaks the assumption that HEAD text is located at 0 (or the .text section being located at 0x7000/0x8000 on Book3S kernels). Provide an option to create a small section just before the .text section with an empty 256 - 4 bytes, and adjust the start of the .text section to match. The linker will tend to put stubs in that section and not break our relative-to-absolute offset assumptions. This causes a small waste of space on common kernels, but allows large kernels to build and boot. For now, it is an EXPERT config option, defaulting to =n, but a reference is provided for it in the build-time check for such breakage. This is good enough for allyesconfig and custom users / hackers. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/Kconfig | 11 +++++++++++ arch/powerpc/include/asm/head-64.h | 18 ++++++++++++++++++ arch/powerpc/kernel/vmlinux.lds.S | 5 +++++ 3 files changed, 34 insertions(+) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index d090275ace44..0153275e9f75 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -455,6 +455,17 @@ config PPC_TRANSACTIONAL_MEM ---help--- Support user-mode Transactional Memory on POWERPC. +config LD_HEAD_STUB_CATCH + bool "Reserve 256 bytes to cope with linker stubs in HEAD text" if EXPERT + depends on PPC64 + default n + help + Very large kernels can cause linker branch stubs to be generated by + code in head_64.S, which moves the head text sections out of their + specified location. This option can work around the problem. + + If unsure, say "N". + config DISABLE_MPROFILE_KERNEL bool "Disable use of mprofile-kernel for kernel tracing" depends on PPC64 && CPU_LITTLE_ENDIAN diff --git a/arch/powerpc/include/asm/head-64.h b/arch/powerpc/include/asm/head-64.h index 86eb87382031..68828aa6e056 100644 --- a/arch/powerpc/include/asm/head-64.h +++ b/arch/powerpc/include/asm/head-64.h @@ -63,11 +63,29 @@ . = 0x0; \ start_##sname: +/* + * .linker_stub_catch section is used to catch linker stubs from being + * inserted in our .text section, above the start_text label (which breaks + * the ABS_ADDR calculation). See kernel/vmlinux.lds.S and tools/head_check.sh + * for more details. We would prefer to just keep a cacheline (0x80), but + * 0x100 seems to be how the linker aligns branch stub groups. + */ +#ifdef CONFIG_LD_HEAD_STUB_CATCH +#define OPEN_TEXT_SECTION(start) \ + .section ".linker_stub_catch","ax",@progbits; \ +linker_stub_catch: \ + . = 0x4; \ + text_start = (start) + 0x100; \ + .section ".text","ax",@progbits; \ + .balign 0x100; \ +start_text: +#else #define OPEN_TEXT_SECTION(start) \ text_start = (start); \ .section ".text","ax",@progbits; \ . = 0x0; \ start_text: +#endif #define ZERO_FIXED_SECTION(sname, start, end) \ sname##_start = (start); \ diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index bcfda21c3179..be8578e02ab6 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -103,6 +103,11 @@ SECTIONS * section placement to work. */ .text BLOCK(0) : AT(ADDR(.text) - LOAD_OFFSET) { +#ifdef CONFIG_LD_HEAD_STUB_CATCH + *(.linker_stub_catch); + . = . ; +#endif + #else .text : AT(ADDR(.text) - LOAD_OFFSET) { ALIGN_FUNCTION(); From c494adefef9fcd0de172132e20f102d44c62fa2f Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 12 May 2017 03:40:39 +1000 Subject: [PATCH 027/158] powerpc/64: Tool to check head sections location sanity Use a tool to check that the location of "fixed sections" are where we expected them to be, which catches cases the linker script can't (stubs being added to start of .text section), and which ends up being neater. Sample output: ERROR: start_text address is c000000000008100, should be c000000000008000 ERROR: see comments in arch/powerpc/tools/head_check.sh Signed-off-by: Nicholas Piggin [mpe: Fold in fix from Nick for 4.6 era toolchains] Signed-off-by: Michael Ellerman --- arch/powerpc/Makefile.postlink | 8 ++- arch/powerpc/include/asm/head-64.h | 4 +- arch/powerpc/kernel/vmlinux.lds.S | 22 --------- arch/powerpc/tools/head_check.sh | 78 ++++++++++++++++++++++++++++++ 4 files changed, 87 insertions(+), 25 deletions(-) create mode 100644 arch/powerpc/tools/head_check.sh diff --git a/arch/powerpc/Makefile.postlink b/arch/powerpc/Makefile.postlink index 50336930e6f7..5db43ebbe2df 100644 --- a/arch/powerpc/Makefile.postlink +++ b/arch/powerpc/Makefile.postlink @@ -10,6 +10,9 @@ __archpost: -include include/config/auto.conf include scripts/Kbuild.include +quiet_cmd_head_check = CHKHEAD $@ + cmd_head_check = $(CONFIG_SHELL) $(srctree)/arch/powerpc/tools/head_check.sh "$(NM)" "$@" + quiet_cmd_relocs_check = CHKREL $@ ifdef CONFIG_PPC_BOOK3S_64 cmd_relocs_check = \ @@ -24,6 +27,9 @@ endif vmlinux: FORCE @true +ifdef CONFIG_PPC64 + $(call cmd,head_check) +endif ifdef CONFIG_RELOCATABLE $(call if_changed,relocs_check) endif @@ -32,7 +38,7 @@ endif @true clean: - @true + rm -f .tmp_symbols.txt PHONY += FORCE clean diff --git a/arch/powerpc/include/asm/head-64.h b/arch/powerpc/include/asm/head-64.h index 68828aa6e056..7ab95798f170 100644 --- a/arch/powerpc/include/asm/head-64.h +++ b/arch/powerpc/include/asm/head-64.h @@ -49,8 +49,8 @@ * CLOSE_FIXED_SECTION() or elsewhere, there may be something * unexpected being added there. Remove the '. = x_len' line, rebuild, and * check what is pushing the section down. - * - If the build dies in linking, check arch/powerpc/kernel/vmlinux.lds.S - * for instructions. + * - If the build dies in linking, check arch/powerpc/tools/head_check.sh + * comments. * - If the kernel crashes or hangs in very early boot, it could be linker * stubs at the start of the main text. */ diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index be8578e02ab6..e69155f0db36 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -58,7 +58,6 @@ SECTIONS #ifdef CONFIG_PPC64 KEEP(*(.head.text.first_256B)); #ifdef CONFIG_PPC_BOOK3E -# define END_FIXED 0x100 #else KEEP(*(.head.text.real_vectors)); *(.head.text.real_trampolines); @@ -66,12 +65,8 @@ SECTIONS *(.head.text.virt_trampolines); # if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) KEEP(*(.head.data.fwnmi_page)); -# define END_FIXED 0x8000 -# else -# define END_FIXED 0x7000 # endif #endif - ASSERT((. == END_FIXED), "vmlinux.lds.S: fixed section overflow error"); #else /* !CONFIG_PPC64 */ HEAD_TEXT #endif @@ -79,23 +74,6 @@ SECTIONS __head_end = .; - /* - * If the build dies here, it's likely code in head_64.S is referencing - * labels it can't reach, and the linker inserting stubs without the - * assembler's knowledge. To debug, remove the above assert and - * rebuild. Look for branch stubs in the fixed section region. - * - * Linker stub generation could be allowed in "trampoline" - * sections if absolutely necessary, but this would require - * some rework of the fixed sections. Before resorting to this, - * consider references that have sufficient addressing range, - * (e.g., hand coded trampolines) so the linker does not have - * to add stubs. - * - * Linker stubs at the top of the main text section are currently not - * detected, and will result in a crash at boot due to offsets being - * wrong. - */ #ifdef CONFIG_PPC64 /* * BLOCK(0) overrides the default output section alignment because diff --git a/arch/powerpc/tools/head_check.sh b/arch/powerpc/tools/head_check.sh new file mode 100644 index 000000000000..ad9e57209aa4 --- /dev/null +++ b/arch/powerpc/tools/head_check.sh @@ -0,0 +1,78 @@ +# Copyright © 2016 IBM Corporation + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version +# 2 of the License, or (at your option) any later version. + +# This script checks the head of a vmlinux for linker stubs that +# break our placement of fixed-location code for 64-bit. + +# based on relocs_check.pl +# Copyright © 2009 IBM Corporation + +# NOTE! +# +# If the build dies here, it's likely code in head_64.S/exception-64*.S or +# nearby, is branching to labels it can't reach directly, which results in the +# linker inserting branch stubs. This can move code around in ways that break +# the fixed section calculations (head-64.h). To debug this, disassemble the +# vmlinux and look for branch stubs (long_branch, plt_branch, etc.) in the +# fixed section region (0 - 0x8000ish). Check what code is calling those stubs, +# and perhaps change so a direct branch can reach. +# +# A ".linker_stub_catch" section is used to catch some stubs generated by +# early .text code, which tend to get placed at the start of the section. +# If there are too many such stubs, they can overflow this section. Expanding +# it may help (or reducing the number of stub branches). +# +# Linker stubs use the TOC pointer, so even if fixed section code could +# tolerate them being inserted into head code, they can't be allowed in low +# level entry code (boot, interrupt vectors, etc) until r2 is set up. This +# could cause the kernel to die in early boot. + +# Turn this on if you want more debug output: +# set -x + +if [ $# -lt 2 ]; then + echo "$0 [path to nm] [path to vmlinux]" 1>&2 + exit 1 +fi + +# Have Kbuild supply the path to nm so we handle cross compilation. +nm="$1" +vmlinux="$2" + +# gcc-4.6-era toolchain make _stext an A (absolute) symbol rather than T +$nm "$vmlinux" | grep -e " [TA] _stext$" -e " t start_first_256B$" -e " a text_start$" -e " t start_text$" -m4 > .tmp_symbols.txt + + +vma=$(cat .tmp_symbols.txt | grep -e " [TA] _stext$" | cut -d' ' -f1) + +expected_start_head_addr=$vma + +start_head_addr=$(cat .tmp_symbols.txt | grep " t start_first_256B$" | cut -d' ' -f1) + +if [ "$start_head_addr" != "$expected_start_head_addr" ]; then + echo "ERROR: head code starts at $start_head_addr, should be $expected_start_head_addr" + echo "ERROR: try to enable LD_HEAD_STUB_CATCH config option" + echo "ERROR: see comments in arch/powerpc/tools/head_check.sh" + + exit 1 +fi + +top_vma=$(echo $vma | cut -d'0' -f1) + +expected_start_text_addr=$(cat .tmp_symbols.txt | grep " a text_start$" | cut -d' ' -f1 | sed "s/^0/$top_vma/") + +start_text_addr=$(cat .tmp_symbols.txt | grep " t start_text$" | cut -d' ' -f1) + +if [ "$start_text_addr" != "$expected_start_text_addr" ]; then + echo "ERROR: start_text address is $start_text_addr, should be $expected_start_text_addr" + echo "ERROR: try to enable LD_HEAD_STUB_CATCH config option" + echo "ERROR: see comments in arch/powerpc/tools/head_check.sh" + + exit 1 +fi + +rm -f .tmp_symbols.txt From 83a092cf95f28696ddc36c8add0cf03ac034897f Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 12 May 2017 03:40:40 +1000 Subject: [PATCH 028/158] powerpc: Link warning for orphan sections Add --orphan-handling=warn to final link flags. This ensures we can handle all sections explicitly. This would have caught subtle breakage such as 7de3b27bac47da9de08409df1d69664acbb72197 at build-time. Also bring existing orphan sections into the fold: - .text.hot and .text.unlikely are compiler generated sections. - .sdata2, .dynsbss, .plt are used by PPC32 - We previously did not specify DWARF_DEBUG or STABS_DEBUG - DWARF_DEBUG did not include all DWARF sections that can be emitted - A number of sections are unused and can be discarded. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/Makefile | 1 + arch/powerpc/kernel/vmlinux.lds.S | 16 ++++++++++++++-- include/asm-generic/vmlinux.lds.h | 12 ++++++++++++ 3 files changed, 27 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index eaa1865e4a8d..8d4ed73d5490 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -98,6 +98,7 @@ endif LDFLAGS_vmlinux-y := -Bstatic LDFLAGS_vmlinux-$(CONFIG_RELOCATABLE) := -pie LDFLAGS_vmlinux := $(LDFLAGS_vmlinux-y) +LDFLAGS_vmlinux += $(call ld-option,--orphan-handling=warn) ifeq ($(CONFIG_PPC64),y) ifeq ($(call cc-option-yn,-mcmodel=medium),y) diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index e69155f0db36..ace6b6579961 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -91,7 +91,7 @@ SECTIONS ALIGN_FUNCTION(); #endif /* careful! __ftr_alt_* sections need to be close to .text */ - *(.text .fixup __ftr_alt_* .ref.text) + *(.text.hot .text .text.fixup .text.unlikely .fixup __ftr_alt_* .ref.text); SCHED_TEXT CPUIDLE_TEXT LOCK_TEXT @@ -258,7 +258,9 @@ SECTIONS .data : AT(ADDR(.data) - LOAD_OFFSET) { DATA_DATA *(.sdata) + *(.sdata2) *(.got.plt) *(.got) + *(.plt) } #else .data : AT(ADDR(.data) - LOAD_OFFSET) { @@ -321,6 +323,16 @@ SECTIONS _end = . ; PROVIDE32 (end = .); - /* Sections to be discarded. */ + STABS_DEBUG + + DWARF_DEBUG + DISCARDS + /DISCARD/ : { + *(*.EMB.apuinfo) + *(.glink .iplt .plt .rela* .comment) + *(.gnu.version*) + *(.gnu.attributes) + *(.eh_frame) + } } diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 314a0b9219c6..9862afb3ae05 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -595,6 +595,7 @@ #define SBSS(sbss_align) \ . = ALIGN(sbss_align); \ .sbss : AT(ADDR(.sbss) - LOAD_OFFSET) { \ + *(.dynsbss) \ *(.sbss) \ *(.scommon) \ } @@ -641,11 +642,22 @@ .debug_str 0 : { *(.debug_str) } \ .debug_loc 0 : { *(.debug_loc) } \ .debug_macinfo 0 : { *(.debug_macinfo) } \ + .debug_pubtypes 0 : { *(.debug_pubtypes) } \ + /* DWARF 3 */ \ + .debug_ranges 0 : { *(.debug_ranges) } \ /* SGI/MIPS DWARF 2 extensions */ \ .debug_weaknames 0 : { *(.debug_weaknames) } \ .debug_funcnames 0 : { *(.debug_funcnames) } \ .debug_typenames 0 : { *(.debug_typenames) } \ .debug_varnames 0 : { *(.debug_varnames) } \ + /* GNU DWARF 2 extensions */ \ + .debug_gnu_pubnames 0 : { *(.debug_gnu_pubnames) } \ + .debug_gnu_pubtypes 0 : { *(.debug_gnu_pubtypes) } \ + /* DWARF 4 */ \ + .debug_types 0 : { *(.debug_types) } \ + /* DWARF 5 */ \ + .debug_macro 0 : { *(.debug_macro) } \ + .debug_addr 0 : { *(.debug_addr) } /* Stabs debugging sections. */ #define STABS_DEBUG \ From e63739b1687ea37390e5463f43d0547477d324e3 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Thu, 25 May 2017 13:36:50 +1000 Subject: [PATCH 029/158] powerpc/mm/ptdump: Dump the first entry of the linear mapping as well The check in hpte_find() should be < and not <= for PAGE_OFFSET Signed-off-by: Balbir Singh Signed-off-by: Michael Ellerman --- arch/powerpc/mm/dump_hashpagetable.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/mm/dump_hashpagetable.c b/arch/powerpc/mm/dump_hashpagetable.c index c6b900f54c07..b1c144b03fcf 100644 --- a/arch/powerpc/mm/dump_hashpagetable.c +++ b/arch/powerpc/mm/dump_hashpagetable.c @@ -335,7 +335,7 @@ static unsigned long hpte_find(struct pg_state *st, unsigned long ea, int psize) unsigned long rpn, lp_bits; int base_psize = 0, actual_psize = 0; - if (ea <= PAGE_OFFSET) + if (ea < PAGE_OFFSET) return -1; /* Look in primary table */ From 9affa9e228d3ece66ed322909e84cfb08f6c4c64 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 29 May 2017 17:32:06 +0200 Subject: [PATCH 030/158] powerpc/mm: Remove __this_fixmap_does_not_exist() This function has not been used since commit 9494a1e8428ea ("powerpc: use generic fixmap.h) Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/mm/pgtable_32.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index a65c0b4c0669..cf32c5c6fe28 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -390,8 +390,3 @@ void __set_fixmap (enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags) map_page(address, phys, pgprot_val(flags)); fixmaps++; } - -void __this_fixmap_does_not_exist(void) -{ - WARN_ON(1); -} From e8de85ca32f572f5dee00733022d8a1ce87aed3d Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 19 Apr 2017 14:56:24 +0200 Subject: [PATCH 031/158] powerpc/mm: Only call store_updates_sp() on stores in do_page_fault() Function store_updates_sp() checks whether the faulting instruction is a store updating r1. Therefore we can limit its calls to store exceptions. This patch is an improvement of commit a7a9dcd882a67 ("powerpc: Avoid taking a data miss on every userspace instruction miss") With the same microbenchmark app, run with 500 as argument, on an MPC885 we get: Before this patch: 152000 DTLB misses After this patch: 147000 DTLB misses Signed-off-by: Christophe Leroy Reviewed-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/mm/fault.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 3a7d580fdc59..67fefb59d40e 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -287,7 +287,7 @@ int do_page_fault(struct pt_regs *regs, unsigned long address, * can result in fault, which will cause a deadlock when called with * mmap_sem held */ - if (!is_exec && user_mode(regs)) + if (is_write && user_mode(regs)) store_update_sp = store_updates_sp(regs); if (user_mode(regs)) From 97a011e69b42bec8ac10f8510d3cd73b50882d88 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 19 Apr 2017 14:56:28 +0200 Subject: [PATCH 032/158] powerpc/mm: Remove a redundant test in do_page_fault() The result of (trap == 0x400) is already in is_exec. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/mm/fault.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 67fefb59d40e..059e762e8995 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -216,7 +216,7 @@ int do_page_fault(struct pt_regs *regs, unsigned long address, * bits we are interested in. But there are some bits which * indicate errors in DSISR but can validly be set in SRR1. */ - if (trap == 0x400) + if (is_exec) error_code &= 0x48200000; else is_write = error_code & DSISR_ISSTORE; From da929f6af4689c75868dc373b4549f53945b5af0 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 19 Apr 2017 14:56:30 +0200 Subject: [PATCH 033/158] powerpc/mm: Evaluate user_mode(regs) only once in do_page_fault() Analysis of the assembly code shows that when using user_mode(regs), at least the 'andi.' is redone all the time, and also the 'lwz ,132(r31)' most of the time. With the new form, the 'is_user' is mapped to cr4, then all further use of is_user results in just things like 'beq cr4,218 ' Without the patch: 50: 81 1e 00 84 lwz r8,132(r30) 54: 71 09 40 00 andi. r9,r8,16384 58: 40 82 00 0c bne 64 84: 81 3e 00 84 lwz r9,132(r30) 8c: 71 2a 40 00 andi. r10,r9,16384 90: 41 a2 01 64 beq 1f4 d4: 81 3e 00 84 lwz r9,132(r30) dc: 71 28 40 00 andi. r8,r9,16384 e0: 41 82 02 08 beq 2e8 108: 81 3e 00 84 lwz r9,132(r30) 110: 71 28 40 00 andi. r8,r9,16384 118: 41 82 02 28 beq 340 1e4: 81 3e 00 84 lwz r9,132(r30) 1e8: 71 2a 40 00 andi. r10,r9,16384 1ec: 40 82 01 68 bne 354 228: 81 3e 00 84 lwz r9,132(r30) 22c: 71 28 40 00 andi. r8,r9,16384 230: 41 82 ff c4 beq 1f4 288: 71 2a 40 00 andi. r10,r9,16384 294: 41 a2 fe 60 beq f4 50c: 81 3e 00 84 lwz r9,132(r30) 514: 71 2a 40 00 andi. r10,r9,16384 518: 40 a2 fc e0 bne 1f8 534: 81 3e 00 84 lwz r9,132(r30) 53c: 71 2a 40 00 andi. r10,r9,16384 540: 41 82 fc b8 beq 1f8 This patch creates a local var called 'is_user' which contains the result of user_mode(regs) With the patch: 20: 81 03 00 84 lwz r8,132(r3) 48: 55 09 97 fe rlwinm r9,r8,18,31,31 58: 2e 09 00 00 cmpwi cr4,r9,0 5c: 40 92 00 0c bne cr4,68 88: 41 b2 01 90 beq cr4,218 d4: 40 92 01 d0 bne cr4,2a4 120: 41 b2 00 f8 beq cr4,218 138: 41 b2 ff a0 beq cr4,d8 1d4: 40 92 00 e0 bne cr4,2b4 Signed-off-by: Christophe Leroy Reviewed-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/mm/fault.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 059e762e8995..ab9622120696 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -206,6 +206,7 @@ int do_page_fault(struct pt_regs *regs, unsigned long address, int is_write = 0; int trap = TRAP(regs); int is_exec = trap == 0x400; + int is_user = user_mode(regs); int fault; int rc = 0, store_update_sp = 0; @@ -247,7 +248,7 @@ int do_page_fault(struct pt_regs *regs, unsigned long address, * The kernel should never take an execute fault nor should it * take a page fault to a kernel address. */ - if (!user_mode(regs) && (is_exec || (address >= TASK_SIZE))) { + if (!is_user && (is_exec || (address >= TASK_SIZE))) { rc = SIGSEGV; goto bail; } @@ -266,7 +267,7 @@ int do_page_fault(struct pt_regs *regs, unsigned long address, local_irq_enable(); if (faulthandler_disabled() || mm == NULL) { - if (!user_mode(regs)) { + if (!is_user) { rc = SIGSEGV; goto bail; } @@ -287,10 +288,10 @@ int do_page_fault(struct pt_regs *regs, unsigned long address, * can result in fault, which will cause a deadlock when called with * mmap_sem held */ - if (is_write && user_mode(regs)) + if (is_write && is_user) store_update_sp = store_updates_sp(regs); - if (user_mode(regs)) + if (is_user) flags |= FAULT_FLAG_USER; /* When running in the kernel we expect faults to occur only to @@ -309,7 +310,7 @@ int do_page_fault(struct pt_regs *regs, unsigned long address, * thus avoiding the deadlock. */ if (!down_read_trylock(&mm->mmap_sem)) { - if (!user_mode(regs) && !search_exception_tables(regs->nip)) + if (!is_user && !search_exception_tables(regs->nip)) goto bad_area_nosemaphore; retry: @@ -509,7 +510,7 @@ bad_area: bad_area_nosemaphore: /* User mode accesses cause a SIGSEGV */ - if (user_mode(regs)) { + if (is_user) { _exception(SIGSEGV, regs, code, address); goto bail; } From 92aa2fe039835a73fec335517652de32eeb58e0a Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 19 Apr 2017 14:56:32 +0200 Subject: [PATCH 034/158] powerpc/mm: The 8xx doesn't call do_page_fault() for breakpoints The 8xx has a dedicated exception for breakpoints, that directly calls do_break() Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/mm/fault.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index ab9622120696..4c422632047b 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -254,7 +254,7 @@ int do_page_fault(struct pt_regs *regs, unsigned long address, } #if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE) || \ - defined(CONFIG_PPC_BOOK3S_64)) + defined(CONFIG_PPC_BOOK3S_64) || defined(CONFIG_PPC_8xx)) if (error_code & DSISR_DABRMATCH) { /* breakpoint match */ do_break(regs, address, error_code); From 362957c27ed0d9ff485d3266ed22d944cbfea6cc Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 5 Aug 2016 13:28:05 +0200 Subject: [PATCH 035/158] powerpc/40x: Clear MSR_DR in one insn instead of two Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/misc_32.S | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S index 84db14e435f5..3f7a9a2d2435 100644 --- a/arch/powerpc/kernel/misc_32.S +++ b/arch/powerpc/kernel/misc_32.S @@ -244,8 +244,7 @@ _GLOBAL(_nmask_and_or_msr) */ _GLOBAL(real_readb) mfmsr r7 - ori r0,r7,MSR_DR - xori r0,r0,MSR_DR + rlwinm r0,r7,0,~MSR_DR sync mtmsr r0 sync @@ -262,8 +261,7 @@ _GLOBAL(real_readb) */ _GLOBAL(real_writeb) mfmsr r7 - ori r0,r7,MSR_DR - xori r0,r0,MSR_DR + rlwinm r0,r7,0,~MSR_DR sync mtmsr r0 sync From 3c29b6038828c1f4c9ecbfec14d4fc5e25f1c947 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 10 Mar 2017 11:37:01 +0100 Subject: [PATCH 036/158] powerpc/8xx: fix mpc8xx_get_irq() return on no irq IRQ 0 is a valid HW interrupt. So get_irq() shall return 0 when there is no irq, instead of returning irq_linear_revmap(... ,0) Fixes: f2a0bd3753dad ("[POWERPC] 8xx: powerpc port of core CPM PIC") Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/sysdev/mpc8xx_pic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/sysdev/mpc8xx_pic.c b/arch/powerpc/sysdev/mpc8xx_pic.c index 3e828b20c21e..2842f9d63d21 100644 --- a/arch/powerpc/sysdev/mpc8xx_pic.c +++ b/arch/powerpc/sysdev/mpc8xx_pic.c @@ -79,7 +79,7 @@ unsigned int mpc8xx_get_irq(void) irq = in_be32(&siu_reg->sc_sivec) >> 26; if (irq == PIC_VEC_SPURRIOUS) - irq = 0; + return 0; return irq_linear_revmap(mpc8xx_pic_host, irq); From 45cb08f4791ce6a15c54598b4cb73db4b4b8294f Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 16 Mar 2017 09:55:45 +0100 Subject: [PATCH 037/158] powerpc: Handle simultaneous interrupts at once It often happens to have simultaneous interrupts, for instance when having double Ethernet attachment. With the current implementation, we suffer the cost of kernel entry/exit for each interrupt. This patch introduces a loop in __do_irq() to handle all interrupts at once before returning. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/irq.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 5c291df30fe3..ab2ed9afd3c2 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -481,7 +481,11 @@ void __do_irq(struct pt_regs *regs) if (unlikely(!irq)) __this_cpu_inc(irq_stat.spurious_irqs); else - generic_handle_irq(irq); + do { + generic_handle_irq(irq); + + irq = ppc_md.get_irq(); + } while (irq); trace_irq_exit(regs); From f83647d642270f6b9d75736817fb5a66273ec903 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 21 Apr 2017 13:18:46 +0200 Subject: [PATCH 038/158] powerpc: Discard ffs()/__ffs() function and use builtin functions instead With the ffs() function as defined in arch/powerpc/include/asm/bitops.h GCC will not optimise the code in case of constant parameter, as shown by the small exemple below. int ffs_test(void) { return 4 << ffs(31); } c0012334 : c0012334: 39 20 00 01 li r9,1 c0012338: 38 60 00 04 li r3,4 c001233c: 7d 29 00 34 cntlzw r9,r9 c0012340: 21 29 00 20 subfic r9,r9,32 c0012344: 7c 63 48 30 slw r3,r3,r9 c0012348: 4e 80 00 20 blr With this patch, the same function will compile as follows: c0012334 : c0012334: 38 60 00 08 li r3,8 c0012338: 4e 80 00 20 blr The same happens with __ffs() For non constant calls, the generated code is doing the same, allthought it is slightly different on 64 bits for ffs(): unsigned long test__ffs(unsigned long x) { return __ffs(x); } int testffs(int x) { return ffs(x); } On PPC32, before the patch: 0000003c : 3c: 7d 23 00 d0 neg r9,r3 40: 7d 23 18 38 and r3,r9,r3 44: 7c 63 00 34 cntlzw r3,r3 48: 20 63 00 1f subfic r3,r3,31 4c: 4e 80 00 20 blr 00000050 : 50: 7d 23 00 d0 neg r9,r3 54: 7d 23 18 38 and r3,r9,r3 58: 7c 63 00 34 cntlzw r3,r3 5c: 20 63 00 20 subfic r3,r3,32 60: 4e 80 00 20 blr On PPC32, after the patch: 0000002c : 2c: 7d 23 00 d0 neg r9,r3 30: 7d 23 18 38 and r3,r9,r3 34: 7c 63 00 34 cntlzw r3,r3 38: 20 63 00 1f subfic r3,r3,31 3c: 4e 80 00 20 blr 00000040 : 40: 7d 23 00 d0 neg r9,r3 44: 7d 23 18 38 and r3,r9,r3 48: 7c 63 00 34 cntlzw r3,r3 4c: 20 63 00 20 subfic r3,r3,32 50: 4e 80 00 20 blr On PPC64, before the patch: 0000000000000060 <.test__ffs>: 60: 7c 03 00 d0 neg r0,r3 64: 7c 03 18 38 and r3,r0,r3 68: 7c 63 00 74 cntlzd r3,r3 6c: 20 63 00 3f subfic r3,r3,63 70: 7c 63 07 b4 extsw r3,r3 74: 4e 80 00 20 blr 0000000000000080 <.testffs>: 80: 7c 03 00 d0 neg r0,r3 84: 7c 03 18 38 and r3,r0,r3 88: 7c 63 00 74 cntlzd r3,r3 8c: 20 63 00 40 subfic r3,r3,64 90: 7c 63 07 b4 extsw r3,r3 94: 4e 80 00 20 blr On PPC64, after the patch: 0000000000000050 <.test__ffs>: 50: 7c 03 00 d0 neg r0,r3 54: 7c 03 18 38 and r3,r0,r3 58: 7c 63 00 74 cntlzd r3,r3 5c: 20 63 00 3f subfic r3,r3,63 60: 4e 80 00 20 blr 0000000000000070 <.testffs>: 70: 7c 03 00 d0 neg r0,r3 74: 7c 03 18 38 and r3,r0,r3 78: 7c 63 00 34 cntlzw r3,r3 7c: 20 63 00 20 subfic r3,r3,32 80: 7c 63 07 b4 extsw r3,r3 84: 4e 80 00 20 blr (ffs() operates on an int so cntlzw is equivalent to cntlzd) In addition, when reading the generated vmlinux, we can observe that with the builtin functions, GCC sometimes efficiently spreads the instructions within the generated functions while the inline assembly force them to remain grouped together. __builtin_ffs() is already used in arch/powerpc/include/asm/page_32.h Those builtins have been in GCC since at least 3.4.6 (see https://gcc.gnu.org/onlinedocs/gcc-3.4.6/gcc/Other-Builtins.html ) Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/bitops.h | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/include/asm/bitops.h b/arch/powerpc/include/asm/bitops.h index 33a24fdd7958..71b05685f3a7 100644 --- a/arch/powerpc/include/asm/bitops.h +++ b/arch/powerpc/include/asm/bitops.h @@ -253,21 +253,9 @@ static __inline__ unsigned long ffz(unsigned long x) return __ilog2(x & -x); } -static __inline__ unsigned long __ffs(unsigned long x) -{ - return __ilog2(x & -x); -} +#include -/* - * ffs: find first bit set. This is defined the same way as - * the libc and compiler builtin ffs routines, therefore - * differs in spirit from the above ffz (man ffs). - */ -static __inline__ int ffs(int x) -{ - unsigned long i = (unsigned long)x; - return __ilog2(i & -i) + 1; -} +#include /* * fls: find last (most-significant) bit set. From 2fcff790dcb419af1545cbd6bba7a04f2d90938f Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 21 Apr 2017 13:18:48 +0200 Subject: [PATCH 039/158] powerpc: Use builtin functions for fls()/__fls()/fls64() With the fls() functions as defined in arch/powerpc/include/asm/bitops.h GCC will not optimise the code in case of constant parameter. This patch replaces __fls() by the builtin function, and modifies fls() and fls64() to use builtins instead of inline assembly For non constant calls, the generated code is doing the same: int testfls(unsigned int x) { return fls(x); } unsigned long test__fls(unsigned long x) { return __fls(x); } int testfls64(__u64 x) { return fls64(x); } On PPC32, before the patch: 00000064 : 64: 7c 63 00 34 cntlzw r3,r3 68: 20 63 00 20 subfic r3,r3,32 6c: 4e 80 00 20 blr 00000070 : 70: 7c 63 00 34 cntlzw r3,r3 74: 20 63 00 1f subfic r3,r3,31 78: 4e 80 00 20 blr 0000007c : 7c: 2c 03 00 00 cmpwi r3,0 80: 40 82 00 10 bne 90 84: 7c 83 00 34 cntlzw r3,r4 88: 20 63 00 20 subfic r3,r3,32 8c: 4e 80 00 20 blr 90: 7c 63 00 34 cntlzw r3,r3 94: 20 63 00 40 subfic r3,r3,64 98: 4e 80 00 20 blr On PPC32, after the patch: 00000054 : 54: 7c 63 00 34 cntlzw r3,r3 58: 20 63 00 20 subfic r3,r3,32 5c: 4e 80 00 20 blr 00000060 : 60: 7c 63 00 34 cntlzw r3,r3 64: 20 63 00 1f subfic r3,r3,31 68: 4e 80 00 20 blr 0000006c : 6c: 2c 03 00 00 cmpwi r3,0 70: 41 82 00 10 beq 80 74: 7c 63 00 34 cntlzw r3,r3 78: 20 63 00 40 subfic r3,r3,64 7c: 4e 80 00 20 blr 80: 7c 83 00 34 cntlzw r3,r4 84: 20 63 00 40 subfic r3,r3,32 88: 4e 80 00 20 blr On PPC64, before the patch: 00000000000000a0 <.testfls>: a0: 7c 63 00 34 cntlzw r3,r3 a4: 20 63 00 20 subfic r3,r3,32 a8: 7c 63 07 b4 extsw r3,r3 ac: 4e 80 00 20 blr 00000000000000b0 <.test__fls>: b0: 7c 63 00 74 cntlzd r3,r3 b4: 20 63 00 3f subfic r3,r3,63 b8: 7c 63 07 b4 extsw r3,r3 bc: 4e 80 00 20 blr 00000000000000c0 <.testfls64>: c0: 7c 63 00 74 cntlzd r3,r3 c4: 20 63 00 40 subfic r3,r3,64 c8: 7c 63 07 b4 extsw r3,r3 cc: 4e 80 00 20 blr On PPC64, after the patch: 0000000000000090 <.testfls>: 90: 7c 63 00 34 cntlzw r3,r3 94: 20 63 00 20 subfic r3,r3,32 98: 7c 63 07 b4 extsw r3,r3 9c: 4e 80 00 20 blr 00000000000000a0 <.test__fls>: a0: 7c 63 00 74 cntlzd r3,r3 a4: 20 63 00 3f subfic r3,r3,63 a8: 4e 80 00 20 blr ac: 60 00 00 00 nop 00000000000000b0 <.testfls64>: b0: 7c 63 00 74 cntlzd r3,r3 b4: 20 63 00 40 subfic r3,r3,64 b8: 7c 63 07 b4 extsw r3,r3 bc: 4e 80 00 20 blr Those builtins have been in GCC since at least 3.4.6 (see https://gcc.gnu.org/onlinedocs/gcc-3.4.6/gcc/Other-Builtins.html ) Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/bitops.h | 24 +++--------------------- 1 file changed, 3 insertions(+), 21 deletions(-) diff --git a/arch/powerpc/include/asm/bitops.h b/arch/powerpc/include/asm/bitops.h index 71b05685f3a7..af36b404dbe8 100644 --- a/arch/powerpc/include/asm/bitops.h +++ b/arch/powerpc/include/asm/bitops.h @@ -263,33 +263,15 @@ static __inline__ unsigned long ffz(unsigned long x) */ static __inline__ int fls(unsigned int x) { - int lz; - - asm ("cntlzw %0,%1" : "=r" (lz) : "r" (x)); - return 32 - lz; + return 32 - __builtin_clz(x); } -static __inline__ unsigned long __fls(unsigned long x) -{ - return __ilog2(x); -} +#include -/* - * 64-bit can do this using one cntlzd (count leading zeroes doubleword) - * instruction; for 32-bit we use the generic version, which does two - * 32-bit fls calls. - */ -#ifdef __powerpc64__ static __inline__ int fls64(__u64 x) { - int lz; - - asm ("cntlzd %0,%1" : "=r" (lz) : "r" (x)); - return 64 - lz; + return 64 - __builtin_clzll(x); } -#else -#include -#endif /* __powerpc64__ */ #ifdef CONFIG_PPC64 unsigned int __arch_hweight8(unsigned int w); From 22ef33b368a3992b30b63606062902e9b139cd4e Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 21 Apr 2017 13:18:50 +0200 Subject: [PATCH 040/158] powerpc: Replace ffz() by equivalent generic function With the ffz() function as defined in arch/powerpc/include/asm/bitops.h GCC will not optimise the code in case of constant parameter. This patch replaces ffz() by the generic function. The generic ffz(x) expects to never be called with ~x == 0 as written in the comment in include/asm-generic/bitops/ffz.h The only user of ffz() within arch/powerpc/ is platforms/512x/mpc5121_ads_cpld.c, which checks if x is not 0xff For non constant calls, the generated code is doing the same: unsigned long testffz(unsigned long x) { return ffz(x); } On PPC32, before the patch: 00000018 : 18: 7c 63 18 f9 not. r3,r3 1c: 40 82 00 0c bne 28 20: 38 60 00 20 li r3,32 24: 4e 80 00 20 blr 28: 7d 23 00 d0 neg r9,r3 2c: 7d 23 18 38 and r3,r9,r3 30: 7c 63 00 34 cntlzw r3,r3 34: 20 63 00 1f subfic r3,r3,31 38: 4e 80 00 20 blr On PPC32, after the patch: 00000018 : 18: 39 23 00 01 addi r9,r3,1 1c: 7d 23 18 78 andc r3,r9,r3 20: 7c 63 00 34 cntlzw r3,r3 24: 20 63 00 1f subfic r3,r3,31 28: 4e 80 00 20 blr On PPC64, before the patch: 0000000000000030 <.testffz>: 30: 7c 60 18 f9 not. r0,r3 34: 38 60 00 40 li r3,64 38: 4d 82 00 20 beqlr 3c: 7c 60 00 d0 neg r3,r0 40: 7c 63 00 38 and r3,r3,r0 44: 7c 63 00 74 cntlzd r3,r3 48: 20 63 00 3f subfic r3,r3,63 4c: 7c 63 07 b4 extsw r3,r3 50: 4e 80 00 20 blr On PPC64, after the patch: 0000000000000030 <.testffz>: 30: 38 03 00 01 addi r0,r3,1 34: 7c 03 18 78 andc r3,r0,r3 38: 7c 63 00 74 cntlzd r3,r3 3c: 20 63 00 3f subfic r3,r3,63 40: 4e 80 00 20 blr Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/bitops.h | 20 +------------------- 1 file changed, 1 insertion(+), 19 deletions(-) diff --git a/arch/powerpc/include/asm/bitops.h b/arch/powerpc/include/asm/bitops.h index af36b404dbe8..d835cd697d6b 100644 --- a/arch/powerpc/include/asm/bitops.h +++ b/arch/powerpc/include/asm/bitops.h @@ -233,25 +233,7 @@ int __ilog2_u64(u64 n) } #endif -/* - * Determines the bit position of the least significant 0 bit in the - * specified double word. The returned bit position will be - * zero-based, starting from the right side (63/31 - 0). - */ -static __inline__ unsigned long ffz(unsigned long x) -{ - /* no zero exists anywhere in the 8 byte area. */ - if ((x = ~x) == 0) - return BITS_PER_LONG; - - /* - * Calculate the bit position of the least significant '1' bit in x - * (since x has been changed this will actually be the least significant - * '0' bit in * the original x). Note: (x & -x) gives us a mask that - * is the least significant * (RIGHT-most) 1-bit of the value in x. - */ - return __ilog2(x & -x); -} +#include #include From f782ddf297318d544bf71ee1e8afbac8d55f1878 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 21 Apr 2017 13:18:52 +0200 Subject: [PATCH 041/158] powerpc: Remove __ilog2()s and use generic ones With the __ilog2() function as defined in arch/powerpc/include/asm/bitops.h, GCC will not optimise the code in case of constant parameter. The generic ilog2() function in include/linux/log2.h is written to handle the case of the constant parameter. This patch discards the three __ilog2() functions and defines __ilog2() as ilog2() For non constant calls, the generated code is doing the same: int test__ilog2(unsigned long x) { return __ilog2(x); } int test__ilog2_u32(u32 n) { return __ilog2_u32(n); } int test__ilog2_u64(u64 n) { return __ilog2_u64(n); } On PPC32 before the patch: 00000000 : 0: 7c 63 00 34 cntlzw r3,r3 4: 20 63 00 1f subfic r3,r3,31 8: 4e 80 00 20 blr 0000000c : c: 7c 63 00 34 cntlzw r3,r3 10: 20 63 00 1f subfic r3,r3,31 14: 4e 80 00 20 blr On PPC32 after the patch: 00000000 : 0: 7c 63 00 34 cntlzw r3,r3 4: 20 63 00 1f subfic r3,r3,31 8: 4e 80 00 20 blr 0000000c : c: 7c 63 00 34 cntlzw r3,r3 10: 20 63 00 1f subfic r3,r3,31 14: 4e 80 00 20 blr On PPC64 before the patch: 0000000000000000 <.test__ilog2>: 0: 7c 63 00 74 cntlzd r3,r3 4: 20 63 00 3f subfic r3,r3,63 8: 7c 63 07 b4 extsw r3,r3 c: 4e 80 00 20 blr 0000000000000010 <.test__ilog2_u32>: 10: 7c 63 00 34 cntlzw r3,r3 14: 20 63 00 1f subfic r3,r3,31 18: 7c 63 07 b4 extsw r3,r3 1c: 4e 80 00 20 blr 0000000000000020 <.test__ilog2_u64>: 20: 7c 63 00 74 cntlzd r3,r3 24: 20 63 00 3f subfic r3,r3,63 28: 7c 63 07 b4 extsw r3,r3 2c: 4e 80 00 20 blr On PPC64 after the patch: 0000000000000000 <.test__ilog2>: 0: 7c 63 00 74 cntlzd r3,r3 4: 20 63 00 3f subfic r3,r3,63 8: 7c 63 07 b4 extsw r3,r3 c: 4e 80 00 20 blr 0000000000000010 <.test__ilog2_u32>: 10: 7c 63 00 34 cntlzw r3,r3 14: 20 63 00 1f subfic r3,r3,31 18: 7c 63 07 b4 extsw r3,r3 1c: 4e 80 00 20 blr 0000000000000020 <.test__ilog2_u64>: 20: 7c 63 00 74 cntlzd r3,r3 24: 20 63 00 3f subfic r3,r3,63 28: 7c 63 07 b4 extsw r3,r3 2c: 4e 80 00 20 blr Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/Kconfig | 8 -------- arch/powerpc/include/asm/bitops.h | 27 +-------------------------- 2 files changed, 1 insertion(+), 34 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 0153275e9f75..b50d46d214f6 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -109,14 +109,6 @@ config GENERIC_LOCKBREAK default y depends on SMP && PREEMPT -config ARCH_HAS_ILOG2_U32 - bool - default y - -config ARCH_HAS_ILOG2_U64 - bool - default y if 64BIT - config GENERIC_HWEIGHT bool default y diff --git a/arch/powerpc/include/asm/bitops.h b/arch/powerpc/include/asm/bitops.h index d835cd697d6b..b750ffef83c7 100644 --- a/arch/powerpc/include/asm/bitops.h +++ b/arch/powerpc/include/asm/bitops.h @@ -206,32 +206,7 @@ static __inline__ void __clear_bit_unlock(int nr, volatile unsigned long *addr) * Return the zero-based bit position (LE, not IBM bit numbering) of * the most significant 1-bit in a double word. */ -static __inline__ __attribute__((const)) -int __ilog2(unsigned long x) -{ - int lz; - - asm (PPC_CNTLZL "%0,%1" : "=r" (lz) : "r" (x)); - return BITS_PER_LONG - 1 - lz; -} - -static inline __attribute__((const)) -int __ilog2_u32(u32 n) -{ - int bit; - asm ("cntlzw %0,%1" : "=r" (bit) : "r" (n)); - return 31 - bit; -} - -#ifdef __powerpc64__ -static inline __attribute__((const)) -int __ilog2_u64(u64 n) -{ - int bit; - asm ("cntlzd %0,%1" : "=r" (bit) : "r" (n)); - return 63 - bit; -} -#endif +#define __ilog2(x) ilog2(x) #include From 98b8cd7f75643e0a442d7a4c1cef2c9d53b7e92b Mon Sep 17 00:00:00 2001 From: Michal Suchanek Date: Sat, 27 May 2017 17:46:15 +0200 Subject: [PATCH 042/158] powerpc/fadump: Return error when fadump registration fails - log an error message when registration fails and no error code listed in the switch is returned - translate the hv error code to posix error code and return it from fw_register - return the posix error code from fw_register to the process writing to sysfs - return EEXIST on re-registration - return success on deregistration when fadump is not registered - return ENODEV when no memory is reserved for fadump Signed-off-by: Michal Suchanek Tested-by: Hari Bathini [mpe: Use pr_err() to shrink the error print] Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/fadump.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 466569e26278..8a5d0e029b93 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -377,9 +377,9 @@ static int __init early_fadump_param(char *p) } early_param("fadump", early_fadump_param); -static void register_fw_dump(struct fadump_mem_struct *fdm) +static int register_fw_dump(struct fadump_mem_struct *fdm) { - int rc; + int rc, err; unsigned int wait_time; pr_debug("Registering for firmware-assisted kernel dump...\n"); @@ -396,7 +396,11 @@ static void register_fw_dump(struct fadump_mem_struct *fdm) } while (wait_time); + err = -EIO; switch (rc) { + default: + pr_err("Failed to register. Unknown Error(%d).\n", rc); + break; case -1: printk(KERN_ERR "Failed to register firmware-assisted kernel" " dump. Hardware Error(%d).\n", rc); @@ -404,18 +408,22 @@ static void register_fw_dump(struct fadump_mem_struct *fdm) case -3: printk(KERN_ERR "Failed to register firmware-assisted kernel" " dump. Parameter Error(%d).\n", rc); + err = -EINVAL; break; case -9: printk(KERN_ERR "firmware-assisted kernel dump is already " " registered."); fw_dump.dump_registered = 1; + err = -EEXIST; break; case 0: printk(KERN_INFO "firmware-assisted kernel dump registration" " is successful\n"); fw_dump.dump_registered = 1; + err = 0; break; } + return err; } void crash_fadump(struct pt_regs *regs, const char *str) @@ -956,7 +964,7 @@ static unsigned long init_fadump_header(unsigned long addr) return addr; } -static void register_fadump(void) +static int register_fadump(void) { unsigned long addr; void *vaddr; @@ -966,7 +974,7 @@ static void register_fadump(void) * assisted dump. */ if (!fw_dump.reserve_dump_area_size) - return; + return -ENODEV; fadump_setup_crash_memory_ranges(); @@ -979,7 +987,7 @@ static void register_fadump(void) fadump_create_elfcore_headers(vaddr); /* register the future kernel dump with firmware. */ - register_fw_dump(&fdm); + return register_fw_dump(&fdm); } static int fadump_unregister_dump(struct fadump_mem_struct *fdm) @@ -1161,7 +1169,6 @@ static ssize_t fadump_register_store(struct kobject *kobj, switch (buf[0]) { case '0': if (fw_dump.dump_registered == 0) { - ret = -EINVAL; goto unlock_out; } /* Un-register Firmware-assisted dump */ @@ -1169,11 +1176,11 @@ static ssize_t fadump_register_store(struct kobject *kobj, break; case '1': if (fw_dump.dump_registered == 1) { - ret = -EINVAL; + ret = -EEXIST; goto unlock_out; } /* Register Firmware-assisted dump */ - register_fadump(); + ret = register_fadump(); break; default: ret = -EINVAL; From 81d9eca502fcc360950ef476124626d97856e139 Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Mon, 22 May 2017 15:04:23 +0530 Subject: [PATCH 043/158] powerpc/fadump: Add a warning when 'fadump_reserve_mem=' is used With commit 11550dc0a00b ("powerpc/fadump: reuse crashkernel parameter for fadump memory reservation"), 'fadump_reserve_mem=' parameter is deprecated in favor of 'crashkernel=' parameter. Add a warning if 'fadump_reserve_mem=' is still used. Fixes: 11550dc0a00b ("powerpc/fadump: reuse crashkernel parameter for fadump memory reservation") Suggested-by: Prarit Bhargava Signed-off-by: Hari Bathini [mpe: Unsplit long printk strings] Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/fadump.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 8a5d0e029b93..1bdbe0b257e0 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -212,6 +212,9 @@ static inline unsigned long fadump_calculate_reserve_size(void) int ret; unsigned long long base, size; + if (fw_dump.reserve_bootvar) + pr_warn("'fadump_reserve_mem=' parameter is deprecated in favor of 'crashkernel=' parameter.\n"); + /* * Check if the size is specified through crashkernel= cmdline * option. If yes, then use that but ignore base as fadump @@ -220,8 +223,17 @@ static inline unsigned long fadump_calculate_reserve_size(void) ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), &size, &base); if (ret == 0 && size > 0) { + if (fw_dump.reserve_bootvar) + pr_info("Using 'crashkernel=' parameter for memory reservation.\n"); + fw_dump.reserve_bootvar = (unsigned long)size; return fw_dump.reserve_bootvar; + } else if (fw_dump.reserve_bootvar) { + /* + * 'fadump_reserve_mem=' is being used to reserve memory + * for firmware-assisted dump. + */ + return fw_dump.reserve_bootvar; } /* divide by 20 to get 5% of value */ @@ -377,6 +389,19 @@ static int __init early_fadump_param(char *p) } early_param("fadump", early_fadump_param); +/* + * Look for fadump_reserve_mem= cmdline option + * TODO: Remove references to 'fadump_reserve_mem=' parameter, + * the sooner 'crashkernel=' parameter is accustomed to. + */ +static int __init early_fadump_reserve_mem(char *p) +{ + if (p) + fw_dump.reserve_bootvar = memparse(p, &p); + return 0; +} +early_param("fadump_reserve_mem", early_fadump_reserve_mem); + static int register_fw_dump(struct fadump_mem_struct *fdm) { int rc, err; From e7467dc6947d7074417aa4cda44b851010fd0795 Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Mon, 22 May 2017 15:04:47 +0530 Subject: [PATCH 044/158] powerpc/fadump: Update comment about offset where fadump is reserved With commit f6e6bedb7731 ("powerpc/fadump: Reserve memory at an offset closer to bottom of RAM"), memory for fadump is no longer reserved at the top of RAM. But there are still a few places which say so. Change them appropriately. Signed-off-by: Hari Bathini Signed-off-by: Michael Ellerman --- Documentation/powerpc/firmware-assisted-dump.txt | 4 ++-- arch/powerpc/kernel/fadump.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/powerpc/firmware-assisted-dump.txt b/Documentation/powerpc/firmware-assisted-dump.txt index 9cabaf8a207e..bdd344aa18d9 100644 --- a/Documentation/powerpc/firmware-assisted-dump.txt +++ b/Documentation/powerpc/firmware-assisted-dump.txt @@ -61,8 +61,8 @@ as follows: boot successfully. For syntax of crashkernel= parameter, refer to Documentation/kdump/kdump.txt. If any offset is provided in crashkernel= parameter, it will be ignored - as fadump reserves memory at end of RAM for boot memory - dump preservation in case of a crash. + as fadump uses a predefined offset to reserve memory + for boot memory dump preservation in case of a crash. -- After the low memory (boot memory) area has been saved, the firmware will reset PCI and other hardware state. It will diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 1bdbe0b257e0..1ec6ea67159c 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -217,8 +217,8 @@ static inline unsigned long fadump_calculate_reserve_size(void) /* * Check if the size is specified through crashkernel= cmdline - * option. If yes, then use that but ignore base as fadump - * reserves memory at end of RAM. + * option. If yes, then use that but ignore base as fadump reserves + * memory at a predefined offset. */ ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), &size, &base); From 48a316e350974739235c234430ec0e129f864a43 Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Fri, 2 Jun 2017 13:00:27 +0530 Subject: [PATCH 045/158] powerpc/fadump: Set an upper limit for boot memory size By default, 5% of system RAM is reserved for preserving boot memory. Alternatively, a user can specify the amount of memory to reserve. See Documentation/powerpc/firmware-assisted-dump.txt for details. In addition to the memory reserved for preserving boot memory, some more memory is reserved, to save HPTE region, CPU state data and ELF core headers. Memory Reservation during first kernel looks like below: Low memory Top of memory 0 boot memory size | | | |<--Reserved dump area -->| V V | Permanent Reservation V +-----------+----------/ /----------+---+----+-----------+----+ | | |CPU|HPTE| DUMP |ELF | +-----------+----------/ /----------+---+----+-----------+----+ | ^ | | \ / ------------------------------------------- Boot memory content gets transferred to reserved area by firmware at the time of crash This implicitly means that the sum of the sizes of boot memory, CPU state data, HPTE region, DUMP preserving area and ELF core headers can't be greater than the total memory size. But currently, a user is allowed to specify any value as boot memory size. So, the above rule is violated when a boot memory size around 50% of the total available memory is specified. As the kernel is not handling this currently, it may lead to undefined behavior. Fix it by setting an upper limit for boot memory size to 25% of the total available memory. Also, instead of using memblock_end_of_DRAM(), which doesn't take the holes, if any, in the memory layout into account, use memblock_phys_mem_size() to calculate the percentage of total available memory. Signed-off-by: Hari Bathini Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/fadump.h | 3 +++ arch/powerpc/kernel/fadump.c | 16 +++++++++++++++- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/fadump.h b/arch/powerpc/include/asm/fadump.h index 60b91084f33c..a3de219073af 100644 --- a/arch/powerpc/include/asm/fadump.h +++ b/arch/powerpc/include/asm/fadump.h @@ -43,6 +43,9 @@ #define MIN_BOOT_MEM (((RMA_END < (0x1UL << 28)) ? (0x1UL << 28) : RMA_END) \ + (0x1UL << 26)) +/* The upper limit percentage for user specified boot memory size (25%) */ +#define MAX_BOOT_MEM_RATIO 4 + #define memblock_num_regions(memblock_type) (memblock.memblock_type.cnt) /* Firmware provided dump sections */ diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 1ec6ea67159c..12837d52e84a 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -223,10 +223,24 @@ static inline unsigned long fadump_calculate_reserve_size(void) ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), &size, &base); if (ret == 0 && size > 0) { + unsigned long max_size; + if (fw_dump.reserve_bootvar) pr_info("Using 'crashkernel=' parameter for memory reservation.\n"); fw_dump.reserve_bootvar = (unsigned long)size; + + /* + * Adjust if the boot memory size specified is above + * the upper limit. + */ + max_size = memblock_phys_mem_size() / MAX_BOOT_MEM_RATIO; + if (fw_dump.reserve_bootvar > max_size) { + fw_dump.reserve_bootvar = max_size; + pr_info("Adjusted boot memory size to %luMB\n", + (fw_dump.reserve_bootvar >> 20)); + } + return fw_dump.reserve_bootvar; } else if (fw_dump.reserve_bootvar) { /* @@ -237,7 +251,7 @@ static inline unsigned long fadump_calculate_reserve_size(void) } /* divide by 20 to get 5% of value */ - size = memblock_end_of_DRAM() / 20; + size = memblock_phys_mem_size() / 20; /* round it down in multiples of 256 */ size = size & ~0x0FFFFFFFUL; From f718d426d7e42eec6e5d2932f52a51de23bd3b30 Mon Sep 17 00:00:00 2001 From: Matt Brown Date: Wed, 24 May 2017 09:45:59 +1000 Subject: [PATCH 046/158] powerpc/lib/xor_vmx: Ensure no altivec code executes before enable_kernel_altivec() The xor_vmx.c file is used for the RAID5 xor operations. In these functions altivec is enabled to run the operation and then disabled. The code uses enable_kernel_altivec() around the core of the algorithm, however the whole file is built with -maltivec, so the compiler is within its rights to generate altivec code anywhere. This has been seen at least once in the wild: 0:mon> di $xor_altivec_2 c0000000000b97d0 3c4c01d9 addis r2,r12,473 c0000000000b97d4 3842db30 addi r2,r2,-9424 c0000000000b97d8 7c0802a6 mflr r0 c0000000000b97dc f8010010 std r0,16(r1) c0000000000b97e0 60000000 nop c0000000000b97e4 7c0802a6 mflr r0 c0000000000b97e8 faa1ffa8 std r21,-88(r1) ... c0000000000b981c f821ff41 stdu r1,-192(r1) c0000000000b9820 7f8101ce stvx v28,r1,r0 <-- POP c0000000000b9824 38000030 li r0,48 c0000000000b9828 7fa101ce stvx v29,r1,r0 ... c0000000000b984c 4bf6a06d bl c0000000000238b8 # enable_kernel_altivec This patch splits the non-altivec code into xor_vmx_glue.c which calls the altivec functions in xor_vmx.c. By compiling xor_vmx_glue.c without -maltivec we can guarantee that altivec instruction will not be executed outside of the enable/disable block. Signed-off-by: Matt Brown [mpe: Rework change log and include disassembly] Signed-off-by: Michael Ellerman --- arch/powerpc/lib/Makefile | 2 +- arch/powerpc/lib/xor_vmx.c | 53 ++++++---------------------- arch/powerpc/lib/xor_vmx.h | 20 +++++++++++ arch/powerpc/lib/xor_vmx_glue.c | 62 +++++++++++++++++++++++++++++++++ 4 files changed, 94 insertions(+), 43 deletions(-) create mode 100644 arch/powerpc/lib/xor_vmx.h create mode 100644 arch/powerpc/lib/xor_vmx_glue.c diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile index 2c56f4636c2b..3c3146ba62da 100644 --- a/arch/powerpc/lib/Makefile +++ b/arch/powerpc/lib/Makefile @@ -37,7 +37,7 @@ obj-$(CONFIG_PPC_LIB_RHEAP) += rheap.o obj-$(CONFIG_FTR_FIXUP_SELFTEST) += feature-fixups-test.o -obj-$(CONFIG_ALTIVEC) += xor_vmx.o +obj-$(CONFIG_ALTIVEC) += xor_vmx.o xor_vmx_glue.o CFLAGS_xor_vmx.o += -maltivec $(call cc-option,-mabi=altivec) obj-$(CONFIG_PPC64) += $(obj64-y) diff --git a/arch/powerpc/lib/xor_vmx.c b/arch/powerpc/lib/xor_vmx.c index f9de69a04e88..4df240aa5f81 100644 --- a/arch/powerpc/lib/xor_vmx.c +++ b/arch/powerpc/lib/xor_vmx.c @@ -29,10 +29,7 @@ #define vector __attribute__((vector_size(16))) #endif -#include -#include -#include -#include +#include "xor_vmx.h" typedef vector signed char unative_t; @@ -64,16 +61,13 @@ typedef vector signed char unative_t; V1##_3 = vec_xor(V1##_3, V2##_3); \ } while (0) -void xor_altivec_2(unsigned long bytes, unsigned long *v1_in, - unsigned long *v2_in) +void __xor_altivec_2(unsigned long bytes, unsigned long *v1_in, + unsigned long *v2_in) { DEFINE(v1); DEFINE(v2); unsigned long lines = bytes / (sizeof(unative_t)) / 4; - preempt_disable(); - enable_kernel_altivec(); - do { LOAD(v1); LOAD(v2); @@ -83,23 +77,16 @@ void xor_altivec_2(unsigned long bytes, unsigned long *v1_in, v1 += 4; v2 += 4; } while (--lines > 0); - - disable_kernel_altivec(); - preempt_enable(); } -EXPORT_SYMBOL(xor_altivec_2); -void xor_altivec_3(unsigned long bytes, unsigned long *v1_in, - unsigned long *v2_in, unsigned long *v3_in) +void __xor_altivec_3(unsigned long bytes, unsigned long *v1_in, + unsigned long *v2_in, unsigned long *v3_in) { DEFINE(v1); DEFINE(v2); DEFINE(v3); unsigned long lines = bytes / (sizeof(unative_t)) / 4; - preempt_disable(); - enable_kernel_altivec(); - do { LOAD(v1); LOAD(v2); @@ -112,15 +99,11 @@ void xor_altivec_3(unsigned long bytes, unsigned long *v1_in, v2 += 4; v3 += 4; } while (--lines > 0); - - disable_kernel_altivec(); - preempt_enable(); } -EXPORT_SYMBOL(xor_altivec_3); -void xor_altivec_4(unsigned long bytes, unsigned long *v1_in, - unsigned long *v2_in, unsigned long *v3_in, - unsigned long *v4_in) +void __xor_altivec_4(unsigned long bytes, unsigned long *v1_in, + unsigned long *v2_in, unsigned long *v3_in, + unsigned long *v4_in) { DEFINE(v1); DEFINE(v2); @@ -128,9 +111,6 @@ void xor_altivec_4(unsigned long bytes, unsigned long *v1_in, DEFINE(v4); unsigned long lines = bytes / (sizeof(unative_t)) / 4; - preempt_disable(); - enable_kernel_altivec(); - do { LOAD(v1); LOAD(v2); @@ -146,15 +126,11 @@ void xor_altivec_4(unsigned long bytes, unsigned long *v1_in, v3 += 4; v4 += 4; } while (--lines > 0); - - disable_kernel_altivec(); - preempt_enable(); } -EXPORT_SYMBOL(xor_altivec_4); -void xor_altivec_5(unsigned long bytes, unsigned long *v1_in, - unsigned long *v2_in, unsigned long *v3_in, - unsigned long *v4_in, unsigned long *v5_in) +void __xor_altivec_5(unsigned long bytes, unsigned long *v1_in, + unsigned long *v2_in, unsigned long *v3_in, + unsigned long *v4_in, unsigned long *v5_in) { DEFINE(v1); DEFINE(v2); @@ -163,9 +139,6 @@ void xor_altivec_5(unsigned long bytes, unsigned long *v1_in, DEFINE(v5); unsigned long lines = bytes / (sizeof(unative_t)) / 4; - preempt_disable(); - enable_kernel_altivec(); - do { LOAD(v1); LOAD(v2); @@ -184,8 +157,4 @@ void xor_altivec_5(unsigned long bytes, unsigned long *v1_in, v4 += 4; v5 += 4; } while (--lines > 0); - - disable_kernel_altivec(); - preempt_enable(); } -EXPORT_SYMBOL(xor_altivec_5); diff --git a/arch/powerpc/lib/xor_vmx.h b/arch/powerpc/lib/xor_vmx.h new file mode 100644 index 000000000000..4746708451ae --- /dev/null +++ b/arch/powerpc/lib/xor_vmx.h @@ -0,0 +1,20 @@ +/* + * Simple interface to link xor_vmx.c and xor_vmx_glue.c + * + * Separating these file ensures that no altivec instructions are run + * outside of the enable/disable altivec block. + */ + +void __xor_altivec_2(unsigned long bytes, unsigned long *v1_in, + unsigned long *v2_in); + +void __xor_altivec_3(unsigned long bytes, unsigned long *v1_in, + unsigned long *v2_in, unsigned long *v3_in); + +void __xor_altivec_4(unsigned long bytes, unsigned long *v1_in, + unsigned long *v2_in, unsigned long *v3_in, + unsigned long *v4_in); + +void __xor_altivec_5(unsigned long bytes, unsigned long *v1_in, + unsigned long *v2_in, unsigned long *v3_in, + unsigned long *v4_in, unsigned long *v5_in); diff --git a/arch/powerpc/lib/xor_vmx_glue.c b/arch/powerpc/lib/xor_vmx_glue.c new file mode 100644 index 000000000000..6521fe5e8cef --- /dev/null +++ b/arch/powerpc/lib/xor_vmx_glue.c @@ -0,0 +1,62 @@ +/* + * Altivec XOR operations + * + * Copyright 2017 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include "xor_vmx.h" + +void xor_altivec_2(unsigned long bytes, unsigned long *v1_in, + unsigned long *v2_in) +{ + preempt_disable(); + enable_kernel_altivec(); + __xor_altivec_2(bytes, v1_in, v2_in); + disable_kernel_altivec(); + preempt_enable(); +} +EXPORT_SYMBOL(xor_altivec_2); + +void xor_altivec_3(unsigned long bytes, unsigned long *v1_in, + unsigned long *v2_in, unsigned long *v3_in) +{ + preempt_disable(); + enable_kernel_altivec(); + __xor_altivec_3(bytes, v1_in, v2_in, v3_in); + disable_kernel_altivec(); + preempt_enable(); +} +EXPORT_SYMBOL(xor_altivec_3); + +void xor_altivec_4(unsigned long bytes, unsigned long *v1_in, + unsigned long *v2_in, unsigned long *v3_in, + unsigned long *v4_in) +{ + preempt_disable(); + enable_kernel_altivec(); + __xor_altivec_4(bytes, v1_in, v2_in, v3_in, v4_in); + disable_kernel_altivec(); + preempt_enable(); +} +EXPORT_SYMBOL(xor_altivec_4); + +void xor_altivec_5(unsigned long bytes, unsigned long *v1_in, + unsigned long *v2_in, unsigned long *v3_in, + unsigned long *v4_in, unsigned long *v5_in) +{ + preempt_disable(); + enable_kernel_altivec(); + __xor_altivec_5(bytes, v1_in, v2_in, v3_in, v4_in, v5_in); + disable_kernel_altivec(); + preempt_enable(); +} +EXPORT_SYMBOL(xor_altivec_5); From b802ab46ba12c617fd55b072f1906627e636947b Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 5 Jun 2017 16:49:12 +1000 Subject: [PATCH 047/158] powerpc: Fix some spelling mistakes Collation of some spelling fixes from Colin. Attemping -> Attempting intialized -> initialized missmanaged -> mismanaged Signed-off-by: Colin Ian King Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/pseries/hotplug-cpu.c | 2 +- arch/powerpc/sysdev/xive/common.c | 2 +- drivers/tty/hvc/hvcs.c | 3 +-- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c index 7bc0e91f8715..6afd1efd3633 100644 --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c @@ -554,7 +554,7 @@ static ssize_t dlpar_cpu_remove(struct device_node *dn, u32 drc_index) { int rc; - pr_debug("Attemping to remove CPU %s, drc index: %x\n", + pr_debug("Attempting to remove CPU %s, drc index: %x\n", dn->name, drc_index); rc = dlpar_offline_cpu(dn); diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c index 913825086b8d..afc9484ae747 100644 --- a/arch/powerpc/sysdev/xive/common.c +++ b/arch/powerpc/sysdev/xive/common.c @@ -1417,7 +1417,7 @@ bool xive_core_init(const struct xive_ops *ops, void __iomem *area, u32 offset, /* Get ready for interrupts */ xive_setup_cpu(); - pr_info("Interrupt handling intialized with %s backend\n", + pr_info("Interrupt handling initialized with %s backend\n", xive_ops->name); pr_info("Using priority %d for all interrupts\n", max_prio); diff --git a/drivers/tty/hvc/hvcs.c b/drivers/tty/hvc/hvcs.c index 99bb875178d7..423e28ec27fb 100644 --- a/drivers/tty/hvc/hvcs.c +++ b/drivers/tty/hvc/hvcs.c @@ -1242,8 +1242,7 @@ static void hvcs_close(struct tty_struct *tty, struct file *filp) free_irq(irq, hvcsd); return; } else if (hvcsd->port.count < 0) { - printk(KERN_ERR "HVCS: vty-server@%X open_count: %d" - " is missmanaged.\n", + printk(KERN_ERR "HVCS: vty-server@%X open_count: %d is mismanaged.\n", hvcsd->vdev->unit_address, hvcsd->port.count); } From b27ce776852078df9e14a223b375db6507096963 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Sun, 4 Jun 2017 22:06:22 +1000 Subject: [PATCH 048/158] selftests/powerpc: context_switch use private futexes with threads This reduces overhead of mutex locking and increases context switch rate significantly (which helps to measure and profile the context switch path). Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- .../powerpc/benchmarks/context_switch.c | 53 ++++++++++++------- 1 file changed, 33 insertions(+), 20 deletions(-) diff --git a/tools/testing/selftests/powerpc/benchmarks/context_switch.c b/tools/testing/selftests/powerpc/benchmarks/context_switch.c index 778f5fbfd784..f4241339edd2 100644 --- a/tools/testing/selftests/powerpc/benchmarks/context_switch.c +++ b/tools/testing/selftests/powerpc/benchmarks/context_switch.c @@ -258,9 +258,14 @@ static unsigned long xchg(unsigned long *p, unsigned long val) return __atomic_exchange_n(p, val, __ATOMIC_SEQ_CST); } +static int processes; + static int mutex_lock(unsigned long *m) { int c; + int flags = FUTEX_WAIT; + if (!processes) + flags |= FUTEX_PRIVATE_FLAG; c = cmpxchg(m, 0, 1); if (!c) @@ -270,7 +275,7 @@ static int mutex_lock(unsigned long *m) c = xchg(m, 2); while (c) { - sys_futex(m, FUTEX_WAIT, 2, NULL, NULL, 0); + sys_futex(m, flags, 2, NULL, NULL, 0); c = xchg(m, 2); } @@ -279,12 +284,16 @@ static int mutex_lock(unsigned long *m) static int mutex_unlock(unsigned long *m) { + int flags = FUTEX_WAKE; + if (!processes) + flags |= FUTEX_PRIVATE_FLAG; + if (*m == 2) *m = 0; else if (xchg(m, 0) == 1) return 0; - sys_futex(m, FUTEX_WAKE, 1, NULL, NULL, 0); + sys_futex(m, flags, 1, NULL, NULL, 0); return 0; } @@ -293,27 +302,33 @@ static unsigned long *m1, *m2; static void futex_setup(int cpu1, int cpu2) { - int shmid; - void *shmaddr; + if (!processes) { + static unsigned long _m1, _m2; + m1 = &_m1; + m2 = &_m2; + } else { + int shmid; + void *shmaddr; - shmid = shmget(IPC_PRIVATE, getpagesize(), SHM_R | SHM_W); - if (shmid < 0) { - perror("shmget"); - exit(1); - } + shmid = shmget(IPC_PRIVATE, getpagesize(), SHM_R | SHM_W); + if (shmid < 0) { + perror("shmget"); + exit(1); + } + + shmaddr = shmat(shmid, NULL, 0); + if (shmaddr == (char *)-1) { + perror("shmat"); + shmctl(shmid, IPC_RMID, NULL); + exit(1); + } - shmaddr = shmat(shmid, NULL, 0); - if (shmaddr == (char *)-1) { - perror("shmat"); shmctl(shmid, IPC_RMID, NULL); - exit(1); + + m1 = shmaddr; + m2 = shmaddr + sizeof(*m1); } - shmctl(shmid, IPC_RMID, NULL); - - m1 = shmaddr; - m2 = shmaddr + sizeof(*m1); - *m1 = 0; *m2 = 0; @@ -352,8 +367,6 @@ static struct actions futex_actions = { .thread2 = futex_thread2, }; -static int processes; - static struct option options[] = { { "test", required_argument, 0, 't' }, { "process", no_argument, &processes, 1 }, From c67ec7010132a4d948686eeab8a23acb9fb62afe Mon Sep 17 00:00:00 2001 From: Yang Li Date: Fri, 2 Jun 2017 18:20:22 -0500 Subject: [PATCH 049/158] MAINTAINERS: Update entry for Freescale SoC drivers Add myself as the maintainer for drivers/fsl/soc/ and fix the scope for device tree bindings. Signed-off-by: Li Yang Acked-by: Scott Wood Signed-off-by: Michael Ellerman --- MAINTAINERS | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 053c3bdd1fe5..787f82da7456 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5347,11 +5347,11 @@ S: Maintained F: drivers/net/ethernet/freescale/dpaa FREESCALE SOC DRIVERS -M: Scott Wood +M: Li Yang L: linuxppc-dev@lists.ozlabs.org L: linux-arm-kernel@lists.infradead.org S: Maintained -F: Documentation/devicetree/bindings/powerpc/fsl/ +F: Documentation/devicetree/bindings/soc/fsl/ F: drivers/soc/fsl/ F: include/linux/fsl/ @@ -7685,6 +7685,7 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/scottwood/linux.git S: Maintained F: arch/powerpc/platforms/83xx/ F: arch/powerpc/platforms/85xx/ +F: Documentation/devicetree/bindings/powerpc/fsl/ LINUX FOR POWERPC PA SEMI PWRFICIENT L: linuxppc-dev@lists.ozlabs.org From 64d09f5ecb237974bc824800e5e5012fd71801c7 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Fri, 2 Jun 2017 18:20:23 -0500 Subject: [PATCH 050/158] MAINTAINERS: Update my email address from freescale to nxp Signed-off-by: Li Yang Signed-off-by: Michael Ellerman --- MAINTAINERS | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 787f82da7456..25eecfef41b7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5282,7 +5282,7 @@ S: Maintained F: drivers/video/fbdev/fsl-diu-fb.* FREESCALE DMA DRIVER -M: Li Yang +M: Li Yang M: Zhang Wei L: linuxppc-dev@lists.ozlabs.org S: Maintained @@ -5364,14 +5364,14 @@ F: include/soc/fsl/*qe*.h F: include/soc/fsl/*ucc*.h FREESCALE USB PERIPHERAL DRIVERS -M: Li Yang +M: Li Yang L: linux-usb@vger.kernel.org L: linuxppc-dev@lists.ozlabs.org S: Maintained F: drivers/usb/gadget/udc/fsl* FREESCALE QUICC ENGINE UCC ETHERNET DRIVER -M: Li Yang +M: Li Yang L: netdev@vger.kernel.org L: linuxppc-dev@lists.ozlabs.org S: Maintained From c5cee6421cd651446b93c231f0ab8ff2530ba25d Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Thu, 25 May 2017 17:28:32 +1000 Subject: [PATCH 051/158] powerpc/mm/hash: Do a local flush if possible when no batch is active Currently in hpte_need_flush() if there is no batch pending we always do a global TLB flush, which is inefficient if the mm has never run on another thread. Instead do the same check that __flush_tlb_pending() does and check if a local flush is sufficient when batch->active is false. Instead of open-coding it we use mm_is_thread_local(). Signed-off-by: Balbir Singh [mpe: Don't use a local, just inline mm_is_thread_local()] Signed-off-by: Michael Ellerman --- arch/powerpc/mm/tlb_hash64.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c index 4517aa43a8b1..b5b0fb97b9c0 100644 --- a/arch/powerpc/mm/tlb_hash64.c +++ b/arch/powerpc/mm/tlb_hash64.c @@ -93,12 +93,10 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr, /* * Check if we have an active batch on this CPU. If not, just - * flush now and return. For now, we don global invalidates - * in that case, might be worth testing the mm cpu mask though - * and decide to use local invalidates instead... + * flush now and return. */ if (!batch->active) { - flush_hash_page(vpn, rpte, psize, ssize, 0); + flush_hash_page(vpn, rpte, psize, ssize, mm_is_thread_local(mm)); put_cpu_var(ppc64_tlb_batch); return; } From de3b87611dd1f3c00f4e42fe298457260ea781e0 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Tue, 2 May 2017 15:17:04 +1000 Subject: [PATCH 052/158] powerpc/mm/book(e)(3s)/64: Add page table accounting Introduce a helper pgtable_gfp_flags() which just returns the current gfp flags and adds __GFP_ACCOUNT to account for page table allocation. The generic helper is added to include/asm/pgalloc.h and has two variants - WARNING ugly bits ahead 1. If the header is included from a module, no check for mm == &init_mm is done, since init_mm is not exported 2. For kernel includes, the check is done and required see (3e79ec7 arch: x86: charge page tables to kmemcg) The fundamental assumption is that no module should be doing pgd/pud/pmd and pte alloc's on behalf of init_mm directly. NOTE: This adds an overhead to pmd/pud/pgd allocations similar to x86. The other alternative was to implement pmd_alloc_kernel/pud_alloc_kernel and pgd_alloc_kernel with their offset variants. For 4k page size, pte_alloc_one no longer calls pte_alloc_one_kernel. Signed-off-by: Balbir Singh Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/32/pgalloc.h | 3 ++- arch/powerpc/include/asm/book3s/64/pgalloc.h | 16 ++++++++++------ arch/powerpc/include/asm/nohash/64/pgalloc.h | 11 +++++++---- arch/powerpc/include/asm/pgalloc.h | 14 ++++++++++++++ arch/powerpc/mm/pgtable_64.c | 20 ++++++++++++++------ 5 files changed, 47 insertions(+), 17 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/32/pgalloc.h b/arch/powerpc/include/asm/book3s/32/pgalloc.h index d310546e5d9d..a120e7f8d535 100644 --- a/arch/powerpc/include/asm/book3s/32/pgalloc.h +++ b/arch/powerpc/include/asm/book3s/32/pgalloc.h @@ -31,7 +31,8 @@ extern struct kmem_cache *pgtable_cache[]; static inline pgd_t *pgd_alloc(struct mm_struct *mm) { - return kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE), GFP_KERNEL); + return kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE), + pgtable_gfp_flags(mm, GFP_KERNEL)); } static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) diff --git a/arch/powerpc/include/asm/book3s/64/pgalloc.h b/arch/powerpc/include/asm/book3s/64/pgalloc.h index cd5e7aa8cc34..20b1485ff1e8 100644 --- a/arch/powerpc/include/asm/book3s/64/pgalloc.h +++ b/arch/powerpc/include/asm/book3s/64/pgalloc.h @@ -53,10 +53,11 @@ extern void __tlb_remove_table(void *_table); static inline pgd_t *radix__pgd_alloc(struct mm_struct *mm) { #ifdef CONFIG_PPC_64K_PAGES - return (pgd_t *)__get_free_page(PGALLOC_GFP); + return (pgd_t *)__get_free_page(pgtable_gfp_flags(mm, PGALLOC_GFP)); #else struct page *page; - page = alloc_pages(PGALLOC_GFP | __GFP_REPEAT, 4); + page = alloc_pages(pgtable_gfp_flags(mm, PGALLOC_GFP | __GFP_REPEAT), + 4); if (!page) return NULL; return (pgd_t *) page_address(page); @@ -76,7 +77,8 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm) { if (radix_enabled()) return radix__pgd_alloc(mm); - return kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE), GFP_KERNEL); + return kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE), + pgtable_gfp_flags(mm, GFP_KERNEL)); } static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) @@ -93,7 +95,8 @@ static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) { - return kmem_cache_alloc(PGT_CACHE(PUD_INDEX_SIZE), GFP_KERNEL); + return kmem_cache_alloc(PGT_CACHE(PUD_INDEX_SIZE), + pgtable_gfp_flags(mm, GFP_KERNEL)); } static inline void pud_free(struct mm_struct *mm, pud_t *pud) @@ -119,7 +122,8 @@ static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) { - return kmem_cache_alloc(PGT_CACHE(PMD_CACHE_INDEX), GFP_KERNEL); + return kmem_cache_alloc(PGT_CACHE(PMD_CACHE_INDEX), + pgtable_gfp_flags(mm, GFP_KERNEL)); } static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) @@ -168,7 +172,7 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm, struct page *page; pte_t *pte; - pte = pte_alloc_one_kernel(mm, address); + pte = (pte_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO | __GFP_ACCOUNT); if (!pte) return NULL; page = virt_to_page(pte); diff --git a/arch/powerpc/include/asm/nohash/64/pgalloc.h b/arch/powerpc/include/asm/nohash/64/pgalloc.h index 897d2e1c8a9b..9721c7867b9c 100644 --- a/arch/powerpc/include/asm/nohash/64/pgalloc.h +++ b/arch/powerpc/include/asm/nohash/64/pgalloc.h @@ -43,7 +43,8 @@ extern struct kmem_cache *pgtable_cache[]; static inline pgd_t *pgd_alloc(struct mm_struct *mm) { - return kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE), GFP_KERNEL); + return kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE), + pgtable_gfp_flags(mm, GFP_KERNEL)); } static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) @@ -57,7 +58,8 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) { - return kmem_cache_alloc(PGT_CACHE(PUD_INDEX_SIZE), GFP_KERNEL); + return kmem_cache_alloc(PGT_CACHE(PUD_INDEX_SIZE), + pgtable_gfp_flags(mm, GFP_KERNEL)); } static inline void pud_free(struct mm_struct *mm, pud_t *pud) @@ -96,7 +98,7 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm, struct page *page; pte_t *pte; - pte = pte_alloc_one_kernel(mm, address); + pte = (pte_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO | __GFP_ACCOUNT); if (!pte) return NULL; page = virt_to_page(pte); @@ -189,7 +191,8 @@ static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table, static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) { - return kmem_cache_alloc(PGT_CACHE(PMD_CACHE_INDEX), GFP_KERNEL); + return kmem_cache_alloc(PGT_CACHE(PMD_CACHE_INDEX), + pgtable_gfp_flags(mm, GFP_KERNEL)); } static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) diff --git a/arch/powerpc/include/asm/pgalloc.h b/arch/powerpc/include/asm/pgalloc.h index 0413457ba11d..d795c5d5789c 100644 --- a/arch/powerpc/include/asm/pgalloc.h +++ b/arch/powerpc/include/asm/pgalloc.h @@ -3,6 +3,20 @@ #include +#ifndef MODULE +static inline gfp_t pgtable_gfp_flags(struct mm_struct *mm, gfp_t gfp) +{ + if (unlikely(mm == &init_mm)) + return gfp; + return gfp | __GFP_ACCOUNT; +} +#else /* !MODULE */ +static inline gfp_t pgtable_gfp_flags(struct mm_struct *mm, gfp_t gfp) +{ + return gfp | __GFP_ACCOUNT; +} +#endif /* MODULE */ + #ifdef CONFIG_PPC_BOOK3S #include #else diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index db93cf747a03..8d2d6742a465 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -351,12 +351,20 @@ static pte_t *get_from_cache(struct mm_struct *mm) static pte_t *__alloc_for_cache(struct mm_struct *mm, int kernel) { void *ret = NULL; - struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); - if (!page) - return NULL; - if (!kernel && !pgtable_page_ctor(page)) { - __free_page(page); - return NULL; + struct page *page; + + if (!kernel) { + page = alloc_page(PGALLOC_GFP | __GFP_ACCOUNT); + if (!page) + return NULL; + if (!pgtable_page_ctor(page)) { + __free_page(page); + return NULL; + } + } else { + page = alloc_page(PGALLOC_GFP); + if (!page) + return NULL; } ret = page_address(page); From abd667be1502f92a9c98404ad3f2e52ea13724f2 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Tue, 2 May 2017 15:17:05 +1000 Subject: [PATCH 053/158] powerpc/mm/book(e)(3s)/32: Add page table accounting Add support in pte_alloc_one() and pgd_alloc() by passing __GFP_ACCOUNT in the flags Signed-off-by: Balbir Singh Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/nohash/32/pgalloc.h | 3 ++- arch/powerpc/mm/pgtable_32.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/nohash/32/pgalloc.h b/arch/powerpc/include/asm/nohash/32/pgalloc.h index 633139291a48..cc369a70f2bb 100644 --- a/arch/powerpc/include/asm/nohash/32/pgalloc.h +++ b/arch/powerpc/include/asm/nohash/32/pgalloc.h @@ -31,7 +31,8 @@ extern struct kmem_cache *pgtable_cache[]; static inline pgd_t *pgd_alloc(struct mm_struct *mm) { - return kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE), GFP_KERNEL); + return kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE), + pgtable_gfp_flags(mm, GFP_KERNEL)); } static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index cf32c5c6fe28..8de9db2b4d96 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -60,7 +60,7 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) { struct page *ptepage; - gfp_t flags = GFP_KERNEL | __GFP_ZERO; + gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_ACCOUNT; ptepage = alloc_pages(flags, 0); if (!ptepage) From d2485644c7836514e337de456b5a6c8a1119fc8b Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Tue, 2 May 2017 15:17:06 +1000 Subject: [PATCH 054/158] powerpc/mm/hugetlb: Add support for page accounting Add __GFP_ACCOUNT to __hugepte_alloc() Signed-off-by: Balbir Singh Signed-off-by: Michael Ellerman --- arch/powerpc/mm/hugetlbpage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index a4f33de4008e..94e56b10e4cc 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -77,7 +77,7 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, num_hugepd = 1; } - new = kmem_cache_zalloc(cachep, GFP_KERNEL); + new = kmem_cache_zalloc(cachep, pgtable_gfp_flags(mm, GFP_KERNEL)); BUG_ON(pshift > HUGEPD_SHIFT_MASK); BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK); From 4386c096c2ffa1b3232d701e9d7ff82a1378e1c5 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 29 May 2017 17:31:56 +0200 Subject: [PATCH 055/158] powerpc/mm: Rename map_page() to map_kernel_page() on 32-bit These two functions implement the same semantics, so unify their naming so we can share code that calls them. The longer name is more descriptive so use it. Signed-off-by: Christophe Leroy Acked-by: Balbir Singh Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/32/pgtable.h | 2 ++ arch/powerpc/include/asm/nohash/32/pgtable.h | 2 ++ arch/powerpc/mm/8xx_mmu.c | 2 +- arch/powerpc/mm/dma-noncoherent.c | 2 +- arch/powerpc/mm/mem.c | 4 ++-- arch/powerpc/mm/mmu_decl.h | 1 - arch/powerpc/mm/pgtable_32.c | 8 ++++---- 7 files changed, 12 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h index 26ed228d4dc6..7fb755880409 100644 --- a/arch/powerpc/include/asm/book3s/32/pgtable.h +++ b/arch/powerpc/include/asm/book3s/32/pgtable.h @@ -297,6 +297,8 @@ static inline void __ptep_set_access_flags(struct mm_struct *mm, extern int get_pteptr(struct mm_struct *mm, unsigned long addr, pte_t **ptep, pmd_t **pmdp); +int map_kernel_page(unsigned long va, phys_addr_t pa, int flags); + /* Generic accessors to PTE bits */ static inline int pte_write(pte_t pte) { return !!(pte_val(pte) & _PAGE_RW);} static inline int pte_dirty(pte_t pte) { return !!(pte_val(pte) & _PAGE_DIRTY); } diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h index 5134ade2e850..91314268f04f 100644 --- a/arch/powerpc/include/asm/nohash/32/pgtable.h +++ b/arch/powerpc/include/asm/nohash/32/pgtable.h @@ -340,6 +340,8 @@ static inline void __ptep_set_access_flags(struct mm_struct *mm, extern int get_pteptr(struct mm_struct *mm, unsigned long addr, pte_t **ptep, pmd_t **pmdp); +int map_kernel_page(unsigned long va, phys_addr_t pa, int flags); + #endif /* !__ASSEMBLY__ */ #endif /* __ASM_POWERPC_NOHASH_32_PGTABLE_H */ diff --git a/arch/powerpc/mm/8xx_mmu.c b/arch/powerpc/mm/8xx_mmu.c index 6c5025e81236..f4c6472f2fc4 100644 --- a/arch/powerpc/mm/8xx_mmu.c +++ b/arch/powerpc/mm/8xx_mmu.c @@ -88,7 +88,7 @@ static void mmu_mapin_immr(void) int offset; for (offset = 0; offset < IMMR_SIZE; offset += PAGE_SIZE) - map_page(v + offset, p + offset, f); + map_kernel_page(v + offset, p + offset, f); } /* Address of instructions to patch */ diff --git a/arch/powerpc/mm/dma-noncoherent.c b/arch/powerpc/mm/dma-noncoherent.c index 2dc74e5c6458..382528475433 100644 --- a/arch/powerpc/mm/dma-noncoherent.c +++ b/arch/powerpc/mm/dma-noncoherent.c @@ -227,7 +227,7 @@ __dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *handle, gfp_t do { SetPageReserved(page); - map_page(vaddr, page_to_phys(page), + map_kernel_page(vaddr, page_to_phys(page), pgprot_val(pgprot_noncached(PAGE_KERNEL))); page++; vaddr += PAGE_SIZE; diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 9ee536ec0739..04f4c9852791 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -313,11 +313,11 @@ void __init paging_init(void) unsigned long end = __fix_to_virt(FIX_HOLE); for (; v < end; v += PAGE_SIZE) - map_page(v, 0, 0); /* XXX gross */ + map_kernel_page(v, 0, 0); /* XXX gross */ #endif #ifdef CONFIG_HIGHMEM - map_page(PKMAP_BASE, 0, 0); /* XXX gross */ + map_kernel_page(PKMAP_BASE, 0, 0); /* XXX gross */ pkmap_page_table = virt_to_kpte(PKMAP_BASE); kmap_pte = virt_to_kpte(__fix_to_virt(FIX_KMAP_BEGIN)); diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index f988db655e5b..d46128b22150 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -94,7 +94,6 @@ extern void _tlbia(void); #ifdef CONFIG_PPC32 extern void mapin_ram(void); -extern int map_page(unsigned long va, phys_addr_t pa, int flags); extern void setbat(int index, unsigned long virt, phys_addr_t phys, unsigned int size, pgprot_t prot); diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index 8de9db2b4d96..a9e4bfc025bc 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -189,7 +189,7 @@ __ioremap_caller(phys_addr_t addr, unsigned long size, unsigned long flags, err = 0; for (i = 0; i < size && err == 0; i += PAGE_SIZE) - err = map_page(v+i, p+i, flags); + err = map_kernel_page(v+i, p+i, flags); if (err) { if (slab_is_available()) vunmap((void *)v); @@ -215,7 +215,7 @@ void iounmap(volatile void __iomem *addr) } EXPORT_SYMBOL(iounmap); -int map_page(unsigned long va, phys_addr_t pa, int flags) +int map_kernel_page(unsigned long va, phys_addr_t pa, int flags) { pmd_t *pd; pte_t *pg; @@ -255,7 +255,7 @@ void __init __mapin_ram_chunk(unsigned long offset, unsigned long top) ktext = ((char *)v >= _stext && (char *)v < etext) || ((char *)v >= _sinittext && (char *)v < _einittext); f = ktext ? pgprot_val(PAGE_KERNEL_TEXT) : pgprot_val(PAGE_KERNEL); - map_page(v, p, f); + map_kernel_page(v, p, f); #ifdef CONFIG_PPC_STD_MMU_32 if (ktext) hash_preload(&init_mm, v, 0, 0x300); @@ -387,6 +387,6 @@ void __set_fixmap (enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags) return; } - map_page(address, phys, pgprot_val(flags)); + map_kernel_page(address, phys, pgprot_val(flags)); fixmaps++; } From 58d876fa7181f2f393190c1d32c056b5a9d34aa2 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 5 May 2017 08:34:58 +0300 Subject: [PATCH 056/158] cxl: Unlock on error in probe We should unlock if get_cxl_adapter() fails. Fixes: 594ff7d067ca ("cxl: Support to flash a new image on the adapter from a guest") Signed-off-by: Dan Carpenter Acked-by: Frederic Barrat Signed-off-by: Michael Ellerman --- drivers/misc/cxl/flash.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/misc/cxl/flash.c b/drivers/misc/cxl/flash.c index 7c61c70ba3f6..3aa216bf0939 100644 --- a/drivers/misc/cxl/flash.c +++ b/drivers/misc/cxl/flash.c @@ -401,8 +401,10 @@ static int device_open(struct inode *inode, struct file *file) if (down_interruptible(&sem) != 0) return -EPERM; - if (!(adapter = get_cxl_adapter(adapter_num))) - return -ENODEV; + if (!(adapter = get_cxl_adapter(adapter_num))) { + rc = -ENODEV; + goto err_unlock; + } file->private_data = adapter; continue_token = 0; @@ -446,6 +448,8 @@ err1: free_page((unsigned long) le); err: put_device(&adapter->dev); +err_unlock: + up(&sem); return rc; } From 90df4bfb4d9e00a1ab6885900b808bef2b62a21c Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Mon, 29 May 2017 16:26:44 +1000 Subject: [PATCH 057/158] powerpc/64s: Machine check handle ifetch from foreign real address for POWER9 The i-side 0111b machine check, which is "Instruction Fetch to foreign address space", was missed by 7b9f71f974 ("powerpc/64s: POWER9 machine check handler"). The POWER9 processor core considers host real addresses with a nonzero value in RA(8:12) as foreign address space, accessible only by the copy and paste instructions. The copy and paste instruction pair can be used to invoke the Nest accelerators via the Virtual Accelerator Switchboard (VAS). It is an error for any regular load/store or ifetch to go to a foreign addresses. When relocation is on, this causes an MMU exception. When relocation is off, a machine check exception. It is possible to trigger this machine check by branching to a foreign address with MSR[IR]=0. Fixes: 7b9f71f974a1 ("powerpc/64s: POWER9 machine check handler") Reported-by: Mahesh Salgaonkar Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/mce.h | 15 ++++++++------- arch/powerpc/kernel/mce.c | 1 + arch/powerpc/kernel/mce_power.c | 3 +++ 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h index 81eff8631434..190d69a7f701 100644 --- a/arch/powerpc/include/asm/mce.h +++ b/arch/powerpc/include/asm/mce.h @@ -90,13 +90,14 @@ enum MCE_UserErrorType { enum MCE_RaErrorType { MCE_RA_ERROR_INDETERMINATE = 0, MCE_RA_ERROR_IFETCH = 1, - MCE_RA_ERROR_PAGE_TABLE_WALK_IFETCH = 2, - MCE_RA_ERROR_PAGE_TABLE_WALK_IFETCH_FOREIGN = 3, - MCE_RA_ERROR_LOAD = 4, - MCE_RA_ERROR_STORE = 5, - MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE = 6, - MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE_FOREIGN = 7, - MCE_RA_ERROR_LOAD_STORE_FOREIGN = 8, + MCE_RA_ERROR_IFETCH_FOREIGN = 2, + MCE_RA_ERROR_PAGE_TABLE_WALK_IFETCH = 3, + MCE_RA_ERROR_PAGE_TABLE_WALK_IFETCH_FOREIGN = 4, + MCE_RA_ERROR_LOAD = 5, + MCE_RA_ERROR_STORE = 6, + MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE = 7, + MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE_FOREIGN = 8, + MCE_RA_ERROR_LOAD_STORE_FOREIGN = 9, }; enum MCE_LinkErrorType { diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index 5f9eada3519b..92f185875694 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -268,6 +268,7 @@ void machine_check_print_event_info(struct machine_check_event *evt, static const char *mc_ra_types[] = { "Indeterminate", "Instruction fetch (bad)", + "Instruction fetch (foreign)", "Page table walk ifetch (bad)", "Page table walk ifetch (foreign)", "Load (bad)", diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c index f913139bb0c2..d24e689e893f 100644 --- a/arch/powerpc/kernel/mce_power.c +++ b/arch/powerpc/kernel/mce_power.c @@ -236,6 +236,9 @@ static const struct mce_ierror_table mce_p9_ierror_table[] = { { 0x00000000081c0000, 0x0000000000180000, true, MCE_ERROR_TYPE_UE, MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH, MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000000081c0000, 0x00000000001c0000, true, + MCE_ERROR_TYPE_RA, MCE_RA_ERROR_IFETCH_FOREIGN, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, { 0x00000000081c0000, 0x0000000008000000, true, MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_IFETCH_TIMEOUT, MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, From 0edc2ca9ccc9df99f7a94b6407ae2a0ff27d86b2 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 15 Jun 2017 16:20:46 +1000 Subject: [PATCH 058/158] Revert "powerpc: Handle simultaneous interrupts at once" This reverts commit 45cb08f4791ce6a15c54598b4cb73db4b4b8294f. For some reason this is causing IRQ problems on Freescale Book3E machines, eg on my p5020ds: irq 25: nobody cared (try booting with the "irqpoll" option) CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.12.0-rc3-gcc-6.3.1-00037-g45cb08f4791c #624 Call Trace: [c0000000fffdbb10] [c00000000049962c] .dump_stack+0xa8/0xe8 (unreliable) [c0000000fffdbba0] [c0000000000babf4] .__report_bad_irq+0x54/0x140 [c0000000fffdbc40] [c0000000000bb11c] .note_interrupt+0x324/0x380 [c0000000fffdbd00] [c0000000000b7110] .handle_irq_event_percpu+0x68/0x88 [c0000000fffdbd90] [c0000000000b718c] .handle_irq_event+0x5c/0xa8 [c0000000fffdbe10] [c0000000000bc01c] .handle_fasteoi_irq+0xe4/0x298 [c0000000fffdbe90] [c0000000000b59c4] .generic_handle_irq+0x50/0x74 [c0000000fffdbf10] [c0000000000075d8] .__do_irq+0x74/0x1f0 [c0000000fffdbf90] [c0000000000189f8] .call_do_irq+0x14/0x24 [c0000000f7173060] [c0000000000077e4] .do_IRQ+0x90/0x120 [c0000000f7173100] [c00000000001d93c] exc_0x500_common+0xfc/0x100 --- interrupt: 501 at .prepare_to_wait_event+0xc/0x14c LR = .fsl_elbc_run_command+0xc8/0x23c [c0000000f71734d0] [c00000000065f418] .nand_reset+0xb8/0x168 [c0000000f7173560] [c00000000065fec4] .nand_scan_ident+0x2b0/0x1638 [c0000000f7173650] [c000000000666cd8] .fsl_elbc_nand_probe+0x34c/0x5f0 ata2: SATA link up 1.5 Gbps (SStatus 113 SControl 300) [c0000000f7173750] [c0000000005a3c60] .platform_drv_probe+0x64/0xb0 [c0000000f71737d0] [c0000000005a12e0] .really_probe+0x290/0x334 [c0000000f7173870] [c0000000005a14a0] .__driver_attach+0x11c/0x120 [c0000000f7173900] [c00000000059e6a0] .bus_for_each_dev+0x98/0xfc [c0000000f71739a0] [c0000000005a0b3c] .driver_attach+0x34/0x4c [c0000000f7173a20] [c0000000005a04b0] .bus_add_driver+0x1ac/0x2e0 [c0000000f7173ac0] [c0000000005a2170] .driver_register+0x94/0x160 [c0000000f7173b40] [c0000000005a3be0] .__platform_driver_register+0x60/0x7c [c0000000f7173bc0] [c000000000d6aab4] .fsl_elbc_nand_driver_init+0x24/0x38 [c0000000f7173c30] [c000000000001934] .do_one_initcall+0x68/0x1b8 [c0000000f7173d00] [c000000000d210f8] .kernel_init_freeable+0x260/0x338 [c0000000f7173db0] [c0000000000021b0] .kernel_init+0x20/0xe70 [c0000000f7173e30] [c0000000000009bc] .ret_from_kernel_thread+0x58/0x9c handlers: [] .fsl_lbc_ctrl_irq Disabling IRQ #25 Ben also had concerns with the implementation being potentially slow on some PICs, so revert it for now. Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/irq.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index ab2ed9afd3c2..5c291df30fe3 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -481,11 +481,7 @@ void __do_irq(struct pt_regs *regs) if (unlikely(!irq)) __this_cpu_inc(irq_stat.spurious_irqs); else - do { - generic_handle_irq(irq); - - irq = ppc_md.get_irq(); - } while (irq); + generic_handle_irq(irq); trace_irq_exit(regs); From 9abcc981de9775659a0f6e4a52a3448ea72e59da Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 6 Jun 2017 15:48:57 +1000 Subject: [PATCH 059/158] powerpc/mm/radix: Only add X for pages overlapping kernel text Currently we map the whole linear mapping with PAGE_KERNEL_X. Instead we should check if the page overlaps the kernel text and only then add PAGE_KERNEL_X. Note that we still use 1G pages if they're available, so this will typically still result in a 1G executable page at KERNELBASE. So this fix is primarily useful for catching stray branches to high linear mapping addresses. Without this patch, we can execute at 1G in xmon using: 0:mon> m c000000040000000 c000000040000000 00 l c000000040000000 00000000 01006038 c000000040000004 00000000 2000804e c000000040000008 00000000 x 0:mon> di c000000040000000 c000000040000000 38600001 li r3,1 c000000040000004 4e800020 blr 0:mon> p c000000040000000 return value is 0x1 After we get a 400 as expected: 0:mon> p c000000040000000 *** 400 exception occurred Fixes: 2bfd65e45e87 ("powerpc/mm/radix: Add radix callbacks for early init routines") Cc: stable@vger.kernel.org # v4.7+ Signed-off-by: Michael Ellerman Reviewed-by: Aneesh Kumar K.V Acked-by: Balbir Singh --- arch/powerpc/mm/pgtable-radix.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c index c28165d8970b..6c062f92b9e4 100644 --- a/arch/powerpc/mm/pgtable-radix.c +++ b/arch/powerpc/mm/pgtable-radix.c @@ -19,6 +19,7 @@ #include #include #include +#include #include @@ -121,7 +122,8 @@ static inline void __meminit print_mapping(unsigned long start, static int __meminit create_physical_mapping(unsigned long start, unsigned long end) { - unsigned long addr, mapping_size = 0; + unsigned long vaddr, addr, mapping_size = 0; + pgprot_t prot; start = _ALIGN_UP(start, PAGE_SIZE); for (addr = start; addr < end; addr += mapping_size) { @@ -145,8 +147,14 @@ static int __meminit create_physical_mapping(unsigned long start, start = addr; } - rc = radix__map_kernel_page((unsigned long)__va(addr), addr, - PAGE_KERNEL_X, mapping_size); + vaddr = (unsigned long)__va(addr); + + if (overlaps_kernel_text(vaddr, vaddr + mapping_size)) + prot = PAGE_KERNEL_X; + else + prot = PAGE_KERNEL; + + rc = radix__map_kernel_page(vaddr, addr, prot, mapping_size); if (rc) return rc; } From acd7d8cef01537062e318143d700357d5a92bd6b Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 9 Jun 2017 01:35:04 +1000 Subject: [PATCH 060/158] powerpc/64s: Optimize hypercall/syscall entry After bc3551257a ("powerpc/64: Allow for relocation-on interrupts from guest to host"), a getppid() system call goes from 307 cycles to 358 cycles (+17%) on POWER8. This is due significantly to the scratch SPR used by the hypercall check. It turns out there are a some volatile registers common to both system call and hypercall (in particular, r12, cr0, ctr), which can be used to avoid the SPR and some other overheads. This brings getppid to 320 cycles (+4%). Testing hcall entry performance by running "sc 1" in guest userspace before this patch is 854 cycles, afterwards is 826. Also a small win there. POWER9 syscall is improved by about the same amount, hcall not tested. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/exceptions-64s.S | 134 +++++++++++++++++++-------- 1 file changed, 97 insertions(+), 37 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index ae418b85c17c..2f700a15bfa3 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -821,46 +821,80 @@ EXC_VIRT(trap_0b, 0x4b00, 0x100, 0xb00) TRAMP_KVM(PACA_EXGEN, 0xb00) EXC_COMMON(trap_0b_common, 0xb00, unknown_exception) +/* + * system call / hypercall (0xc00, 0x4c00) + * + * The system call exception is invoked with "sc 0" and does not alter HV bit. + * There is support for kernel code to invoke system calls but there are no + * in-tree users. + * + * The hypercall is invoked with "sc 1" and sets HV=1. + * + * In HPT, sc 1 always goes to 0xc00 real mode. In RADIX, sc 1 can go to + * 0x4c00 virtual mode. + * + * Call convention: + * + * syscall register convention is in Documentation/powerpc/syscall64-abi.txt + * + * For hypercalls, the register convention is as follows: + * r0 volatile + * r1-2 nonvolatile + * r3 volatile parameter and return value for status + * r4-r10 volatile input and output value + * r11 volatile hypercall number and output value + * r12 volatile + * r13-r31 nonvolatile + * LR nonvolatile + * CTR volatile + * XER volatile + * CR0-1 CR5-7 volatile + * CR2-4 nonvolatile + * Other registers nonvolatile + * + * The intersection of volatile registers that don't contain possible + * inputs is: r12, cr0, xer, ctr. We may use these as scratch regs + * upon entry without saving. + */ #ifdef CONFIG_KVM_BOOK3S_64_HANDLER - /* - * If CONFIG_KVM_BOOK3S_64_HANDLER is set, save the PPR (on systems - * that support it) before changing to HMT_MEDIUM. That allows the KVM - * code to save that value into the guest state (it is the guest's PPR - * value). Otherwise just change to HMT_MEDIUM as userspace has - * already saved the PPR. - */ + /* + * There is a little bit of juggling to get syscall and hcall + * working well. Save r10 in ctr to be restored in case it is a + * hcall. + * + * Userspace syscalls have already saved the PPR, hcalls must save + * it before setting HMT_MEDIUM. + */ #define SYSCALL_KVMTEST \ - SET_SCRATCH0(r13); \ + mr r12,r13; \ GET_PACA(r13); \ - std r9,PACA_EXGEN+EX_R9(r13); \ - OPT_GET_SPR(r9, SPRN_PPR, CPU_FTR_HAS_PPR); \ + mtctr r10; \ + KVMTEST_PR(0xc00); /* uses r10, branch to do_kvm_0xc00_system_call */ \ HMT_MEDIUM; \ - std r10,PACA_EXGEN+EX_R10(r13); \ - OPT_SAVE_REG_TO_PACA(PACA_EXGEN+EX_PPR, r9, CPU_FTR_HAS_PPR); \ - mfcr r9; \ - KVMTEST_PR(0xc00); \ - GET_SCRATCH0(r13) + mr r9,r12; \ #else #define SYSCALL_KVMTEST \ - HMT_MEDIUM + HMT_MEDIUM; \ + mr r9,r13; \ + GET_PACA(r13); #endif #define LOAD_SYSCALL_HANDLER(reg) \ __LOAD_HANDLER(reg, system_call_common) -/* Syscall routine is used twice, in reloc-off and reloc-on paths */ -#define SYSCALL_PSERIES_1 \ +#define SYSCALL_FASTENDIAN_TEST \ BEGIN_FTR_SECTION \ cmpdi r0,0x1ebe ; \ beq- 1f ; \ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) \ - mr r9,r13 ; \ - GET_PACA(r13) ; \ - mfspr r11,SPRN_SRR0 ; \ -0: -#define SYSCALL_PSERIES_2_RFID \ +/* + * After SYSCALL_KVMTEST, we reach here with PACA in r13, r13 in r9, + * and HMT_MEDIUM. + */ +#define SYSCALL_REAL \ + mfspr r11,SPRN_SRR0 ; \ mfspr r12,SPRN_SRR1 ; \ LOAD_SYSCALL_HANDLER(r10) ; \ mtspr SPRN_SRR0,r10 ; \ @@ -869,11 +903,12 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) \ rfid ; \ b . ; /* prevent speculative execution */ -#define SYSCALL_PSERIES_3 \ +#define SYSCALL_FASTENDIAN \ /* Fast LE/BE switch system call */ \ 1: mfspr r12,SPRN_SRR1 ; \ xori r12,r12,MSR_LE ; \ mtspr SPRN_SRR1,r12 ; \ + mr r13,r9 ; \ rfid ; /* return to userspace */ \ b . ; /* prevent speculative execution */ @@ -882,16 +917,18 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) \ * We can't branch directly so we do it via the CTR which * is volatile across system calls. */ -#define SYSCALL_PSERIES_2_DIRECT \ - LOAD_SYSCALL_HANDLER(r12) ; \ - mtctr r12 ; \ +#define SYSCALL_VIRT \ + LOAD_SYSCALL_HANDLER(r10) ; \ + mtctr r10 ; \ + mfspr r11,SPRN_SRR0 ; \ mfspr r12,SPRN_SRR1 ; \ li r10,MSR_RI ; \ mtmsrd r10,1 ; \ bctr ; #else /* We can branch directly */ -#define SYSCALL_PSERIES_2_DIRECT \ +#define SYSCALL_VIRT \ + mfspr r11,SPRN_SRR0 ; \ mfspr r12,SPRN_SRR1 ; \ li r10,MSR_RI ; \ mtmsrd r10,1 ; /* Set RI (EE=0) */ \ @@ -899,20 +936,43 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) \ #endif EXC_REAL_BEGIN(system_call, 0xc00, 0x100) - SYSCALL_KVMTEST - SYSCALL_PSERIES_1 - SYSCALL_PSERIES_2_RFID - SYSCALL_PSERIES_3 + SYSCALL_KVMTEST /* loads PACA into r13, and saves r13 to r9 */ + SYSCALL_FASTENDIAN_TEST + SYSCALL_REAL + SYSCALL_FASTENDIAN EXC_REAL_END(system_call, 0xc00, 0x100) EXC_VIRT_BEGIN(system_call, 0x4c00, 0x100) - SYSCALL_KVMTEST - SYSCALL_PSERIES_1 - SYSCALL_PSERIES_2_DIRECT - SYSCALL_PSERIES_3 + SYSCALL_KVMTEST /* loads PACA into r13, and saves r13 to r9 */ + SYSCALL_FASTENDIAN_TEST + SYSCALL_VIRT + SYSCALL_FASTENDIAN EXC_VIRT_END(system_call, 0x4c00, 0x100) -TRAMP_KVM(PACA_EXGEN, 0xc00) +#ifdef CONFIG_KVM_BOOK3S_64_HANDLER + /* + * This is a hcall, so register convention is as above, with these + * differences: + * r13 = PACA + * r12 = orig r13 + * ctr = orig r10 + */ +TRAMP_KVM_BEGIN(do_kvm_0xc00) + /* + * Save the PPR (on systems that support it) before changing to + * HMT_MEDIUM. That allows the KVM code to save that value into the + * guest state (it is the guest's PPR value). + */ + OPT_GET_SPR(r0, SPRN_PPR, CPU_FTR_HAS_PPR) + HMT_MEDIUM + OPT_SAVE_REG_TO_PACA(PACA_EXGEN+EX_PPR, r0, CPU_FTR_HAS_PPR) + mfctr r10 + SET_SCRATCH0(r12) + std r9,PACA_EXGEN+EX_R9(r13) + mfcr r9 + std r10,PACA_EXGEN+EX_R10(r13) + KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xc00) +#endif EXC_REAL(single_step, 0xd00, 0x100) From bc4f65e4cf9d6cc43e0e9ba0b8648cf9201cd55f Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 9 Jun 2017 01:35:05 +1000 Subject: [PATCH 061/158] powerpc/64: Avoid restore_math call if possible in syscall exit The syscall exit code that branches to restore_math is quite heavy on Book3S, consisting of 2 mtmsr instructions. Threads that don't use both FP and vector can get caught here if the kernel ever uses FP or vector. Lazy-FP/vec context switching also trips this case. So check for lazy FP and vector before switching RI for restore_math. Move most of this case out of line. For threads that do want to restore math registers, the MSR switches are still suboptimal. Future direction may be to use a soft-RI bit to avoid MSR switches in kernel (similar to soft-EE), but for now at least the no-restore POWER9 context switch rate increases by about 5% due to sched_yield(2) return performance. I haven't constructed a test to measure the syscall cost. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/entry_64.S | 62 +++++++++++++++++++++++----------- arch/powerpc/kernel/process.c | 4 +++ 2 files changed, 47 insertions(+), 19 deletions(-) diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index bfbad08a1207..6f70ea821a07 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -210,27 +210,17 @@ system_call: /* label this so stack traces look sane */ andi. r0,r9,(_TIF_SYSCALL_DOTRACE|_TIF_SINGLESTEP|_TIF_USER_WORK_MASK|_TIF_PERSYSCALL_MASK) bne- syscall_exit_work - andi. r0,r8,MSR_FP - beq 2f + /* If MSR_FP and MSR_VEC are set in user msr, then no need to restore */ + li r7,MSR_FP #ifdef CONFIG_ALTIVEC - andis. r0,r8,MSR_VEC@h - bne 3f + oris r7,r7,MSR_VEC@h #endif -2: addi r3,r1,STACK_FRAME_OVERHEAD -#ifdef CONFIG_PPC_BOOK3S - li r10,MSR_RI - mtmsrd r10,1 /* Restore RI */ -#endif - bl restore_math -#ifdef CONFIG_PPC_BOOK3S - li r11,0 - mtmsrd r11,1 -#endif - ld r8,_MSR(r1) - ld r3,RESULT(r1) - li r11,-MAX_ERRNO + and r0,r8,r7 + cmpd r0,r7 + bne syscall_restore_math +.Lsyscall_restore_math_cont: -3: cmpld r3,r11 + cmpld r3,r11 ld r5,_CCR(r1) bge- syscall_error .Lsyscall_error_cont: @@ -263,7 +253,41 @@ syscall_error: neg r3,r3 std r5,_CCR(r1) b .Lsyscall_error_cont - + +syscall_restore_math: + /* + * Some initial tests from restore_math to avoid the heavyweight + * C code entry and MSR manipulations. + */ + LOAD_REG_IMMEDIATE(r0, MSR_TS_MASK) + and. r0,r0,r8 + bne 1f + + ld r7,PACACURRENT(r13) + lbz r0,THREAD+THREAD_LOAD_FP(r7) +#ifdef CONFIG_ALTIVEC + lbz r6,THREAD+THREAD_LOAD_VEC(r7) + add r0,r0,r6 +#endif + cmpdi r0,0 + beq .Lsyscall_restore_math_cont + +1: addi r3,r1,STACK_FRAME_OVERHEAD +#ifdef CONFIG_PPC_BOOK3S + li r10,MSR_RI + mtmsrd r10,1 /* Restore RI */ +#endif + bl restore_math +#ifdef CONFIG_PPC_BOOK3S + li r11,0 + mtmsrd r11,1 +#endif + /* Restore volatiles, reload MSR from updated one */ + ld r8,_MSR(r1) + ld r3,RESULT(r1) + li r11,-MAX_ERRNO + b .Lsyscall_restore_math_cont + /* Traced system call support */ syscall_dotrace: bl save_nvgprs diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index baae104b16c7..5cbb8b1faf7e 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -511,6 +511,10 @@ void restore_math(struct pt_regs *regs) { unsigned long msr; + /* + * Syscall exit makes a similar initial check before branching + * to restore_math. Keep them in synch. + */ if (!msr_tm_active(regs->msr) && !current->thread.load_fp && !loadvec(current->thread)) return; From e4c0fc5f72bca11432297168338aef46c12793a4 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 9 Jun 2017 01:36:06 +1000 Subject: [PATCH 062/158] powerpc/64s: Leave interrupts hard enabled in context switch for radix Commit 4387e9ff25 ("[POWERPC] Fix PMU + soft interrupt disable bug") hard disabled interrupts over the low level context switch, because the SLB management can't cope with a PMU interrupt accesing the stack in that window. Radix based kernel mapping does not use the SLB so it does not require interrupts hard disabled here. This is worth 1-2% in context switch performance on POWER9. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/entry_64.S | 8 ++++++++ arch/powerpc/kernel/process.c | 14 ++++++++------ 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 6f70ea821a07..91f9fdc2d027 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -607,6 +607,14 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT) top of the kernel stack. */ addi r7,r7,THREAD_SIZE-SWITCH_FRAME_SIZE + /* + * PMU interrupts in radix may come in here. They will use r1, not + * PACAKSAVE, so this stack switch will not cause a problem. They + * will store to the process stack, which may then be migrated to + * another CPU. However the rq lock release on this CPU paired with + * the rq lock acquire on the new CPU before the stack becomes + * active on the new CPU, will order those stores. + */ mr r1,r8 /* start using new stack pointer */ std r7,PACAKSAVE(r13) diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 5cbb8b1faf7e..45faa9a32a01 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1199,12 +1199,14 @@ struct task_struct *__switch_to(struct task_struct *prev, __switch_to_tm(prev, new); - /* - * We can't take a PMU exception inside _switch() since there is a - * window where the kernel stack SLB and the kernel stack are out - * of sync. Hard disable here. - */ - hard_irq_disable(); + if (!radix_enabled()) { + /* + * We can't take a PMU exception inside _switch() since there + * is a window where the kernel stack SLB and the kernel stack + * are out of sync. Hard disable here. + */ + hard_irq_disable(); + } /* * Call restore_sprs() before calling _switch(). If we move it after From 837e72f78a72ef43a0c5e179f3addadb2a225f80 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 9 Jun 2017 01:36:07 +1000 Subject: [PATCH 063/158] powerpc/64: Drop reservation-clearing ldarx in context switch There is no need to explicitly break the reservation in _switch, because we are guaranteed that the context switch path will include a larx/stcx. Comment the guarantee and remove the reservation clear from _switch. This is worth 1-2% in context switch performance. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/entry_64.S | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 91f9fdc2d027..273a35926534 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -521,15 +521,10 @@ _GLOBAL(_switch) #endif /* CONFIG_SMP */ /* - * If we optimise away the clear of the reservation in system - * calls because we know the CPU tracks the address of the - * reservation, then we need to clear it here to cover the - * case that the kernel context switch path has no larx - * instructions. + * The kernel context switch path must contain a spin_lock, + * which contains larx/stcx, which will clear any reservation + * of the task being switched. */ -BEGIN_FTR_SECTION - ldarx r6,0,r1 -END_FTR_SECTION_IFSET(CPU_FTR_STCX_CHECKS_ADDRESS) BEGIN_FTR_SECTION /* From 9145effd626d155484f73db24ab3e142ecda31db Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 9 Jun 2017 01:36:08 +1000 Subject: [PATCH 064/158] powerpc/64: Drop explicit hwsync in context switch The sync (aka. hwsync, aka. heavyweight sync) in the context switch code to prevent MMIO access being reordered from the point of view of a single process if it gets migrated to a different CPU is not required because there is an hwsync performed earlier in the context switch path. Comment this so it's clear enough if anything changes on the scheduler or the powerpc sides. Remove the hwsync from _switch. This improves context switch performance by 2-3% on POWER8. Signed-off-by: Nicholas Piggin Acked-by: Peter Zijlstra (Intel) Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/barrier.h | 5 +++++ arch/powerpc/kernel/entry_64.S | 23 +++++++++++++++++------ 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/include/asm/barrier.h b/arch/powerpc/include/asm/barrier.h index c0deafc212b8..25d42bd3f114 100644 --- a/arch/powerpc/include/asm/barrier.h +++ b/arch/powerpc/include/asm/barrier.h @@ -74,6 +74,11 @@ do { \ ___p1; \ }) +/* + * This must resolve to hwsync on SMP for the context switch path. + * See _switch, and core scheduler context switch memory ordering + * comments. + */ #define smp_mb__before_spinlock() smp_mb() #include diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 273a35926534..fb143859cc68 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -512,13 +512,24 @@ _GLOBAL(_switch) std r23,_CCR(r1) std r1,KSP(r3) /* Set old stack pointer */ -#ifdef CONFIG_SMP - /* We need a sync somewhere here to make sure that if the - * previous task gets rescheduled on another CPU, it sees all - * stores it has performed on this one. + /* + * On SMP kernels, care must be taken because a task may be + * scheduled off CPUx and on to CPUy. Memory ordering must be + * considered. + * + * Cacheable stores on CPUx will be visible when the task is + * scheduled on CPUy by virtue of the core scheduler barriers + * (see "Notes on Program-Order guarantees on SMP systems." in + * kernel/sched/core.c). + * + * Uncacheable stores in the case of involuntary preemption must + * be taken care of. The smp_mb__before_spin_lock() in __schedule() + * is implemented as hwsync on powerpc, which orders MMIO too. So + * long as there is an hwsync in the context switch path, it will + * be executed on the source CPU after the task has performed + * all MMIO ops on that CPU, and on the destination CPU before the + * task performs any MMIO ops there. */ - sync -#endif /* CONFIG_SMP */ /* * The kernel context switch path must contain a spin_lock, From 07d2a628bc0008f90754ac7982289f6cb0f46cf8 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 9 Jun 2017 01:36:09 +1000 Subject: [PATCH 065/158] powerpc/64s: Avoid cpabort in context switch when possible The ISA v3.0B copy-paste facility only requires cpabort when switching to a process that has foreign real addresses mapped (direct access to accelerators), to clear a potential copy buffer filled by a previous thread. There is no accelerator driver implemented yet, so cpabort can be removed. It can be be re-added when a driver is implemented. POWER9 DD1 requires the copy buffer to always be cleared on context switch, but if accelerators are not in use, then an unpaired copy from a dummy region is sufficient to clear data out of the copy buffer. This increases context switch performance by about 5% on POWER9. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/ppc-opcode.h | 8 ++++---- arch/powerpc/kernel/entry_64.S | 9 --------- arch/powerpc/kernel/process.c | 27 ++++++++++++++++++++++++++- 3 files changed, 30 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h index 3a8d278e7421..3b6bbf5a8683 100644 --- a/arch/powerpc/include/asm/ppc-opcode.h +++ b/arch/powerpc/include/asm/ppc-opcode.h @@ -189,8 +189,7 @@ /* sorted alphabetically */ #define PPC_INST_BHRBE 0x7c00025c #define PPC_INST_CLRBHRB 0x7c00035c -#define PPC_INST_COPY 0x7c00060c -#define PPC_INST_COPY_FIRST 0x7c20060c +#define PPC_INST_COPY 0x7c20060c #define PPC_INST_CP_ABORT 0x7c00068c #define PPC_INST_DCBA 0x7c0005ec #define PPC_INST_DCBA_MASK 0xfc0007fe @@ -223,8 +222,7 @@ #define PPC_INST_MSGSNDP 0x7c00011c #define PPC_INST_MTTMR 0x7c0003dc #define PPC_INST_NOP 0x60000000 -#define PPC_INST_PASTE 0x7c00070c -#define PPC_INST_PASTE_LAST 0x7c20070d +#define PPC_INST_PASTE 0x7c20070d #define PPC_INST_POPCNTB 0x7c0000f4 #define PPC_INST_POPCNTB_MASK 0xfc0007fe #define PPC_INST_POPCNTD 0x7c0003f4 @@ -392,6 +390,8 @@ /* Deal with instructions that older assemblers aren't aware of */ #define PPC_CP_ABORT stringify_in_c(.long PPC_INST_CP_ABORT) +#define PPC_COPY(a, b) stringify_in_c(.long PPC_INST_COPY | \ + ___PPC_RA(a) | ___PPC_RB(b)) #define PPC_DCBAL(a, b) stringify_in_c(.long PPC_INST_DCBAL | \ __PPC_RA(a) | __PPC_RB(b)) #define PPC_DCBZL(a, b) stringify_in_c(.long PPC_INST_DCBZL | \ diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index fb143859cc68..da9486e2fd89 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -536,15 +536,6 @@ _GLOBAL(_switch) * which contains larx/stcx, which will clear any reservation * of the task being switched. */ - -BEGIN_FTR_SECTION -/* - * A cp_abort (copy paste abort) here ensures that when context switching, a - * copy from one process can't leak into the paste of another. - */ - PPC_CP_ABORT -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) - #ifdef CONFIG_PPC_BOOK3S /* Cancel all explict user streams as they will have no use after context * switch and will stop the HW from creating streams itself diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 45faa9a32a01..6273b5d5baec 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1137,6 +1137,11 @@ static inline void restore_sprs(struct thread_struct *old_thread, #endif } +#ifdef CONFIG_PPC_BOOK3S_64 +#define CP_SIZE 128 +static const u8 dummy_copy_buffer[CP_SIZE] __attribute__((aligned(CP_SIZE))); +#endif + struct task_struct *__switch_to(struct task_struct *prev, struct task_struct *new) { @@ -1226,8 +1231,28 @@ struct task_struct *__switch_to(struct task_struct *prev, batch->active = 1; } - if (current_thread_info()->task->thread.regs) + if (current_thread_info()->task->thread.regs) { restore_math(current_thread_info()->task->thread.regs); + + /* + * The copy-paste buffer can only store into foreign real + * addresses, so unprivileged processes can not see the + * data or use it in any way unless they have foreign real + * mappings. We don't have a VAS driver that allocates those + * yet, so no cpabort is required. + */ + if (cpu_has_feature(CPU_FTR_POWER9_DD1)) { + /* + * DD1 allows paste into normal system memory, so we + * do an unpaired copy here to clear the buffer and + * prevent a covert channel being set up. + * + * cpabort is not used because it is quite expensive. + */ + asm volatile(PPC_COPY(%0, %1) + : : "r"(dummy_copy_buffer), "r"(0)); + } + } #endif /* CONFIG_PPC_STD_MMU_64 */ return last; From 42bed042556261b56aa92e05f4f388f0f7ac1210 Mon Sep 17 00:00:00 2001 From: Murilo Opsfelder Araujo Date: Mon, 29 May 2017 10:26:28 -0300 Subject: [PATCH 066/158] drivers/watchdog/Kconfig: Update CONFIG_WATCHDOG_RTAS dependencies drivers/watchdog/wdrtas.c uses symbols defined in arch/powerpc/kernel/rtas.c, which are exported iff CONFIG_PPC_RTAS is selected. Building wdrtas.c without setting CONFIG_PPC_RTAS throws the following errors: ERROR: ".rtas_token" [drivers/watchdog/wdrtas.ko] undefined! ERROR: "rtas_data_buf" [drivers/watchdog/wdrtas.ko] undefined! ERROR: "rtas_data_buf_lock" [drivers/watchdog/wdrtas.ko] undefined! ERROR: ".rtas_get_sensor" [drivers/watchdog/wdrtas.ko] undefined! ERROR: ".rtas_call" [drivers/watchdog/wdrtas.ko] undefined! This was identified during a randconfig build where CONFIG_WATCHDOG_RTAS=m and CONFIG_PPC_RTAS was not set. Logs are here: http://kisskb.ellerman.id.au/kisskb/buildresult/12982152/ This patch fixes the issue by updating CONFIG_WATCHDOG_RTAS to depend on just CONFIG_PPC_RTAS, removing COMPILE_TEST entirely. Signed-off-by: Murilo Opsfelder Araujo Reviewed-by: Guenter Roeck Signed-off-by: Michael Ellerman --- drivers/watchdog/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig index 8b9049dac094..e6e31a16f68f 100644 --- a/drivers/watchdog/Kconfig +++ b/drivers/watchdog/Kconfig @@ -1688,7 +1688,7 @@ config MEN_A21_WDT config WATCHDOG_RTAS tristate "RTAS watchdog" - depends on PPC_RTAS || (PPC64 && COMPILE_TEST) + depends on PPC_RTAS help This driver adds watchdog support for the RTAS watchdog. From 2201f994a5742c03e660623c385fd6897dd1fa2f Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 13 Jun 2017 23:05:45 +1000 Subject: [PATCH 067/158] powerpc/64s/idle: Move soft interrupt mask logic into C code This simplifies the asm and fixes irq-off tracing over sleep instructions. Also move powersave_nap check for POWER8 into C code, and move PSSCR register value calculation for POWER9 into C. Reviewed-by: Gautham R. Shenoy Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/hw_irq.h | 3 + arch/powerpc/include/asm/machdep.h | 1 + arch/powerpc/include/asm/processor.h | 10 +-- arch/powerpc/kernel/idle_book3s.S | 82 +++++------------------- arch/powerpc/kernel/irq.c | 33 +++++++++- arch/powerpc/platforms/powernv/idle.c | 71 ++++++++++++++++++-- arch/powerpc/platforms/powernv/smp.c | 2 - arch/powerpc/platforms/powernv/subcore.c | 3 +- drivers/cpuidle/cpuidle-powernv.c | 12 ++-- 9 files changed, 128 insertions(+), 89 deletions(-) diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h index eba60416536e..f06112cf8734 100644 --- a/arch/powerpc/include/asm/hw_irq.h +++ b/arch/powerpc/include/asm/hw_irq.h @@ -129,6 +129,9 @@ static inline bool arch_irq_disabled_regs(struct pt_regs *regs) } extern bool prep_irq_for_idle(void); +extern bool prep_irq_for_idle_irqsoff(void); + +#define fini_irq_for_idle_irqsoff() trace_hardirqs_off(); extern void force_external_irq_replay(void); diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h index f90b22c722e1..cd2fc1cc1cc7 100644 --- a/arch/powerpc/include/asm/machdep.h +++ b/arch/powerpc/include/asm/machdep.h @@ -226,6 +226,7 @@ struct machdep_calls { extern void e500_idle(void); extern void power4_idle(void); extern void power7_idle(void); +extern void power9_idle(void); extern void ppc6xx_idle(void); extern void book3e_idle(void); diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index a2123f291ab0..c49165a7439c 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -481,11 +481,11 @@ extern unsigned long cpuidle_disable; enum idle_boot_override {IDLE_NO_OVERRIDE = 0, IDLE_POWERSAVE_OFF}; extern int powersave_nap; /* set if nap mode can be used in idle loop */ -extern unsigned long power7_nap(int check_irq); -extern unsigned long power7_sleep(void); -extern unsigned long power7_winkle(void); -extern unsigned long power9_idle_stop(unsigned long stop_psscr_val, - unsigned long stop_psscr_mask); +extern unsigned long power7_idle_insn(unsigned long type); /* PNV_THREAD_NAP/etc*/ +extern void power7_idle_type(unsigned long type); +extern unsigned long power9_idle_stop(unsigned long psscr_val); +extern void power9_idle_type(unsigned long stop_psscr_val, + unsigned long stop_psscr_mask); extern void flush_instruction_cache(void); extern void hard_reset_now(void); diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S index 98a6d07ecb5c..35cf5bb7daed 100644 --- a/arch/powerpc/kernel/idle_book3s.S +++ b/arch/powerpc/kernel/idle_book3s.S @@ -109,13 +109,9 @@ core_idle_lock_held: /* * Pass requested state in r3: * r3 - PNV_THREAD_NAP/SLEEP/WINKLE in POWER8 - * - Requested STOP state in POWER9 + * - Requested PSSCR value in POWER9 * - * To check IRQ_HAPPENED in r4 - * 0 - don't check - * 1 - check - * - * Address to 'rfid' to in r5 + * Address of idle handler to 'rfid' to in r4 */ pnv_powersave_common: /* Use r3 to pass state nap/sleep/winkle */ @@ -131,30 +127,7 @@ pnv_powersave_common: std r0,_LINK(r1) std r0,_NIP(r1) - /* Hard disable interrupts */ - mfmsr r9 - rldicl r9,r9,48,1 - rotldi r9,r9,16 - mtmsrd r9,1 /* hard-disable interrupts */ - - /* Check if something happened while soft-disabled */ - lbz r0,PACAIRQHAPPENED(r13) - andi. r0,r0,~PACA_IRQ_HARD_DIS@l - beq 1f - cmpwi cr0,r4,0 - beq 1f - addi r1,r1,INT_FRAME_SIZE - ld r0,16(r1) - li r3,0 /* Return 0 (no nap) */ - mtlr r0 - blr - -1: /* We mark irqs hard disabled as this is the state we'll - * be in when returning and we need to tell arch_local_irq_restore() - * about it - */ - li r0,PACA_IRQ_HARD_DIS - stb r0,PACAIRQHAPPENED(r13) + mfmsr r9 /* We haven't lost state ... yet */ li r0,0 @@ -163,8 +136,8 @@ pnv_powersave_common: /* Continue saving state */ SAVE_GPR(2, r1) SAVE_NVGPRS(r1) - mfcr r4 - std r4,_CCR(r1) + mfcr r5 + std r5,_CCR(r1) std r9,_MSR(r1) std r1,PACAR1(r13) @@ -178,7 +151,7 @@ pnv_powersave_common: li r6, MSR_RI andc r6, r9, r6 mtmsrd r6, 1 /* clear RI before setting SRR0/1 */ - mtspr SPRN_SRR0, r5 + mtspr SPRN_SRR0, r4 mtspr SPRN_SRR1, r7 rfid @@ -322,35 +295,14 @@ lwarx_loop_stop: IDLE_STATE_ENTER_SEQ_NORET(PPC_STOP) -_GLOBAL(power7_idle) +/* + * Entered with MSR[EE]=0 and no soft-masked interrupts pending. + * r3 contains desired idle state (PNV_THREAD_NAP/SLEEP/WINKLE). + */ +_GLOBAL(power7_idle_insn) /* Now check if user or arch enabled NAP mode */ - LOAD_REG_ADDRBASE(r3,powersave_nap) - lwz r4,ADDROFF(powersave_nap)(r3) - cmpwi 0,r4,0 - beqlr - li r3, 1 - /* fall through */ - -_GLOBAL(power7_nap) - mr r4,r3 - li r3,PNV_THREAD_NAP - LOAD_REG_ADDR(r5, pnv_enter_arch207_idle_mode) + LOAD_REG_ADDR(r4, pnv_enter_arch207_idle_mode) b pnv_powersave_common - /* No return */ - -_GLOBAL(power7_sleep) - li r3,PNV_THREAD_SLEEP - li r4,1 - LOAD_REG_ADDR(r5, pnv_enter_arch207_idle_mode) - b pnv_powersave_common - /* No return */ - -_GLOBAL(power7_winkle) - li r3,PNV_THREAD_WINKLE - li r4,1 - LOAD_REG_ADDR(r5, pnv_enter_arch207_idle_mode) - b pnv_powersave_common - /* No return */ #define CHECK_HMI_INTERRUPT \ mfspr r0,SPRN_SRR1; \ @@ -372,17 +324,13 @@ ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_207S, 66); \ 20: nop; /* - * r3 - The PSSCR value corresponding to the stop state. - * r4 - The PSSCR mask corrresonding to the stop state. + * Entered with MSR[EE]=0 and no soft-masked interrupts pending. + * r3 contains desired PSSCR register value. */ _GLOBAL(power9_idle_stop) - mfspr r5,SPRN_PSSCR - andc r5,r5,r4 - or r3,r3,r5 std r3, PACA_REQ_PSSCR(r13) mtspr SPRN_PSSCR,r3 - LOAD_REG_ADDR(r5,power_enter_stop) - li r4,1 + LOAD_REG_ADDR(r4,power_enter_stop) b pnv_powersave_common /* No return */ diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 5c291df30fe3..58dcac88bc79 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -322,7 +322,8 @@ bool prep_irq_for_idle(void) * First we need to hard disable to ensure no interrupt * occurs before we effectively enter the low power state */ - hard_irq_disable(); + __hard_irq_disable(); + local_paca->irq_happened |= PACA_IRQ_HARD_DIS; /* * If anything happened while we were soft-disabled, @@ -347,6 +348,36 @@ bool prep_irq_for_idle(void) return true; } +/* + * This is for idle sequences that return with IRQs off, but the + * idle state itself wakes on interrupt. Tell the irq tracer that + * IRQs are enabled for the duration of idle so it does not get long + * off times. Must be paired with fini_irq_for_idle_irqsoff. + */ +bool prep_irq_for_idle_irqsoff(void) +{ + WARN_ON(!irqs_disabled()); + + /* + * First we need to hard disable to ensure no interrupt + * occurs before we effectively enter the low power state + */ + __hard_irq_disable(); + local_paca->irq_happened |= PACA_IRQ_HARD_DIS; + + /* + * If anything happened while we were soft-disabled, + * we return now and do not enter the low power state. + */ + if (lazy_irq_pending()) + return false; + + /* Tell lockdep we are about to re-enable */ + trace_hardirqs_on(); + + return true; +} + /* * Force a replay of the external interrupt handler on this CPU. */ diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index 46946a587004..f875879ff1eb 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -23,6 +23,7 @@ #include #include #include +#include #include "powernv.h" #include "subcore.h" @@ -283,12 +284,68 @@ static DEVICE_ATTR(fastsleep_workaround_applyonce, 0600, show_fastsleep_workaround_applyonce, store_fastsleep_workaround_applyonce); +static unsigned long __power7_idle_type(unsigned long type) +{ + unsigned long srr1; + + if (!prep_irq_for_idle_irqsoff()) + return 0; + + ppc64_runlatch_off(); + srr1 = power7_idle_insn(type); + ppc64_runlatch_on(); + + fini_irq_for_idle_irqsoff(); + + return srr1; +} + +void power7_idle_type(unsigned long type) +{ + __power7_idle_type(type); +} + +void power7_idle(void) +{ + if (!powersave_nap) + return; + + power7_idle_type(PNV_THREAD_NAP); +} + +static unsigned long __power9_idle_type(unsigned long stop_psscr_val, + unsigned long stop_psscr_mask) +{ + unsigned long psscr; + unsigned long srr1; + + if (!prep_irq_for_idle_irqsoff()) + return 0; + + psscr = mfspr(SPRN_PSSCR); + psscr = (psscr & ~stop_psscr_mask) | stop_psscr_val; + + ppc64_runlatch_off(); + srr1 = power9_idle_stop(psscr); + ppc64_runlatch_on(); + + fini_irq_for_idle_irqsoff(); + + return srr1; +} + +void power9_idle_type(unsigned long stop_psscr_val, + unsigned long stop_psscr_mask) +{ + __power9_idle_type(stop_psscr_val, stop_psscr_mask); +} + /* * Used for ppc_md.power_save which needs a function with no parameters */ -static void power9_idle(void) +void power9_idle(void) { - power9_idle_stop(pnv_default_stop_val, pnv_default_stop_mask); + power9_idle_type(pnv_default_stop_val, pnv_default_stop_mask); } #ifdef CONFIG_HOTPLUG_CPU @@ -303,16 +360,17 @@ unsigned long pnv_cpu_offline(unsigned int cpu) u32 idle_states = pnv_get_supported_cpuidle_states(); if (cpu_has_feature(CPU_FTR_ARCH_300) && deepest_stop_found) { - srr1 = power9_idle_stop(pnv_deepest_stop_psscr_val, + srr1 = __power9_idle_type(pnv_deepest_stop_psscr_val, pnv_deepest_stop_psscr_mask); } else if (idle_states & OPAL_PM_WINKLE_ENABLED) { - srr1 = power7_winkle(); + srr1 = __power7_idle_type(PNV_THREAD_WINKLE); } else if ((idle_states & OPAL_PM_SLEEP_ENABLED) || (idle_states & OPAL_PM_SLEEP_ENABLED_ER1)) { - srr1 = power7_sleep(); + srr1 = __power7_idle_type(PNV_THREAD_SLEEP); } else if (idle_states & OPAL_PM_NAP_ENABLED) { - srr1 = power7_nap(1); + srr1 = __power7_idle_type(PNV_THREAD_NAP); } else { + ppc64_runlatch_off(); /* This is the fallback method. We emulate snooze */ while (!generic_check_cpu_restart(cpu)) { HMT_low(); @@ -320,6 +378,7 @@ unsigned long pnv_cpu_offline(unsigned int cpu) } srr1 = 0; HMT_medium(); + ppc64_runlatch_on(); } return srr1; diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c index 4aff754b6f2c..f8752795decf 100644 --- a/arch/powerpc/platforms/powernv/smp.c +++ b/arch/powerpc/platforms/powernv/smp.c @@ -182,9 +182,7 @@ static void pnv_smp_cpu_kill_self(void) */ kvmppc_set_host_ipi(cpu, 0); - ppc64_runlatch_off(); srr1 = pnv_cpu_offline(cpu); - ppc64_runlatch_on(); /* * If the SRR1 value indicates that we woke up due to diff --git a/arch/powerpc/platforms/powernv/subcore.c b/arch/powerpc/platforms/powernv/subcore.c index 0babef11136f..d975d78188a9 100644 --- a/arch/powerpc/platforms/powernv/subcore.c +++ b/arch/powerpc/platforms/powernv/subcore.c @@ -18,6 +18,7 @@ #include #include +#include #include #include #include @@ -182,7 +183,7 @@ static void unsplit_core(void) cpu = smp_processor_id(); if (cpu_thread_in_core(cpu) != 0) { while (mfspr(SPRN_HID0) & mask) - power7_nap(0); + power7_idle_insn(PNV_THREAD_NAP); per_cpu(split_state, cpu).step = SYNC_STEP_UNSPLIT; return; diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c index 45eaf06462ae..79152676f62b 100644 --- a/drivers/cpuidle/cpuidle-powernv.c +++ b/drivers/cpuidle/cpuidle-powernv.c @@ -73,9 +73,8 @@ static int nap_loop(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index) { - ppc64_runlatch_off(); - power7_idle(); - ppc64_runlatch_on(); + power7_idle_type(PNV_THREAD_NAP); + return index; } @@ -98,7 +97,8 @@ static int fastsleep_loop(struct cpuidle_device *dev, new_lpcr &= ~LPCR_PECE1; mtspr(SPRN_LPCR, new_lpcr); - power7_sleep(); + + power7_idle_type(PNV_THREAD_SLEEP); mtspr(SPRN_LPCR, old_lpcr); @@ -110,10 +110,8 @@ static int stop_loop(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index) { - ppc64_runlatch_off(); - power9_idle_stop(stop_psscr_table[index].val, + power9_idle_type(stop_psscr_table[index].val, stop_psscr_table[index].mask); - ppc64_runlatch_on(); return index; } From 2525db04d1cc53e1951143d1829aa75a78cc7f76 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 13 Jun 2017 23:05:46 +1000 Subject: [PATCH 068/158] powerpc/powernv: Simplify lazy IRQ handling in CPU offline Rather than concern ourselves with any soft-mask logic in the CPU hotplug handler, just hard disable interrupts. This ensures there are no lazy-irqs pending, which means we can call directly to idle instruction in order to sleep. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/idle.c | 23 +++++++++++++-------- arch/powerpc/platforms/powernv/smp.c | 29 +++++++++++++-------------- 2 files changed, 29 insertions(+), 23 deletions(-) diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index f875879ff1eb..f188d84d9c59 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -352,25 +352,31 @@ void power9_idle(void) /* * pnv_cpu_offline: A function that puts the CPU into the deepest * available platform idle state on a CPU-Offline. + * interrupts hard disabled and no lazy irq pending. */ unsigned long pnv_cpu_offline(unsigned int cpu) { unsigned long srr1; - u32 idle_states = pnv_get_supported_cpuidle_states(); + ppc64_runlatch_off(); + if (cpu_has_feature(CPU_FTR_ARCH_300) && deepest_stop_found) { - srr1 = __power9_idle_type(pnv_deepest_stop_psscr_val, - pnv_deepest_stop_psscr_mask); + unsigned long psscr; + + psscr = mfspr(SPRN_PSSCR); + psscr = (psscr & ~pnv_deepest_stop_psscr_mask) | + pnv_deepest_stop_psscr_val; + srr1 = power9_idle_stop(psscr); + } else if (idle_states & OPAL_PM_WINKLE_ENABLED) { - srr1 = __power7_idle_type(PNV_THREAD_WINKLE); + srr1 = power7_idle_insn(PNV_THREAD_WINKLE); } else if ((idle_states & OPAL_PM_SLEEP_ENABLED) || (idle_states & OPAL_PM_SLEEP_ENABLED_ER1)) { - srr1 = __power7_idle_type(PNV_THREAD_SLEEP); + srr1 = power7_idle_insn(PNV_THREAD_SLEEP); } else if (idle_states & OPAL_PM_NAP_ENABLED) { - srr1 = __power7_idle_type(PNV_THREAD_NAP); + srr1 = power7_idle_insn(PNV_THREAD_NAP); } else { - ppc64_runlatch_off(); /* This is the fallback method. We emulate snooze */ while (!generic_check_cpu_restart(cpu)) { HMT_low(); @@ -378,9 +384,10 @@ unsigned long pnv_cpu_offline(unsigned int cpu) } srr1 = 0; HMT_medium(); - ppc64_runlatch_on(); } + ppc64_runlatch_on(); + return srr1; } #endif diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c index f8752795decf..c04c87adad94 100644 --- a/arch/powerpc/platforms/powernv/smp.c +++ b/arch/powerpc/platforms/powernv/smp.c @@ -144,7 +144,14 @@ static void pnv_smp_cpu_kill_self(void) unsigned long srr1, wmask; /* Standard hot unplug procedure */ - local_irq_disable(); + /* + * This hard disables local interurpts, ensuring we have no lazy + * irqs pending. + */ + WARN_ON(irqs_disabled()); + hard_irq_disable(); + WARN_ON(lazy_irq_pending()); + idle_task_exit(); current->active_mm = NULL; /* for sanity */ cpu = smp_processor_id(); @@ -162,16 +169,6 @@ static void pnv_smp_cpu_kill_self(void) */ mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) & ~(u64)LPCR_PECE1); - /* - * Hard-disable interrupts, and then clear irq_happened flags - * that we can safely ignore while off-line, since they - * are for things for which we do no processing when off-line - * (or in the case of HMI, all the processing we need to do - * is done in lower-level real-mode code). - */ - hard_irq_disable(); - local_paca->irq_happened &= ~(PACA_IRQ_DEC | PACA_IRQ_HMI); - while (!generic_check_cpu_restart(cpu)) { /* * Clear IPI flag, since we don't handle IPIs while @@ -184,6 +181,8 @@ static void pnv_smp_cpu_kill_self(void) srr1 = pnv_cpu_offline(cpu); + WARN_ON(lazy_irq_pending()); + /* * If the SRR1 value indicates that we woke up due to * an external interrupt, then clear the interrupt. @@ -196,8 +195,7 @@ static void pnv_smp_cpu_kill_self(void) * contains 0. */ if (((srr1 & wmask) == SRR1_WAKEEE) || - ((srr1 & wmask) == SRR1_WAKEHVI) || - (local_paca->irq_happened & PACA_IRQ_EE)) { + ((srr1 & wmask) == SRR1_WAKEHVI)) { if (cpu_has_feature(CPU_FTR_ARCH_300)) { if (xive_enabled()) xive_flush_interrupt(); @@ -209,14 +207,15 @@ static void pnv_smp_cpu_kill_self(void) unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER); asm volatile(PPC_MSGCLR(%0) : : "r" (msg)); } - local_paca->irq_happened &= ~(PACA_IRQ_EE | PACA_IRQ_DBELL); smp_mb(); if (cpu_core_split_required()) continue; if (srr1 && !generic_check_cpu_restart(cpu)) - DBG("CPU%d Unexpected exit while offline !\n", cpu); + DBG("CPU%d Unexpected exit while offline srr1=%lx!\n", + cpu, srr1); + } /* Re-enable decrementer interrupts */ From 771d4304d07f080b6ce751e12f3579cb012a1b22 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 13 Jun 2017 23:05:47 +1000 Subject: [PATCH 069/158] powerpc/64s/idle: Process interrupts from system reset wakeup When the CPU wakes from low power state, it begins at the system reset interrupt with the exception that caused the wakeup encoded in SRR1. Today, powernv idle wakeup ignores the wakeup reason (except a special case for HMI), and the regular interrupt corresponding to the exception will fire after the idle wakeup exits. Change this to replay the interrupt from the idle wakeup before interrupts are hard-enabled. Test on POWER8 of context_switch selftests benchmark with polling idle disabled (e.g., always nap, giving cross-CPU IPIs) gives the following results: original wakeup direct Different threads, same core: 315k/s 264k/s Different cores: 235k/s 242k/s There is a slowdown for doorbell IPI (same core) case because system reset wakeup does not clear the message and the doorbell interrupt fires again needlessly. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/hw_irq.h | 1 + arch/powerpc/kernel/irq.c | 29 +++++++++++++++++++++++++++ arch/powerpc/platforms/powernv/idle.c | 10 +++++++-- 3 files changed, 38 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h index f06112cf8734..c1dd1929342d 100644 --- a/arch/powerpc/include/asm/hw_irq.h +++ b/arch/powerpc/include/asm/hw_irq.h @@ -130,6 +130,7 @@ static inline bool arch_irq_disabled_regs(struct pt_regs *regs) extern bool prep_irq_for_idle(void); extern bool prep_irq_for_idle_irqsoff(void); +extern void irq_set_pending_from_srr1(unsigned long srr1); #define fini_irq_for_idle_irqsoff() trace_hardirqs_off(); diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 58dcac88bc79..0bcec745a672 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -348,6 +348,7 @@ bool prep_irq_for_idle(void) return true; } +#ifdef CONFIG_PPC_BOOK3S /* * This is for idle sequences that return with IRQs off, but the * idle state itself wakes on interrupt. Tell the irq tracer that @@ -378,6 +379,34 @@ bool prep_irq_for_idle_irqsoff(void) return true; } +/* + * Take the SRR1 wakeup reason, index into this table to find the + * appropriate irq_happened bit. + */ +static const u8 srr1_to_lazyirq[0x10] = { + 0, 0, 0, + PACA_IRQ_DBELL, + 0, + PACA_IRQ_DBELL, + PACA_IRQ_DEC, + 0, + PACA_IRQ_EE, + PACA_IRQ_EE, + PACA_IRQ_HMI, + 0, 0, 0, 0, 0 }; + +void irq_set_pending_from_srr1(unsigned long srr1) +{ + unsigned int idx = (srr1 & SRR1_WAKEMASK_P8) >> 18; + + /* + * The 0 index (SRR1[42:45]=b0000) must always evaluate to 0, + * so this can be called unconditionally with srr1 wake reason. + */ + local_paca->irq_happened |= srr1_to_lazyirq[idx]; +} +#endif /* CONFIG_PPC_BOOK3S */ + /* * Force a replay of the external interrupt handler on this CPU. */ diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index f188d84d9c59..1028df82cd2f 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -302,7 +302,10 @@ static unsigned long __power7_idle_type(unsigned long type) void power7_idle_type(unsigned long type) { - __power7_idle_type(type); + unsigned long srr1; + + srr1 = __power7_idle_type(type); + irq_set_pending_from_srr1(srr1); } void power7_idle(void) @@ -337,7 +340,10 @@ static unsigned long __power9_idle_type(unsigned long stop_psscr_val, void power9_idle_type(unsigned long stop_psscr_val, unsigned long stop_psscr_mask) { - __power9_idle_type(stop_psscr_val, stop_psscr_mask); + unsigned long srr1; + + srr1 = __power9_idle_type(stop_psscr_val, stop_psscr_mask); + irq_set_pending_from_srr1(srr1); } /* From a9af97aa0a12c30178dd7ad9af8887d5b9c4647b Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 13 Jun 2017 23:05:48 +1000 Subject: [PATCH 070/158] powerpc/64s: msgclr when handling doorbell exceptions from system reset msgsnd doorbell exceptions are cleared when the doorbell interrupt is taken. However if a doorbell exception causes a system reset interrupt wake from power saving state, the message is not cleared. Processing the doorbell from the system reset interrupt requires msgclr to avoid taking the exception again. Testing this plus the previous wakup direct patch gives: original wakeup direct msgclr Different threads, same core: 315k/s 264k/s 345k/s Different cores: 235k/s 242k/s 242k/s Net speedup is +10% for same core, and +3% for different core. Reviewed-by: Gautham R. Shenoy Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/dbell.h | 13 +++++++++++++ arch/powerpc/include/asm/ppc-opcode.h | 3 +++ arch/powerpc/kernel/asm-offsets.c | 1 + arch/powerpc/kernel/exceptions-64s.S | 23 +++++++++++++++++++++-- 4 files changed, 38 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/dbell.h b/arch/powerpc/include/asm/dbell.h index f70cbfe0ec04..9f2ae0d25e15 100644 --- a/arch/powerpc/include/asm/dbell.h +++ b/arch/powerpc/include/asm/dbell.h @@ -56,6 +56,19 @@ static inline void ppc_msgsync(void) : : "i" (CPU_FTR_HVMODE|CPU_FTR_ARCH_300)); } +static inline void _ppc_msgclr(u32 msg) +{ + __asm__ __volatile__ (ASM_FTR_IFSET(PPC_MSGCLR(%1), PPC_MSGCLRP(%1), %0) + : : "i" (CPU_FTR_HVMODE), "r" (msg)); +} + +static inline void ppc_msgclr(enum ppc_dbell type) +{ + u32 msg = PPC_DBELL_TYPE(type); + + _ppc_msgclr(msg); +} + #else /* CONFIG_PPC_BOOK3S */ #define PPC_DBELL_MSGTYPE PPC_DBELL diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h index 3b6bbf5a8683..4e2cf719c9b2 100644 --- a/arch/powerpc/include/asm/ppc-opcode.h +++ b/arch/powerpc/include/asm/ppc-opcode.h @@ -220,6 +220,7 @@ #define PPC_INST_MSGCLR 0x7c0001dc #define PPC_INST_MSGSYNC 0x7c0006ec #define PPC_INST_MSGSNDP 0x7c00011c +#define PPC_INST_MSGCLRP 0x7c00015c #define PPC_INST_MTTMR 0x7c0003dc #define PPC_INST_NOP 0x60000000 #define PPC_INST_PASTE 0x7c20070d @@ -409,6 +410,8 @@ ___PPC_RB(b)) #define PPC_MSGSNDP(b) stringify_in_c(.long PPC_INST_MSGSNDP | \ ___PPC_RB(b)) +#define PPC_MSGCLRP(b) stringify_in_c(.long PPC_INST_MSGCLRP | \ + ___PPC_RB(b)) #define PPC_POPCNTB(a, s) stringify_in_c(.long PPC_INST_POPCNTB | \ __PPC_RA(a) | __PPC_RS(s)) #define PPC_POPCNTD(a, s) stringify_in_c(.long PPC_INST_POPCNTD | \ diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index e15c178ba079..9624851ca276 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -746,6 +746,7 @@ int main(void) #endif DEFINE(PPC_DBELL_SERVER, PPC_DBELL_SERVER); + DEFINE(PPC_DBELL_MSGTYPE, PPC_DBELL_MSGTYPE); #ifdef CONFIG_PPC_8xx DEFINE(VIRT_IMMR_BASE, (u64)__fix_to_virt(FIX_IMMR_BASE)); diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 2f700a15bfa3..1752beefee69 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -1611,6 +1611,25 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR) bl kernel_bad_stack b 1b +/* + * When doorbell is triggered from system reset wakeup, the message is + * not cleared, so it would fire again when EE is enabled. + * + * When coming from local_irq_enable, there may be the same problem if + * we were hard disabled. + * + * Execute msgclr to clear pending exceptions before handling it. + */ +h_doorbell_common_msgclr: + LOAD_REG_IMMEDIATE(r3, PPC_DBELL_MSGTYPE << (63-36)) + PPC_MSGCLR(3) + b h_doorbell_common + +doorbell_super_common_msgclr: + LOAD_REG_IMMEDIATE(r3, PPC_DBELL_MSGTYPE << (63-36)) + PPC_MSGCLRP(3) + b doorbell_super_common + /* * Called from arch_local_irq_enable when an interrupt needs * to be resent. r3 contains 0x500, 0x900, 0xa00 or 0xe80 to indicate @@ -1636,13 +1655,13 @@ _GLOBAL(__replay_interrupt) beq hardware_interrupt_common BEGIN_FTR_SECTION cmpwi r3,0xe80 - beq h_doorbell_common + beq h_doorbell_common_msgclr cmpwi r3,0xea0 beq h_virt_irq_common cmpwi r3,0xe60 beq hmi_exception_common FTR_SECTION_ELSE cmpwi r3,0xa00 - beq doorbell_super_common + beq doorbell_super_common_msgclr ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE) blr From b48bbb82e2b83537c500417d60218ad44446e572 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 13 Jun 2017 23:05:49 +1000 Subject: [PATCH 071/158] powerpc/64s: Don't unbalance the return branch predictor in __replay_interrupt() The __replay_interrupt() code is branched to with bl, but the caller is returned to directly with rfid from the interrupt. Instead, rfid to a stub that returns to the caller with blr, which should keep the return branch predictor balanced. Reviewed-by: Gautham R. Shenoy Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/exceptions-64s.S | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 1752beefee69..cad3b4b82813 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -1639,6 +1639,10 @@ doorbell_super_common_msgclr: * Note: While MSR:EE is off, we need to make sure that _MSR * in the generated frame has EE set to 1 or the exception * handler will not properly re-enable them. + * + * Note that we don't specify LR as the NIP (return address) for + * the interrupt because that would unbalance the return branch + * predictor. */ _GLOBAL(__replay_interrupt) /* We are going to jump to the exception common code which @@ -1646,7 +1650,7 @@ _GLOBAL(__replay_interrupt) * we don't give a damn about, so we don't bother storing them. */ mfmsr r12 - mflr r11 + LOAD_REG_ADDR(r11, .L__replay_interrupt_return) mfcr r9 ori r12,r12,MSR_EE cmpwi r3,0x900 @@ -1664,4 +1668,6 @@ FTR_SECTION_ELSE cmpwi r3,0xa00 beq doorbell_super_common_msgclr ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE) +.L__replay_interrupt_return: blr + From b51351e264009e890936af83b8d800b32034273d Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 13 Jun 2017 23:05:50 +1000 Subject: [PATCH 072/158] powerpc/64s/idle: Branch to handler with virtual mode offset Have the system reset idle wakeup handlers branched to in real mode with the 0xc... kernel address applied. This allows simplifications of avoiding rfid when switching to virtual mode in the wakeup handler. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/exception-64s.h | 13 +++++++++++++ arch/powerpc/kernel/exceptions-64s.S | 6 ++++-- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index 183d73b6ed99..33473cbc0986 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -236,6 +236,19 @@ END_FTR_SECTION_NESTED(ftr,ftr,943) #define kvmppc_interrupt kvmppc_interrupt_pr #endif +/* + * Branch to label using its 0xC000 address. This results in instruction + * address suitable for MSR[IR]=0 or 1, which allows relocation to be turned + * on using mtmsr rather than rfid. + * + * This could set the 0xc bits for !RELOCATABLE as an immediate, rather than + * load KBASE for a slight optimisation. + */ +#define BRANCH_TO_C000(reg, label) \ + __LOAD_HANDLER(reg, label); \ + mtctr reg; \ + bctr + #ifdef CONFIG_RELOCATABLE #define BRANCH_TO_COMMON(reg, label) \ __LOAD_HANDLER(reg, label); \ diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index cad3b4b82813..7807719ca855 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -99,7 +99,9 @@ EXC_VIRT_NONE(0x4000, 0x100) #ifdef CONFIG_PPC_P7_NAP /* * If running native on arch 2.06 or later, check if we are waking up - * from nap/sleep/winkle, and branch to idle handler. + * from nap/sleep/winkle, and branch to idle handler. The idle wakeup + * handler initially runs in real mode, but we branch to the 0xc000... + * address so we can turn on relocation with mtmsr. */ #define IDLETEST(n) \ BEGIN_FTR_SECTION ; \ @@ -107,7 +109,7 @@ EXC_VIRT_NONE(0x4000, 0x100) rlwinm. r10,r10,47-31,30,31 ; \ beq- 1f ; \ cmpwi cr3,r10,2 ; \ - BRANCH_TO_COMMON(r10, system_reset_idle_common) ; \ + BRANCH_TO_C000(r10, system_reset_idle_common) ; \ 1: \ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) #else From 9d29250136f60438fc0839871bae0a0e9cbbd47e Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 13 Jun 2017 23:05:51 +1000 Subject: [PATCH 073/158] powerpc/64s/idle: Avoid SRR usage in idle sleep/wake paths Idle code now always runs at the 0xc... effective address whether in real or virtual mode. This means rfid can be ditched, along with a lot of SRR manipulations. In the wakeup path, carry SRR1 around in r12. Use mtmsrd to change MSR states as required. This also balances the return prediction for the idle call, by doing blr rather than rfid to return to the idle caller. On POWER9, 2-process context switch on different cores, with snooze disabled, increases performance by 2%. Signed-off-by: Nicholas Piggin [mpe: Incorporate v2 fixes from Nick] Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/exceptions-64s.S | 1 + arch/powerpc/kernel/idle_book3s.S | 57 ++++++++++++------------- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 12 +++++- 3 files changed, 38 insertions(+), 32 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 7807719ca855..64365907cddc 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -130,6 +130,7 @@ EXC_VIRT_NONE(0x4100, 0x100) #ifdef CONFIG_PPC_P7_NAP EXC_COMMON_BEGIN(system_reset_idle_common) + mfspr r12,SPRN_SRR1 b pnv_powersave_wakeup #endif diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S index 35cf5bb7daed..ebe80b5d5ce4 100644 --- a/arch/powerpc/kernel/idle_book3s.S +++ b/arch/powerpc/kernel/idle_book3s.S @@ -111,7 +111,7 @@ core_idle_lock_held: * r3 - PNV_THREAD_NAP/SLEEP/WINKLE in POWER8 * - Requested PSSCR value in POWER9 * - * Address of idle handler to 'rfid' to in r4 + * Address of idle handler to branch to in realmode in r4 */ pnv_powersave_common: /* Use r3 to pass state nap/sleep/winkle */ @@ -121,14 +121,14 @@ pnv_powersave_common: * need to save PC, some CR bits and the NV GPRs, * but for now an interrupt frame will do. */ + mtctr r4 + mflr r0 std r0,16(r1) stdu r1,-INT_FRAME_SIZE(r1) std r0,_LINK(r1) std r0,_NIP(r1) - mfmsr r9 - /* We haven't lost state ... yet */ li r0,0 stb r0,PACA_NAPSTATELOST(r13) @@ -138,7 +138,6 @@ pnv_powersave_common: SAVE_NVGPRS(r1) mfcr r5 std r5,_CCR(r1) - std r9,_MSR(r1) std r1,PACAR1(r13) /* @@ -148,12 +147,8 @@ pnv_powersave_common: * the MMU context to the guest. */ LOAD_REG_IMMEDIATE(r7, MSR_IDLE) - li r6, MSR_RI - andc r6, r9, r6 - mtmsrd r6, 1 /* clear RI before setting SRR0/1 */ - mtspr SPRN_SRR0, r4 - mtspr SPRN_SRR1, r7 - rfid + mtmsrd r7,0 + bctr .globl pnv_enter_arch207_idle_mode pnv_enter_arch207_idle_mode: @@ -305,11 +300,10 @@ _GLOBAL(power7_idle_insn) b pnv_powersave_common #define CHECK_HMI_INTERRUPT \ - mfspr r0,SPRN_SRR1; \ BEGIN_FTR_SECTION_NESTED(66); \ - rlwinm r0,r0,45-31,0xf; /* extract wake reason field (P8) */ \ + rlwinm r0,r12,45-31,0xf; /* extract wake reason field (P8) */ \ FTR_SECTION_ELSE_NESTED(66); \ - rlwinm r0,r0,45-31,0xe; /* P7 wake reason field is 3 bits */ \ + rlwinm r0,r12,45-31,0xe; /* P7 wake reason field is 3 bits */ \ ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_207S, 66); \ cmpwi r0,0xa; /* Hypervisor maintenance ? */ \ bne 20f; \ @@ -388,17 +382,17 @@ pnv_powersave_wakeup_mce: /* * Now put the original SRR1 with SRR1_WAKEMCE_RESVD as the wake - * reason into SRR1, which allows reuse of the system reset wakeup + * reason into r12, which allows reuse of the system reset wakeup * code without being mistaken for another type of wakeup. */ - oris r3,r3,SRR1_WAKEMCE_RESVD@h - mtspr SPRN_SRR1,r3 + oris r12,r3,SRR1_WAKEMCE_RESVD@h b pnv_powersave_wakeup /* * Called from reset vector for powersave wakeups. * cr3 - set to gt if waking up with partial/complete hypervisor state loss + * r12 - SRR1 */ .global pnv_powersave_wakeup pnv_powersave_wakeup: @@ -416,6 +410,8 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300) li r0,PNV_THREAD_RUNNING stb r0,PACA_THREAD_IDLE_STATE(r13) /* Clear thread state */ + mr r3,r12 + #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE li r0,KVM_HWTHREAD_IN_KERNEL stb r0,HSTATE_HWTHREAD_STATE(r13) @@ -429,7 +425,6 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300) #endif /* Return SRR1 from power7_nap() */ - mfspr r3,SPRN_SRR1 blt cr3,pnv_wakeup_noloss b pnv_wakeup_loss @@ -529,9 +524,9 @@ pnv_wakeup_tb_loss: * is required to return back to reset vector after hypervisor state * restore is complete. */ + mr r19,r12 mr r18,r4 mflr r17 - mfspr r16,SPRN_SRR1 BEGIN_FTR_SECTION CHECK_HMI_INTERRUPT END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) @@ -781,7 +776,7 @@ BEGIN_FTR_SECTION END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) hypervisor_state_restored: - mtspr SPRN_SRR1,r16 + mr r12,r19 mtlr r17 blr /* return to pnv_powersave_wakeup */ @@ -794,6 +789,7 @@ fastsleep_workaround_at_exit: /* * R3 here contains the value that will be returned to the caller * of power7_nap. + * R12 contains SRR1 for CHECK_HMI_INTERRUPT. */ .global pnv_wakeup_loss pnv_wakeup_loss: @@ -803,32 +799,33 @@ BEGIN_FTR_SECTION END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) REST_NVGPRS(r1) REST_GPR(2, r1) + ld r4,PACAKMSR(r13) + ld r5,_LINK(r1) ld r6,_CCR(r1) - ld r4,_MSR(r1) - ld r5,_NIP(r1) addi r1,r1,INT_FRAME_SIZE + mtlr r5 mtcr r6 - mtspr SPRN_SRR1,r4 - mtspr SPRN_SRR0,r5 - rfid + mtmsrd r4 + blr /* * R3 here contains the value that will be returned to the caller * of power7_nap. + * R12 contains SRR1 for CHECK_HMI_INTERRUPT. */ pnv_wakeup_noloss: lbz r0,PACA_NAPSTATELOST(r13) cmpwi r0,0 bne pnv_wakeup_loss + ld r1,PACAR1(r13) BEGIN_FTR_SECTION CHECK_HMI_INTERRUPT END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) - ld r1,PACAR1(r13) - ld r6,_CCR(r1) - ld r4,_MSR(r1) + ld r4,PACAKMSR(r13) ld r5,_NIP(r1) + ld r6,_CCR(r1) addi r1,r1,INT_FRAME_SIZE + mtlr r5 mtcr r6 - mtspr SPRN_SRR1,r4 - mtspr SPRN_SRR0,r5 - rfid + mtmsrd r4 + blr diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index bdb3f76ceb6b..ecb69c4ee943 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -329,15 +329,21 @@ kvm_novcpu_exit: * We come in here when wakened from nap mode. * Relocation is off and most register values are lost. * r13 points to the PACA. + * r3 contains the SRR1 wakeup value, SRR1 is trashed. */ .globl kvm_start_guest kvm_start_guest: - /* Set runlatch bit the minute you wake up from nap */ mfspr r0, SPRN_CTRLF ori r0, r0, 1 mtspr SPRN_CTRLT, r0 + /* + * Could avoid this and pass it through in r3. For now, + * code expects it to be in SRR1. + */ + mtspr SPRN_SRR1,r3 + ld r2,PACATOC(r13) li r0,KVM_HWTHREAD_IN_KVM @@ -456,13 +462,15 @@ kvm_no_guest: /* * We jump to pnv_wakeup_loss, which will return to the caller * of power7_nap in the powernv cpu offline loop. The value we - * put in r3 becomes the return value for power7_nap. + * put in r3 becomes the return value for power7_nap. pnv_wakeup_loss + * requires SRR1 in r12. */ li r3, LPCR_PECE0 mfspr r4, SPRN_LPCR rlwimi r4, r3, 0, LPCR_PECE0 | LPCR_PECE1 mtspr SPRN_LPCR, r4 li r3, 0 + mfspr r12,SPRN_SRR1 b pnv_wakeup_loss 53: HMT_LOW From 95acdc07124f329ef3088a9bc68af905804b2e6b Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 13 Jun 2017 23:05:52 +1000 Subject: [PATCH 074/158] powerpc/64s/idle: Predict HMI wakeup as unlikely In a busy system, idle wakeups can be expected from IPIs and device interrupts. Reviewed-by: Gautham R. Shenoy Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/idle_book3s.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S index ebe80b5d5ce4..1ea14b96f126 100644 --- a/arch/powerpc/kernel/idle_book3s.S +++ b/arch/powerpc/kernel/idle_book3s.S @@ -306,7 +306,7 @@ FTR_SECTION_ELSE_NESTED(66); \ rlwinm r0,r12,45-31,0xe; /* P7 wake reason field is 3 bits */ \ ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_207S, 66); \ cmpwi r0,0xa; /* Hypervisor maintenance ? */ \ - bne 20f; \ + bne+ 20f; \ /* Invoke opal call to handle hmi */ \ ld r2,PACATOC(r13); \ ld r1,PACAR1(r13); \ From 40d24343a8926b6998a13d04aa54e04961d1f5ed Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 13 Jun 2017 23:05:57 +1000 Subject: [PATCH 075/158] powerpc/64s/idle: Run latch switch is done with MSR[EE]=0 In the idle sleep/wake code we know that MSR[EE] is clear, so we can avoid 2 x mfmsr and 2 x mtmsr by calling the double-underscore versions of the run latch routines which assume interrupts are already disabled. Acked-by: Vaidyanathan Srinivasan Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/idle.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index 1028df82cd2f..2abee070373f 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -291,9 +291,9 @@ static unsigned long __power7_idle_type(unsigned long type) if (!prep_irq_for_idle_irqsoff()) return 0; - ppc64_runlatch_off(); + __ppc64_runlatch_off(); srr1 = power7_idle_insn(type); - ppc64_runlatch_on(); + __ppc64_runlatch_on(); fini_irq_for_idle_irqsoff(); @@ -328,9 +328,9 @@ static unsigned long __power9_idle_type(unsigned long stop_psscr_val, psscr = mfspr(SPRN_PSSCR); psscr = (psscr & ~stop_psscr_mask) | stop_psscr_val; - ppc64_runlatch_off(); + __ppc64_runlatch_off(); srr1 = power9_idle_stop(psscr); - ppc64_runlatch_on(); + __ppc64_runlatch_on(); fini_irq_for_idle_irqsoff(); @@ -365,7 +365,7 @@ unsigned long pnv_cpu_offline(unsigned int cpu) unsigned long srr1; u32 idle_states = pnv_get_supported_cpuidle_states(); - ppc64_runlatch_off(); + __ppc64_runlatch_off(); if (cpu_has_feature(CPU_FTR_ARCH_300) && deepest_stop_found) { unsigned long psscr; @@ -392,7 +392,7 @@ unsigned long pnv_cpu_offline(unsigned int cpu) HMT_medium(); } - ppc64_runlatch_on(); + __ppc64_runlatch_on(); return srr1; } From d59afffdf04c66c09085160706297ec55833533c Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Sun, 21 May 2017 23:15:42 +1000 Subject: [PATCH 076/158] powerpc/64s: Preserve r3 in slb_allocate_realmode() One fewer registers clobbered by this function means the SLB miss handler can save one fewer. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/mm/slb_low.S | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S index 1519617aab36..9869b44a04dc 100644 --- a/arch/powerpc/mm/slb_low.S +++ b/arch/powerpc/mm/slb_low.S @@ -70,6 +70,7 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_68_BIT_VA) * Create an SLB entry for the given EA (user or kernel). * r3 = faulting address, r13 = PACA * r9, r10, r11 are clobbered by this function + * r3 is preserved. * No other registers are examined or changed. */ _GLOBAL(slb_allocate_realmode) @@ -235,6 +236,9 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT) * dont have any LRU information to help us choose a slot. */ + mr r9,r3 + + /* slb_finish_load_1T continues here. r9=EA with non-ESID bits clear */ 7: ld r10,PACASTABRR(r13) addi r10,r10,1 /* This gets soft patched on boot. */ @@ -249,10 +253,10 @@ slb_compare_rr_to_size: std r10,PACASTABRR(r13) 3: - rldimi r3,r10,0,36 /* r3= EA[0:35] | entry */ - oris r10,r3,SLB_ESID_V@h /* r3 |= SLB_ESID_V */ + rldimi r9,r10,0,36 /* r9 = EA[0:35] | entry */ + oris r10,r9,SLB_ESID_V@h /* r10 = r9 | SLB_ESID_V */ - /* r3 = ESID data, r11 = VSID data */ + /* r9 = ESID data, r11 = VSID data */ /* * No need for an isync before or after this slbmte. The exception @@ -265,21 +269,21 @@ slb_compare_rr_to_size: bgelr cr7 /* Update the slb cache */ - lhz r3,PACASLBCACHEPTR(r13) /* offset = paca->slb_cache_ptr */ - cmpldi r3,SLB_CACHE_ENTRIES + lhz r9,PACASLBCACHEPTR(r13) /* offset = paca->slb_cache_ptr */ + cmpldi r9,SLB_CACHE_ENTRIES bge 1f /* still room in the slb cache */ - sldi r11,r3,2 /* r11 = offset * sizeof(u32) */ + sldi r11,r9,2 /* r11 = offset * sizeof(u32) */ srdi r10,r10,28 /* get the 36 bits of the ESID */ add r11,r11,r13 /* r11 = (u32 *)paca + offset */ stw r10,PACASLBCACHE(r11) /* paca->slb_cache[offset] = esid */ - addi r3,r3,1 /* offset++ */ + addi r9,r9,1 /* offset++ */ b 2f 1: /* offset >= SLB_CACHE_ENTRIES */ - li r3,SLB_CACHE_ENTRIES+1 + li r9,SLB_CACHE_ENTRIES+1 2: - sth r3,PACASLBCACHEPTR(r13) /* paca->slb_cache_ptr = offset */ + sth r9,PACASLBCACHEPTR(r13) /* paca->slb_cache_ptr = offset */ crclr 4*cr0+eq /* set result to "success" */ blr @@ -301,7 +305,7 @@ slb_compare_rr_to_size: rldimi r11,r10,SLB_VSID_SSIZE_SHIFT,0 /* insert segment size */ /* r3 = EA, r11 = VSID data */ - clrrdi r3,r3,SID_SHIFT_1T /* clear out non-ESID bits */ + clrrdi r9,r3,SID_SHIFT_1T /* clear out non-ESID bits */ b 7b From 7c28f04828dc8321cb234b2ad57266b9f902add0 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Sun, 21 May 2017 23:15:43 +1000 Subject: [PATCH 077/158] powerpc/64s: Avoid saving faulting address into EX_DAR in SLB miss The EX_DAR save area is only used in exceptional cases. With r3 no longer clobbered by slb_allocate_realmode, saving faulting address to EX_DAR can be deferred to those cases. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/exceptions-64s.S | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 64365907cddc..fe3bc52aadf8 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -631,7 +631,6 @@ EXC_COMMON_BEGIN(slb_miss_realmode) stw r9,PACA_EXSLB+EX_CCR(r13) /* save CR in exc. frame */ std r10,PACA_EXSLB+EX_LR(r13) /* save LR */ - std r3,PACA_EXSLB+EX_DAR(r13) crset 4*cr0+eq #ifdef CONFIG_PPC_STD_MMU_64 @@ -641,11 +640,10 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX) #endif ld r10,PACA_EXSLB+EX_LR(r13) - ld r3,PACA_EXSLB+EX_R3(r13) lwz r9,PACA_EXSLB+EX_CCR(r13) /* get saved CR */ mtlr r10 - beq 8f /* if bad address, make full stack frame */ + beq- 8f /* if bad address, make full stack frame */ andi. r10,r12,MSR_RI /* check for unrecoverable exception */ beq- 2f @@ -660,6 +658,7 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX) .machine pop RESTORE_PPR_PACA(PACA_EXSLB, r9) + ld r3,PACA_EXSLB+EX_R3(r13) ld r9,PACA_EXSLB+EX_R9(r13) ld r10,PACA_EXSLB+EX_R10(r13) ld r11,PACA_EXSLB+EX_R11(r13) @@ -668,7 +667,9 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX) rfid b . /* prevent speculative execution */ -2: mfspr r11,SPRN_SRR0 +2: std r3,PACA_EXSLB+EX_DAR(r13) + ld r3,PACA_EXSLB+EX_R3(r13) + mfspr r11,SPRN_SRR0 LOAD_HANDLER(r10,unrecov_slb) mtspr SPRN_SRR0,r10 ld r10,PACAKMSR(r13) @@ -676,7 +677,9 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX) rfid b . -8: mfspr r11,SPRN_SRR0 +8: std r3,PACA_EXSLB+EX_DAR(r13) + ld r3,PACA_EXSLB+EX_R3(r13) + mfspr r11,SPRN_SRR0 LOAD_HANDLER(r10,bad_addr_slb) mtspr SPRN_SRR0,r10 ld r10,PACAKMSR(r13) From fe5482c04312791bb19202e47f8a7751d476251e Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Sun, 21 May 2017 23:15:44 +1000 Subject: [PATCH 078/158] powerpc/64s: SLB miss already has CTR saved for relocatable kernel The EXCEPTION_PROLOG_1 used by SLB miss already saves CTR when the kernel is built with CONFIG_RELOCATABLE. So it does not have to be saved and reloaded when branching to slb_miss_realmode. It can be restored from the PACA as usual. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/exceptions-64s.S | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index fe3bc52aadf8..059b3a356250 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -522,7 +522,6 @@ EXC_REAL_BEGIN(data_access_slb, 0x380, 0x80) * because the distance from here to there depends on where * the kernel ends up being put. */ - mfctr r11 LOAD_HANDLER(r10, slb_miss_realmode) mtctr r10 bctr @@ -545,7 +544,6 @@ EXC_VIRT_BEGIN(data_access_slb, 0x4380, 0x80) * because the distance from here to there depends on where * the kernel ends up being put. */ - mfctr r11 LOAD_HANDLER(r10, slb_miss_realmode) mtctr r10 bctr @@ -585,7 +583,6 @@ EXC_REAL_BEGIN(instruction_access_slb, 0x480, 0x80) #ifndef CONFIG_RELOCATABLE b slb_miss_realmode #else - mfctr r11 LOAD_HANDLER(r10, slb_miss_realmode) mtctr r10 bctr @@ -603,7 +600,6 @@ EXC_VIRT_BEGIN(instruction_access_slb, 0x4480, 0x80) #ifndef CONFIG_RELOCATABLE b slb_miss_realmode #else - mfctr r11 LOAD_HANDLER(r10, slb_miss_realmode) mtctr r10 bctr @@ -625,10 +621,6 @@ EXC_COMMON_BEGIN(slb_miss_realmode) * procedure. */ mflr r10 -#ifdef CONFIG_RELOCATABLE - mtctr r11 -#endif - stw r9,PACA_EXSLB+EX_CCR(r13) /* save CR in exc. frame */ std r10,PACA_EXSLB+EX_LR(r13) /* save LR */ @@ -657,6 +649,7 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX) mtcrf 0x01,r9 /* slb_allocate uses cr0 and cr7 */ .machine pop + RESTORE_CTR(r9, PACA_EXSLB) RESTORE_PPR_PACA(PACA_EXSLB, r9) ld r3,PACA_EXSLB+EX_R3(r13) ld r9,PACA_EXSLB+EX_R9(r13) From 4d7cd3b956713d3dfbc3028ad1251b3f6b416a53 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Sun, 21 May 2017 23:15:45 +1000 Subject: [PATCH 079/158] powerpc/64s: Avoid r3 save/restore in SLB miss handler The SLB miss handler uses r3 for the faulting address but r12 is mostly able to be freed up to save r3 in. It just requires SRR1 be reloaded again on error. It would be more conventional to use r12 for SRR1 (and use r11 to save r3), but slb_allocate_realmode clobbers r11 and not r12. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/exceptions-64s.S | 41 ++++++++++++++++++---------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 059b3a356250..ed8628c6f0f4 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -510,9 +510,9 @@ EXC_REAL_BEGIN(data_access_slb, 0x380, 0x80) SET_SCRATCH0(r13) EXCEPTION_PROLOG_0(PACA_EXSLB) EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_PR, 0x380) - std r3,PACA_EXSLB+EX_R3(r13) + mr r12,r3 /* save r3 */ mfspr r3,SPRN_DAR - mfspr r12,SPRN_SRR1 + mfspr r11,SPRN_SRR1 crset 4*cr6+eq #ifndef CONFIG_RELOCATABLE b slb_miss_realmode @@ -532,9 +532,9 @@ EXC_VIRT_BEGIN(data_access_slb, 0x4380, 0x80) SET_SCRATCH0(r13) EXCEPTION_PROLOG_0(PACA_EXSLB) EXCEPTION_PROLOG_1(PACA_EXSLB, NOTEST, 0x380) - std r3,PACA_EXSLB+EX_R3(r13) + mr r12,r3 /* save r3 */ mfspr r3,SPRN_DAR - mfspr r12,SPRN_SRR1 + mfspr r11,SPRN_SRR1 crset 4*cr6+eq #ifndef CONFIG_RELOCATABLE b slb_miss_realmode @@ -576,9 +576,9 @@ EXC_REAL_BEGIN(instruction_access_slb, 0x480, 0x80) SET_SCRATCH0(r13) EXCEPTION_PROLOG_0(PACA_EXSLB) EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_PR, 0x480) - std r3,PACA_EXSLB+EX_R3(r13) + mr r12,r3 /* save r3 */ mfspr r3,SPRN_SRR0 /* SRR0 is faulting address */ - mfspr r12,SPRN_SRR1 + mfspr r11,SPRN_SRR1 crclr 4*cr6+eq #ifndef CONFIG_RELOCATABLE b slb_miss_realmode @@ -593,9 +593,9 @@ EXC_VIRT_BEGIN(instruction_access_slb, 0x4480, 0x80) SET_SCRATCH0(r13) EXCEPTION_PROLOG_0(PACA_EXSLB) EXCEPTION_PROLOG_1(PACA_EXSLB, NOTEST, 0x480) - std r3,PACA_EXSLB+EX_R3(r13) + mr r12,r3 /* save r3 */ mfspr r3,SPRN_SRR0 /* SRR0 is faulting address */ - mfspr r12,SPRN_SRR1 + mfspr r11,SPRN_SRR1 crclr 4*cr6+eq #ifndef CONFIG_RELOCATABLE b slb_miss_realmode @@ -612,10 +612,10 @@ TRAMP_KVM(PACA_EXSLB, 0x480) EXC_COMMON_BEGIN(slb_miss_realmode) /* * r13 points to the PACA, r9 contains the saved CR, - * r12 contain the saved SRR1, SRR0 is still ready for return + * r12 contains the saved r3, + * r11 contain the saved SRR1, SRR0 is still ready for return * r3 has the faulting address * r9 - r13 are saved in paca->exslb. - * r3 is saved in paca->slb_r3 * cr6.eq is set for a D-SLB miss, clear for a I-SLB miss * We assume we aren't going to take any exceptions during this * procedure. @@ -624,6 +624,15 @@ EXC_COMMON_BEGIN(slb_miss_realmode) stw r9,PACA_EXSLB+EX_CCR(r13) /* save CR in exc. frame */ std r10,PACA_EXSLB+EX_LR(r13) /* save LR */ + /* + * Test MSR_RI before calling slb_allocate_realmode, because the + * MSR in r11 gets clobbered. However we still want to allocate + * SLB in case MSR_RI=0, to minimise the risk of getting stuck in + * recursive SLB faults. So use cr5 for this, which is preserved. + */ + andi. r11,r11,MSR_RI /* check for unrecoverable exception */ + cmpdi cr5,r11,MSR_RI + crset 4*cr0+eq #ifdef CONFIG_PPC_STD_MMU_64 BEGIN_MMU_FTR_SECTION @@ -637,21 +646,21 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX) beq- 8f /* if bad address, make full stack frame */ - andi. r10,r12,MSR_RI /* check for unrecoverable exception */ - beq- 2f + bne- cr5,2f /* if unrecoverable exception, oops */ /* All done -- return from exception. */ .machine push .machine "power4" mtcrf 0x80,r9 + mtcrf 0x04,r9 /* MSR[RI] indication is in cr5 */ mtcrf 0x02,r9 /* I/D indication is in cr6 */ mtcrf 0x01,r9 /* slb_allocate uses cr0 and cr7 */ .machine pop RESTORE_CTR(r9, PACA_EXSLB) RESTORE_PPR_PACA(PACA_EXSLB, r9) - ld r3,PACA_EXSLB+EX_R3(r13) + mr r3,r12 ld r9,PACA_EXSLB+EX_R9(r13) ld r10,PACA_EXSLB+EX_R10(r13) ld r11,PACA_EXSLB+EX_R11(r13) @@ -661,8 +670,9 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX) b . /* prevent speculative execution */ 2: std r3,PACA_EXSLB+EX_DAR(r13) - ld r3,PACA_EXSLB+EX_R3(r13) + mr r3,r12 mfspr r11,SPRN_SRR0 + mfspr r12,SPRN_SRR1 LOAD_HANDLER(r10,unrecov_slb) mtspr SPRN_SRR0,r10 ld r10,PACAKMSR(r13) @@ -671,8 +681,9 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX) b . 8: std r3,PACA_EXSLB+EX_DAR(r13) - ld r3,PACA_EXSLB+EX_R3(r13) + mr r3,r12 mfspr r11,SPRN_SRR0 + mfspr r12,SPRN_SRR1 LOAD_HANDLER(r10,bad_addr_slb) mtspr SPRN_SRR0,r10 ld r10,PACAKMSR(r13) From 8c3885141537966065e3d2b9be03e574ae381c79 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Sun, 21 May 2017 23:15:46 +1000 Subject: [PATCH 080/158] powerpc/64s: Add EX_SIZE definition for paca exception save areas Rather than open-coding it 4 times. Signed-off-by: Nicholas Piggin [mpe: Move __ASSEMBLY__ guards into head-64.h where they're really needed] Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/exception-64s.h | 3 +++ arch/powerpc/include/asm/head-64.h | 3 +++ arch/powerpc/include/asm/paca.h | 12 ++++++++---- 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index 33473cbc0986..9f7f9be6bf7a 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -36,6 +36,7 @@ */ #include +/* PACA save area offsets (exgen, exmc, etc) */ #define EX_R9 0 #define EX_R10 8 #define EX_R11 16 @@ -51,6 +52,8 @@ #define EX_PPR 88 /* SMT thread status register (priority) */ #define EX_CTR 96 +#define EX_SIZE 13 /* size in u64 units */ + #ifdef CONFIG_RELOCATABLE #define __EXCEPTION_RELON_PROLOG_PSERIES_1(label, h) \ mfspr r11,SPRN_##h##SRR0; /* save SRR0 */ \ diff --git a/arch/powerpc/include/asm/head-64.h b/arch/powerpc/include/asm/head-64.h index 7ab95798f170..d81eac5b509f 100644 --- a/arch/powerpc/include/asm/head-64.h +++ b/arch/powerpc/include/asm/head-64.h @@ -3,6 +3,7 @@ #include +#ifdef __ASSEMBLY__ /* * We can't do CPP stringification and concatination directly into the section * name for some reason, so these macros can do it for us. @@ -415,4 +416,6 @@ name: EXC_COMMON_BEGIN(name); \ STD_EXCEPTION_COMMON(realvec + 0x2, name, hdlr); \ +#endif /* __ASSEMBLY__ */ + #endif /* _ASM_POWERPC_HEAD_64_H */ diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index 77f60a0f1405..dc88a31cc79a 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -21,7 +21,11 @@ #include #include #include +#ifdef CONFIG_PPC_BOOK3E #include +#else +#include +#endif #ifdef CONFIG_KVM_BOOK3S_64_HANDLER #include #endif @@ -98,8 +102,8 @@ struct paca_struct { * Now, starting in cacheline 2, the exception save areas */ /* used for most interrupts/exceptions */ - u64 exgen[13] __attribute__((aligned(0x80))); - u64 exslb[13]; /* used for SLB/segment table misses + u64 exgen[EX_SIZE] __attribute__((aligned(0x80))); + u64 exslb[EX_SIZE]; /* used for SLB/segment table misses * on the linear mapping */ /* SLB related definitions */ u16 vmalloc_sllp; @@ -183,8 +187,8 @@ struct paca_struct { #ifdef CONFIG_PPC_STD_MMU_64 /* Non-maskable exceptions that are not performance critical */ - u64 exnmi[13]; /* used for system reset (nmi) */ - u64 exmc[13]; /* used for machine checks */ + u64 exnmi[EX_SIZE]; /* used for system reset (nmi) */ + u64 exmc[EX_SIZE]; /* used for machine checks */ #endif #ifdef CONFIG_PPC_BOOK3S_64 /* Exclusive stacks for system reset and machine check exception. */ From 36670fcf01aa22c7de2e96b4a6fb5fbd4dfe4a33 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Sun, 21 May 2017 23:15:47 +1000 Subject: [PATCH 081/158] powerpc/64s/paca: EX_SRR0 is unused, remove it Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/exception-64s.h | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index 9f7f9be6bf7a..f3c6272f430f 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -42,17 +42,16 @@ #define EX_R11 16 #define EX_R12 24 #define EX_R13 32 -#define EX_SRR0 40 -#define EX_DAR 48 -#define EX_DSISR 56 -#define EX_CCR 60 -#define EX_R3 64 -#define EX_LR 72 -#define EX_CFAR 80 -#define EX_PPR 88 /* SMT thread status register (priority) */ -#define EX_CTR 96 +#define EX_DAR 40 +#define EX_DSISR 48 +#define EX_CCR 52 +#define EX_R3 56 +#define EX_LR 64 +#define EX_CFAR 72 +#define EX_PPR 80 /* SMT thread status register (priority) */ +#define EX_CTR 88 -#define EX_SIZE 13 /* size in u64 units */ +#define EX_SIZE 12 /* size in u64 units */ #ifdef CONFIG_RELOCATABLE #define __EXCEPTION_RELON_PROLOG_PSERIES_1(label, h) \ From dbeea1d6b4bd9fff10e125e5516156fb52ddeae8 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Sun, 21 May 2017 23:15:48 +1000 Subject: [PATCH 082/158] powerpc/64s/paca: EX_LR can be merged with EX_DAR EX_LR is used only for a small section of the SLB miss handler. Merge it with EX_DAR. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/exception-64s.h | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index f3c6272f430f..90f4b771df02 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -46,12 +46,19 @@ #define EX_DSISR 48 #define EX_CCR 52 #define EX_R3 56 -#define EX_LR 64 -#define EX_CFAR 72 -#define EX_PPR 80 /* SMT thread status register (priority) */ -#define EX_CTR 88 +#define EX_CFAR 64 +#define EX_PPR 72 +#define EX_CTR 80 -#define EX_SIZE 12 /* size in u64 units */ +#define EX_SIZE 11 /* size in u64 units */ + +/* + * EX_LR is only used in EXSLB and where it does not overlap with EX_DAR + * EX_CCR similarly with DSISR, but being 4 byte registers there is a hole + * in the save area so it's not necessary to overlap them. Could be used + * for future savings though if another 4 byte register was to be saved. + */ +#define EX_LR EX_DAR #ifdef CONFIG_RELOCATABLE #define __EXCEPTION_RELON_PROLOG_PSERIES_1(label, h) \ From 635942ae53cc5d4049c2c4a5bd2ad819e3a47ee5 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Sun, 21 May 2017 23:15:49 +1000 Subject: [PATCH 083/158] powerpc/64s/paca: EX_R3 can be merged with EX_DAR EX_R3 is used only for a small section of the bad stack handler. Merge it with EX_DAR. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/exception-64s.h | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index 90f4b771df02..f42a49a274a6 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -45,12 +45,11 @@ #define EX_DAR 40 #define EX_DSISR 48 #define EX_CCR 52 -#define EX_R3 56 -#define EX_CFAR 64 -#define EX_PPR 72 -#define EX_CTR 80 +#define EX_CFAR 56 +#define EX_PPR 64 +#define EX_CTR 72 -#define EX_SIZE 11 /* size in u64 units */ +#define EX_SIZE 10 /* size in u64 units */ /* * EX_LR is only used in EXSLB and where it does not overlap with EX_DAR @@ -60,6 +59,13 @@ */ #define EX_LR EX_DAR +/* + * EX_R3 is only used by the bad_stack handler. bad_stack reloads and + * saves DAR from SPRN_DAR, and EX_DAR is not used. So EX_R3 can overlap + * with EX_DAR. + */ +#define EX_R3 EX_DAR + #ifdef CONFIG_RELOCATABLE #define __EXCEPTION_RELON_PROLOG_PSERIES_1(label, h) \ mfspr r11,SPRN_##h##SRR0; /* save SRR0 */ \ From 8568f1e02624e6ce34644369e6ca137d10e32a88 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Sun, 21 May 2017 23:15:50 +1000 Subject: [PATCH 084/158] powerpc/64s/paca: EX_CTR is not used with RELOCATABLE=n, remove it Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/exception-64s.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index f42a49a274a6..9a318973af05 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -47,9 +47,12 @@ #define EX_CCR 52 #define EX_CFAR 56 #define EX_PPR 64 +#if defined(CONFIG_RELOCATABLE) #define EX_CTR 72 - #define EX_SIZE 10 /* size in u64 units */ +#else +#define EX_SIZE 9 /* size in u64 units */ +#endif /* * EX_LR is only used in EXSLB and where it does not overlap with EX_DAR From b102063b47d59752e113c5588422279c75eadd4d Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Mon, 19 Jun 2017 21:47:11 +1000 Subject: [PATCH 085/158] powerpc/64s: Use BRANCH_TO_COMMON() for slb_miss_realmode All the callers of slb_miss_realmode currently open code the #ifndef CONFIG_RELOCATABLE check and the branch via CTR in the RELOCATABLE case. We have a macro to do this, BRANCH_TO_COMMON(), so use it. Signed-off-by: Michael Ellerman Reviewed-by: Nicholas Piggin --- arch/powerpc/kernel/exceptions-64s.S | 42 +++------------------------- 1 file changed, 4 insertions(+), 38 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index ed8628c6f0f4..7bdfddbe0328 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -514,18 +514,7 @@ EXC_REAL_BEGIN(data_access_slb, 0x380, 0x80) mfspr r3,SPRN_DAR mfspr r11,SPRN_SRR1 crset 4*cr6+eq -#ifndef CONFIG_RELOCATABLE - b slb_miss_realmode -#else - /* - * We can't just use a direct branch to slb_miss_realmode - * because the distance from here to there depends on where - * the kernel ends up being put. - */ - LOAD_HANDLER(r10, slb_miss_realmode) - mtctr r10 - bctr -#endif + BRANCH_TO_COMMON(r10, slb_miss_realmode) EXC_REAL_END(data_access_slb, 0x380, 0x80) EXC_VIRT_BEGIN(data_access_slb, 0x4380, 0x80) @@ -536,18 +525,7 @@ EXC_VIRT_BEGIN(data_access_slb, 0x4380, 0x80) mfspr r3,SPRN_DAR mfspr r11,SPRN_SRR1 crset 4*cr6+eq -#ifndef CONFIG_RELOCATABLE - b slb_miss_realmode -#else - /* - * We can't just use a direct branch to slb_miss_realmode - * because the distance from here to there depends on where - * the kernel ends up being put. - */ - LOAD_HANDLER(r10, slb_miss_realmode) - mtctr r10 - bctr -#endif + BRANCH_TO_COMMON(r10, slb_miss_realmode) EXC_VIRT_END(data_access_slb, 0x4380, 0x80) TRAMP_KVM_SKIP(PACA_EXSLB, 0x380) @@ -580,13 +558,7 @@ EXC_REAL_BEGIN(instruction_access_slb, 0x480, 0x80) mfspr r3,SPRN_SRR0 /* SRR0 is faulting address */ mfspr r11,SPRN_SRR1 crclr 4*cr6+eq -#ifndef CONFIG_RELOCATABLE - b slb_miss_realmode -#else - LOAD_HANDLER(r10, slb_miss_realmode) - mtctr r10 - bctr -#endif + BRANCH_TO_COMMON(r10, slb_miss_realmode) EXC_REAL_END(instruction_access_slb, 0x480, 0x80) EXC_VIRT_BEGIN(instruction_access_slb, 0x4480, 0x80) @@ -597,13 +569,7 @@ EXC_VIRT_BEGIN(instruction_access_slb, 0x4480, 0x80) mfspr r3,SPRN_SRR0 /* SRR0 is faulting address */ mfspr r11,SPRN_SRR1 crclr 4*cr6+eq -#ifndef CONFIG_RELOCATABLE - b slb_miss_realmode -#else - LOAD_HANDLER(r10, slb_miss_realmode) - mtctr r10 - bctr -#endif + BRANCH_TO_COMMON(r10, slb_miss_realmode) EXC_VIRT_END(instruction_access_slb, 0x4480, 0x80) TRAMP_KVM(PACA_EXSLB, 0x480) From 442b6e8e03e44dc3a33142ab3b8d0b3395053274 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Mon, 19 Jun 2017 21:52:03 +1000 Subject: [PATCH 086/158] powerpc/64s: Rename slb_miss_realmode() to slb_miss_common() slb_miss_realmode() doesn't always runs in real mode, which is what the name implies. So rename it to avoid confusing people. Signed-off-by: Michael Ellerman Reviewed-by: Nicholas Piggin --- arch/powerpc/kernel/exceptions-64s.S | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 7bdfddbe0328..6ad755e0cb29 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -514,7 +514,7 @@ EXC_REAL_BEGIN(data_access_slb, 0x380, 0x80) mfspr r3,SPRN_DAR mfspr r11,SPRN_SRR1 crset 4*cr6+eq - BRANCH_TO_COMMON(r10, slb_miss_realmode) + BRANCH_TO_COMMON(r10, slb_miss_common) EXC_REAL_END(data_access_slb, 0x380, 0x80) EXC_VIRT_BEGIN(data_access_slb, 0x4380, 0x80) @@ -525,7 +525,7 @@ EXC_VIRT_BEGIN(data_access_slb, 0x4380, 0x80) mfspr r3,SPRN_DAR mfspr r11,SPRN_SRR1 crset 4*cr6+eq - BRANCH_TO_COMMON(r10, slb_miss_realmode) + BRANCH_TO_COMMON(r10, slb_miss_common) EXC_VIRT_END(data_access_slb, 0x4380, 0x80) TRAMP_KVM_SKIP(PACA_EXSLB, 0x380) @@ -558,7 +558,7 @@ EXC_REAL_BEGIN(instruction_access_slb, 0x480, 0x80) mfspr r3,SPRN_SRR0 /* SRR0 is faulting address */ mfspr r11,SPRN_SRR1 crclr 4*cr6+eq - BRANCH_TO_COMMON(r10, slb_miss_realmode) + BRANCH_TO_COMMON(r10, slb_miss_common) EXC_REAL_END(instruction_access_slb, 0x480, 0x80) EXC_VIRT_BEGIN(instruction_access_slb, 0x4480, 0x80) @@ -569,13 +569,16 @@ EXC_VIRT_BEGIN(instruction_access_slb, 0x4480, 0x80) mfspr r3,SPRN_SRR0 /* SRR0 is faulting address */ mfspr r11,SPRN_SRR1 crclr 4*cr6+eq - BRANCH_TO_COMMON(r10, slb_miss_realmode) + BRANCH_TO_COMMON(r10, slb_miss_common) EXC_VIRT_END(instruction_access_slb, 0x4480, 0x80) TRAMP_KVM(PACA_EXSLB, 0x480) -/* This handler is used by both 0x380 and 0x480 slb miss interrupts */ -EXC_COMMON_BEGIN(slb_miss_realmode) +/* + * This handler is used by the 0x380 and 0x480 SLB miss interrupts, as well as + * the virtual mode 0x4380 and 0x4480 interrupts if AIL is enabled. + */ +EXC_COMMON_BEGIN(slb_miss_common) /* * r13 points to the PACA, r9 contains the saved CR, * r12 contains the saved r3, From fd88b945c18866962645d09dfc401e91e13b1909 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Mon, 19 Jun 2017 21:57:33 +1000 Subject: [PATCH 087/158] powerpc/64s: Rename slb_allocate_realmode() to slb_allocate() As for slb_miss_realmode(), rename slb_allocate_realmode() to avoid confusion over whether it runs in real or virtual mode - it runs in both. Signed-off-by: Michael Ellerman Reviewed-by: Nicholas Piggin --- arch/powerpc/kernel/exceptions-64s.S | 2 +- arch/powerpc/mm/slb.c | 10 +--------- arch/powerpc/mm/slb_low.S | 6 +++--- 3 files changed, 5 insertions(+), 13 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 6ad755e0cb29..07b79c2c70f8 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -605,7 +605,7 @@ EXC_COMMON_BEGIN(slb_miss_common) crset 4*cr0+eq #ifdef CONFIG_PPC_STD_MMU_64 BEGIN_MMU_FTR_SECTION - bl slb_allocate_realmode + bl slb_allocate END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX) #endif diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c index 654a0d7ba0e7..13cfe413b40d 100644 --- a/arch/powerpc/mm/slb.c +++ b/arch/powerpc/mm/slb.c @@ -33,15 +33,7 @@ enum slb_index { KSTACK_INDEX = 2, /* Kernel stack map */ }; -extern void slb_allocate_realmode(unsigned long ea); - -static void slb_allocate(unsigned long ea) -{ - /* Currently, we do real mode for all SLBs including user, but - * that will change if we bring back dynamic VSIDs - */ - slb_allocate_realmode(ea); -} +extern void slb_allocate(unsigned long ea); #define slb_esid_mask(ssize) \ (((ssize) == MMU_SEGSIZE_256M)? ESID_MASK: ESID_MASK_1T) diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S index 9869b44a04dc..bde378559d01 100644 --- a/arch/powerpc/mm/slb_low.S +++ b/arch/powerpc/mm/slb_low.S @@ -65,7 +65,7 @@ MMU_FTR_SECTION_ELSE \ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_68_BIT_VA) -/* void slb_allocate_realmode(unsigned long ea); +/* void slb_allocate(unsigned long ea); * * Create an SLB entry for the given EA (user or kernel). * r3 = faulting address, r13 = PACA @@ -73,7 +73,7 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_68_BIT_VA) * r3 is preserved. * No other registers are examined or changed. */ -_GLOBAL(slb_allocate_realmode) +_GLOBAL(slb_allocate) /* * check for bad kernel/user address * (ea & ~REGION_MASK) >= PGTABLE_RANGE @@ -309,7 +309,7 @@ slb_compare_rr_to_size: b 7b -_ASM_NOKPROBE_SYMBOL(slb_allocate_realmode) +_ASM_NOKPROBE_SYMBOL(slb_allocate) _ASM_NOKPROBE_SYMBOL(slb_miss_kernel_load_linear) _ASM_NOKPROBE_SYMBOL(slb_miss_kernel_load_io) _ASM_NOKPROBE_SYMBOL(slb_compare_rr_to_size) From 6b847d795cf4ab3e574f4fcf7193fe245908a195 Mon Sep 17 00:00:00 2001 From: Santosh Sivaraj Date: Tue, 20 Jun 2017 13:14:47 +0530 Subject: [PATCH 088/158] powerpc/time: Fix tracing in time.c Since trace_clock is in a different file and already marked with notrace, enable tracing in time.c by removing it from the disabled list in Makefile. Also annotate clocksource read functions and sched_clock with notrace. Testing: Timer and ftrace selftests run with different trace clocks. Acked-by: Naveen N. Rao Signed-off-by: Santosh Sivaraj Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/Makefile | 2 -- arch/powerpc/kernel/time.c | 6 +++--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index e132902e1f14..0845eebc5af3 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -25,8 +25,6 @@ CFLAGS_REMOVE_cputable.o = -mno-sched-epilog $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_prom_init.o = -mno-sched-epilog $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_btext.o = -mno-sched-epilog $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_prom.o = -mno-sched-epilog $(CC_FLAGS_FTRACE) -# timers used by tracing -CFLAGS_REMOVE_time.o = -mno-sched-epilog $(CC_FLAGS_FTRACE) endif obj-y := cputable.o ptrace.o syscalls.o \ diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 60714b8c9a2f..476a527b220d 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -675,7 +675,7 @@ EXPORT_SYMBOL_GPL(tb_to_ns); * the high 64 bits of a * b, i.e. (a * b) >> 64, where a and b * are 64-bit unsigned numbers. */ -unsigned long long sched_clock(void) +notrace unsigned long long sched_clock(void) { if (__USE_RTC()) return get_rtc(); @@ -831,12 +831,12 @@ void read_persistent_clock(struct timespec *ts) } /* clocksource code */ -static u64 rtc_read(struct clocksource *cs) +static notrace u64 rtc_read(struct clocksource *cs) { return (u64)get_rtc(); } -static u64 timebase_read(struct clocksource *cs) +static notrace u64 timebase_read(struct clocksource *cs) { return (u64)get_tb(); } From d4cfb11387ee29ba4626546c676fd25c7abbbbb2 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 27 May 2017 18:04:52 +1000 Subject: [PATCH 089/158] powerpc: Convert VDSO update function to use new update_vsyscall interface This converts the powerpc VDSO time update function to use the new interface introduced in commit 576094b7f0aa ("time: Introduce new GENERIC_TIME_VSYSCALL", 2012-09-11). Where the old interface gave us the time as of the last update in seconds and whole nanoseconds, with the new interface we get the nanoseconds part effectively in a binary fixed-point format with tk->tkr_mono.shift bits to the right of the binary point. With the old interface, the fractional nanoseconds got truncated, meaning that the value returned by the VDSO clock_gettime function would have about 1ns of jitter in it compared to the value computed by the generic timekeeping code in the kernel. The powerpc VDSO time functions (clock_gettime and gettimeofday) already work in units of 2^-32 seconds, or 0.23283 ns, because that makes it simple to split the result into seconds and fractional seconds, and represent the fractional seconds in either microseconds or nanoseconds. This is good enough accuracy for now, so this patch avoids changing how the VDSO works or the interface in the VDSO data page. This patch converts the powerpc update_vsyscall_old to be called update_vsyscall and use the new interface. We convert the fractional second to units of 2^-32 seconds without truncating to whole nanoseconds. (There is still a conversion to whole nanoseconds for any legacy users of the vdso_data/systemcfg stamp_xtime field.) In addition, this improves the accuracy of the computation of tb_to_xs for those systems with high-frequency timebase clocks (>= 268.5 MHz) by doing the right shift in two parts, one before the multiplication and one after, rather than doing the right shift before the multiplication. (We can't do all of the right shift after the multiplication unless we use 128-bit arithmetic.) Signed-off-by: Paul Mackerras Acked-by: John Stultz Signed-off-by: Michael Ellerman --- arch/powerpc/Kconfig | 2 +- arch/powerpc/kernel/time.c | 68 +++++++++++++++++++++++++++++--------- 2 files changed, 53 insertions(+), 17 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index b50d46d214f6..7ee79e0ac88e 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -155,7 +155,7 @@ config PPC select GENERIC_SMP_IDLE_THREAD select GENERIC_STRNCPY_FROM_USER select GENERIC_STRNLEN_USER - select GENERIC_TIME_VSYSCALL_OLD + select GENERIC_TIME_VSYSCALL select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_JUMP_LABEL select HAVE_ARCH_KGDB diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 476a527b220d..0cc0dad905d5 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -841,28 +841,66 @@ static notrace u64 timebase_read(struct clocksource *cs) return (u64)get_tb(); } -void update_vsyscall_old(struct timespec *wall_time, struct timespec *wtm, - struct clocksource *clock, u32 mult, u64 cycle_last) + +void update_vsyscall(struct timekeeper *tk) { + struct timespec xt; + struct clocksource *clock = tk->tkr_mono.clock; + u32 mult = tk->tkr_mono.mult; + u32 shift = tk->tkr_mono.shift; + u64 cycle_last = tk->tkr_mono.cycle_last; u64 new_tb_to_xs, new_stamp_xsec; - u32 frac_sec; + u64 frac_sec; if (clock != &clocksource_timebase) return; + xt.tv_sec = tk->xtime_sec; + xt.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift); + /* Make userspace gettimeofday spin until we're done. */ ++vdso_data->tb_update_count; smp_mb(); - /* 19342813113834067 ~= 2^(20+64) / 1e9 */ - new_tb_to_xs = (u64) mult * (19342813113834067ULL >> clock->shift); - new_stamp_xsec = (u64) wall_time->tv_nsec * XSEC_PER_SEC; - do_div(new_stamp_xsec, 1000000000); - new_stamp_xsec += (u64) wall_time->tv_sec * XSEC_PER_SEC; + /* + * This computes ((2^20 / 1e9) * mult) >> shift as a + * 0.64 fixed-point fraction. + * The computation in the else clause below won't overflow + * (as long as the timebase frequency is >= 1.049 MHz) + * but loses precision because we lose the low bits of the constant + * in the shift. Note that 19342813113834067 ~= 2^(20+64) / 1e9. + * For a shift of 24 the error is about 0.5e-9, or about 0.5ns + * over a second. (Shift values are usually 22, 23 or 24.) + * For high frequency clocks such as the 512MHz timebase clock + * on POWER[6789], the mult value is small (e.g. 32768000) + * and so we can shift the constant by 16 initially + * (295147905179 ~= 2^(20+64-16) / 1e9) and then do the + * remaining shifts after the multiplication, which gives a + * more accurate result (e.g. with mult = 32768000, shift = 24, + * the error is only about 1.2e-12, or 0.7ns over 10 minutes). + */ + if (mult <= 62500000 && clock->shift >= 16) + new_tb_to_xs = ((u64) mult * 295147905179ULL) >> (clock->shift - 16); + else + new_tb_to_xs = (u64) mult * (19342813113834067ULL >> clock->shift); - BUG_ON(wall_time->tv_nsec >= NSEC_PER_SEC); - /* this is tv_nsec / 1e9 as a 0.32 fraction */ - frac_sec = ((u64) wall_time->tv_nsec * 18446744073ULL) >> 32; + /* + * Compute the fractional second in units of 2^-32 seconds. + * The fractional second is tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift + * in nanoseconds, so multiplying that by 2^32 / 1e9 gives + * it in units of 2^-32 seconds. + * We assume shift <= 32 because clocks_calc_mult_shift() + * generates shift values in the range 0 - 32. + */ + frac_sec = tk->tkr_mono.xtime_nsec << (32 - shift); + do_div(frac_sec, NSEC_PER_SEC); + + /* + * Work out new stamp_xsec value for any legacy users of systemcfg. + * stamp_xsec is in units of 2^-20 seconds. + */ + new_stamp_xsec = frac_sec >> 12; + new_stamp_xsec += tk->xtime_sec * XSEC_PER_SEC; /* * tb_update_count is used to allow the userspace gettimeofday code @@ -872,15 +910,13 @@ void update_vsyscall_old(struct timespec *wall_time, struct timespec *wtm, * the two values of tb_update_count match and are even then the * tb_to_xs and stamp_xsec values are consistent. If not, then it * loops back and reads them again until this criteria is met. - * We expect the caller to have done the first increment of - * vdso_data->tb_update_count already. */ vdso_data->tb_orig_stamp = cycle_last; vdso_data->stamp_xsec = new_stamp_xsec; vdso_data->tb_to_xs = new_tb_to_xs; - vdso_data->wtom_clock_sec = wtm->tv_sec; - vdso_data->wtom_clock_nsec = wtm->tv_nsec; - vdso_data->stamp_xtime = *wall_time; + vdso_data->wtom_clock_sec = tk->wall_to_monotonic.tv_sec; + vdso_data->wtom_clock_nsec = tk->wall_to_monotonic.tv_nsec; + vdso_data->stamp_xtime = xt; vdso_data->stamp_sec_fraction = frac_sec; smp_wmb(); ++(vdso_data->tb_update_count); From 0428491cba9277db42d66eb245d74255bd3dbfe7 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Tue, 11 Apr 2017 15:23:25 +1000 Subject: [PATCH 090/158] powerpc/mm: Trace tlbie(l) instructions Add a trace point for tlbie(l) (Translation Lookaside Buffer Invalidate Entry (Local)) instructions. The tlbie instruction has changed over the years, so not all versions accept the same operands. Use the ISA v3 field operands because they are the most verbose, we may change them in future. Example output: qemu-system-ppc-5371 [016] 1412.369519: tlbie: tlbie with lpid 0, local 1, rb=67bd8900174c11c1, rs=0, ric=0 prs=0 r=0 Signed-off-by: Balbir Singh [mpe: Add some missing trace_tlbie()s, reword change log] Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/trace.h | 33 +++++++++++++++++++++++++++++ arch/powerpc/kvm/book3s_hv_rm_mmu.c | 11 ++++++++-- arch/powerpc/mm/hash_native_64.c | 3 +++ arch/powerpc/mm/hash_utils_64.c | 2 ++ arch/powerpc/mm/pgtable-radix.c | 5 +++++ arch/powerpc/mm/pgtable_64.c | 8 +++++-- arch/powerpc/mm/tlb-radix.c | 9 ++++++++ 7 files changed, 67 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/include/asm/trace.h b/arch/powerpc/include/asm/trace.h index c05cef6ee06c..18f168aebae3 100644 --- a/arch/powerpc/include/asm/trace.h +++ b/arch/powerpc/include/asm/trace.h @@ -168,6 +168,39 @@ TRACE_EVENT(hash_fault, __entry->addr, __entry->access, __entry->trap) ); + +TRACE_EVENT(tlbie, + + TP_PROTO(unsigned long lpid, unsigned long local, unsigned long rb, + unsigned long rs, unsigned long ric, unsigned long prs, + unsigned long r), + TP_ARGS(lpid, local, rb, rs, ric, prs, r), + TP_STRUCT__entry( + __field(unsigned long, lpid) + __field(unsigned long, local) + __field(unsigned long, rb) + __field(unsigned long, rs) + __field(unsigned long, ric) + __field(unsigned long, prs) + __field(unsigned long, r) + ), + + TP_fast_assign( + __entry->lpid = lpid; + __entry->local = local; + __entry->rb = rb; + __entry->rs = rs; + __entry->ric = ric; + __entry->prs = prs; + __entry->r = r; + ), + + TP_printk("lpid=%ld, local=%ld, rb=0x%lx, rs=0x%lx, ric=0x%lx, " + "prs=0x%lx, r=0x%lx", __entry->lpid, __entry->local, + __entry->rb, __entry->rs, __entry->ric, __entry->prs, + __entry->r) +); + #endif /* _TRACE_POWERPC_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index ce6f2121fffe..584c74c8119f 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -15,6 +15,7 @@ #include #include +#include #include #include #include @@ -443,17 +444,23 @@ static void do_tlbies(struct kvm *kvm, unsigned long *rbvalues, cpu_relax(); if (need_sync) asm volatile("ptesync" : : : "memory"); - for (i = 0; i < npages; ++i) + for (i = 0; i < npages; ++i) { asm volatile(PPC_TLBIE_5(%0,%1,0,0,0) : : "r" (rbvalues[i]), "r" (kvm->arch.lpid)); + trace_tlbie(kvm->arch.lpid, 0, rbvalues[i], + kvm->arch.lpid, 0, 0, 0); + } asm volatile("eieio; tlbsync; ptesync" : : : "memory"); kvm->arch.tlbie_lock = 0; } else { if (need_sync) asm volatile("ptesync" : : : "memory"); - for (i = 0; i < npages; ++i) + for (i = 0; i < npages; ++i) { asm volatile(PPC_TLBIEL(%0,%1,0,0,0) : : "r" (rbvalues[i]), "r" (0)); + trace_tlbie(kvm->arch.lpid, 1, rbvalues[i], + 0, 0, 0, 0); + } asm volatile("ptesync" : : : "memory"); } } diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c index 65bb8f33b399..bdaac28193f7 100644 --- a/arch/powerpc/mm/hash_native_64.c +++ b/arch/powerpc/mm/hash_native_64.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -98,6 +99,7 @@ static inline void __tlbie(unsigned long vpn, int psize, int apsize, int ssize) : "memory"); break; } + trace_tlbie(0, 0, va, 0, 0, 0, 0); } static inline void __tlbiel(unsigned long vpn, int psize, int apsize, int ssize) @@ -147,6 +149,7 @@ static inline void __tlbiel(unsigned long vpn, int psize, int apsize, int ssize) : "memory"); break; } + trace_tlbie(0, 1, va, 0, 0, 0, 0); } diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index f2095ce9d4b0..7a20669c19e7 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -810,6 +810,8 @@ static void update_hid_for_hash(void) asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) : : "r"(rb), "i"(0), "i"(0), "i"(2), "r"(0) : "memory"); asm volatile("eieio; tlbsync; ptesync; isync; slbia": : :"memory"); + trace_tlbie(0, 0, rb, 0, 2, 0, 0); + /* * now switch the HID */ diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c index 6c062f92b9e4..419199d68928 100644 --- a/arch/powerpc/mm/pgtable-radix.c +++ b/arch/powerpc/mm/pgtable-radix.c @@ -20,6 +20,7 @@ #include #include #include +#include #include @@ -198,6 +199,7 @@ static void __init radix_init_pgtable(void) asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : : "r" (TLBIEL_INVAL_SET_LPID), "r" (0)); asm volatile("eieio; tlbsync; ptesync" : : : "memory"); + trace_tlbie(0, 0, TLBIEL_INVAL_SET_LPID, 0, 2, 1, 1); } static void __init radix_init_partition_table(void) @@ -324,6 +326,9 @@ static void update_hid_for_radix(void) asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) : : "r"(rb), "i"(1), "i"(1), "i"(2), "r"(0) : "memory"); asm volatile("eieio; tlbsync; ptesync; isync; slbia": : :"memory"); + trace_tlbie(0, 0, rb, 0, 2, 0, 1); + trace_tlbie(0, 0, rb, 0, 2, 1, 1); + /* * now switch the HID */ diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 8d2d6742a465..1feb36e58a45 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include #include @@ -477,12 +478,15 @@ void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0, * use of this partition ID was, not the new use. */ asm volatile("ptesync" : : : "memory"); - if (old & PATB_HR) + if (old & PATB_HR) { asm volatile(PPC_TLBIE_5(%0,%1,2,0,1) : : "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); - else + trace_tlbie(lpid, 0, TLBIEL_INVAL_SET_LPID, lpid, 2, 0, 1); + } else { asm volatile(PPC_TLBIE_5(%0,%1,2,0,0) : : "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); + trace_tlbie(lpid, 0, TLBIEL_INVAL_SET_LPID, lpid, 2, 0, 0); + } asm volatile("eieio; tlbsync; ptesync" : : : "memory"); } EXPORT_SYMBOL_GPL(mmu_partition_table_set_entry); diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c index 02e71402fdd3..744e0164ecf5 100644 --- a/arch/powerpc/mm/tlb-radix.c +++ b/arch/powerpc/mm/tlb-radix.c @@ -16,6 +16,7 @@ #include #include +#include #define RIC_FLUSH_TLB 0 @@ -35,6 +36,7 @@ static inline void __tlbiel_pid(unsigned long pid, int set, asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1) : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); + trace_tlbie(0, 1, rb, rs, ric, prs, r); } /* @@ -87,6 +89,7 @@ static inline void _tlbie_pid(unsigned long pid, unsigned long ric) asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); asm volatile("eieio; tlbsync; ptesync": : :"memory"); + trace_tlbie(0, 0, rb, rs, ric, prs, r); } static inline void _tlbiel_va(unsigned long va, unsigned long pid, @@ -104,6 +107,7 @@ static inline void _tlbiel_va(unsigned long va, unsigned long pid, asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1) : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); asm volatile("ptesync": : :"memory"); + trace_tlbie(0, 1, rb, rs, ric, prs, r); } static inline void _tlbie_va(unsigned long va, unsigned long pid, @@ -121,6 +125,7 @@ static inline void _tlbie_va(unsigned long va, unsigned long pid, asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); asm volatile("eieio; tlbsync; ptesync": : :"memory"); + trace_tlbie(0, 0, rb, rs, ric, prs, r); } /* @@ -377,6 +382,7 @@ void radix__flush_tlb_lpid_va(unsigned long lpid, unsigned long gpa, asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); asm volatile("eieio; tlbsync; ptesync": : :"memory"); + trace_tlbie(lpid, 0, rb, rs, ric, prs, r); } EXPORT_SYMBOL(radix__flush_tlb_lpid_va); @@ -394,6 +400,7 @@ void radix__flush_tlb_lpid(unsigned long lpid) asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); asm volatile("eieio; tlbsync; ptesync": : :"memory"); + trace_tlbie(lpid, 0, rb, rs, ric, prs, r); } EXPORT_SYMBOL(radix__flush_tlb_lpid); @@ -420,12 +427,14 @@ void radix__flush_tlb_all(void) */ asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) : : "r"(rb), "i"(r), "i"(1), "i"(ric), "r"(rs) : "memory"); + trace_tlbie(0, 0, rb, rs, ric, prs, r); /* * now flush host entires by passing PRS = 0 and LPID == 0 */ asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(0) : "memory"); asm volatile("eieio; tlbsync; ptesync": : :"memory"); + trace_tlbie(0, 0, rb, 0, ric, prs, r); } void radix__flush_tlb_pte_p9_dd1(unsigned long old_pte, struct mm_struct *mm, From 64ebb9a208c6e66316329a6d9101815d1ee06fa9 Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Thu, 15 Jun 2017 11:53:16 +1000 Subject: [PATCH 091/158] powerpc: Fix /proc/cpuinfo revision for POWER9 DD2 The P9 PVR bits 12-15 don't indicate a revision but instead different chip configurations. From BookIV we have: Bits Configuration 0 : Scale out 12 cores 1 : Scale out 24 cores 2 : Scale up 12 cores 3 : Scale up 24 cores DD1 doesn't use this but DD2 does. Linux will mostly use the "Scale out 24 core" configuration (ie. SMT4 not SMT8) which results in a PVR of 0x004e1200. The reported revision in /proc/cpuinfo is hence reported incorrectly as "18.0". This patch fixes this to mask off only the relevant bits for the major revision (ie. bits 8-11) for POWER9. Signed-off-by: Michael Neuling Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/setup-common.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 71dcda91755d..096dee58b9c1 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -335,6 +335,10 @@ static int show_cpuinfo(struct seq_file *m, void *v) maj = ((pvr >> 8) & 0xFF) - 1; min = pvr & 0xFF; break; + case 0x004e: /* POWER9 bits 12-15 give chip type */ + maj = (pvr >> 8) & 0x0F; + min = pvr & 0xFF; + break; default: maj = (pvr >> 8) & 0xFF; min = pvr & 0xFF; From aa9a95163638bd9acb3e1f61f48cd5a000e79f03 Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Mon, 8 May 2017 16:23:31 +1000 Subject: [PATCH 092/158] powerpc: Fix asm offsets to point to actual FP and VMX regs The asm code assumes the FP regs are at the start of fp_state. While this is true now, it may not always be the case and there is nothing enforcing it. This fixes the asm-offsets to point to the actual FP registers inside the fp_state. Similarly for VMX. Signed-off-by: Michael Neuling Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/asm-offsets.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 9624851ca276..a7b5af32b89e 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -100,12 +100,12 @@ int main(void) OFFSET(THREAD_NORMSAVES, thread_struct, normsave[0]); #endif OFFSET(THREAD_FPEXC_MODE, thread_struct, fpexc_mode); - OFFSET(THREAD_FPSTATE, thread_struct, fp_state); + OFFSET(THREAD_FPSTATE, thread_struct, fp_state.fpr); OFFSET(THREAD_FPSAVEAREA, thread_struct, fp_save_area); OFFSET(FPSTATE_FPSCR, thread_fp_state, fpscr); OFFSET(THREAD_LOAD_FP, thread_struct, load_fp); #ifdef CONFIG_ALTIVEC - OFFSET(THREAD_VRSTATE, thread_struct, vr_state); + OFFSET(THREAD_VRSTATE, thread_struct, vr_state.vr); OFFSET(THREAD_VRSAVEAREA, thread_struct, vr_save_area); OFFSET(THREAD_VRSAVE, thread_struct, vrsave); OFFSET(THREAD_USED_VR, thread_struct, used_vr); @@ -145,9 +145,9 @@ int main(void) OFFSET(THREAD_TM_PPR, thread_struct, tm_ppr); OFFSET(THREAD_TM_DSCR, thread_struct, tm_dscr); OFFSET(PT_CKPT_REGS, thread_struct, ckpt_regs); - OFFSET(THREAD_CKVRSTATE, thread_struct, ckvr_state); + OFFSET(THREAD_CKVRSTATE, thread_struct, ckvr_state.vr); OFFSET(THREAD_CKVRSAVE, thread_struct, ckvrsave); - OFFSET(THREAD_CKFPSTATE, thread_struct, ckfp_state); + OFFSET(THREAD_CKFPSTATE, thread_struct, ckfp_state.fpr); /* Local pt_regs on stack for Transactional Memory funcs. */ DEFINE(TM_FRAME_SIZE, STACK_FRAME_OVERHEAD + sizeof(struct pt_regs) + 16); From 2bafb7ffa3e0908ad2e69b94c436a0326ef2e7e1 Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Mon, 8 May 2017 16:18:31 +1000 Subject: [PATCH 093/158] powerpc/tm: Fix comment Update to real function name. Signed-off-by: Michael Neuling Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/tm.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/tm.S b/arch/powerpc/kernel/tm.S index 3a2d04134da9..c4ba37822ba0 100644 --- a/arch/powerpc/kernel/tm.S +++ b/arch/powerpc/kernel/tm.S @@ -313,8 +313,8 @@ dont_backup_fp: blr - /* void tm_recheckpoint(struct thread_struct *thread, - * unsigned long orig_msr) + /* void __tm_recheckpoint(struct thread_struct *thread, + * unsigned long orig_msr) * - Restore the checkpointed register state saved by tm_reclaim * when we switch_to a process. * From 31bbd45af313c3b1cdaa98e5a2de65194cf7d948 Mon Sep 17 00:00:00 2001 From: Russell Currey Date: Wed, 14 Jun 2017 14:19:58 +1000 Subject: [PATCH 094/158] powerpc/powernv/pci: Reduce spam when dumping PEST Dumping the PE State Tables (PEST) can be highly verbose if a number of PEs are affected, especially in the case where the whole PHB is frozen and 512 lines get printed. Check for duplicates when dumping the PEST to reduce useless output. For example: PE[0f8] A/B: 9700002600000000 80000080d00000f8 PE[0f9] A/B: 8000000000000000 0000000000000000 PE[..0fe] A/B: as above PE[0ff] A/B: 8440002b00000000 0000000000000000 instead of: PE[0f8] A/B: 9700002600000000 80000080d00000f8 PE[0f9] A/B: 8000000000000000 0000000000000000 PE[0fa] A/B: 8000000000000000 0000000000000000 PE[0fb] A/B: 8000000000000000 0000000000000000 PE[0fc] A/B: 8000000000000000 0000000000000000 PE[0fd] A/B: 8000000000000000 0000000000000000 PE[0fe] A/B: 8000000000000000 0000000000000000 PE[0ff] A/B: 8440002b00000000 0000000000000000 and you can imagine how much worse it can get for 512 PEs. Signed-off-by: Russell Currey Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/pci.c | 51 +++++++++++++++++----------- arch/powerpc/platforms/powernv/pci.h | 3 ++ 2 files changed, 34 insertions(+), 20 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index 935ccb249a8a..40071ad0bc42 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -227,11 +227,39 @@ void pnv_teardown_msi_irqs(struct pci_dev *pdev) } #endif /* CONFIG_PCI_MSI */ +/* Nicely print the contents of the PE State Tables (PEST). */ +static void pnv_pci_dump_pest(__be64 pestA[], __be64 pestB[], int pest_size) +{ + __be64 prevA = ULONG_MAX, prevB = ULONG_MAX; + bool dup = false; + int i; + + for (i = 0; i < pest_size; i++) { + __be64 peA = be64_to_cpu(pestA[i]); + __be64 peB = be64_to_cpu(pestB[i]); + + if (peA != prevA || peB != prevB) { + if (dup) { + pr_info("PE[..%03x] A/B: as above\n", i-1); + dup = false; + } + prevA = peA; + prevB = peB; + if (peA & PNV_IODA_STOPPED_STATE || + peB & PNV_IODA_STOPPED_STATE) + pr_info("PE[%03x] A/B: %016llx %016llx\n", + i, peA, peB); + } else if (!dup && (peA & PNV_IODA_STOPPED_STATE || + peB & PNV_IODA_STOPPED_STATE)) { + dup = true; + } + } +} + static void pnv_pci_dump_p7ioc_diag_data(struct pci_controller *hose, struct OpalIoPhbErrorCommon *common) { struct OpalIoP7IOCPhbErrorData *data; - int i; data = (struct OpalIoP7IOCPhbErrorData *)common; pr_info("P7IOC PHB#%x Diag-data (Version: %d)\n", @@ -308,22 +336,13 @@ static void pnv_pci_dump_p7ioc_diag_data(struct pci_controller *hose, be64_to_cpu(data->dma1ErrorLog0), be64_to_cpu(data->dma1ErrorLog1)); - for (i = 0; i < OPAL_P7IOC_NUM_PEST_REGS; i++) { - if ((be64_to_cpu(data->pestA[i]) >> 63) == 0 && - (be64_to_cpu(data->pestB[i]) >> 63) == 0) - continue; - - pr_info("PE[%3d] A/B: %016llx %016llx\n", - i, be64_to_cpu(data->pestA[i]), - be64_to_cpu(data->pestB[i])); - } + pnv_pci_dump_pest(data->pestA, data->pestB, OPAL_P7IOC_NUM_PEST_REGS); } static void pnv_pci_dump_phb3_diag_data(struct pci_controller *hose, struct OpalIoPhbErrorCommon *common) { struct OpalIoPhb3ErrorData *data; - int i; data = (struct OpalIoPhb3ErrorData*)common; pr_info("PHB3 PHB#%x Diag-data (Version: %d)\n", @@ -404,15 +423,7 @@ static void pnv_pci_dump_phb3_diag_data(struct pci_controller *hose, be64_to_cpu(data->dma1ErrorLog0), be64_to_cpu(data->dma1ErrorLog1)); - for (i = 0; i < OPAL_PHB3_NUM_PEST_REGS; i++) { - if ((be64_to_cpu(data->pestA[i]) >> 63) == 0 && - (be64_to_cpu(data->pestB[i]) >> 63) == 0) - continue; - - pr_info("PE[%3d] A/B: %016llx %016llx\n", - i, be64_to_cpu(data->pestA[i]), - be64_to_cpu(data->pestB[i])); - } + pnv_pci_dump_pest(data->pestA, data->pestB, OPAL_PHB3_NUM_PEST_REGS); } void pnv_pci_dump_phb_diag_data(struct pci_controller *hose, diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index 18c8a2fa03b8..6abc77dd9261 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -33,6 +33,9 @@ enum pnv_phb_model { #define PNV_IODA_PE_SLAVE (1 << 4) /* Slave PE in compound case */ #define PNV_IODA_PE_VF (1 << 5) /* PE for one VF */ +/* Indicates operations are frozen for a PE: MMIO in PESTA & DMA in PESTB. */ +#define PNV_IODA_STOPPED_STATE 0x8000000000000000 + /* Data associated with a PE, including IOMMU tracking etc.. */ struct pnv_phb; struct pnv_ioda_pe { From 5cb1f8fdddb7475f38ea9fba48da17c002eec90b Mon Sep 17 00:00:00 2001 From: Russell Currey Date: Wed, 14 Jun 2017 14:19:59 +1000 Subject: [PATCH 095/158] powerpc/powernv/pci: Dynamically allocate PHB diag data Diagnostic data for PHBs currently works by allocated a fixed-sized buffer. This is simple, but either wastes memory (though only a few kilobytes) or in the case of PHB4 isn't enough to fit the whole data blob. For machines that don't describe the diagnostic data size in the device tree, use the hardcoded buffer size as before. For those that do, only allocate exactly what's needed. In the special case of P7IOC (which has two types of diag data), the larger should be specified in the device tree. Signed-off-by: Russell Currey Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/eeh-powernv.c | 16 +++++++++++----- arch/powerpc/platforms/powernv/pci-ioda.c | 15 ++++++++++++--- arch/powerpc/platforms/powernv/pci.c | 6 +++--- arch/powerpc/platforms/powernv/pci.h | 10 +++------- 4 files changed, 29 insertions(+), 18 deletions(-) diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index d12ea7b9fd47..3f48f6df1cf3 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -48,6 +48,7 @@ static int pnv_eeh_init(void) { struct pci_controller *hose; struct pnv_phb *phb; + int max_diag_size = PNV_PCI_DIAG_BUF_SIZE; if (!firmware_has_feature(FW_FEATURE_OPAL)) { pr_warn("%s: OPAL is required !\n", @@ -69,6 +70,9 @@ static int pnv_eeh_init(void) if (phb->model == PNV_PHB_MODEL_P7IOC) eeh_add_flag(EEH_ENABLE_IO_FOR_LOG); + if (phb->diag_data_size > max_diag_size) + max_diag_size = phb->diag_data_size; + /* * PE#0 should be regarded as valid by EEH core * if it's not the reserved one. Currently, we @@ -82,6 +86,8 @@ static int pnv_eeh_init(void) break; } + eeh_set_pe_aux_size(max_diag_size); + return 0; } @@ -540,7 +546,7 @@ static void pnv_eeh_get_phb_diag(struct eeh_pe *pe) s64 rc; rc = opal_pci_get_phb_diag_data2(phb->opal_id, pe->data, - PNV_PCI_DIAG_BUF_SIZE); + phb->diag_data_size); if (rc != OPAL_SUCCESS) pr_warn("%s: Failure %lld getting PHB#%x diag-data\n", __func__, rc, pe->phb->global_number); @@ -1314,7 +1320,8 @@ static void pnv_eeh_dump_hub_diag_common(struct OpalIoP7IOCErrorData *data) static void pnv_eeh_get_and_dump_hub_diag(struct pci_controller *hose) { struct pnv_phb *phb = hose->private_data; - struct OpalIoP7IOCErrorData *data = &phb->diag.hub_diag; + struct OpalIoP7IOCErrorData *data = + (struct OpalIoP7IOCErrorData*)phb->diag_data; long rc; rc = opal_pci_get_hub_diag_data(phb->hub_id, data, sizeof(*data)); @@ -1549,10 +1556,10 @@ static int pnv_eeh_next_error(struct eeh_pe **pe) /* Dump PHB diag-data */ rc = opal_pci_get_phb_diag_data2(phb->opal_id, - phb->diag.blob, PNV_PCI_DIAG_BUF_SIZE); + phb->diag_data, phb->diag_data_size); if (rc == OPAL_SUCCESS) pnv_pci_dump_phb_diag_data(hose, - phb->diag.blob); + phb->diag_data); /* Try best to clear it */ opal_pci_eeh_freeze_clear(phb->opal_id, @@ -1795,7 +1802,6 @@ static int __init eeh_powernv_init(void) { int ret = -EINVAL; - eeh_set_pe_aux_size(PNV_PCI_DIAG_BUF_SIZE); ret = eeh_ops_register(&pnv_eeh_ops); if (!ret) pr_info("EEH: PowerNV platform initialized\n"); diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 283caf1070c9..96d0156f48db 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -3123,13 +3123,13 @@ static int pnv_pci_diag_data_set(void *data, u64 val) phb = hose->private_data; /* Retrieve the diag data from firmware */ - ret = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag.blob, - PNV_PCI_DIAG_BUF_SIZE); + ret = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag_data, + phb->diag_data_size); if (ret != OPAL_SUCCESS) return -EIO; /* Print the diag data to the kernel log */ - pnv_pci_dump_phb_diag_data(phb->hose, phb->diag.blob); + pnv_pci_dump_phb_diag_data(phb->hose, phb->diag_data); return 0; } @@ -3725,6 +3725,15 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, else phb->model = PNV_PHB_MODEL_UNKNOWN; + /* Initialize diagnostic data buffer */ + prop32 = of_get_property(np, "ibm,phb-diag-data-size", NULL); + if (prop32) + phb->diag_data_size = be32_to_cpup(prop32); + else + phb->diag_data_size = PNV_PCI_DIAG_BUF_SIZE; + + phb->diag_data = memblock_virt_alloc(phb->diag_data_size, 0); + /* Parse 32-bit and IO ranges (if any) */ pci_process_bridge_OF_ranges(hose, np, !hose->global_number); diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index 40071ad0bc42..209ad47a3383 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -456,8 +456,8 @@ static void pnv_pci_handle_eeh_config(struct pnv_phb *phb, u32 pe_no) spin_lock_irqsave(&phb->lock, flags); /* Fetch PHB diag-data */ - rc = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag.blob, - PNV_PCI_DIAG_BUF_SIZE); + rc = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag_data, + phb->diag_data_size); has_diag = (rc == OPAL_SUCCESS); /* If PHB supports compound PE, to handle it */ @@ -485,7 +485,7 @@ static void pnv_pci_handle_eeh_config(struct pnv_phb *phb, u32 pe_no) * with the normal errors generated when probing empty slots */ if (has_diag && ret) - pnv_pci_dump_phb_diag_data(phb->hose, phb->diag.blob); + pnv_pci_dump_phb_diag_data(phb->hose, phb->diag_data); spin_unlock_irqrestore(&phb->lock, flags); } diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index 6abc77dd9261..f16bc403ec03 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -172,13 +172,9 @@ struct pnv_phb { unsigned int pe_rmap[0x10000]; } ioda; - /* PHB and hub status structure */ - union { - unsigned char blob[PNV_PCI_DIAG_BUF_SIZE]; - struct OpalIoP7IOCPhbErrorData p7ioc; - struct OpalIoPhb3ErrorData phb3; - struct OpalIoP7IOCErrorData hub_diag; - } diag; + /* PHB and hub diagnostics */ + unsigned int diag_data_size; + u8 *diag_data; /* Nvlink2 data */ struct npu { From a4b48ba9047dfdf99cf2316d21baac2791b3b14c Mon Sep 17 00:00:00 2001 From: Russell Currey Date: Wed, 14 Jun 2017 14:20:00 +1000 Subject: [PATCH 096/158] powerpc/powernv/pci: Add support for PHB4 diagnostics As with P7IOC and PHB3, add kernel-side support for decoding and printing diagnostic data for PHB4. Signed-off-by: Russell Currey Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/opal-api.h | 75 ++++++++++++++++++- arch/powerpc/platforms/powernv/pci.c | 105 +++++++++++++++++++++++++++ 2 files changed, 178 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index cb3e6242a78c..0b543f0f54f5 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -667,12 +667,14 @@ enum { enum { OPAL_PHB_ERROR_DATA_TYPE_P7IOC = 1, - OPAL_PHB_ERROR_DATA_TYPE_PHB3 = 2 + OPAL_PHB_ERROR_DATA_TYPE_PHB3 = 2, + OPAL_PHB_ERROR_DATA_TYPE_PHB4 = 3 }; enum { OPAL_P7IOC_NUM_PEST_REGS = 128, - OPAL_PHB3_NUM_PEST_REGS = 256 + OPAL_PHB3_NUM_PEST_REGS = 256, + OPAL_PHB4_NUM_PEST_REGS = 512 }; struct OpalIoPhbErrorCommon { @@ -802,6 +804,75 @@ struct OpalIoPhb3ErrorData { __be64 pestB[OPAL_PHB3_NUM_PEST_REGS]; }; +struct OpalIoPhb4ErrorData { + struct OpalIoPhbErrorCommon common; + + __be32 brdgCtl; + + /* PHB4 cfg regs */ + __be32 deviceStatus; + __be32 slotStatus; + __be32 linkStatus; + __be32 devCmdStatus; + __be32 devSecStatus; + + /* cfg AER regs */ + __be32 rootErrorStatus; + __be32 uncorrErrorStatus; + __be32 corrErrorStatus; + __be32 tlpHdr1; + __be32 tlpHdr2; + __be32 tlpHdr3; + __be32 tlpHdr4; + __be32 sourceId; + + /* PHB4 ETU Error Regs */ + __be64 nFir; /* 000 */ + __be64 nFirMask; /* 003 */ + __be64 nFirWOF; /* 008 */ + __be64 phbPlssr; /* 120 */ + __be64 phbCsr; /* 110 */ + __be64 lemFir; /* C00 */ + __be64 lemErrorMask; /* C18 */ + __be64 lemWOF; /* C40 */ + __be64 phbErrorStatus; /* C80 */ + __be64 phbFirstErrorStatus; /* C88 */ + __be64 phbErrorLog0; /* CC0 */ + __be64 phbErrorLog1; /* CC8 */ + __be64 phbTxeErrorStatus; /* D00 */ + __be64 phbTxeFirstErrorStatus; /* D08 */ + __be64 phbTxeErrorLog0; /* D40 */ + __be64 phbTxeErrorLog1; /* D48 */ + __be64 phbRxeArbErrorStatus; /* D80 */ + __be64 phbRxeArbFirstErrorStatus; /* D88 */ + __be64 phbRxeArbErrorLog0; /* DC0 */ + __be64 phbRxeArbErrorLog1; /* DC8 */ + __be64 phbRxeMrgErrorStatus; /* E00 */ + __be64 phbRxeMrgFirstErrorStatus; /* E08 */ + __be64 phbRxeMrgErrorLog0; /* E40 */ + __be64 phbRxeMrgErrorLog1; /* E48 */ + __be64 phbRxeTceErrorStatus; /* E80 */ + __be64 phbRxeTceFirstErrorStatus; /* E88 */ + __be64 phbRxeTceErrorLog0; /* EC0 */ + __be64 phbRxeTceErrorLog1; /* EC8 */ + + /* PHB4 REGB Error Regs */ + __be64 phbPblErrorStatus; /* 1900 */ + __be64 phbPblFirstErrorStatus; /* 1908 */ + __be64 phbPblErrorLog0; /* 1940 */ + __be64 phbPblErrorLog1; /* 1948 */ + __be64 phbPcieDlpErrorLog1; /* 1AA0 */ + __be64 phbPcieDlpErrorLog2; /* 1AA8 */ + __be64 phbPcieDlpErrorStatus; /* 1AB0 */ + __be64 phbRegbErrorStatus; /* 1C00 */ + __be64 phbRegbFirstErrorStatus; /* 1C08 */ + __be64 phbRegbErrorLog0; /* 1C40 */ + __be64 phbRegbErrorLog1; /* 1C48 */ + + __be64 pestA[OPAL_PHB4_NUM_PEST_REGS]; + __be64 pestB[OPAL_PHB4_NUM_PEST_REGS]; +}; + enum { OPAL_REINIT_CPUS_HILE_BE = (1 << 0), OPAL_REINIT_CPUS_HILE_LE = (1 << 1), diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index 209ad47a3383..7905d179d036 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -426,6 +426,108 @@ static void pnv_pci_dump_phb3_diag_data(struct pci_controller *hose, pnv_pci_dump_pest(data->pestA, data->pestB, OPAL_PHB3_NUM_PEST_REGS); } +static void pnv_pci_dump_phb4_diag_data(struct pci_controller *hose, + struct OpalIoPhbErrorCommon *common) +{ + struct OpalIoPhb4ErrorData *data; + + data = (struct OpalIoPhb4ErrorData*)common; + pr_info("PHB4 PHB#%d Diag-data (Version: %d)\n", + hose->global_number, be32_to_cpu(common->version)); + if (data->brdgCtl) + pr_info("brdgCtl: %08x\n", + be32_to_cpu(data->brdgCtl)); + if (data->deviceStatus || data->slotStatus || + data->linkStatus || data->devCmdStatus || + data->devSecStatus) + pr_info("RootSts: %08x %08x %08x %08x %08x\n", + be32_to_cpu(data->deviceStatus), + be32_to_cpu(data->slotStatus), + be32_to_cpu(data->linkStatus), + be32_to_cpu(data->devCmdStatus), + be32_to_cpu(data->devSecStatus)); + if (data->rootErrorStatus || data->uncorrErrorStatus || + data->corrErrorStatus) + pr_info("RootErrSts: %08x %08x %08x\n", + be32_to_cpu(data->rootErrorStatus), + be32_to_cpu(data->uncorrErrorStatus), + be32_to_cpu(data->corrErrorStatus)); + if (data->tlpHdr1 || data->tlpHdr2 || + data->tlpHdr3 || data->tlpHdr4) + pr_info("RootErrLog: %08x %08x %08x %08x\n", + be32_to_cpu(data->tlpHdr1), + be32_to_cpu(data->tlpHdr2), + be32_to_cpu(data->tlpHdr3), + be32_to_cpu(data->tlpHdr4)); + if (data->sourceId) + pr_info("sourceId: %08x\n", be32_to_cpu(data->sourceId)); + if (data->nFir) + pr_info("nFir: %016llx %016llx %016llx\n", + be64_to_cpu(data->nFir), + be64_to_cpu(data->nFirMask), + be64_to_cpu(data->nFirWOF)); + if (data->phbPlssr || data->phbCsr) + pr_info("PhbSts: %016llx %016llx\n", + be64_to_cpu(data->phbPlssr), + be64_to_cpu(data->phbCsr)); + if (data->lemFir) + pr_info("Lem: %016llx %016llx %016llx\n", + be64_to_cpu(data->lemFir), + be64_to_cpu(data->lemErrorMask), + be64_to_cpu(data->lemWOF)); + if (data->phbErrorStatus) + pr_info("PhbErr: %016llx %016llx %016llx %016llx\n", + be64_to_cpu(data->phbErrorStatus), + be64_to_cpu(data->phbFirstErrorStatus), + be64_to_cpu(data->phbErrorLog0), + be64_to_cpu(data->phbErrorLog1)); + if (data->phbTxeErrorStatus) + pr_info("PhbTxeErr: %016llx %016llx %016llx %016llx\n", + be64_to_cpu(data->phbTxeErrorStatus), + be64_to_cpu(data->phbTxeFirstErrorStatus), + be64_to_cpu(data->phbTxeErrorLog0), + be64_to_cpu(data->phbTxeErrorLog1)); + if (data->phbRxeArbErrorStatus) + pr_info("RxeArbErr: %016llx %016llx %016llx %016llx\n", + be64_to_cpu(data->phbRxeArbErrorStatus), + be64_to_cpu(data->phbRxeArbFirstErrorStatus), + be64_to_cpu(data->phbRxeArbErrorLog0), + be64_to_cpu(data->phbRxeArbErrorLog1)); + if (data->phbRxeMrgErrorStatus) + pr_info("RxeMrgErr: %016llx %016llx %016llx %016llx\n", + be64_to_cpu(data->phbRxeMrgErrorStatus), + be64_to_cpu(data->phbRxeMrgFirstErrorStatus), + be64_to_cpu(data->phbRxeMrgErrorLog0), + be64_to_cpu(data->phbRxeMrgErrorLog1)); + if (data->phbRxeTceErrorStatus) + pr_info("RxeTceErr: %016llx %016llx %016llx %016llx\n", + be64_to_cpu(data->phbRxeTceErrorStatus), + be64_to_cpu(data->phbRxeTceFirstErrorStatus), + be64_to_cpu(data->phbRxeTceErrorLog0), + be64_to_cpu(data->phbRxeTceErrorLog1)); + + if (data->phbPblErrorStatus) + pr_info("PblErr: %016llx %016llx %016llx %016llx\n", + be64_to_cpu(data->phbPblErrorStatus), + be64_to_cpu(data->phbPblFirstErrorStatus), + be64_to_cpu(data->phbPblErrorLog0), + be64_to_cpu(data->phbPblErrorLog1)); + if (data->phbPcieDlpErrorStatus) + pr_info("PcieDlp: %016llx %016llx %016llx\n", + be64_to_cpu(data->phbPcieDlpErrorLog1), + be64_to_cpu(data->phbPcieDlpErrorLog2), + be64_to_cpu(data->phbPcieDlpErrorStatus)); + if (data->phbRegbErrorStatus) + pr_info("RegbErr: %016llx %016llx %016llx %016llx\n", + be64_to_cpu(data->phbRegbErrorStatus), + be64_to_cpu(data->phbRegbFirstErrorStatus), + be64_to_cpu(data->phbRegbErrorLog0), + be64_to_cpu(data->phbRegbErrorLog1)); + + + pnv_pci_dump_pest(data->pestA, data->pestB, OPAL_PHB4_NUM_PEST_REGS); +} + void pnv_pci_dump_phb_diag_data(struct pci_controller *hose, unsigned char *log_buff) { @@ -442,6 +544,9 @@ void pnv_pci_dump_phb_diag_data(struct pci_controller *hose, case OPAL_PHB_ERROR_DATA_TYPE_PHB3: pnv_pci_dump_phb3_diag_data(hose, common); break; + case OPAL_PHB_ERROR_DATA_TYPE_PHB4: + pnv_pci_dump_phb4_diag_data(hose, common); + break; default: pr_warn("%s: Unrecognized ioType %d\n", __func__, be32_to_cpu(common->ioType)); From a0f98629f19943fb8387a7d5fd01e60178223c00 Mon Sep 17 00:00:00 2001 From: Russell Currey Date: Wed, 21 Jun 2017 17:18:03 +1000 Subject: [PATCH 097/158] powerpc/powernv/pci: Add helper to check if a PE has a single vendor Add a helper that determines if all the devices contained in a given PE are all from the same vendor or not. This can be useful in determining if it's okay to make PE-wide changes that may be suitable for some devices but not for others. This is used later in the series. Signed-off-by: Russell Currey Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/pci-ioda.c | 25 +++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 96d0156f48db..a6424127ca8d 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1718,6 +1718,31 @@ static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev */ } +static bool pnv_pci_ioda_pe_single_vendor(struct pnv_ioda_pe *pe) +{ + unsigned short vendor = 0; + struct pci_dev *pdev; + + if (pe->device_count == 1) + return true; + + /* pe->pdev should be set if it's a single device, pe->pbus if not */ + if (!pe->pbus) + return true; + + list_for_each_entry(pdev, &pe->pbus->devices, bus_list) { + if (!vendor) { + vendor = pdev->vendor; + continue; + } + + if (pdev->vendor != vendor) + return false; + } + + return true; +} + static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask) { struct pci_controller *hose = pci_bus_to_host(pdev->bus); From 8e3f1b1d8255105f31556aacf8aeb6071b00d469 Mon Sep 17 00:00:00 2001 From: Russell Currey Date: Wed, 21 Jun 2017 17:18:04 +1000 Subject: [PATCH 098/158] powerpc/powernv/pci: Enable 64-bit devices to access >4GB DMA space On PHB3/POWER8 systems, devices can select between two different sections of address space, TVE#0 and TVE#1. TVE#0 is intended for 32bit devices that aren't capable of addressing more than 4GB. Selecting TVE#1 instead, with the capability of addressing over 4GB, is performed by setting bit 59 of a PCI address. However, some devices aren't capable of addressing at least 59 bits, but still want more than 4GB of DMA space. In order to enable this, reconfigure TVE#0 to be suitable for 64-bit devices by allocating memory past the initial 4GB that is inaccessible by 64-bit DMAs. This bypass mode is only enabled if a device requests 4GB or more of DMA address space, if the system has PHB3 (POWER8 systems), and if the device does not share a PE with any devices from different vendors. Signed-off-by: Russell Currey Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/pci-ioda.c | 93 ++++++++++++++++++++++- 1 file changed, 91 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index a6424127ca8d..437613588df1 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1743,6 +1743,75 @@ static bool pnv_pci_ioda_pe_single_vendor(struct pnv_ioda_pe *pe) return true; } +/* + * Reconfigure TVE#0 to be usable as 64-bit DMA space. + * + * The first 4GB of virtual memory for a PE is reserved for 32-bit accesses. + * Devices can only access more than that if bit 59 of the PCI address is set + * by hardware, which indicates TVE#1 should be used instead of TVE#0. + * Many PCI devices are not capable of addressing that many bits, and as a + * result are limited to the 4GB of virtual memory made available to 32-bit + * devices in TVE#0. + * + * In order to work around this, reconfigure TVE#0 to be suitable for 64-bit + * devices by configuring the virtual memory past the first 4GB inaccessible + * by 64-bit DMAs. This should only be used by devices that want more than + * 4GB, and only on PEs that have no 32-bit devices. + * + * Currently this will only work on PHB3 (POWER8). + */ +static int pnv_pci_ioda_dma_64bit_bypass(struct pnv_ioda_pe *pe) +{ + u64 window_size, table_size, tce_count, addr; + struct page *table_pages; + u64 tce_order = 28; /* 256MB TCEs */ + __be64 *tces; + s64 rc; + + /* + * Window size needs to be a power of two, but needs to account for + * shifting memory by the 4GB offset required to skip 32bit space. + */ + window_size = roundup_pow_of_two(memory_hotplug_max() + (1ULL << 32)); + tce_count = window_size >> tce_order; + table_size = tce_count << 3; + + if (table_size < PAGE_SIZE) + table_size = PAGE_SIZE; + + table_pages = alloc_pages_node(pe->phb->hose->node, GFP_KERNEL, + get_order(table_size)); + if (!table_pages) + goto err; + + tces = page_address(table_pages); + if (!tces) + goto err; + + memset(tces, 0, table_size); + + for (addr = 0; addr < memory_hotplug_max(); addr += (1 << tce_order)) { + tces[(addr + (1ULL << 32)) >> tce_order] = + cpu_to_be64(addr | TCE_PCI_READ | TCE_PCI_WRITE); + } + + rc = opal_pci_map_pe_dma_window(pe->phb->opal_id, + pe->pe_number, + /* reconfigure window 0 */ + (pe->pe_number << 1) + 0, + 1, + __pa(tces), + table_size, + 1 << tce_order); + if (rc == OPAL_SUCCESS) { + pe_info(pe, "Using 64-bit DMA iommu bypass (through TVE#0)\n"); + return 0; + } +err: + pe_err(pe, "Error configuring 64-bit DMA bypass\n"); + return -EIO; +} + static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask) { struct pci_controller *hose = pci_bus_to_host(pdev->bus); @@ -1751,6 +1820,7 @@ static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask) struct pnv_ioda_pe *pe; uint64_t top; bool bypass = false; + s64 rc; if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE)) return -ENODEV;; @@ -1765,8 +1835,27 @@ static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask) dev_info(&pdev->dev, "Using 64-bit DMA iommu bypass\n"); set_dma_ops(&pdev->dev, &dma_direct_ops); } else { - dev_info(&pdev->dev, "Using 32-bit DMA via iommu\n"); - set_dma_ops(&pdev->dev, &dma_iommu_ops); + /* + * If the device can't set the TCE bypass bit but still wants + * to access 4GB or more, on PHB3 we can reconfigure TVE#0 to + * bypass the 32-bit region and be usable for 64-bit DMAs. + * The device needs to be able to address all of this space. + */ + if (dma_mask >> 32 && + dma_mask > (memory_hotplug_max() + (1ULL << 32)) && + pnv_pci_ioda_pe_single_vendor(pe) && + phb->model == PNV_PHB_MODEL_PHB3) { + /* Configure the bypass mode */ + rc = pnv_pci_ioda_dma_64bit_bypass(pe); + if (rc) + return rc; + /* 4GB offset bypasses 32-bit space */ + set_dma_offset(&pdev->dev, (1ULL << 32)); + set_dma_ops(&pdev->dev, &dma_direct_ops); + } else { + dev_info(&pdev->dev, "Using 32-bit DMA via iommu\n"); + set_dma_ops(&pdev->dev, &dma_iommu_ops); + } } *pdev->dev.dma_mask = dma_mask; From 74e27c6af56fe6898c3c8c451595746a992f0f0f Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Sun, 25 Jun 2017 15:08:46 -0500 Subject: [PATCH 099/158] powerpc: Only do ERAT invalidate on radix context switch on P9 DD1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit From: Michael Neuling On P9 (Nimbus) DD2 and later, in radix mode, the move to the PID register will implicitly invalidate the user space ERAT entries and leave the kernel ones alone. Thus the only thing needed is an isync() to synchronize this with subsequent uaccess's Signed-off-by: Michael Neuling Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Michael Ellerman --- arch/powerpc/mm/mmu_context_book3s64.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c index c6dca2ae78ef..3ef0a89503bf 100644 --- a/arch/powerpc/mm/mmu_context_book3s64.c +++ b/arch/powerpc/mm/mmu_context_book3s64.c @@ -235,10 +235,15 @@ void destroy_context(struct mm_struct *mm) #ifdef CONFIG_PPC_RADIX_MMU void radix__switch_mmu_context(struct mm_struct *prev, struct mm_struct *next) { - asm volatile("isync": : :"memory"); - mtspr(SPRN_PID, next->context.id); - asm volatile("isync \n" - PPC_SLBIA(0x7) - : : :"memory"); + + if (cpu_has_feature(CPU_FTR_POWER9_DD1)) { + isync(); + mtspr(SPRN_PID, next->context.id); + isync(); + asm volatile(PPC_INVALIDATE_ERAT : : :"memory"); + } else { + mtspr(SPRN_PID, next->context.id); + isync(); + } } #endif From ba6d334ac230065243a92bb7cb3fd6a5f6a7f8ac Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Sat, 24 Jun 2017 12:29:01 -0500 Subject: [PATCH 100/158] powerpc/64s: Invalidate ERAT on powersave wakeup for POWER9 On POWER9 the ERAT may be incorrect on wakeup from some stop states that lose state. This causes random segvs and illegal instructions when these stop states are enabled. This patch invalidates the ERAT on wakeup on POWER9 to prevent this from causing a problem. Signed-off-by: Michael Neuling Signed-off-by: Benjamin Herrenschmidt Reviewed-by: Nicholas Piggin [mpe: Merge comment change with upstream changes] Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/exceptions-64s.S | 8 +++++--- arch/powerpc/kernel/idle_book3s.S | 7 +++++++ 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 07b79c2c70f8..02a82777fd5b 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -99,9 +99,11 @@ EXC_VIRT_NONE(0x4000, 0x100) #ifdef CONFIG_PPC_P7_NAP /* * If running native on arch 2.06 or later, check if we are waking up - * from nap/sleep/winkle, and branch to idle handler. The idle wakeup - * handler initially runs in real mode, but we branch to the 0xc000... - * address so we can turn on relocation with mtmsr. + * from nap/sleep/winkle, and branch to idle handler. This tests SRR1 + * bits 46:47. A non-0 value indicates that we are coming from a power + * saving state. The idle wakeup handler initially runs in real mode, + * but we branch to the 0xc000... address so we can turn on relocation + * with mtmsr. */ #define IDLETEST(n) \ BEGIN_FTR_SECTION ; \ diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S index 1ea14b96f126..f6518c768d2a 100644 --- a/arch/powerpc/kernel/idle_book3s.S +++ b/arch/powerpc/kernel/idle_book3s.S @@ -435,6 +435,13 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300) * cr3 - set to gt if waking up with partial/complete hypervisor state loss */ pnv_restore_hyp_resource_arch300: + /* + * Workaround for POWER9, if we lost resources, the ERAT + * might have been mixed up and needs flushing. + */ + blt cr3,1f + PPC_INVALIDATE_ERAT +1: /* * POWER ISA 3. Use PSSCR to determine if we * are waking up from deep idle state From 89d8bb163868178d5b35d1bc1237e67306940cce Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Sat, 24 Jun 2017 14:57:27 -0500 Subject: [PATCH 101/158] powerpc/xive: Silence message about VP block allocation There is no reason for that message to be pr_info(), it will be printed every time we start a KVM guest. Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Michael Ellerman --- arch/powerpc/sysdev/xive/native.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/sysdev/xive/native.c b/arch/powerpc/sysdev/xive/native.c index ab9ecce61ee5..0f95476b01f6 100644 --- a/arch/powerpc/sysdev/xive/native.c +++ b/arch/powerpc/sysdev/xive/native.c @@ -633,8 +633,8 @@ u32 xive_native_alloc_vp_block(u32 max_vcpus) if (max_vcpus > (1 << order)) order++; - pr_info("VP block alloc, for max VCPUs %d use order %d\n", - max_vcpus, order); + pr_debug("VP block alloc, for max VCPUs %d use order %d\n", + max_vcpus, order); for (;;) { rc = opal_xive_alloc_vp_block(order); From 24bedcb7c811375a962a621613ad152d95bc28ba Mon Sep 17 00:00:00 2001 From: Madhavan Srinivasan Date: Sun, 25 Jun 2017 21:04:46 +0530 Subject: [PATCH 102/158] powerpc/perf: Fix branch event code for power9 Correct "branch" event code of Power9 is "r4d05e". Replace the current "branch" event code with "r4d05e" and add a hack to use "r10012" as event code for Power9 DD1. Fixes: d89f473ff6f8 ("powerpc/perf: Fix PM_BRU_CMPL event code for power9") Reported-by: Anton Blanchard Signed-off-by: Madhavan Srinivasan Signed-off-by: Michael Ellerman --- arch/powerpc/perf/power9-events-list.h | 4 +++- arch/powerpc/perf/power9-pmu.c | 8 +++++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/perf/power9-events-list.h b/arch/powerpc/perf/power9-events-list.h index 71a6bfee5c02..80204e064362 100644 --- a/arch/powerpc/perf/power9-events-list.h +++ b/arch/powerpc/perf/power9-events-list.h @@ -16,7 +16,7 @@ EVENT(PM_CYC, 0x0001e) EVENT(PM_ICT_NOSLOT_CYC, 0x100f8) EVENT(PM_CMPLU_STALL, 0x1e054) EVENT(PM_INST_CMPL, 0x00002) -EVENT(PM_BRU_CMPL, 0x10012) +EVENT(PM_BRU_CMPL, 0x4d05e) EVENT(PM_BR_MPRED_CMPL, 0x400f6) /* All L1 D cache load references counted at finish, gated by reject */ @@ -56,3 +56,5 @@ EVENT(PM_RUN_CYC, 0x600f4) /* Instruction Dispatched */ EVENT(PM_INST_DISP, 0x200f2) EVENT(PM_INST_DISP_ALT, 0x300f2) +/* Alternate Branch event code */ +EVENT(PM_BR_CMPL_ALT, 0x10012) diff --git a/arch/powerpc/perf/power9-pmu.c b/arch/powerpc/perf/power9-pmu.c index 018f8e90ac35..ab4765547f20 100644 --- a/arch/powerpc/perf/power9-pmu.c +++ b/arch/powerpc/perf/power9-pmu.c @@ -231,7 +231,7 @@ static int power9_generic_events_dd1[] = { [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = PM_ICT_NOSLOT_CYC, [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = PM_CMPLU_STALL, [PERF_COUNT_HW_INSTRUCTIONS] = PM_INST_DISP, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = PM_BRU_CMPL, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = PM_BR_CMPL_ALT, [PERF_COUNT_HW_BRANCH_MISSES] = PM_BR_MPRED_CMPL, [PERF_COUNT_HW_CACHE_REFERENCES] = PM_LD_REF_L1, [PERF_COUNT_HW_CACHE_MISSES] = PM_LD_MISS_L1_FIN, @@ -453,6 +453,12 @@ static int __init init_power9_pmu(void) * sampling scenarios in power9 DD1, instead use PM_INST_DISP. */ EVENT_VAR(PM_INST_CMPL, _g).id = PM_INST_DISP; + /* + * Power9 DD1 should use PM_BR_CMPL_ALT event code for + * "branches" to provide correct counter value. + */ + EVENT_VAR(PM_BRU_CMPL, _g).id = PM_BR_CMPL_ALT; + EVENT_VAR(PM_BRU_CMPL, _c).id = PM_BR_CMPL_ALT; rc = register_power_pmu(&power9_isa207_pmu); } else { rc = register_power_pmu(&power9_pmu); From a77af552ccc9d4d54459a39f9e5f7ad307aeb4f9 Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Thu, 1 Jun 2017 22:50:38 +0530 Subject: [PATCH 103/158] powerpc/fadump: avoid duplicates in crash memory ranges fadump sets up crash memory ranges to be used for creating PT_LOAD program headers in elfcore header. Memory chunk RMA_START through boot memory area size is added as the first memory range because firmware, at the time of crash, moves this memory chunk to different location specified during fadump registration making it necessary to create a separate program header for it with the correct offset. This memory chunk is skipped while setting up the remaining memory ranges. But currently, there is possibility that some of this memory may have duplicate entries like when it is hot-removed and added again. Ensure that no two memory ranges represent the same memory. When 5 lmbs are hot-removed and then hot-plugged before registering fadump, here is how the program headers in /proc/vmcore exported by fadump look like without this change: Program Headers: Type Offset VirtAddr PhysAddr FileSiz MemSiz Flags Align NOTE 0x0000000000010000 0x0000000000000000 0x0000000000000000 0x0000000000001894 0x0000000000001894 0 LOAD 0x0000000000021020 0xc000000000000000 0x0000000000000000 0x0000000040000000 0x0000000040000000 RWE 0 LOAD 0x0000000040031020 0xc000000000000000 0x0000000000000000 0x0000000010000000 0x0000000010000000 RWE 0 LOAD 0x0000000050040000 0xc000000010000000 0x0000000010000000 0x0000000050000000 0x0000000050000000 RWE 0 LOAD 0x00000000a0040000 0xc000000060000000 0x0000000060000000 0x000000019ffe0000 0x000000019ffe0000 RWE 0 and with this change: Program Headers: Type Offset VirtAddr PhysAddr FileSiz MemSiz Flags Align NOTE 0x0000000000010000 0x0000000000000000 0x0000000000000000 0x0000000000001894 0x0000000000001894 0 LOAD 0x0000000000021020 0xc000000000000000 0x0000000000000000 0x0000000040000000 0x0000000040000000 RWE 0 LOAD 0x0000000040030000 0xc000000040000000 0x0000000040000000 0x0000000020000000 0x0000000020000000 RWE 0 LOAD 0x0000000060030000 0xc000000060000000 0x0000000060000000 0x000000019ffe0000 0x000000019ffe0000 RWE 0 Signed-off-by: Hari Bathini Reviewed-by: Mahesh J Salgaonkar Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/fadump.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 12837d52e84a..08e00448a355 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -878,8 +878,19 @@ static void fadump_setup_crash_memory_ranges(void) for_each_memblock(memory, reg) { start = (unsigned long long)reg->base; end = start + (unsigned long long)reg->size; - if (start == RMA_START && end >= fw_dump.boot_memory_size) - start = fw_dump.boot_memory_size; + + /* + * skip the first memory chunk that is already added (RMA_START + * through boot_memory_size). This logic needs a relook if and + * when RMA_START changes to a non-zero value. + */ + BUILD_BUG_ON(RMA_START != 0); + if (start < fw_dump.boot_memory_size) { + if (end > fw_dump.boot_memory_size) + start = fw_dump.boot_memory_size; + else + continue; + } /* add this range excluding the reserved dump area. */ fadump_exclude_reserved_area(start, end); From eae0dfcc44320c79a05637534d59af4643b2ee7b Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Thu, 1 Jun 2017 22:51:26 +0530 Subject: [PATCH 104/158] powerpc/fadump: avoid holes in boot memory area when fadump is registered To register fadump, boot memory area - the size of low memory chunk that is required for a kernel to boot successfully when booted with restricted memory, is assumed to have no holes. But this memory area is currently not protected from hot-remove operations. So, fadump could fail to re-register after a memory hot-remove operation, if memory is removed from boot memory area. To avoid this, ensure that memory from boot memory area is not hot-removed when fadump is registered. Signed-off-by: Hari Bathini Reviewed-by: Mahesh J Salgaonkar Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/fadump.h | 1 + arch/powerpc/kernel/fadump.c | 12 ++++++++++++ arch/powerpc/platforms/pseries/hotplug-memory.c | 7 +++++++ 3 files changed, 20 insertions(+) diff --git a/arch/powerpc/include/asm/fadump.h b/arch/powerpc/include/asm/fadump.h index a3de219073af..ce88bbe1d809 100644 --- a/arch/powerpc/include/asm/fadump.h +++ b/arch/powerpc/include/asm/fadump.h @@ -203,6 +203,7 @@ struct fad_crash_memory_ranges { unsigned long long size; }; +extern int is_fadump_boot_memory_area(u64 addr, ulong size); extern int early_init_dt_scan_fw_dump(unsigned long node, const char *uname, int depth, void *data); extern int fadump_reserve_mem(void); diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 08e00448a355..750bff3b4af3 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -113,6 +113,18 @@ int __init early_init_dt_scan_fw_dump(unsigned long node, return 1; } +/* + * If fadump is registered, check if the memory provided + * falls within boot memory area. + */ +int is_fadump_boot_memory_area(u64 addr, ulong size) +{ + if (!fw_dump.dump_registered) + return 0; + + return (addr + size) > RMA_START && addr <= fw_dump.boot_memory_size; +} + int is_fadump_active(void) { return fw_dump.dump_active; diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c index e104c71ea44a..a186b8e8019a 100644 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "pseries.h" static bool rtas_hp_event; @@ -406,6 +407,12 @@ static bool lmb_is_removable(struct of_drconf_cell *lmb) scns_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE; phys_addr = lmb->base_addr; +#ifdef CONFIG_FA_DUMP + /* Don't hot-remove memory that falls in fadump boot memory area */ + if (is_fadump_boot_memory_area(phys_addr, block_sz)) + return false; +#endif + for (i = 0; i < scns_per_block; i++) { pfn = PFN_DOWN(phys_addr); if (!pfn_present(pfn)) From a5a05b91c7f36c180c32e27fa41890957c31bad1 Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Thu, 1 Jun 2017 22:52:10 +0530 Subject: [PATCH 105/158] powerpc/fadump: provide a helpful error message fadump fails to register when there are holes in boot memory area. Provide a helpful error message to the user in such case. Signed-off-by: Hari Bathini Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/fadump.c | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 750bff3b4af3..a3568136ab28 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -130,6 +130,38 @@ int is_fadump_active(void) return fw_dump.dump_active; } +/* + * Returns 1, if there are no holes in boot memory area, + * 0 otherwise. + */ +static int is_boot_memory_area_contiguous(void) +{ + struct memblock_region *reg; + unsigned long tstart, tend; + unsigned long start_pfn = PHYS_PFN(RMA_START); + unsigned long end_pfn = PHYS_PFN(RMA_START + fw_dump.boot_memory_size); + unsigned int ret = 0; + + for_each_memblock(memory, reg) { + tstart = max(start_pfn, memblock_region_memory_base_pfn(reg)); + tend = min(end_pfn, memblock_region_memory_end_pfn(reg)); + if (tstart < tend) { + /* Memory hole from start_pfn to tstart */ + if (tstart > start_pfn) + break; + + if (tend == end_pfn) { + ret = 1; + break; + } + + start_pfn = tend + 1; + } + } + + return ret; +} + /* Print firmware assisted dump configurations for debugging purpose. */ static void fadump_show_config(void) { @@ -457,6 +489,10 @@ static int register_fw_dump(struct fadump_mem_struct *fdm) " dump. Hardware Error(%d).\n", rc); break; case -3: + if (!is_boot_memory_area_contiguous()) + pr_err("Can't have holes in boot memory area while " + "registering fadump\n"); + printk(KERN_ERR "Failed to register firmware-assisted kernel" " dump. Parameter Error(%d).\n", rc); err = -EINVAL; From 68fa6478e3b1fab7077d390070ed455aed93905c Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Fri, 2 Jun 2017 01:10:10 +0530 Subject: [PATCH 106/158] powerpc/fadump: add reschedule point while releasing memory Around 95% of memory is reserved by fadump/capture kernel. All this memory is freed, one page at a time, on writing '1' to the node /sys/kernel/fadump_release_mem. On systems with large memory, this can take a long time to complete, leading to soft lockup warning messages. To avoid this, add reschedule points at regular intervals. Also, while memblock_reserve() implicitly takes care of holes in the given memory range while reserving memory, those holes need to be taken care of while releasing memory as memory is freed one page at a time. Add support to skip holes while releasing memory. Suggested-by: Michael Ellerman Signed-off-by: Hari Bathini Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/fadump.c | 65 ++++++++++++++++++++++++++++++------ 1 file changed, 54 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index a3568136ab28..3079518f2245 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -1152,28 +1152,71 @@ void fadump_cleanup(void) } } +static void fadump_free_reserved_memory(unsigned long start_pfn, + unsigned long end_pfn) +{ + unsigned long pfn; + unsigned long time_limit = jiffies + HZ; + + pr_info("freeing reserved memory (0x%llx - 0x%llx)\n", + PFN_PHYS(start_pfn), PFN_PHYS(end_pfn)); + + for (pfn = start_pfn; pfn < end_pfn; pfn++) { + free_reserved_page(pfn_to_page(pfn)); + + if (time_after(jiffies, time_limit)) { + cond_resched(); + time_limit = jiffies + HZ; + } + } +} + +/* + * Skip memory holes and free memory that was actually reserved. + */ +static void fadump_release_reserved_area(unsigned long start, unsigned long end) +{ + struct memblock_region *reg; + unsigned long tstart, tend; + unsigned long start_pfn = PHYS_PFN(start); + unsigned long end_pfn = PHYS_PFN(end); + + for_each_memblock(memory, reg) { + tstart = max(start_pfn, memblock_region_memory_base_pfn(reg)); + tend = min(end_pfn, memblock_region_memory_end_pfn(reg)); + if (tstart < tend) { + fadump_free_reserved_memory(tstart, tend); + + if (tend == end_pfn) + break; + + start_pfn = tend + 1; + } + } +} + /* * Release the memory that was reserved in early boot to preserve the memory * contents. The released memory will be available for general use. */ static void fadump_release_memory(unsigned long begin, unsigned long end) { - unsigned long addr; unsigned long ra_start, ra_end; ra_start = fw_dump.reserve_dump_area_start; ra_end = ra_start + fw_dump.reserve_dump_area_size; - for (addr = begin; addr < end; addr += PAGE_SIZE) { - /* - * exclude the dump reserve area. Will reuse it for next - * fadump registration. - */ - if (addr <= ra_end && ((addr + PAGE_SIZE) > ra_start)) - continue; - - free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT)); - } + /* + * exclude the dump reserve area. Will reuse it for next + * fadump registration. + */ + if (begin < ra_end && end > ra_start) { + if (begin < ra_start) + fadump_release_reserved_area(begin, ra_start); + if (end > ra_end) + fadump_release_reserved_area(ra_end, end); + } else + fadump_release_reserved_area(begin, end); } static void fadump_invalidate_release_mem(void) From 3fc5ee927ff4ffed6aa2fcd44d2fbf07ac893cdc Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 14 Jun 2017 23:02:39 +1000 Subject: [PATCH 107/158] cpuidle: powerpc: cpuidle set polling before enabling irqs local_irq_enable can cause interrupts to be taken which could take significant amount of processing time. The idle process should set its polling flag before this, so another process that wakes it during this time will not have to send an IPI. Expand the TIF_POLLING_NRFLAG coverage to as large as possible. Reviewed-by: Gautham R. Shenoy Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- drivers/cpuidle/cpuidle-powernv.c | 4 +++- drivers/cpuidle/cpuidle-pseries.c | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c index 79152676f62b..50b3c2e0306f 100644 --- a/drivers/cpuidle/cpuidle-powernv.c +++ b/drivers/cpuidle/cpuidle-powernv.c @@ -51,9 +51,10 @@ static int snooze_loop(struct cpuidle_device *dev, { u64 snooze_exit_time; - local_irq_enable(); set_thread_flag(TIF_POLLING_NRFLAG); + local_irq_enable(); + snooze_exit_time = get_tb() + snooze_timeout; ppc64_runlatch_off(); HMT_very_low(); @@ -66,6 +67,7 @@ static int snooze_loop(struct cpuidle_device *dev, ppc64_runlatch_on(); clear_thread_flag(TIF_POLLING_NRFLAG); smp_mb(); + return index; } diff --git a/drivers/cpuidle/cpuidle-pseries.c b/drivers/cpuidle/cpuidle-pseries.c index 166ccd711ec9..7b12bb2ea70f 100644 --- a/drivers/cpuidle/cpuidle-pseries.c +++ b/drivers/cpuidle/cpuidle-pseries.c @@ -62,9 +62,10 @@ static int snooze_loop(struct cpuidle_device *dev, unsigned long in_purr; u64 snooze_exit_time; + set_thread_flag(TIF_POLLING_NRFLAG); + idle_loop_prolog(&in_purr); local_irq_enable(); - set_thread_flag(TIF_POLLING_NRFLAG); snooze_exit_time = get_tb() + snooze_timeout; while (!need_resched()) { From 624e46d03576dd4d5667bad9d2ef814135d0075c Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 14 Jun 2017 23:02:40 +1000 Subject: [PATCH 108/158] cpuidle: powerpc: read mostly for common globals Ensure these don't get put into bouncing cachelines. Reviewed-by: Vaidyanathan Srinivasan Reviewed-by: Gautham R. Shenoy Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- drivers/cpuidle/cpuidle-powernv.c | 10 +++++----- drivers/cpuidle/cpuidle-pseries.c | 8 ++++---- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c index 50b3c2e0306f..9d03326ac05e 100644 --- a/drivers/cpuidle/cpuidle-powernv.c +++ b/drivers/cpuidle/cpuidle-powernv.c @@ -32,18 +32,18 @@ static struct cpuidle_driver powernv_idle_driver = { .owner = THIS_MODULE, }; -static int max_idle_state; -static struct cpuidle_state *cpuidle_state_table; +static int max_idle_state __read_mostly; +static struct cpuidle_state *cpuidle_state_table __read_mostly; struct stop_psscr_table { u64 val; u64 mask; }; -static struct stop_psscr_table stop_psscr_table[CPUIDLE_STATE_MAX]; +static struct stop_psscr_table stop_psscr_table[CPUIDLE_STATE_MAX] __read_mostly; -static u64 snooze_timeout; -static bool snooze_timeout_en; +static u64 snooze_timeout __read_mostly; +static bool snooze_timeout_en __read_mostly; static int snooze_loop(struct cpuidle_device *dev, struct cpuidle_driver *drv, diff --git a/drivers/cpuidle/cpuidle-pseries.c b/drivers/cpuidle/cpuidle-pseries.c index 7b12bb2ea70f..a404f352d284 100644 --- a/drivers/cpuidle/cpuidle-pseries.c +++ b/drivers/cpuidle/cpuidle-pseries.c @@ -25,10 +25,10 @@ struct cpuidle_driver pseries_idle_driver = { .owner = THIS_MODULE, }; -static int max_idle_state; -static struct cpuidle_state *cpuidle_state_table; -static u64 snooze_timeout; -static bool snooze_timeout_en; +static int max_idle_state __read_mostly; +static struct cpuidle_state *cpuidle_state_table __read_mostly; +static u64 snooze_timeout __read_mostly; +static bool snooze_timeout_en __read_mostly; static inline void idle_loop_prolog(unsigned long *in_purr) { From 7ded429152e84831f6696585755f318fb351e67f Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 14 Jun 2017 23:02:41 +1000 Subject: [PATCH 109/158] cpuidle: powerpc: no memory barrier after break from idle A memory barrier is not required after the task wakes up, only if we clear the polling flag before waking. The case where we have work to do is the important one, so optimise for it. Reviewed-by: Vaidyanathan Srinivasan Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- drivers/cpuidle/cpuidle-powernv.c | 11 +++++++++-- drivers/cpuidle/cpuidle-pseries.c | 11 +++++++++-- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c index 9d03326ac05e..37b0698b7193 100644 --- a/drivers/cpuidle/cpuidle-powernv.c +++ b/drivers/cpuidle/cpuidle-powernv.c @@ -59,14 +59,21 @@ static int snooze_loop(struct cpuidle_device *dev, ppc64_runlatch_off(); HMT_very_low(); while (!need_resched()) { - if (likely(snooze_timeout_en) && get_tb() > snooze_exit_time) + if (likely(snooze_timeout_en) && get_tb() > snooze_exit_time) { + /* + * Task has not woken up but we are exiting the polling + * loop anyway. Require a barrier after polling is + * cleared to order subsequent test of need_resched(). + */ + clear_thread_flag(TIF_POLLING_NRFLAG); + smp_mb(); break; + } } HMT_medium(); ppc64_runlatch_on(); clear_thread_flag(TIF_POLLING_NRFLAG); - smp_mb(); return index; } diff --git a/drivers/cpuidle/cpuidle-pseries.c b/drivers/cpuidle/cpuidle-pseries.c index a404f352d284..e9b3853d93ea 100644 --- a/drivers/cpuidle/cpuidle-pseries.c +++ b/drivers/cpuidle/cpuidle-pseries.c @@ -71,13 +71,20 @@ static int snooze_loop(struct cpuidle_device *dev, while (!need_resched()) { HMT_low(); HMT_very_low(); - if (snooze_timeout_en && get_tb() > snooze_exit_time) + if (likely(snooze_timeout_en) && get_tb() > snooze_exit_time) { + /* + * Task has not woken up but we are exiting the polling + * loop anyway. Require a barrier after polling is + * cleared to order subsequent test of need_resched(). + */ + clear_thread_flag(TIF_POLLING_NRFLAG); + smp_mb(); break; + } } HMT_medium(); clear_thread_flag(TIF_POLLING_NRFLAG); - smp_mb(); idle_loop_epilog(in_purr); From fd393188064e3d01a91a21160751f60ff5ce5def Mon Sep 17 00:00:00 2001 From: Javier Martinez Canillas Date: Thu, 15 Jun 2017 20:54:14 +0200 Subject: [PATCH 110/158] powerpc/5200: Add generic compatible string for I2C EEPROM The at24 driver allows to register I2C EEPROM chips using different vendor and devices, but the I2C subsystem does not take the vendor into account when matching using the I2C table since it only has device entries. But when matching using an OF table, both the vendor and device has to be taken into account so the driver defines only a set of compatible strings using the "atmel" vendor as a generic fallback for compatible I2C devices. So add this generic fallback to the device node compatible string to make the device to match the driver using the OF device ID table. Signed-off-by: Javier Martinez Canillas Signed-off-by: Michael Ellerman --- arch/powerpc/boot/dts/digsy_mtc.dts | 2 +- arch/powerpc/boot/dts/pcm030.dts | 2 +- arch/powerpc/boot/dts/pcm032.dts | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/boot/dts/digsy_mtc.dts b/arch/powerpc/boot/dts/digsy_mtc.dts index 955bff629df3..c280e75c86bf 100644 --- a/arch/powerpc/boot/dts/digsy_mtc.dts +++ b/arch/powerpc/boot/dts/digsy_mtc.dts @@ -73,7 +73,7 @@ i2c@3d00 { eeprom@50 { - compatible = "at,24c08"; + compatible = "atmel,24c08"; reg = <0x50>; }; diff --git a/arch/powerpc/boot/dts/pcm030.dts b/arch/powerpc/boot/dts/pcm030.dts index 192e66af0001..836e47cc4bed 100644 --- a/arch/powerpc/boot/dts/pcm030.dts +++ b/arch/powerpc/boot/dts/pcm030.dts @@ -71,7 +71,7 @@ reg = <0x51>; }; eeprom@52 { - compatible = "catalyst,24c32"; + compatible = "catalyst,24c32", "atmel,24c32"; reg = <0x52>; pagesize = <32>; }; diff --git a/arch/powerpc/boot/dts/pcm032.dts b/arch/powerpc/boot/dts/pcm032.dts index 96b139bf50e9..576249bf2fb9 100644 --- a/arch/powerpc/boot/dts/pcm032.dts +++ b/arch/powerpc/boot/dts/pcm032.dts @@ -75,7 +75,7 @@ reg = <0x51>; }; eeprom@52 { - compatible = "catalyst,24c32"; + compatible = "catalyst,24c32", "atmel,24c32"; reg = <0x52>; pagesize = <32>; }; From 226b9391d1a4e91fe08627f75ebad1c60900dfe2 Mon Sep 17 00:00:00 2001 From: Javier Martinez Canillas Date: Thu, 15 Jun 2017 20:54:15 +0200 Subject: [PATCH 111/158] powerpc/fsl: Add generic compatible string for I2C EEPROM The at24 driver allows to register I2C EEPROM chips using different vendor and devices, but the I2C subsystem does not take the vendor into account when matching using the I2C table since it only has device entries. But when matching using an OF table, both the vendor and device has to be taken into account so the driver defines only a set of compatible strings using the "atmel" vendor as a generic fallback for compatible I2C devices. So add this generic fallback to the device node compatible string to make the device to match the driver using the OF device ID table. Signed-off-by: Javier Martinez Canillas Signed-off-by: Michael Ellerman --- arch/powerpc/boot/dts/fsl/b4qds.dtsi | 8 ++++---- arch/powerpc/boot/dts/fsl/c293pcie.dts | 2 +- arch/powerpc/boot/dts/fsl/p1010rdb.dtsi | 2 +- arch/powerpc/boot/dts/fsl/p1023rdb.dts | 2 +- arch/powerpc/boot/dts/fsl/p2041rdb.dts | 4 ++-- arch/powerpc/boot/dts/fsl/p3041ds.dts | 4 ++-- arch/powerpc/boot/dts/fsl/p4080ds.dts | 4 ++-- arch/powerpc/boot/dts/fsl/p5020ds.dts | 4 ++-- arch/powerpc/boot/dts/fsl/p5040ds.dts | 4 ++-- arch/powerpc/boot/dts/fsl/t208xqds.dtsi | 8 ++++---- arch/powerpc/boot/dts/fsl/t4240qds.dts | 12 ++++++------ arch/powerpc/boot/dts/fsl/t4240rdb.dts | 6 +++--- 12 files changed, 30 insertions(+), 30 deletions(-) diff --git a/arch/powerpc/boot/dts/fsl/b4qds.dtsi b/arch/powerpc/boot/dts/fsl/b4qds.dtsi index 3785ef826d07..999efd3bc167 100644 --- a/arch/powerpc/boot/dts/fsl/b4qds.dtsi +++ b/arch/powerpc/boot/dts/fsl/b4qds.dtsi @@ -166,19 +166,19 @@ reg = <0>; eeprom@50 { - compatible = "at24,24c64"; + compatible = "atmel,24c64"; reg = <0x50>; }; eeprom@51 { - compatible = "at24,24c256"; + compatible = "atmel,24c256"; reg = <0x51>; }; eeprom@53 { - compatible = "at24,24c256"; + compatible = "atmel,24c256"; reg = <0x53>; }; eeprom@57 { - compatible = "at24,24c256"; + compatible = "atmel,24c256"; reg = <0x57>; }; rtc@68 { diff --git a/arch/powerpc/boot/dts/fsl/c293pcie.dts b/arch/powerpc/boot/dts/fsl/c293pcie.dts index 66709788429d..5e905e0857cf 100644 --- a/arch/powerpc/boot/dts/fsl/c293pcie.dts +++ b/arch/powerpc/boot/dts/fsl/c293pcie.dts @@ -153,7 +153,7 @@ &soc { i2c@3000 { eeprom@50 { - compatible = "st,24c1024"; + compatible = "st,24c1024", "atmel,24c1024"; reg = <0x50>; }; diff --git a/arch/powerpc/boot/dts/fsl/p1010rdb.dtsi b/arch/powerpc/boot/dts/fsl/p1010rdb.dtsi index a8e4ba070104..2ca9cee2ddeb 100644 --- a/arch/powerpc/boot/dts/fsl/p1010rdb.dtsi +++ b/arch/powerpc/boot/dts/fsl/p1010rdb.dtsi @@ -89,7 +89,7 @@ &board_soc { i2c@3000 { eeprom@50 { - compatible = "st,24c256"; + compatible = "st,24c256", "atmel,24c256"; reg = <0x50>; }; diff --git a/arch/powerpc/boot/dts/fsl/p1023rdb.dts b/arch/powerpc/boot/dts/fsl/p1023rdb.dts index 9716ca64651c..ead928364beb 100644 --- a/arch/powerpc/boot/dts/fsl/p1023rdb.dts +++ b/arch/powerpc/boot/dts/fsl/p1023rdb.dts @@ -79,7 +79,7 @@ i2c@3000 { eeprom@53 { - compatible = "at24,24c04"; + compatible = "atmel,24c04"; reg = <0x53>; }; diff --git a/arch/powerpc/boot/dts/fsl/p2041rdb.dts b/arch/powerpc/boot/dts/fsl/p2041rdb.dts index e50fea95a853..950816b9d6e1 100644 --- a/arch/powerpc/boot/dts/fsl/p2041rdb.dts +++ b/arch/powerpc/boot/dts/fsl/p2041rdb.dts @@ -127,7 +127,7 @@ reg = <0x48>; }; eeprom@50 { - compatible = "at24,24c256"; + compatible = "atmel,24c256"; reg = <0x50>; }; rtc@68 { @@ -142,7 +142,7 @@ i2c@118100 { eeprom@50 { - compatible = "at24,24c256"; + compatible = "atmel,24c256"; reg = <0x50>; }; }; diff --git a/arch/powerpc/boot/dts/fsl/p3041ds.dts b/arch/powerpc/boot/dts/fsl/p3041ds.dts index 40748e415adb..6f5f7283c533 100644 --- a/arch/powerpc/boot/dts/fsl/p3041ds.dts +++ b/arch/powerpc/boot/dts/fsl/p3041ds.dts @@ -124,11 +124,11 @@ i2c@118100 { eeprom@51 { - compatible = "at24,24c256"; + compatible = "atmel,24c256"; reg = <0x51>; }; eeprom@52 { - compatible = "at24,24c256"; + compatible = "atmel,24c256"; reg = <0x52>; }; }; diff --git a/arch/powerpc/boot/dts/fsl/p4080ds.dts b/arch/powerpc/boot/dts/fsl/p4080ds.dts index 816b9788d5f6..65e20152e22f 100644 --- a/arch/powerpc/boot/dts/fsl/p4080ds.dts +++ b/arch/powerpc/boot/dts/fsl/p4080ds.dts @@ -125,11 +125,11 @@ i2c@118100 { eeprom@51 { - compatible = "at24,24c256"; + compatible = "atmel,24c256"; reg = <0x51>; }; eeprom@52 { - compatible = "at24,24c256"; + compatible = "atmel,24c256"; reg = <0x52>; }; rtc@68 { diff --git a/arch/powerpc/boot/dts/fsl/p5020ds.dts b/arch/powerpc/boot/dts/fsl/p5020ds.dts index cd6f37386111..b24adf902d8d 100644 --- a/arch/powerpc/boot/dts/fsl/p5020ds.dts +++ b/arch/powerpc/boot/dts/fsl/p5020ds.dts @@ -124,11 +124,11 @@ i2c@118100 { eeprom@51 { - compatible = "at24,24c256"; + compatible = "atmel,24c256"; reg = <0x51>; }; eeprom@52 { - compatible = "at24,24c256"; + compatible = "atmel,24c256"; reg = <0x52>; }; }; diff --git a/arch/powerpc/boot/dts/fsl/p5040ds.dts b/arch/powerpc/boot/dts/fsl/p5040ds.dts index 45084738cf4e..30850b3228e0 100644 --- a/arch/powerpc/boot/dts/fsl/p5040ds.dts +++ b/arch/powerpc/boot/dts/fsl/p5040ds.dts @@ -133,11 +133,11 @@ i2c@118100 { eeprom@51 { - compatible = "at24,24c256"; + compatible = "atmel,24c256"; reg = <0x51>; }; eeprom@52 { - compatible = "at24,24c256"; + compatible = "atmel,24c256"; reg = <0x52>; }; }; diff --git a/arch/powerpc/boot/dts/fsl/t208xqds.dtsi b/arch/powerpc/boot/dts/fsl/t208xqds.dtsi index ec080bd01b09..db4139999b28 100644 --- a/arch/powerpc/boot/dts/fsl/t208xqds.dtsi +++ b/arch/powerpc/boot/dts/fsl/t208xqds.dtsi @@ -147,17 +147,17 @@ reg = <0x0>; eeprom@50 { - compatible = "at24,24c512"; + compatible = "atmel,24c512"; reg = <0x50>; }; eeprom@51 { - compatible = "at24,24c02"; + compatible = "atmel,24c02"; reg = <0x51>; }; eeprom@57 { - compatible = "at24,24c02"; + compatible = "atmel,24c02"; reg = <0x57>; }; @@ -174,7 +174,7 @@ reg = <0x1>; eeprom@55 { - compatible = "at24,24c02"; + compatible = "atmel,24c02"; reg = <0x55>; }; }; diff --git a/arch/powerpc/boot/dts/fsl/t4240qds.dts b/arch/powerpc/boot/dts/fsl/t4240qds.dts index 9573ceada07c..c0913ac5aaad 100644 --- a/arch/powerpc/boot/dts/fsl/t4240qds.dts +++ b/arch/powerpc/boot/dts/fsl/t4240qds.dts @@ -377,27 +377,27 @@ reg = <0>; eeprom@51 { - compatible = "at24,24c256"; + compatible = "atmel,24c256"; reg = <0x51>; }; eeprom@52 { - compatible = "at24,24c256"; + compatible = "atmel,24c256"; reg = <0x52>; }; eeprom@53 { - compatible = "at24,24c256"; + compatible = "atmel,24c256"; reg = <0x53>; }; eeprom@54 { - compatible = "at24,24c256"; + compatible = "atmel,24c256"; reg = <0x54>; }; eeprom@55 { - compatible = "at24,24c256"; + compatible = "atmel,24c256"; reg = <0x55>; }; eeprom@56 { - compatible = "at24,24c256"; + compatible = "atmel,24c256"; reg = <0x56>; }; rtc@68 { diff --git a/arch/powerpc/boot/dts/fsl/t4240rdb.dts b/arch/powerpc/boot/dts/fsl/t4240rdb.dts index 8166c660712a..15eb0a3f7290 100644 --- a/arch/powerpc/boot/dts/fsl/t4240rdb.dts +++ b/arch/powerpc/boot/dts/fsl/t4240rdb.dts @@ -130,15 +130,15 @@ reg = <0x2f>; }; eeprom@52 { - compatible = "at24,24c256"; + compatible = "atmel,24c256"; reg = <0x52>; }; eeprom@54 { - compatible = "at24,24c256"; + compatible = "atmel,24c256"; reg = <0x54>; }; eeprom@56 { - compatible = "at24,24c256"; + compatible = "atmel,24c256"; reg = <0x56>; }; rtc@68 { From 9b4091682700fb5909aed439fc0101992164bcd0 Mon Sep 17 00:00:00 2001 From: Javier Martinez Canillas Date: Thu, 15 Jun 2017 20:54:16 +0200 Subject: [PATCH 112/158] powerpc/512x: Add generic compatible string for I2C EEPROM The at24 driver allows to register I2C EEPROM chips using different vendor and devices, but the I2C subsystem does not take the vendor into account when matching using the I2C table since it only has device entries. But when matching using an OF table, both the vendor and device has to be taken into account so the driver defines only a set of compatible strings using the "atmel" vendor as a generic fallback for compatible I2C devices. So add this generic fallback to the device node compatible string to make the device to match the driver using the OF device ID table. Signed-off-by: Javier Martinez Canillas Signed-off-by: Michael Ellerman --- arch/powerpc/boot/dts/mpc5121ads.dts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/boot/dts/mpc5121ads.dts b/arch/powerpc/boot/dts/mpc5121ads.dts index 75888ce2c792..fcaa9bad4bda 100644 --- a/arch/powerpc/boot/dts/mpc5121ads.dts +++ b/arch/powerpc/boot/dts/mpc5121ads.dts @@ -94,7 +94,7 @@ }; eeprom@50 { - compatible = "at,24c32"; + compatible = "atmel,24c32"; reg = <0x50>; }; From 8d0590cefdc22bc5d6f4d30ea19c95acda2eefed Mon Sep 17 00:00:00 2001 From: Javier Martinez Canillas Date: Thu, 15 Jun 2017 20:54:17 +0200 Subject: [PATCH 113/158] powerpc/83xx: Add generic compatible string for I2C EEPROM The at24 driver allows to register I2C EEPROM chips using different vendor and devices, but the I2C subsystem does not take the vendor into account when matching using the I2C table since it only has device entries. But when matching using an OF table, both the vendor and device has to be taken into account so the driver defines only a set of compatible strings using the "atmel" vendor as a generic fallback for compatible I2C devices. So add this generic fallback to the device node compatible string to make the device to match the driver using the OF device ID table. Signed-off-by: Javier Martinez Canillas Signed-off-by: Michael Ellerman --- arch/powerpc/boot/dts/mpc8308_p1m.dts | 2 +- arch/powerpc/boot/dts/mpc8349emitx.dts | 4 ++-- arch/powerpc/boot/dts/mpc8377_rdb.dts | 2 +- arch/powerpc/boot/dts/mpc8377_wlan.dts | 2 +- arch/powerpc/boot/dts/mpc8378_rdb.dts | 2 +- arch/powerpc/boot/dts/mpc8379_rdb.dts | 2 +- 6 files changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/boot/dts/mpc8308_p1m.dts b/arch/powerpc/boot/dts/mpc8308_p1m.dts index 57f86cdf9f36..cab933b3957a 100644 --- a/arch/powerpc/boot/dts/mpc8308_p1m.dts +++ b/arch/powerpc/boot/dts/mpc8308_p1m.dts @@ -123,7 +123,7 @@ interrupt-parent = <&ipic>; dfsrr; fram@50 { - compatible = "ramtron,24c64"; + compatible = "ramtron,24c64", "atmel,24c64"; reg = <0x50>; }; }; diff --git a/arch/powerpc/boot/dts/mpc8349emitx.dts b/arch/powerpc/boot/dts/mpc8349emitx.dts index 90aed3ac2f69..648a85858eb5 100644 --- a/arch/powerpc/boot/dts/mpc8349emitx.dts +++ b/arch/powerpc/boot/dts/mpc8349emitx.dts @@ -92,7 +92,7 @@ dfsrr; eeprom: at24@50 { - compatible = "st,24c256"; + compatible = "st,24c256", "atmel,24c256"; reg = <0x50>; }; @@ -130,7 +130,7 @@ }; spd: at24@51 { - compatible = "at24,spd"; + compatible = "atmel,spd"; reg = <0x51>; }; diff --git a/arch/powerpc/boot/dts/mpc8377_rdb.dts b/arch/powerpc/boot/dts/mpc8377_rdb.dts index e32613963ab0..5e85d8c93bca 100644 --- a/arch/powerpc/boot/dts/mpc8377_rdb.dts +++ b/arch/powerpc/boot/dts/mpc8377_rdb.dts @@ -150,7 +150,7 @@ }; at24@50 { - compatible = "at24,24c256"; + compatible = "atmel,24c256"; reg = <0x50>; }; diff --git a/arch/powerpc/boot/dts/mpc8377_wlan.dts b/arch/powerpc/boot/dts/mpc8377_wlan.dts index c0c790168b96..fee15fcbb46f 100644 --- a/arch/powerpc/boot/dts/mpc8377_wlan.dts +++ b/arch/powerpc/boot/dts/mpc8377_wlan.dts @@ -135,7 +135,7 @@ dfsrr; at24@50 { - compatible = "at24,24c256"; + compatible = "atmel,24c256"; reg = <0x50>; }; diff --git a/arch/powerpc/boot/dts/mpc8378_rdb.dts b/arch/powerpc/boot/dts/mpc8378_rdb.dts index 71842fcd621f..e973d61956b9 100644 --- a/arch/powerpc/boot/dts/mpc8378_rdb.dts +++ b/arch/powerpc/boot/dts/mpc8378_rdb.dts @@ -150,7 +150,7 @@ }; at24@50 { - compatible = "at24,24c256"; + compatible = "atmel,24c256"; reg = <0x50>; }; diff --git a/arch/powerpc/boot/dts/mpc8379_rdb.dts b/arch/powerpc/boot/dts/mpc8379_rdb.dts index e442a29b2fe0..ed5d12ff2ee0 100644 --- a/arch/powerpc/boot/dts/mpc8379_rdb.dts +++ b/arch/powerpc/boot/dts/mpc8379_rdb.dts @@ -148,7 +148,7 @@ }; at24@50 { - compatible = "at24,24c256"; + compatible = "atmel,24c256"; reg = <0x50>; }; From adeb8667eaf5f00b5a7f0cede680cda078e8424b Mon Sep 17 00:00:00 2001 From: Javier Martinez Canillas Date: Thu, 15 Jun 2017 20:54:18 +0200 Subject: [PATCH 114/158] powerpc/44x: Add generic compatible string for I2C EEPROM The at24 driver allows to register I2C EEPROM chips using different vendor and devices, but the I2C subsystem does not take the vendor into account when matching using the I2C table since it only has device entries. But when matching using an OF table, both the vendor and device has to be taken into account so the driver defines only a set of compatible strings using the "atmel" vendor as a generic fallback for compatible I2C devices. So add this generic fallback to the device node compatible string to make the device to match the driver using the OF device ID table. Signed-off-by: Javier Martinez Canillas Signed-off-by: Michael Ellerman --- arch/powerpc/boot/dts/warp.dts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/boot/dts/warp.dts b/arch/powerpc/boot/dts/warp.dts index e576ee85c42f..ea9053ef4819 100644 --- a/arch/powerpc/boot/dts/warp.dts +++ b/arch/powerpc/boot/dts/warp.dts @@ -238,7 +238,7 @@ /* This will create 52 and 53 */ at24@52 { - compatible = "at,24c04"; + compatible = "atmel,24c04"; reg = <0x52>; }; }; From f8d0d5dc641cd405ad40cb2498b04df9716baee6 Mon Sep 17 00:00:00 2001 From: Santosh Sivaraj Date: Tue, 27 Jun 2017 12:30:05 +0530 Subject: [PATCH 115/158] powerpc/smp: Do not BUG_ON if invalid CPU during kick During secondary start, we do not need to BUG_ON if an invalid CPU number is passed. We already print an error if secondary cannot be started, so just return an error instead. Signed-off-by: Santosh Sivaraj Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/smp.c | 3 ++- arch/powerpc/platforms/cell/smp.c | 3 ++- arch/powerpc/platforms/powernv/smp.c | 3 ++- arch/powerpc/platforms/pseries/smp.c | 3 ++- 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index df2a41647d8e..05bf5836107c 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -112,7 +112,8 @@ int smp_generic_cpu_bootable(unsigned int nr) #ifdef CONFIG_PPC64 int smp_generic_kick_cpu(int nr) { - BUG_ON(nr < 0 || nr >= NR_CPUS); + if (nr < 0 || nr >= NR_CPUS) + return -EINVAL; /* * The processor is currently spinning, waiting for the diff --git a/arch/powerpc/platforms/cell/smp.c b/arch/powerpc/platforms/cell/smp.c index 895560f4be69..ee8c535cf4d3 100644 --- a/arch/powerpc/platforms/cell/smp.c +++ b/arch/powerpc/platforms/cell/smp.c @@ -115,7 +115,8 @@ static void smp_cell_setup_cpu(int cpu) static int smp_cell_kick_cpu(int nr) { - BUG_ON(nr < 0 || nr >= NR_CPUS); + if (nr < 0 || nr >= NR_CPUS) + return -EINVAL; if (!smp_startup_cpu(nr)) return -ENOENT; diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c index c04c87adad94..292825f25ffd 100644 --- a/arch/powerpc/platforms/powernv/smp.c +++ b/arch/powerpc/platforms/powernv/smp.c @@ -63,7 +63,8 @@ static int pnv_smp_kick_cpu(int nr) long rc; uint8_t status; - BUG_ON(nr < 0 || nr >= NR_CPUS); + if (nr < 0 || nr >= NR_CPUS) + return -EINVAL; /* * If we already started or OPAL is not supported, we just diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c index 52ca6b311d44..c82182ac40af 100644 --- a/arch/powerpc/platforms/pseries/smp.c +++ b/arch/powerpc/platforms/pseries/smp.c @@ -151,7 +151,8 @@ static void smp_setup_cpu(int cpu) static int smp_pSeries_kick_cpu(int nr) { - BUG_ON(nr < 0 || nr >= NR_CPUS); + if (nr < 0 || nr >= NR_CPUS) + return -EINVAL; if (!smp_startup_cpu(nr)) return -ENOENT; From c642af9c41f09296997519499d16ff30e700816a Mon Sep 17 00:00:00 2001 From: Santosh Sivaraj Date: Tue, 27 Jun 2017 12:30:06 +0530 Subject: [PATCH 116/158] powerpc/smp: Convert NR_CPUS to nr_cpu_ids nr_cpu_ids can be limited by nr_cpus boot parameter, whereas NR_CPUS is a compile time constant, which shouldn't be compared against during cpu kick. Signed-off-by: Santosh Sivaraj Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/smp.c | 2 +- arch/powerpc/platforms/cell/smp.c | 2 +- arch/powerpc/platforms/powernv/smp.c | 2 +- arch/powerpc/platforms/pseries/smp.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 05bf5836107c..418019728efa 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -112,7 +112,7 @@ int smp_generic_cpu_bootable(unsigned int nr) #ifdef CONFIG_PPC64 int smp_generic_kick_cpu(int nr) { - if (nr < 0 || nr >= NR_CPUS) + if (nr < 0 || nr >= nr_cpu_ids) return -EINVAL; /* diff --git a/arch/powerpc/platforms/cell/smp.c b/arch/powerpc/platforms/cell/smp.c index ee8c535cf4d3..f84d52a2db40 100644 --- a/arch/powerpc/platforms/cell/smp.c +++ b/arch/powerpc/platforms/cell/smp.c @@ -115,7 +115,7 @@ static void smp_cell_setup_cpu(int cpu) static int smp_cell_kick_cpu(int nr) { - if (nr < 0 || nr >= NR_CPUS) + if (nr < 0 || nr >= nr_cpu_ids) return -EINVAL; if (!smp_startup_cpu(nr)) diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c index 292825f25ffd..40dae96f7e20 100644 --- a/arch/powerpc/platforms/powernv/smp.c +++ b/arch/powerpc/platforms/powernv/smp.c @@ -63,7 +63,7 @@ static int pnv_smp_kick_cpu(int nr) long rc; uint8_t status; - if (nr < 0 || nr >= NR_CPUS) + if (nr < 0 || nr >= nr_cpu_ids) return -EINVAL; /* diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c index c82182ac40af..24785f63fb40 100644 --- a/arch/powerpc/platforms/pseries/smp.c +++ b/arch/powerpc/platforms/pseries/smp.c @@ -151,7 +151,7 @@ static void smp_setup_cpu(int cpu) static int smp_pSeries_kick_cpu(int nr) { - if (nr < 0 || nr >= NR_CPUS) + if (nr < 0 || nr >= nr_cpu_ids) return -EINVAL; if (!smp_startup_cpu(nr)) From b0f36c10de8d9bd271ae261604d4e4f47f663255 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Thu, 6 Apr 2017 19:44:49 +0530 Subject: [PATCH 117/158] powerpc/mm: Add comments to the vmemmap layout Add some explaination to the layout of vmemmap virtual address space and how physical page mapping is only used for valid PFNs present at any point on the system. Reviewed-by: Aneesh Kumar K.V Signed-off-by: Anshuman Khandual Signed-off-by: Michael Ellerman --- arch/powerpc/mm/pgtable-hash64.c | 75 ++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/arch/powerpc/mm/pgtable-hash64.c b/arch/powerpc/mm/pgtable-hash64.c index 8b85a14b08ea..16877f60fb87 100644 --- a/arch/powerpc/mm/pgtable-hash64.c +++ b/arch/powerpc/mm/pgtable-hash64.c @@ -21,6 +21,81 @@ #include #ifdef CONFIG_SPARSEMEM_VMEMMAP +/* + * vmemmap is the starting address of the virtual address space where + * struct pages are allocated for all possible PFNs present on the system + * including holes and bad memory (hence sparse). These virtual struct + * pages are stored in sequence in this virtual address space irrespective + * of the fact whether the corresponding PFN is valid or not. This achieves + * constant relationship between address of struct page and its PFN. + * + * During boot or memory hotplug operation when a new memory section is + * added, physical memory allocation (including hash table bolting) will + * be performed for the set of struct pages which are part of the memory + * section. This saves memory by not allocating struct pages for PFNs + * which are not valid. + * + * ---------------------------------------------- + * | PHYSICAL ALLOCATION OF VIRTUAL STRUCT PAGES| + * ---------------------------------------------- + * + * f000000000000000 c000000000000000 + * vmemmap +--------------+ +--------------+ + * + | page struct | +--------------> | page struct | + * | +--------------+ +--------------+ + * | | page struct | +--------------> | page struct | + * | +--------------+ | +--------------+ + * | | page struct | + +------> | page struct | + * | +--------------+ | +--------------+ + * | | page struct | | +--> | page struct | + * | +--------------+ | | +--------------+ + * | | page struct | | | + * | +--------------+ | | + * | | page struct | | | + * | +--------------+ | | + * | | page struct | | | + * | +--------------+ | | + * | | page struct | | | + * | +--------------+ | | + * | | page struct | +-------+ | + * | +--------------+ | + * | | page struct | +-----------+ + * | +--------------+ + * | | page struct | No mapping + * | +--------------+ + * | | page struct | No mapping + * v +--------------+ + * + * ----------------------------------------- + * | RELATION BETWEEN STRUCT PAGES AND PFNS| + * ----------------------------------------- + * + * vmemmap +--------------+ +---------------+ + * + | page struct | +-------------> | PFN | + * | +--------------+ +---------------+ + * | | page struct | +-------------> | PFN | + * | +--------------+ +---------------+ + * | | page struct | +-------------> | PFN | + * | +--------------+ +---------------+ + * | | page struct | +-------------> | PFN | + * | +--------------+ +---------------+ + * | | | + * | +--------------+ + * | | | + * | +--------------+ + * | | | + * | +--------------+ +---------------+ + * | | page struct | +-------------> | PFN | + * | +--------------+ +---------------+ + * | | | + * | +--------------+ + * | | | + * | +--------------+ +---------------+ + * | | page struct | +-------------> | PFN | + * | +--------------+ +---------------+ + * | | page struct | +-------------> | PFN | + * v +--------------+ +---------------+ + */ /* * On hash-based CPUs, the vmemmap is bolted in the hash table. * From 39e46751839dfe4c34eb354eee1e278082fc9d07 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Thu, 6 Apr 2017 19:44:50 +0530 Subject: [PATCH 118/158] powerpc/mm: Add comments on vmemmap physical mapping Adds some explaination on how the vmemmap based struct page layout's physical mapping is allocated and tracked through linked list. It also keeps note of a possible race condition. Signed-off-by: Anshuman Khandual Signed-off-by: Michael Ellerman --- arch/powerpc/mm/init_64.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index ec84b31c6c86..f1a4964f9317 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -110,8 +110,29 @@ static int __meminit vmemmap_populated(unsigned long start, int page_size) return 0; } +/* + * vmemmap virtual address space management does not have a traditonal page + * table to track which virtual struct pages are backed by physical mapping. + * The virtual to physical mappings are tracked in a simple linked list + * format. 'vmemmap_list' maintains the entire vmemmap physical mapping at + * all times where as the 'next' list maintains the available + * vmemmap_backing structures which have been deleted from the + * 'vmemmap_global' list during system runtime (memory hotplug remove + * operation). The freed 'vmemmap_backing' structures are reused later when + * new requests come in without allocating fresh memory. This pointer also + * tracks the allocated 'vmemmap_backing' structures as we allocate one + * full page memory at a time when we dont have any. + */ struct vmemmap_backing *vmemmap_list; static struct vmemmap_backing *next; + +/* + * The same pointer 'next' tracks individual chunks inside the allocated + * full page during the boot time and again tracks the freeed nodes during + * runtime. It is racy but it does not happen as they are separated by the + * boot process. Will create problem if some how we have memory hotplug + * operation during boot !! + */ static int num_left; static int num_freed; From 4d0d7c02df680740da41f5f92a238c35796ca5be Mon Sep 17 00:00:00 2001 From: Akshay Adiga Date: Wed, 28 Jun 2017 06:46:49 +0530 Subject: [PATCH 119/158] powerpc/powernv/idle: Clear r12 on wakeup from stop lite pnv_wakeup_noloss() expects r12 to contain SRR1 value to determine if the wakeup reason is an HMI in CHECK_HMI_INTERRUPT. When we wakeup with ESL=0, SRR1 will not contain the wakeup reason, so there is no point setting r12 to SRR1. However, we don't set r12 at all so r12 contains garbage (likely a kernel pointer), and is still used to check HMI assuming that it contained SRR1. This causes the OPAL msglog to be filled with the following print: HMI: Received HMI interrupt: HMER = 0x0040000000000000 This patch clears r12 after waking up from stop with ESL=EC=0, so that we don't accidentally enter the HMI handler in pnv_wakeup_noloss() if the value of r12[42:45] corresponds to HMI as wakeup reason. Prior to commit 9d29250136f6 ("powerpc/64s/idle: Avoid SRR usage in idle sleep/wake paths") this bug existed, in that we would incorrectly look at SRR1 to check for a HMI when SRR1 didn't contain a wakeup reason. However the SRR1 value would just happen to never have bits 42:45 set. Fixes: 9d29250136f6 ("powerpc/64s/idle: Avoid SRR usage in idle sleep/wake paths") Signed-off-by: Akshay Adiga Reviewed-by: Nicholas Piggin [mpe: Change log and comment massaging] Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/idle_book3s.S | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S index f6518c768d2a..5adb390e773b 100644 --- a/arch/powerpc/kernel/idle_book3s.S +++ b/arch/powerpc/kernel/idle_book3s.S @@ -256,6 +256,19 @@ power_enter_stop: bne .Lhandle_esl_ec_set IDLE_STATE_ENTER_SEQ(PPC_STOP) li r3,0 /* Since we didn't lose state, return 0 */ + + /* + * pnv_wakeup_noloss() expects r12 to contain the SRR1 value so + * it can determine if the wakeup reason is an HMI in + * CHECK_HMI_INTERRUPT. + * + * However, when we wakeup with ESL=0, SRR1 will not contain the wakeup + * reason, so there is no point setting r12 to SRR1. + * + * Further, we clear r12 here, so that we don't accidentally enter the + * HMI in pnv_wakeup_noloss() if the value of r12[42:45] == WAKE_HMI. + */ + li r12, 0 b pnv_wakeup_noloss .Lhandle_esl_ec_set: From fd851a3cdc196bfc1d229b5f22369069af532bf8 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Mon, 29 May 2017 12:22:23 +1000 Subject: [PATCH 120/158] spin loop primitives for busy waiting Current busy-wait loops are implemented by repeatedly calling cpu_relax() to give an arch option for a low-latency option to improve power and/or SMT resource contention. This poses some difficulties for powerpc, which has SMT priority setting instructions (priorities determine how ifetch cycles are apportioned). powerpc's cpu_relax() is implemented by setting a low priority then setting normal priority. This has several problems: - Changing thread priority can have some execution cost and potential impact to other threads in the core. It's inefficient to execute them every time around a busy-wait loop. - Depending on implementation details, a `low ; medium` sequence may not have much if any affect. Some software with similar pattern actually inserts a lot of nops between, in order to cause a few fetch cycles with the low priority. - The busy-wait loop runs with regular priority. This might only be a few fetch cycles, but if there are several threads running such loops, they could cause a noticable impact on a non-idle thread. Implement spin_begin, spin_end primitives that can be used around busy wait loops, which default to no-ops. And spin_cpu_relax which defaults to cpu_relax. This will allow architectures to hook the entry and exit of busy-wait loops, and will allow powerpc to set low SMT priority at entry, and normal priority at exit. Suggested-by: Linus Torvalds Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- include/linux/processor.h | 70 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 include/linux/processor.h diff --git a/include/linux/processor.h b/include/linux/processor.h new file mode 100644 index 000000000000..da0c5e56ca02 --- /dev/null +++ b/include/linux/processor.h @@ -0,0 +1,70 @@ +/* Misc low level processor primitives */ +#ifndef _LINUX_PROCESSOR_H +#define _LINUX_PROCESSOR_H + +#include + +/* + * spin_begin is used before beginning a busy-wait loop, and must be paired + * with spin_end when the loop is exited. spin_cpu_relax must be called + * within the loop. + * + * The loop body should be as small and fast as possible, on the order of + * tens of instructions/cycles as a guide. It should and avoid calling + * cpu_relax, or any "spin" or sleep type of primitive including nested uses + * of these primitives. It should not lock or take any other resource. + * Violations of these guidelies will not cause a bug, but may cause sub + * optimal performance. + * + * These loops are optimized to be used where wait times are expected to be + * less than the cost of a context switch (and associated overhead). + * + * Detection of resource owner and decision to spin or sleep or guest-yield + * (e.g., spin lock holder vcpu preempted, or mutex owner not on CPU) can be + * tested within the loop body. + */ +#ifndef spin_begin +#define spin_begin() +#endif + +#ifndef spin_cpu_relax +#define spin_cpu_relax() cpu_relax() +#endif + +/* + * spin_cpu_yield may be called to yield (undirected) to the hypervisor if + * necessary. This should be used if the wait is expected to take longer + * than context switch overhead, but we can't sleep or do a directed yield. + */ +#ifndef spin_cpu_yield +#define spin_cpu_yield() cpu_relax_yield() +#endif + +#ifndef spin_end +#define spin_end() +#endif + +/* + * spin_until_cond can be used to wait for a condition to become true. It + * may be expected that the first iteration will true in the common case + * (no spinning), so that callers should not require a first "likely" test + * for the uncontended case before using this primitive. + * + * Usage and implementation guidelines are the same as for the spin_begin + * primitives, above. + */ +#ifndef spin_until_cond +#define spin_until_cond(cond) \ +do { \ + if (unlikely(!(cond))) { \ + spin_begin(); \ + do { \ + spin_cpu_relax(); \ + } while (!(cond)); \ + spin_end(); \ + } \ +} while (0) + +#endif + +#endif /* _LINUX_PROCESSOR_H */ From ede8e2bbb0eb3370e4dc5484b40eb22850a09b92 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 6 Jun 2017 23:08:31 +1000 Subject: [PATCH 121/158] powerpc/64: implement spin loop primitives Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/processor.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index c49165a7439c..832775771bd3 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -428,6 +428,26 @@ static inline unsigned long __pack_fe01(unsigned int fpmode) #ifdef CONFIG_PPC64 #define cpu_relax() do { HMT_low(); HMT_medium(); barrier(); } while (0) + +#define spin_begin() HMT_low() + +#define spin_cpu_relax() barrier() + +#define spin_cpu_yield() spin_cpu_relax() + +#define spin_end() HMT_medium() + +#define spin_until_cond(cond) \ +do { \ + if (unlikely(!(cond))) { \ + spin_begin(); \ + do { \ + spin_cpu_relax(); \ + } while (!(cond)); \ + spin_end(); \ + } \ +} while (0) + #else #define cpu_relax() barrier() #endif From 4e287e655e108cbbd6e3e7dcc49d591c8aa5a8a4 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 6 Jun 2017 23:08:32 +1000 Subject: [PATCH 122/158] powerpc: use spin loop primitives in some functions Use the different spin loop primitives in some simple powerpc spin loops, including those which will spin as a common case. This will help to test the spin loop primitives before more conversions are done. Signed-off-by: Nicholas Piggin [mpe: Add some includes of ] Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/delay.h | 16 ++++++++++++---- arch/powerpc/kernel/smp.c | 4 ++-- arch/powerpc/kernel/time.c | 8 +++++--- arch/powerpc/mm/hash_native_64.c | 5 ++++- 4 files changed, 23 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/include/asm/delay.h b/arch/powerpc/include/asm/delay.h index 52e4d54da2a9..3df4417dd9c8 100644 --- a/arch/powerpc/include/asm/delay.h +++ b/arch/powerpc/include/asm/delay.h @@ -2,6 +2,7 @@ #define _ASM_POWERPC_DELAY_H #ifdef __KERNEL__ +#include #include /* @@ -58,11 +59,18 @@ extern void udelay(unsigned long usecs); typeof(condition) __ret; \ unsigned long __loops = tb_ticks_per_usec * timeout; \ unsigned long __start = get_tbl(); \ - while (!(__ret = (condition)) && (tb_ticks_since(__start) <= __loops)) \ - if (delay) \ + \ + if (delay) { \ + while (!(__ret = (condition)) && \ + (tb_ticks_since(__start) <= __loops)) \ udelay(delay); \ - else \ - cpu_relax(); \ + } else { \ + spin_begin(); \ + while (!(__ret = (condition)) && \ + (tb_ticks_since(__start) <= __loops)) \ + spin_cpu_relax(); \ + spin_end(); \ + } \ if (!__ret) \ __ret = (condition); \ __ret; \ diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 418019728efa..a975ddf77200 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -767,8 +768,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *tidle) smp_ops->give_timebase(); /* Wait until cpu puts itself in the online & active maps */ - while (!cpu_online(cpu)) - cpu_relax(); + spin_until_cond(cpu_online(cpu)); return 0; } diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 0cc0dad905d5..fe6f3a285455 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -59,10 +59,10 @@ #include #include #include +#include #include #include -#include #include #include #include @@ -442,6 +442,7 @@ void __delay(unsigned long loops) unsigned long start; int diff; + spin_begin(); if (__USE_RTC()) { start = get_rtcl(); do { @@ -449,13 +450,14 @@ void __delay(unsigned long loops) diff = get_rtcl() - start; if (diff < 0) diff += 1000000000; + spin_cpu_relax(); } while (diff < loops); } else { start = get_tbl(); while (get_tbl() - start < loops) - HMT_low(); - HMT_medium(); + spin_cpu_relax(); } + spin_end(); } EXPORT_SYMBOL(__delay); diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c index bdaac28193f7..fbd1acc3b8d9 100644 --- a/arch/powerpc/mm/hash_native_64.c +++ b/arch/powerpc/mm/hash_native_64.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -184,8 +185,10 @@ static inline void native_lock_hpte(struct hash_pte *hptep) while (1) { if (!test_and_set_bit_lock(HPTE_LOCK_BIT, word)) break; + spin_begin(); while(test_bit(HPTE_LOCK_BIT, word)) - cpu_relax(); + spin_cpu_relax(); + spin_end(); } } From 5fc3a7f754b9f89744b1b45533b166db1fb2053c Mon Sep 17 00:00:00 2001 From: Ian Munsie Date: Wed, 28 Jun 2017 16:19:20 +1000 Subject: [PATCH 123/158] MAINTAINERS: Remove myself as cxl maintainer I am no longer employed by IBM and will no longer have access to cxl hardware, so remove myself as a cxl maintainer. If anyone needs to contact me in the future, please use my personal email address darkstarsword@gmail.com Signed-off-by: Ian Munsie Cc: Frederic Barrat Cc: Andrew Donnellan Cc: linuxppc-dev@lists.ozlabs.org Acked-by: Frederic Barrat Signed-off-by: Michael Ellerman --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 25eecfef41b7..ecf65798eedb 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3721,7 +3721,6 @@ S: Supported F: drivers/net/ethernet/chelsio/cxgb4vf/ CXL (IBM Coherent Accelerator Processor Interface CAPI) DRIVER -M: Ian Munsie M: Frederic Barrat L: linuxppc-dev@lists.ozlabs.org S: Supported From 8c7d0a04066b0d9d95134e7853e8c26b84173108 Mon Sep 17 00:00:00 2001 From: Andrew Donnellan Date: Wed, 28 Jun 2017 17:22:30 +1000 Subject: [PATCH 124/158] MAINTAINERS: cxl: update maintainership As Ian's stepping down from his maintainer role now that he's leaving IBM, Frederic has asked me to add myself to the cxl maintainer list. Updating accordingly. Cc: Frederic Barrat Cc: Ian Munsie Signed-off-by: Andrew Donnellan Acked-by: Frederic Barrat Acked-by: Ian Munsie Signed-off-by: Michael Ellerman --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index ecf65798eedb..29410ede3655 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3722,6 +3722,7 @@ F: drivers/net/ethernet/chelsio/cxgb4vf/ CXL (IBM Coherent Accelerator Processor Interface CAPI) DRIVER M: Frederic Barrat +M: Andrew Donnellan L: linuxppc-dev@lists.ozlabs.org S: Supported F: arch/powerpc/platforms/powernv/pci-cxl.c From 7a849a6cf3ff676d1726259c18fa413f9e0c62a3 Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Fri, 30 Jun 2017 16:52:35 +1000 Subject: [PATCH 125/158] powerpc/hugetlbfs: Export HPAGE_SHIFT Export it so it can be referenced inside a module. Signed-off-by: Oliver O'Halloran Signed-off-by: Michael Ellerman --- arch/powerpc/mm/hugetlbpage.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 94e56b10e4cc..19659081da72 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -32,6 +32,7 @@ #define PAGE_SHIFT_16G 34 unsigned int HPAGE_SHIFT; +EXPORT_SYMBOL(HPAGE_SHIFT); /* * Tracks gpages after the device tree is scanned and before the From 65f7d049788763969180c72ef98dab8bf0340c55 Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Wed, 28 Jun 2017 11:32:31 +1000 Subject: [PATCH 126/158] mm, x86: Add ARCH_HAS_ZONE_DEVICE to Kconfig Currently ZONE_DEVICE depends on X86_64 and this will get unwieldly as new architectures (and platforms) get ZONE_DEVICE support. Move to an arch selected Kconfig option to save us the trouble. Cc: linux-mm@kvack.org Acked-by: Ingo Molnar Acked-by: Balbir Singh Signed-off-by: Oliver O'Halloran Signed-off-by: Michael Ellerman --- arch/x86/Kconfig | 1 + mm/Kconfig | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 4ccfacc7232a..32903e30e347 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -59,6 +59,7 @@ config X86 select ARCH_HAS_STRICT_KERNEL_RWX select ARCH_HAS_STRICT_MODULE_RWX select ARCH_HAS_UBSAN_SANITIZE_ALL + select ARCH_HAS_ZONE_DEVICE if X86_64 select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI select ARCH_MIGHT_HAVE_PC_PARPORT diff --git a/mm/Kconfig b/mm/Kconfig index beb7a455915d..790e52a8a486 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -683,12 +683,16 @@ config IDLE_PAGE_TRACKING See Documentation/vm/idle_page_tracking.txt for more details. +# arch_add_memory() comprehends device memory +config ARCH_HAS_ZONE_DEVICE + bool + config ZONE_DEVICE bool "Device memory (pmem, etc...) hotplug support" depends on MEMORY_HOTPLUG depends on MEMORY_HOTREMOVE depends on SPARSEMEM_VMEMMAP - depends on X86_64 #arch_add_memory() comprehends device memory + depends on ARCH_HAS_ZONE_DEVICE help Device memory hotplug support allows for establishing pmem, From d7d9b612f1b0ac154071bafc02bc9b7b879d7363 Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Wed, 28 Jun 2017 11:32:32 +1000 Subject: [PATCH 127/158] powerpc/vmemmap: Reshuffle vmemmap_free() Removes an indentation level and shuffles some code around to make the following patch cleaner. No functional changes. Reviewed-by: Balbir Singh Signed-off-by: Oliver O'Halloran Signed-off-by: Michael Ellerman --- arch/powerpc/mm/init_64.c | 44 ++++++++++++++++++++------------------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index f1a4964f9317..f855a2442c97 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -255,13 +255,15 @@ static unsigned long vmemmap_list_free(unsigned long start) void __ref vmemmap_free(unsigned long start, unsigned long end) { unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift; + unsigned long page_order = get_order(page_size); start = _ALIGN_DOWN(start, page_size); pr_debug("vmemmap_free %lx...%lx\n", start, end); for (; start < end; start += page_size) { - unsigned long addr; + unsigned long nr_pages, addr; + struct page *page; /* * the section has already be marked as invalid, so @@ -272,29 +274,29 @@ void __ref vmemmap_free(unsigned long start, unsigned long end) continue; addr = vmemmap_list_free(start); - if (addr) { - struct page *page = pfn_to_page(addr >> PAGE_SHIFT); + if (!addr) + continue; - if (PageReserved(page)) { - /* allocated from bootmem */ - if (page_size < PAGE_SIZE) { - /* - * this shouldn't happen, but if it is - * the case, leave the memory there - */ - WARN_ON_ONCE(1); - } else { - unsigned int nr_pages = - 1 << get_order(page_size); - while (nr_pages--) - free_reserved_page(page++); - } - } else - free_pages((unsigned long)(__va(addr)), - get_order(page_size)); + page = pfn_to_page(addr >> PAGE_SHIFT); + nr_pages = 1 << page_order; - vmemmap_remove_mapping(start, page_size); + if (PageReserved(page)) { + /* allocated from bootmem */ + if (page_size < PAGE_SIZE) { + /* + * this shouldn't happen, but if it is + * the case, leave the memory there + */ + WARN_ON_ONCE(1); + } else { + while (nr_pages--) + free_reserved_page(page++); + } + } else { + free_pages((unsigned long)(__va(addr)), page_order); } + + vmemmap_remove_mapping(start, page_size); } } #endif From b584c2544041707ea041748dbfbb1081289c6cf5 Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Wed, 28 Jun 2017 11:32:33 +1000 Subject: [PATCH 128/158] powerpc/vmemmap: Add altmap support Adds support to powerpc for the altmap feature of ZONE_DEVICE memory. An altmap is a driver provided region that is used to provide the backing storage for the struct pages of ZONE_DEVICE memory. In situations where large amount of ZONE_DEVICE memory is being added to the system the altmap reduces pressure on main system memory by allowing the mm/ metadata to be stored on the device itself rather in main memory. Reviewed-by: Balbir Singh Signed-off-by: Oliver O'Halloran Signed-off-by: Michael Ellerman --- arch/powerpc/mm/init_64.c | 15 +++++++++++++-- arch/powerpc/mm/mem.c | 16 +++++++++++++--- 2 files changed, 26 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index f855a2442c97..5b4c25d12ff3 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include @@ -192,13 +193,17 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) pr_debug("vmemmap_populate %lx..%lx, node %d\n", start, end, node); for (; start < end; start += page_size) { + struct vmem_altmap *altmap; void *p; int rc; if (vmemmap_populated(start, page_size)) continue; - p = vmemmap_alloc_block(page_size, node); + /* altmap lookups only work at section boundaries */ + altmap = to_vmem_altmap(SECTION_ALIGN_DOWN(start)); + + p = __vmemmap_alloc_block_buf(page_size, node, altmap); if (!p) return -ENOMEM; @@ -263,6 +268,8 @@ void __ref vmemmap_free(unsigned long start, unsigned long end) for (; start < end; start += page_size) { unsigned long nr_pages, addr; + struct vmem_altmap *altmap; + struct page *section_base; struct page *page; /* @@ -278,9 +285,13 @@ void __ref vmemmap_free(unsigned long start, unsigned long end) continue; page = pfn_to_page(addr >> PAGE_SHIFT); + section_base = pfn_to_page(vmemmap_section_start(start)); nr_pages = 1 << page_order; - if (PageReserved(page)) { + altmap = to_vmem_altmap((unsigned long) section_base); + if (altmap) { + vmem_altmap_free(altmap, nr_pages); + } else if (PageReserved(page)) { /* allocated from bootmem */ if (page_size < PAGE_SIZE) { /* diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 04f4c9852791..6f71d259de68 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include @@ -159,11 +160,20 @@ int arch_remove_memory(u64 start, u64 size) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; - struct zone *zone; + struct vmem_altmap *altmap; + struct page *page; int ret; - zone = page_zone(pfn_to_page(start_pfn)); - ret = __remove_pages(zone, start_pfn, nr_pages); + /* + * If we have an altmap then we need to skip over any reserved PFNs + * when querying the zone. + */ + page = pfn_to_page(start_pfn); + altmap = to_vmem_altmap((unsigned long) page); + if (altmap) + page += vmem_altmap_offset(altmap); + + ret = __remove_pages(page_zone(page), start_pfn, nr_pages); if (ret) return ret; From ebd31197931d75c837269f3fd3365b4cee358faf Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Wed, 28 Jun 2017 11:32:34 +1000 Subject: [PATCH 129/158] powerpc/mm: Add devmap support for ppc64 Add support for the devmap bit on PTEs and PMDs for PPC64 Book3S. This is used to differentiate device backed memory from transparent huge pages since they are handled in more or less the same manner by the core mm code. Cc: Aneesh Kumar K.V Signed-off-by: Oliver O'Halloran Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/64/pgtable.h | 45 ++++++++++++++++++++ arch/powerpc/include/asm/book3s/64/radix.h | 2 +- arch/powerpc/mm/hugetlbpage.c | 2 +- arch/powerpc/mm/pgtable-book3s64.c | 4 +- arch/powerpc/mm/pgtable-hash64.c | 4 +- arch/powerpc/mm/pgtable-radix.c | 3 +- arch/powerpc/mm/pgtable_64.c | 2 +- 7 files changed, 55 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 85bc9875c3be..c0737c86a362 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -5,6 +5,7 @@ #ifndef __ASSEMBLY__ #include +#include #endif /* @@ -79,6 +80,9 @@ #define _PAGE_SOFT_DIRTY _RPAGE_SW3 /* software: software dirty tracking */ #define _PAGE_SPECIAL _RPAGE_SW2 /* software: special page */ +#define _PAGE_DEVMAP _RPAGE_SW1 /* software: ZONE_DEVICE page */ +#define __HAVE_ARCH_PTE_DEVMAP + /* * Drivers request for cache inhibited pte mapping using _PAGE_NO_CACHE * Instead of fixing all of them, add an alternate define which @@ -599,6 +603,16 @@ static inline pte_t pte_mkhuge(pte_t pte) return pte; } +static inline pte_t pte_mkdevmap(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_SPECIAL|_PAGE_DEVMAP); +} + +static inline int pte_devmap(pte_t pte) +{ + return !!(pte_raw(pte) & cpu_to_be64(_PAGE_DEVMAP)); +} + static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) { /* FIXME!! check whether this need to be a conditional */ @@ -1146,6 +1160,37 @@ static inline bool arch_needs_pgtable_deposit(void) return true; } + +static inline pmd_t pmd_mkdevmap(pmd_t pmd) +{ + return __pmd(pmd_val(pmd) | (_PAGE_PTE | _PAGE_DEVMAP)); +} + +static inline int pmd_devmap(pmd_t pmd) +{ + return pte_devmap(pmd_pte(pmd)); +} + +static inline int pud_devmap(pud_t pud) +{ + return 0; +} + +static inline int pgd_devmap(pgd_t pgd) +{ + return 0; +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +static inline const int pud_pfn(pud_t pud) +{ + /* + * Currently all calls to pud_pfn() are gated around a pud_devmap() + * check so this should never be used. If it grows another user we + * want to know about it. + */ + BUILD_BUG(); + return 0; +} #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ */ diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h index ac16d1943022..ba43754e96d2 100644 --- a/arch/powerpc/include/asm/book3s/64/radix.h +++ b/arch/powerpc/include/asm/book3s/64/radix.h @@ -252,7 +252,7 @@ static inline int radix__pgd_bad(pgd_t pgd) static inline int radix__pmd_trans_huge(pmd_t pmd) { - return !!(pmd_val(pmd) & _PAGE_PTE); + return (pmd_val(pmd) & (_PAGE_PTE | _PAGE_DEVMAP)) == _PAGE_PTE; } static inline pmd_t radix__pmd_mkhuge(pmd_t pmd) diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 19659081da72..1ca196c00b2a 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -964,7 +964,7 @@ pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, if (pmd_none(pmd)) return NULL; - if (pmd_trans_huge(pmd)) { + if (pmd_trans_huge(pmd) || pmd_devmap(pmd)) { if (is_thp) *is_thp = true; ret_pte = (pte_t *) pmdp; diff --git a/arch/powerpc/mm/pgtable-book3s64.c b/arch/powerpc/mm/pgtable-book3s64.c index 5fcb3dd74c13..31eed8fa8e99 100644 --- a/arch/powerpc/mm/pgtable-book3s64.c +++ b/arch/powerpc/mm/pgtable-book3s64.c @@ -32,7 +32,7 @@ int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, { int changed; #ifdef CONFIG_DEBUG_VM - WARN_ON(!pmd_trans_huge(*pmdp)); + WARN_ON(!pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp)); assert_spin_locked(&vma->vm_mm->page_table_lock); #endif changed = !pmd_same(*(pmdp), entry); @@ -59,7 +59,7 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr, #ifdef CONFIG_DEBUG_VM WARN_ON(pte_present(pmd_pte(*pmdp)) && !pte_protnone(pmd_pte(*pmdp))); assert_spin_locked(&mm->page_table_lock); - WARN_ON(!pmd_trans_huge(pmd)); + WARN_ON(!(pmd_trans_huge(pmd) || pmd_devmap(pmd))); #endif trace_hugepage_set_pmd(addr, pmd_val(pmd)); return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd)); diff --git a/arch/powerpc/mm/pgtable-hash64.c b/arch/powerpc/mm/pgtable-hash64.c index 16877f60fb87..a0facee58811 100644 --- a/arch/powerpc/mm/pgtable-hash64.c +++ b/arch/powerpc/mm/pgtable-hash64.c @@ -184,7 +184,7 @@ unsigned long hash__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr unsigned long old; #ifdef CONFIG_DEBUG_VM - WARN_ON(!pmd_trans_huge(*pmdp)); + WARN_ON(!hash__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp)); assert_spin_locked(&mm->page_table_lock); #endif @@ -216,6 +216,7 @@ pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long addres VM_BUG_ON(address & ~HPAGE_PMD_MASK); VM_BUG_ON(pmd_trans_huge(*pmdp)); + VM_BUG_ON(pmd_devmap(*pmdp)); pmd = *pmdp; pmd_clear(pmdp); @@ -296,6 +297,7 @@ void hash__pmdp_huge_split_prepare(struct vm_area_struct *vma, { VM_BUG_ON(address & ~HPAGE_PMD_MASK); VM_BUG_ON(REGION_ID(address) != USER_REGION_ID); + VM_BUG_ON(pmd_devmap(*pmdp)); /* * We can't mark the pmd none here, because that will cause a race diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c index 419199d68928..f6af90371b1e 100644 --- a/arch/powerpc/mm/pgtable-radix.c +++ b/arch/powerpc/mm/pgtable-radix.c @@ -696,7 +696,7 @@ unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long add unsigned long old; #ifdef CONFIG_DEBUG_VM - WARN_ON(!radix__pmd_trans_huge(*pmdp)); + WARN_ON(!radix__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp)); assert_spin_locked(&mm->page_table_lock); #endif @@ -714,6 +714,7 @@ pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long addre VM_BUG_ON(address & ~HPAGE_PMD_MASK); VM_BUG_ON(radix__pmd_trans_huge(*pmdp)); + VM_BUG_ON(pmd_devmap(*pmdp)); /* * khugepaged calls this for normal pmd */ diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 1feb36e58a45..bce0ed50789c 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -324,7 +324,7 @@ struct page *pud_page(pud_t pud) */ struct page *pmd_page(pmd_t pmd) { - if (pmd_trans_huge(pmd) || pmd_huge(pmd)) + if (pmd_trans_huge(pmd) || pmd_huge(pmd) || pmd_devmap(pmd)) return pte_page(pmd_pte(pmd)); return virt_to_page(pmd_page_vaddr(pmd)); } From 1b644f57b3a8a4e05720ff88e87450c6f2e3f356 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Wed, 28 Jun 2017 11:32:35 +1000 Subject: [PATCH 130/158] powerpc/mm: Wire up hpte_removebolted for powernv Adds support for removing bolted (i.e kernel linear mapping) mappings on powernv. This is needed to support memory hot unplug operations which are required for the teardown of DAX/PMEM devices. Reviewed-by: Balbir Singh Reviewed-by: Rashmica Gupta Signed-off-by: Anton Blanchard Signed-off-by: Oliver O'Halloran Signed-off-by: Michael Ellerman --- arch/powerpc/mm/hash_native_64.c | 33 ++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c index fbd1acc3b8d9..3848af167df9 100644 --- a/arch/powerpc/mm/hash_native_64.c +++ b/arch/powerpc/mm/hash_native_64.c @@ -413,6 +413,38 @@ static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea, tlbie(vpn, psize, psize, ssize, 0); } +/* + * Remove a bolted kernel entry. Memory hotplug uses this. + * + * No need to lock here because we should be the only user. + */ +static int native_hpte_removebolted(unsigned long ea, int psize, int ssize) +{ + unsigned long vpn; + unsigned long vsid; + long slot; + struct hash_pte *hptep; + + vsid = get_kernel_vsid(ea, ssize); + vpn = hpt_vpn(ea, vsid, ssize); + + slot = native_hpte_find(vpn, psize, ssize); + if (slot == -1) + return -ENOENT; + + hptep = htab_address + slot; + + VM_WARN_ON(!(be64_to_cpu(hptep->v) & HPTE_V_BOLTED)); + + /* Invalidate the hpte */ + hptep->v = 0; + + /* Invalidate the TLB */ + tlbie(vpn, psize, psize, ssize, 0); + return 0; +} + + static void native_hpte_invalidate(unsigned long slot, unsigned long vpn, int bpsize, int apsize, int ssize, int local) { @@ -731,6 +763,7 @@ void __init hpte_init_native(void) mmu_hash_ops.hpte_invalidate = native_hpte_invalidate; mmu_hash_ops.hpte_updatepp = native_hpte_updatepp; mmu_hash_ops.hpte_updateboltedpp = native_hpte_updateboltedpp; + mmu_hash_ops.hpte_removebolted = native_hpte_removebolted; mmu_hash_ops.hpte_insert = native_hpte_insert; mmu_hash_ops.hpte_remove = native_hpte_remove; mmu_hash_ops.hpte_clear_all = native_hpte_clear; From c07424414c28c8ae9a54803a4496e4d8718b3613 Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Wed, 28 Jun 2017 11:32:36 +1000 Subject: [PATCH 131/158] powerpc/mm: Enable ZONE_DEVICE on powerpc Flip the switch. Running around and screaming "IT'S ALIVE" is optional, but recommended. Signed-off-by: Oliver O'Halloran Signed-off-by: Michael Ellerman --- arch/powerpc/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 7ee79e0ac88e..eb0e591c867d 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -130,6 +130,7 @@ config PPC select ARCH_HAS_SG_CHAIN select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAS_UBSAN_SANITIZE_ALL + select ARCH_HAS_ZONE_DEVICE if PPC_BOOK3S_64 select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_MIGHT_HAVE_PC_SERIO From 12bf85a71000af7419b19b5e90910919f36f336c Mon Sep 17 00:00:00 2001 From: Thiago Jung Bauermann Date: Thu, 29 Jun 2017 18:55:31 -0300 Subject: [PATCH 132/158] powerpc/perf/hv-24x7: Fix passing of catalog version number H_GET_24X7_CATALOG_PAGE needs to be passed the version number obtained from the first catalog page obtained previously. This is a 64 bit number, but create_events_from_catalog truncates it to 32-bit. This worked on POWER8, but POWER9 actually uses the upper bits so the call fails with H_P3 because the hypervisor doesn't recognize the version. This patch also adds the hcall return code to the error message, which is helpful when debugging the problem. Fixes: 5c5cd7b50259 ("powerpc/perf/hv-24x7: parse catalog and populate sysfs with events") Reviewed-by: Sukadev Bhattiprolu Signed-off-by: Thiago Jung Bauermann Signed-off-by: Michael Ellerman --- arch/powerpc/perf/hv-24x7.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/perf/hv-24x7.c b/arch/powerpc/perf/hv-24x7.c index 7b2ca16b1eb4..1354cde7095c 100644 --- a/arch/powerpc/perf/hv-24x7.c +++ b/arch/powerpc/perf/hv-24x7.c @@ -670,7 +670,7 @@ static int create_events_from_catalog(struct attribute ***events_, event_data_bytes, junk_events, event_idx, event_attr_ct, i, attr_max, event_idx_last, desc_ct, long_desc_ct; ssize_t ct, ev_len; - uint32_t catalog_version_num; + uint64_t catalog_version_num; struct attribute **events, **event_descs, **event_long_descs; struct hv_24x7_catalog_page_0 *page_0 = kmem_cache_alloc(hv_page_cache, GFP_KERNEL); @@ -706,8 +706,8 @@ static int create_events_from_catalog(struct attribute ***events_, event_data_offs = be16_to_cpu(page_0->event_data_offs); event_data_len = be16_to_cpu(page_0->event_data_len); - pr_devel("cv %zu cl %zu eec %zu edo %zu edl %zu\n", - (size_t)catalog_version_num, catalog_len, + pr_devel("cv %llu cl %zu eec %zu edo %zu edl %zu\n", + catalog_version_num, catalog_len, event_entry_count, event_data_offs, event_data_len); if ((MAX_4K < event_data_len) @@ -761,8 +761,8 @@ static int create_events_from_catalog(struct attribute ***events_, catalog_version_num, i + event_data_offs); if (hret) { - pr_err("failed to get event data in page %zu\n", - i + event_data_offs); + pr_err("Failed to get event data in page %zu: rc=%ld\n", + i + event_data_offs, hret); ret = -EIO; goto e_event_data; } From 36c8fb2c616d9373758b155d9723774353067a87 Mon Sep 17 00:00:00 2001 From: Thiago Jung Bauermann Date: Thu, 29 Jun 2017 18:55:32 -0300 Subject: [PATCH 133/158] powerpc/perf/hv-24x7: Fix off-by-one error in request_buffer check request_buffer can hold 254 requests, so if it already has that number of entries we can't add a new one. Also, define constant to show where the number comes from. Fixes: e3ee15dc5d19 ("powerpc/perf/hv-24x7: Define add_event_to_24x7_request()") Reviewed-by: Sukadev Bhattiprolu Signed-off-by: Thiago Jung Bauermann Signed-off-by: Michael Ellerman --- arch/powerpc/perf/hv-24x7.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/perf/hv-24x7.c b/arch/powerpc/perf/hv-24x7.c index 1354cde7095c..141de0f708f7 100644 --- a/arch/powerpc/perf/hv-24x7.c +++ b/arch/powerpc/perf/hv-24x7.c @@ -166,6 +166,10 @@ DEFINE_PER_CPU(struct hv_24x7_hw, hv_24x7_hw); DEFINE_PER_CPU(char, hv_24x7_reqb[H24x7_DATA_BUFFER_SIZE]) __aligned(4096); DEFINE_PER_CPU(char, hv_24x7_resb[H24x7_DATA_BUFFER_SIZE]) __aligned(4096); +#define MAX_NUM_REQUESTS ((H24x7_DATA_BUFFER_SIZE - \ + sizeof(struct hv_24x7_request_buffer)) \ + / sizeof(struct hv_24x7_request)) + static char *event_name(struct hv_24x7_event_data *ev, int *len) { *len = be16_to_cpu(ev->event_name_len) - 2; @@ -1107,7 +1111,7 @@ static int add_event_to_24x7_request(struct perf_event *event, int i; struct hv_24x7_request *req; - if (request_buffer->num_requests > 254) { + if (request_buffer->num_requests >= MAX_NUM_REQUESTS) { pr_devel("Too many requests for 24x7 HCALL %d\n", request_buffer->num_requests); return -EINVAL; From 41f577eb012f33d9af9c9fcee6a68a499562f875 Mon Sep 17 00:00:00 2001 From: Thiago Jung Bauermann Date: Thu, 29 Jun 2017 18:55:33 -0300 Subject: [PATCH 134/158] powerpc/perf/hv-24x7: Properly iterate through results hv-24x7.h has a comment mentioning that result_buffer->results can't be indexed as a normal array because it may contain results of variable sizes, so fix the loop in h_24x7_event_commit_txn to take the variation into account when iterating through results. Another problem in that loop is that it sets h24x7hw->events[i] to NULL. This assumes that only the i'th result maps to the i'th request, but that is not guaranteed to be true. We need to leave the event in the array so that we don't dereference a NULL pointer in case more than one result maps to one request. We still assume that each result has only one result element, so warn if that assumption is violated. Reviewed-by: Sukadev Bhattiprolu Signed-off-by: Thiago Jung Bauermann Signed-off-by: Michael Ellerman --- arch/powerpc/perf/hv-24x7.c | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/perf/hv-24x7.c b/arch/powerpc/perf/hv-24x7.c index 141de0f708f7..8b1a3e849d23 100644 --- a/arch/powerpc/perf/hv-24x7.c +++ b/arch/powerpc/perf/hv-24x7.c @@ -1144,7 +1144,9 @@ static int add_event_to_24x7_request(struct perf_event *event, static unsigned long single_24x7_request(struct perf_event *event, u64 *count) { + u16 num_elements; unsigned long ret; + struct hv_24x7_result *result; struct hv_24x7_request_buffer *request_buffer; struct hv_24x7_data_result_buffer *result_buffer; @@ -1166,8 +1168,14 @@ static unsigned long single_24x7_request(struct perf_event *event, u64 *count) goto out; } + result = result_buffer->results; + + /* This code assumes that a result has only one element. */ + num_elements = be16_to_cpu(result->num_elements_returned); + WARN_ON_ONCE(num_elements != 1); + /* process result from hcall */ - *count = be64_to_cpu(result_buffer->results[0].elements[0].element_data[0]); + *count = be64_to_cpu(result->elements[0].element_data[0]); out: put_cpu_var(hv_24x7_reqb); @@ -1400,8 +1408,7 @@ static int h_24x7_event_commit_txn(struct pmu *pmu) { struct hv_24x7_request_buffer *request_buffer; struct hv_24x7_data_result_buffer *result_buffer; - struct hv_24x7_result *resb; - struct perf_event *event; + struct hv_24x7_result *res, *next_res; u64 count; int i, ret, txn_flags; struct hv_24x7_hw *h24x7hw; @@ -1428,13 +1435,20 @@ static int h_24x7_event_commit_txn(struct pmu *pmu) h24x7hw = &get_cpu_var(hv_24x7_hw); - /* Update event counts from hcall */ - for (i = 0; i < request_buffer->num_requests; i++) { - resb = &result_buffer->results[i]; - count = be64_to_cpu(resb->elements[0].element_data[0]); - event = h24x7hw->events[i]; - h24x7hw->events[i] = NULL; + /* Go through results in the result buffer to update event counts. */ + for (i = 0, res = result_buffer->results; + i < result_buffer->num_results; i++, res = next_res) { + struct perf_event *event = h24x7hw->events[res->result_ix]; + u16 num_elements = be16_to_cpu(res->num_elements_returned); + u16 data_size = be16_to_cpu(res->result_element_data_size); + + /* This code assumes that a result has only one element. */ + WARN_ON_ONCE(num_elements != 1); + + count = be64_to_cpu(res->elements[0].element_data[0]); update_event_count(event, count); + + next_res = (void *) res->elements[0].element_data + data_size; } put_cpu_var(hv_24x7_hw); From 62714a14928061f496308a9d4635bf986642046d Mon Sep 17 00:00:00 2001 From: Thiago Jung Bauermann Date: Thu, 29 Jun 2017 18:55:34 -0300 Subject: [PATCH 135/158] powerpc-perf/hx-24x7: Don't log failed hcall twice make_24x7_request already calls log_24x7_hcall if it fails, so callers don't have to do it again. In fact, since the latter is now only called from the former, there's no need for a separate log_24x7_hcall anymore so remove it. Reviewed-by: Sukadev Bhattiprolu Signed-off-by: Thiago Jung Bauermann Signed-off-by: Michael Ellerman --- arch/powerpc/perf/hv-24x7.c | 35 ++++++++++++----------------------- 1 file changed, 12 insertions(+), 23 deletions(-) diff --git a/arch/powerpc/perf/hv-24x7.c b/arch/powerpc/perf/hv-24x7.c index 8b1a3e849d23..111c61ea7416 100644 --- a/arch/powerpc/perf/hv-24x7.c +++ b/arch/powerpc/perf/hv-24x7.c @@ -1044,21 +1044,6 @@ static const struct attribute_group *attr_groups[] = { NULL, }; -static void log_24x7_hcall(struct hv_24x7_request_buffer *request_buffer, - struct hv_24x7_data_result_buffer *result_buffer, - unsigned long ret) -{ - struct hv_24x7_request *req; - - req = &request_buffer->requests[0]; - pr_notice_ratelimited("hcall failed: [%d %#x %#x %d] => " - "ret 0x%lx (%ld) detail=0x%x failing ix=%x\n", - req->performance_domain, req->data_offset, - req->starting_ix, req->starting_lpar_ix, ret, ret, - result_buffer->detailed_rc, - result_buffer->failing_request_ix); -} - /* * Start the process for a new H_GET_24x7_DATA hcall. */ @@ -1091,8 +1076,16 @@ static int make_24x7_request(struct hv_24x7_request_buffer *request_buffer, virt_to_phys(request_buffer), H24x7_DATA_BUFFER_SIZE, virt_to_phys(result_buffer), H24x7_DATA_BUFFER_SIZE); - if (ret) - log_24x7_hcall(request_buffer, result_buffer, ret); + if (ret) { + struct hv_24x7_request *req; + + req = &request_buffer->requests[0]; + pr_notice_ratelimited("hcall failed: [%d %#x %#x %d] => ret 0x%lx (%ld) detail=0x%x failing ix=%x\n", + req->performance_domain, req->data_offset, + req->starting_ix, req->starting_lpar_ix, + ret, ret, result_buffer->detailed_rc, + result_buffer->failing_request_ix); + } return ret; } @@ -1163,10 +1156,8 @@ static unsigned long single_24x7_request(struct perf_event *event, u64 *count) goto out; ret = make_24x7_request(request_buffer, result_buffer); - if (ret) { - log_24x7_hcall(request_buffer, result_buffer, ret); + if (ret) goto out; - } result = result_buffer->results; @@ -1428,10 +1419,8 @@ static int h_24x7_event_commit_txn(struct pmu *pmu) result_buffer = (void *)get_cpu_var(hv_24x7_resb); ret = make_24x7_request(request_buffer, result_buffer); - if (ret) { - log_24x7_hcall(request_buffer, result_buffer, ret); + if (ret) goto put_reqb; - } h24x7hw = &get_cpu_var(hv_24x7_hw); From 38d81846106bb1e14fcf2cf390f7cdeb7fdab550 Mon Sep 17 00:00:00 2001 From: Thiago Jung Bauermann Date: Thu, 29 Jun 2017 18:55:35 -0300 Subject: [PATCH 136/158] powerpc/perf/hv-24x7: Fix return value of hcalls The H_GET_24X7_CATALOG_PAGE hcall can return a signed error code, so fix this in the code. The H_GET_24X7_DATA hcall can return a signed error code, so fix this in the code. Also, don't truncate it to 32 bit to use as return value for make_24x7_request. In case of error h_24x7_event_commit_txn passes that return value to generic code, so it should be a proper errno. The other caller of make_24x7_request is single_24x7_request, whose callers don't actually care which error code is returned so they are not affected by this change. Finally, h_24x7_get_value doesn't use the error code from single_24x7_request, so there's no need to store it. Reviewed-by: Sukadev Bhattiprolu Signed-off-by: Thiago Jung Bauermann Signed-off-by: Michael Ellerman --- arch/powerpc/perf/hv-24x7.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/arch/powerpc/perf/hv-24x7.c b/arch/powerpc/perf/hv-24x7.c index 111c61ea7416..7e47dcdea62a 100644 --- a/arch/powerpc/perf/hv-24x7.c +++ b/arch/powerpc/perf/hv-24x7.c @@ -264,9 +264,8 @@ static void *event_end(struct hv_24x7_event_data *ev, void *end) return start + nl + dl + ldl; } -static unsigned long h_get_24x7_catalog_page_(unsigned long phys_4096, - unsigned long version, - unsigned long index) +static long h_get_24x7_catalog_page_(unsigned long phys_4096, + unsigned long version, unsigned long index) { pr_devel("h_get_24x7_catalog_page(0x%lx, %lu, %lu)", phys_4096, version, index); @@ -277,8 +276,7 @@ static unsigned long h_get_24x7_catalog_page_(unsigned long phys_4096, phys_4096, version, index); } -static unsigned long h_get_24x7_catalog_page(char page[], - u64 version, u32 index) +static long h_get_24x7_catalog_page(char page[], u64 version, u32 index) { return h_get_24x7_catalog_page_(virt_to_phys(page), version, index); @@ -668,7 +666,7 @@ static int create_events_from_catalog(struct attribute ***events_, struct attribute ***event_descs_, struct attribute ***event_long_descs_) { - unsigned long hret; + long hret; size_t catalog_len, catalog_page_len, event_entry_count, event_data_len, event_data_offs, event_data_bytes, junk_events, event_idx, event_attr_ct, i, @@ -907,7 +905,7 @@ static ssize_t catalog_read(struct file *filp, struct kobject *kobj, struct bin_attribute *bin_attr, char *buf, loff_t offset, size_t count) { - unsigned long hret; + long hret; ssize_t ret = 0; size_t catalog_len = 0, catalog_page_len = 0; loff_t page_offset = 0; @@ -992,7 +990,7 @@ static ssize_t _name##_show(struct device *dev, \ struct device_attribute *dev_attr, \ char *buf) \ { \ - unsigned long hret; \ + long hret; \ ssize_t ret = 0; \ void *page = kmem_cache_alloc(hv_page_cache, GFP_USER); \ struct hv_24x7_catalog_page_0 *page_0 = page; \ @@ -1065,7 +1063,7 @@ static void init_24x7_request(struct hv_24x7_request_buffer *request_buffer, static int make_24x7_request(struct hv_24x7_request_buffer *request_buffer, struct hv_24x7_data_result_buffer *result_buffer) { - unsigned long ret; + long ret; /* * NOTE: Due to variable number of array elements in request and @@ -1085,9 +1083,10 @@ static int make_24x7_request(struct hv_24x7_request_buffer *request_buffer, req->starting_ix, req->starting_lpar_ix, ret, ret, result_buffer->detailed_rc, result_buffer->failing_request_ix); + return -EIO; } - return ret; + return 0; } /* @@ -1135,10 +1134,10 @@ static int add_event_to_24x7_request(struct perf_event *event, return 0; } -static unsigned long single_24x7_request(struct perf_event *event, u64 *count) +static int single_24x7_request(struct perf_event *event, u64 *count) { + int ret; u16 num_elements; - unsigned long ret; struct hv_24x7_result *result; struct hv_24x7_request_buffer *request_buffer; struct hv_24x7_data_result_buffer *result_buffer; @@ -1253,10 +1252,9 @@ static int h_24x7_event_init(struct perf_event *event) static u64 h_24x7_get_value(struct perf_event *event) { - unsigned long ret; u64 ct; - ret = single_24x7_request(event, &ct); - if (ret) + + if (single_24x7_request(event, &ct)) /* We checked this in event init, shouldn't fail here... */ return 0; From ebd4a5a3ebd9c7bef105e8581f80ae4592a21e6d Mon Sep 17 00:00:00 2001 From: Thiago Jung Bauermann Date: Thu, 29 Jun 2017 18:55:36 -0300 Subject: [PATCH 137/158] powerpc/perf/hv-24x7: Minor improvements There's an H24x7_DATA_BUFFER_SIZE constant, so use it in init_24x7_request. There's also an HV_PERF_DOMAIN_MAX constant, so use it in h_24x7_event_init. This makes the comment above the check redundant, so remove it. In add_event_to_24x7_request, a statement is terminated with a comma instead of a semicolon. Fix it. In hv-24x7.h, improve comments in struct hv_24x7_result. Signed-off-by: Thiago Jung Bauermann Signed-off-by: Michael Ellerman --- arch/powerpc/perf/hv-24x7.c | 9 ++++----- arch/powerpc/perf/hv-24x7.h | 11 ++++++++++- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/perf/hv-24x7.c b/arch/powerpc/perf/hv-24x7.c index 7e47dcdea62a..7771c70afa48 100644 --- a/arch/powerpc/perf/hv-24x7.c +++ b/arch/powerpc/perf/hv-24x7.c @@ -1049,8 +1049,8 @@ static void init_24x7_request(struct hv_24x7_request_buffer *request_buffer, struct hv_24x7_data_result_buffer *result_buffer) { - memset(request_buffer, 0, 4096); - memset(result_buffer, 0, 4096); + memset(request_buffer, 0, H24x7_DATA_BUFFER_SIZE); + memset(result_buffer, 0, H24x7_DATA_BUFFER_SIZE); request_buffer->interface_version = HV_24X7_IF_VERSION_CURRENT; /* memset above set request_buffer->num_requests to 0 */ @@ -1126,7 +1126,7 @@ static int add_event_to_24x7_request(struct perf_event *event, req->performance_domain = event_get_domain(event); req->data_size = cpu_to_be16(8); req->data_offset = cpu_to_be32(event_get_offset(event)); - req->starting_lpar_ix = cpu_to_be16(event_get_lpar(event)), + req->starting_lpar_ix = cpu_to_be16(event_get_lpar(event)); req->max_num_lpars = cpu_to_be16(1); req->starting_ix = cpu_to_be16(idx); req->max_ix = cpu_to_be16(1); @@ -1218,9 +1218,8 @@ static int h_24x7_event_init(struct perf_event *event) return -EINVAL; } - /* Domains above 6 are invalid */ domain = event_get_domain(event); - if (domain > 6) { + if (domain >= HV_PERF_DOMAIN_MAX) { pr_devel("invalid domain %d\n", domain); return -EINVAL; } diff --git a/arch/powerpc/perf/hv-24x7.h b/arch/powerpc/perf/hv-24x7.h index 634ef4082cdc..b95909400b2a 100644 --- a/arch/powerpc/perf/hv-24x7.h +++ b/arch/powerpc/perf/hv-24x7.h @@ -71,6 +71,10 @@ struct hv_24x7_result_element { } __packed; struct hv_24x7_result { + /* + * The index of the 24x7 Request Structure in the 24x7 Request Buffer + * used to request this result. + */ __u8 result_ix; /* @@ -81,7 +85,12 @@ struct hv_24x7_result { __u8 results_complete; __be16 num_elements_returned; - /* This is a copy of @data_size from the corresponding hv_24x7_request */ + /* + * This is a copy of @data_size from the corresponding hv_24x7_request + * + * Warning: to obtain the size of each element in @elements you have + * to add the size of the other members of the result_element struct. + */ __be16 result_element_data_size; __u8 reserved[0x2]; From 2e6553aae3e6bd13cf176855d67233dce8817381 Mon Sep 17 00:00:00 2001 From: Thiago Jung Bauermann Date: Thu, 29 Jun 2017 18:55:37 -0300 Subject: [PATCH 138/158] powerpc/perf/hv-24x7: Support v2 of the hypervisor API POWER9 introduces a new version of the hypervisor API to access the 24x7 perf counters. The new version changed some of the structures used for requests and results. Signed-off-by: Thiago Jung Bauermann Signed-off-by: Michael Ellerman --- arch/powerpc/perf/hv-24x7.c | 135 ++++++++++++++++++++----- arch/powerpc/perf/hv-24x7.h | 58 +++++++++-- arch/powerpc/platforms/pseries/Kconfig | 2 +- 3 files changed, 160 insertions(+), 35 deletions(-) diff --git a/arch/powerpc/perf/hv-24x7.c b/arch/powerpc/perf/hv-24x7.c index 7771c70afa48..a9a4df6e6e22 100644 --- a/arch/powerpc/perf/hv-24x7.c +++ b/arch/powerpc/perf/hv-24x7.c @@ -18,6 +18,7 @@ #include #include +#include #include #include #include @@ -27,6 +28,9 @@ #include "hv-24x7-catalog.h" #include "hv-common.h" +/* Version of the 24x7 hypervisor API that we should use in this machine. */ +static int interface_version; + static bool domain_is_valid(unsigned domain) { switch (domain) { @@ -74,7 +78,11 @@ static const char *domain_name(unsigned domain) static bool catalog_entry_domain_is_valid(unsigned domain) { - return is_physical_domain(domain); + /* POWER8 doesn't support virtual domains. */ + if (interface_version == 1) + return is_physical_domain(domain); + else + return domain_is_valid(domain); } /* @@ -166,9 +174,11 @@ DEFINE_PER_CPU(struct hv_24x7_hw, hv_24x7_hw); DEFINE_PER_CPU(char, hv_24x7_reqb[H24x7_DATA_BUFFER_SIZE]) __aligned(4096); DEFINE_PER_CPU(char, hv_24x7_resb[H24x7_DATA_BUFFER_SIZE]) __aligned(4096); -#define MAX_NUM_REQUESTS ((H24x7_DATA_BUFFER_SIZE - \ - sizeof(struct hv_24x7_request_buffer)) \ - / sizeof(struct hv_24x7_request)) +static unsigned int max_num_requests(int interface_version) +{ + return (H24x7_DATA_BUFFER_SIZE - sizeof(struct hv_24x7_request_buffer)) + / H24x7_REQUEST_SIZE(interface_version); +} static char *event_name(struct hv_24x7_event_data *ev, int *len) { @@ -1052,7 +1062,7 @@ static void init_24x7_request(struct hv_24x7_request_buffer *request_buffer, memset(request_buffer, 0, H24x7_DATA_BUFFER_SIZE); memset(result_buffer, 0, H24x7_DATA_BUFFER_SIZE); - request_buffer->interface_version = HV_24X7_IF_VERSION_CURRENT; + request_buffer->interface_version = interface_version; /* memset above set request_buffer->num_requests to 0 */ } @@ -1077,7 +1087,7 @@ static int make_24x7_request(struct hv_24x7_request_buffer *request_buffer, if (ret) { struct hv_24x7_request *req; - req = &request_buffer->requests[0]; + req = request_buffer->requests; pr_notice_ratelimited("hcall failed: [%d %#x %#x %d] => ret 0x%lx (%ld) detail=0x%x failing ix=%x\n", req->performance_domain, req->data_offset, req->starting_ix, req->starting_lpar_ix, @@ -1101,9 +1111,11 @@ static int add_event_to_24x7_request(struct perf_event *event, { u16 idx; int i; + size_t req_size; struct hv_24x7_request *req; - if (request_buffer->num_requests >= MAX_NUM_REQUESTS) { + if (request_buffer->num_requests >= + max_num_requests(request_buffer->interface_version)) { pr_devel("Too many requests for 24x7 HCALL %d\n", request_buffer->num_requests); return -EINVAL; @@ -1120,8 +1132,10 @@ static int add_event_to_24x7_request(struct perf_event *event, idx = event_get_vcpu(event); } + req_size = H24x7_REQUEST_SIZE(request_buffer->interface_version); + i = request_buffer->num_requests++; - req = &request_buffer->requests[i]; + req = (void *) request_buffer->requests + i * req_size; req->performance_domain = event_get_domain(event); req->data_size = cpu_to_be16(8); @@ -1131,14 +1145,86 @@ static int add_event_to_24x7_request(struct perf_event *event, req->starting_ix = cpu_to_be16(idx); req->max_ix = cpu_to_be16(1); + if (request_buffer->interface_version > 1 && + req->performance_domain != HV_PERF_DOMAIN_PHYS_CHIP) { + req->starting_thread_group_ix = idx % 2; + req->max_num_thread_groups = 1; + } + + return 0; +} + +/** + * get_count_from_result - get event count from the given result + * + * @event: Event associated with @res. + * @resb: Result buffer containing @res. + * @res: Result to work on. + * @countp: Output variable containing the event count. + * @next: Optional output variable pointing to the next result in @resb. + */ +static int get_count_from_result(struct perf_event *event, + struct hv_24x7_data_result_buffer *resb, + struct hv_24x7_result *res, u64 *countp, + struct hv_24x7_result **next) +{ + u16 num_elements = be16_to_cpu(res->num_elements_returned); + u16 data_size = be16_to_cpu(res->result_element_data_size); + unsigned int data_offset; + void *element_data; + + /* + * We can bail out early if the result is empty. + */ + if (!num_elements) { + pr_debug("Result of request %hhu is empty, nothing to do\n", + res->result_ix); + + if (next) + *next = (struct hv_24x7_result *) res->elements; + + return -ENODATA; + } + + /* + * Since we always specify 1 as the maximum for the smallest resource + * we're requesting, there should to be only one element per result. + */ + if (num_elements != 1) { + pr_err("Error: result of request %hhu has %hu elements\n", + res->result_ix, num_elements); + + return -EIO; + } + + if (data_size != sizeof(u64)) { + pr_debug("Error: result of request %hhu has data of %hu bytes\n", + res->result_ix, data_size); + + return -ENOTSUPP; + } + + if (resb->interface_version == 1) + data_offset = offsetof(struct hv_24x7_result_element_v1, + element_data); + else + data_offset = offsetof(struct hv_24x7_result_element_v2, + element_data); + + element_data = res->elements + data_offset; + + *countp = be64_to_cpu(*((u64 *) element_data)); + + /* The next result is after the result element. */ + if (next) + *next = element_data + data_size; + return 0; } static int single_24x7_request(struct perf_event *event, u64 *count) { int ret; - u16 num_elements; - struct hv_24x7_result *result; struct hv_24x7_request_buffer *request_buffer; struct hv_24x7_data_result_buffer *result_buffer; @@ -1158,14 +1244,9 @@ static int single_24x7_request(struct perf_event *event, u64 *count) if (ret) goto out; - result = result_buffer->results; - - /* This code assumes that a result has only one element. */ - num_elements = be16_to_cpu(result->num_elements_returned); - WARN_ON_ONCE(num_elements != 1); - /* process result from hcall */ - *count = be64_to_cpu(result->elements[0].element_data[0]); + ret = get_count_from_result(event, result_buffer, + result_buffer->results, count, NULL); out: put_cpu_var(hv_24x7_reqb); @@ -1425,16 +1506,13 @@ static int h_24x7_event_commit_txn(struct pmu *pmu) for (i = 0, res = result_buffer->results; i < result_buffer->num_results; i++, res = next_res) { struct perf_event *event = h24x7hw->events[res->result_ix]; - u16 num_elements = be16_to_cpu(res->num_elements_returned); - u16 data_size = be16_to_cpu(res->result_element_data_size); - /* This code assumes that a result has only one element. */ - WARN_ON_ONCE(num_elements != 1); + ret = get_count_from_result(event, result_buffer, res, &count, + &next_res); + if (ret) + break; - count = be64_to_cpu(res->elements[0].element_data[0]); update_event_count(event, count); - - next_res = (void *) res->elements[0].element_data + data_size; } put_cpu_var(hv_24x7_hw); @@ -1484,7 +1562,14 @@ static int hv_24x7_init(void) if (!firmware_has_feature(FW_FEATURE_LPAR)) { pr_debug("not a virtualized system, not enabling\n"); return -ENODEV; - } + } else if (!cur_cpu_spec->oprofile_cpu_type) + return -ENODEV; + + /* POWER8 only supports v1, while POWER9 only supports v2. */ + if (!strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/power8")) + interface_version = 1; + else + interface_version = 2; hret = hv_perf_caps_get(&caps); if (hret) { diff --git a/arch/powerpc/perf/hv-24x7.h b/arch/powerpc/perf/hv-24x7.h index b95909400b2a..5092c4a222a6 100644 --- a/arch/powerpc/perf/hv-24x7.h +++ b/arch/powerpc/perf/hv-24x7.h @@ -10,6 +10,8 @@ enum hv_perf_domains { HV_PERF_DOMAIN_MAX, }; +#define H24x7_REQUEST_SIZE(iface_version) (iface_version == 1 ? 16 : 32) + struct hv_24x7_request { /* PHYSICAL domains require enabling via phyp/hmc. */ __u8 performance_domain; @@ -42,19 +44,27 @@ struct hv_24x7_request { /* chip, core, or virtual processor based on @performance_domain */ __be16 starting_ix; __be16 max_ix; + + /* The following fields were added in v2 of the 24x7 interface. */ + + __u8 starting_thread_group_ix; + + /* -1 means all thread groups starting at @starting_thread_group_ix */ + __u8 max_num_thread_groups; + + __u8 reserved2[0xE]; } __packed; struct hv_24x7_request_buffer { /* 0 - ? */ /* 1 - ? */ -#define HV_24X7_IF_VERSION_CURRENT 0x01 __u8 interface_version; __u8 num_requests; __u8 reserved[0xE]; - struct hv_24x7_request requests[1]; + struct hv_24x7_request requests[]; } __packed; -struct hv_24x7_result_element { +struct hv_24x7_result_element_v1 { __be16 lpar_ix; /* @@ -67,7 +77,31 @@ struct hv_24x7_result_element { __be32 lpar_cfg_instance_id; /* size = @result_element_data_size of containing result. */ - __u64 element_data[1]; + __u64 element_data[]; +} __packed; + +/* + * We need a separate struct for v2 because the offset of @element_data changed + * between versions. + */ +struct hv_24x7_result_element_v2 { + __be16 lpar_ix; + + /* + * represents the core, chip, or virtual processor based on the + * request's @performance_domain + */ + __be16 domain_ix; + + /* -1 if @performance_domain does not refer to a virtual processor */ + __be32 lpar_cfg_instance_id; + + __u8 thread_group_ix; + + __u8 reserved[7]; + + /* size = @result_element_data_size of containing result. */ + __u64 element_data[]; } __packed; struct hv_24x7_result { @@ -94,10 +128,16 @@ struct hv_24x7_result { __be16 result_element_data_size; __u8 reserved[0x2]; - /* WARNING: only valid for first result element due to variable sizes - * of result elements */ - /* struct hv_24x7_result_element[@num_elements_returned] */ - struct hv_24x7_result_element elements[1]; + /* + * Either + * struct hv_24x7_result_element_v1[@num_elements_returned] + * or + * struct hv_24x7_result_element_v2[@num_elements_returned] + * + * depending on the interface_version field of the + * struct hv_24x7_data_result_buffer containing this result. + */ + char elements[]; } __packed; struct hv_24x7_data_result_buffer { @@ -113,7 +153,7 @@ struct hv_24x7_data_result_buffer { __u8 reserved2[0x8]; /* WARNING: only valid for the first result due to variable sizes of * results */ - struct hv_24x7_result results[1]; /* [@num_results] */ + struct hv_24x7_result results[]; /* [@num_results] */ } __packed; #endif diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig index 913c54e23eea..3a6dfd14f64b 100644 --- a/arch/powerpc/platforms/pseries/Kconfig +++ b/arch/powerpc/platforms/pseries/Kconfig @@ -124,7 +124,7 @@ config HV_PERF_CTRS Enable access to hypervisor supplied counters in perf. Currently, this enables code that uses the hcall GetPerfCounterInfo and 24x7 interfaces to retrieve counters. GPCI exists on Power 6 and later - systems. 24x7 is available on Power 8 systems. + systems. 24x7 is available on Power 8 and later systems. If unsure, select Y. From bfaa7834b60e01135af4e8e06a9477bef2368f44 Mon Sep 17 00:00:00 2001 From: Thiago Jung Bauermann Date: Thu, 29 Jun 2017 18:55:38 -0300 Subject: [PATCH 139/158] powerpc/perf/hv-24x7: Aggregate result elements on POWER9 SMT8 On POWER9 SMT8 the 24x7 API returns two result elements for physical core and virtual CPU events and we need to add their counts to get the final result. Reviewed-by: Sukadev Bhattiprolu Signed-off-by: Thiago Jung Bauermann Signed-off-by: Michael Ellerman --- arch/powerpc/perf/hv-24x7.c | 53 +++++++++++++++++++++++++++++-------- 1 file changed, 42 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/perf/hv-24x7.c b/arch/powerpc/perf/hv-24x7.c index a9a4df6e6e22..9c88b82f6229 100644 --- a/arch/powerpc/perf/hv-24x7.c +++ b/arch/powerpc/perf/hv-24x7.c @@ -31,6 +31,9 @@ /* Version of the 24x7 hypervisor API that we should use in this machine. */ static int interface_version; +/* Whether we have to aggregate result data for some domains. */ +static bool aggregate_result_elements; + static bool domain_is_valid(unsigned domain) { switch (domain) { @@ -58,6 +61,15 @@ static bool is_physical_domain(unsigned domain) } } +/* Domains for which more than one result element are returned for each event. */ +static bool domain_needs_aggregation(unsigned int domain) +{ + return aggregate_result_elements && + (domain == HV_PERF_DOMAIN_PHYS_CORE || + (domain >= HV_PERF_DOMAIN_VCPU_HOME_CORE && + domain <= HV_PERF_DOMAIN_VCPU_REMOTE_NODE)); +} + static const char *domain_name(unsigned domain) { if (!domain_is_valid(domain)) @@ -1145,17 +1157,23 @@ static int add_event_to_24x7_request(struct perf_event *event, req->starting_ix = cpu_to_be16(idx); req->max_ix = cpu_to_be16(1); - if (request_buffer->interface_version > 1 && - req->performance_domain != HV_PERF_DOMAIN_PHYS_CHIP) { - req->starting_thread_group_ix = idx % 2; - req->max_num_thread_groups = 1; + if (request_buffer->interface_version > 1) { + if (domain_needs_aggregation(req->performance_domain)) + req->max_num_thread_groups = -1; + else if (req->performance_domain != HV_PERF_DOMAIN_PHYS_CHIP) { + req->starting_thread_group_ix = idx % 2; + req->max_num_thread_groups = 1; + } } return 0; } /** - * get_count_from_result - get event count from the given result + * get_count_from_result - get event count from all result elements in result + * + * If the event corresponding to this result needs aggregation of the result + * element values, then this function does that. * * @event: Event associated with @res. * @resb: Result buffer containing @res. @@ -1172,6 +1190,8 @@ static int get_count_from_result(struct perf_event *event, u16 data_size = be16_to_cpu(res->result_element_data_size); unsigned int data_offset; void *element_data; + int i; + u64 count; /* * We can bail out early if the result is empty. @@ -1189,8 +1209,10 @@ static int get_count_from_result(struct perf_event *event, /* * Since we always specify 1 as the maximum for the smallest resource * we're requesting, there should to be only one element per result. + * Except when an event needs aggregation, in which case there are more. */ - if (num_elements != 1) { + if (num_elements != 1 && + !domain_needs_aggregation(event_get_domain(event))) { pr_err("Error: result of request %hhu has %hu elements\n", res->result_ix, num_elements); @@ -1211,13 +1233,17 @@ static int get_count_from_result(struct perf_event *event, data_offset = offsetof(struct hv_24x7_result_element_v2, element_data); - element_data = res->elements + data_offset; + /* Go through the result elements in the result. */ + for (i = count = 0, element_data = res->elements + data_offset; + i < num_elements; + i++, element_data += data_size + data_offset) + count += be64_to_cpu(*((u64 *) element_data)); - *countp = be64_to_cpu(*((u64 *) element_data)); + *countp = count; - /* The next result is after the result element. */ + /* The next result is after the last result element. */ if (next) - *next = element_data + data_size; + *next = element_data - data_offset; return 0; } @@ -1568,9 +1594,14 @@ static int hv_24x7_init(void) /* POWER8 only supports v1, while POWER9 only supports v2. */ if (!strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/power8")) interface_version = 1; - else + else { interface_version = 2; + /* SMT8 in POWER9 needs to aggregate result elements. */ + if (threads_per_core == 8) + aggregate_result_elements = true; + } + hret = hv_perf_caps_get(&caps); if (hret) { pr_debug("could not obtain capabilities, not enabling, rc=%ld\n", From 5405c92bc2cd0c09c7f9716af234b45ef66faa94 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Wed, 24 May 2017 14:12:24 +0900 Subject: [PATCH 140/158] powerpc/dts: Use #include "..." to include local DT Most of DT files in PowerPC use #include "..." to make pre-processor include DT in the same directory, but we have 3 exceptional files that use #include <...> for that. Fix them to remove -I$(srctree)/arch/$(SRCARCH)/boot/dts path from dtc_cpp_flags. Signed-off-by: Masahiro Yamada Signed-off-by: Michael Ellerman --- arch/powerpc/boot/dts/ac14xx.dts | 2 +- arch/powerpc/boot/dts/mpc5121ads.dts | 2 +- arch/powerpc/boot/dts/pdm360ng.dts | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/boot/dts/ac14xx.dts b/arch/powerpc/boot/dts/ac14xx.dts index 27fcabc2f857..83bcfd865167 100644 --- a/arch/powerpc/boot/dts/ac14xx.dts +++ b/arch/powerpc/boot/dts/ac14xx.dts @@ -10,7 +10,7 @@ */ -#include +#include "mpc5121.dtsi" / { model = "ac14xx"; diff --git a/arch/powerpc/boot/dts/mpc5121ads.dts b/arch/powerpc/boot/dts/mpc5121ads.dts index fcaa9bad4bda..1e81a7e32d18 100644 --- a/arch/powerpc/boot/dts/mpc5121ads.dts +++ b/arch/powerpc/boot/dts/mpc5121ads.dts @@ -9,7 +9,7 @@ * option) any later version. */ -#include +#include "mpc5121.dtsi" / { model = "mpc5121ads"; diff --git a/arch/powerpc/boot/dts/pdm360ng.dts b/arch/powerpc/boot/dts/pdm360ng.dts index 0cec7244abe7..445b88114009 100644 --- a/arch/powerpc/boot/dts/pdm360ng.dts +++ b/arch/powerpc/boot/dts/pdm360ng.dts @@ -13,7 +13,7 @@ * option) any later version. */ -#include +#include "mpc5121.dtsi" / { model = "pdm360ng"; From 3ced8d73006321bd2a0412fa0ff4b065a02e7514 Mon Sep 17 00:00:00 2001 From: Christophe Lombard Date: Thu, 22 Jun 2017 15:07:27 +0200 Subject: [PATCH 141/158] cxl: Export library to support IBM XSL This patch exports a in-kernel 'library' API which can be called by other drivers to help interacting with an IBM XSL on a POWER9 system. The XSL (Translation Service Layer) is a stripped down version of the PSL (Power Service Layer) used in some cards such as the Mellanox CX5. Like the PSL, it implements the CAIA architecture, but has a number of differences, mostly in it's implementation dependent registers. The XSL also uses a special DMA cxl mode, which uses a slightly different init sequence for the CAPP and PHB. Signed-off-by: Andrew Donnellan Signed-off-by: Christophe Lombard Acked-by: Frederic Barrat Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/opal-api.h | 1 + drivers/misc/cxl/Kconfig | 5 + drivers/misc/cxl/Makefile | 2 +- drivers/misc/cxl/cxl.h | 6 + drivers/misc/cxl/cxllib.c | 246 ++++++++++++++++++++++++++++ drivers/misc/cxl/fault.c | 29 ++-- drivers/misc/cxl/native.c | 16 +- drivers/misc/cxl/pci.c | 41 +++-- include/misc/cxllib.h | 133 +++++++++++++++ 9 files changed, 449 insertions(+), 30 deletions(-) create mode 100644 drivers/misc/cxl/cxllib.c create mode 100644 include/misc/cxllib.h diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index 0b543f0f54f5..ef930ba500f9 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -948,6 +948,7 @@ enum { OPAL_PHB_CAPI_MODE_SNOOP_OFF = 2, OPAL_PHB_CAPI_MODE_SNOOP_ON = 3, OPAL_PHB_CAPI_MODE_DMA = 4, + OPAL_PHB_CAPI_MODE_DMA_TVT1 = 5, }; /* OPAL I2C request */ diff --git a/drivers/misc/cxl/Kconfig b/drivers/misc/cxl/Kconfig index b75cf830d08a..93397cb05b15 100644 --- a/drivers/misc/cxl/Kconfig +++ b/drivers/misc/cxl/Kconfig @@ -11,11 +11,16 @@ config CXL_AFU_DRIVER_OPS bool default n +config CXL_LIB + bool + default n + config CXL tristate "Support for IBM Coherent Accelerators (CXL)" depends on PPC_POWERNV && PCI_MSI && EEH select CXL_BASE select CXL_AFU_DRIVER_OPS + select CXL_LIB default m help Select this option to enable driver support for IBM Coherent diff --git a/drivers/misc/cxl/Makefile b/drivers/misc/cxl/Makefile index c14fd6b65b5a..0b5fd749d96d 100644 --- a/drivers/misc/cxl/Makefile +++ b/drivers/misc/cxl/Makefile @@ -3,7 +3,7 @@ ccflags-$(CONFIG_PPC_WERROR) += -Werror cxl-y += main.o file.o irq.o fault.o native.o cxl-y += context.o sysfs.o pci.o trace.o -cxl-y += vphb.o phb.o api.o +cxl-y += vphb.o phb.o api.o cxllib.o cxl-$(CONFIG_PPC_PSERIES) += flash.o guest.o of.o hcalls.o cxl-$(CONFIG_DEBUG_FS) += debugfs.o obj-$(CONFIG_CXL) += cxl.o diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h index a03f8e7535e5..b1afeccbb97f 100644 --- a/drivers/misc/cxl/cxl.h +++ b/drivers/misc/cxl/cxl.h @@ -1010,6 +1010,7 @@ static inline void cxl_debugfs_add_afu_regs_psl8(struct cxl_afu *afu, struct den void cxl_handle_fault(struct work_struct *work); void cxl_prefault(struct cxl_context *ctx, u64 wed); +int cxl_handle_mm_fault(struct mm_struct *mm, u64 dsisr, u64 dar); struct cxl *get_cxl_adapter(int num); int cxl_alloc_sst(struct cxl_context *ctx); @@ -1061,6 +1062,11 @@ int cxl_afu_slbia(struct cxl_afu *afu); int cxl_data_cache_flush(struct cxl *adapter); int cxl_afu_disable(struct cxl_afu *afu); int cxl_psl_purge(struct cxl_afu *afu); +int cxl_calc_capp_routing(struct pci_dev *dev, u64 *chipid, + u32 *phb_index, u64 *capp_unit_id); +int cxl_slot_is_switched(struct pci_dev *dev); +int cxl_get_xsl9_dsnctl(u64 capp_unit_id, u64 *reg); +u64 cxl_calculate_sr(bool master, bool kernel, bool real_mode, bool p9); void cxl_native_irq_dump_regs_psl9(struct cxl_context *ctx); void cxl_native_irq_dump_regs_psl8(struct cxl_context *ctx); diff --git a/drivers/misc/cxl/cxllib.c b/drivers/misc/cxl/cxllib.c new file mode 100644 index 000000000000..5dba23ca2e5f --- /dev/null +++ b/drivers/misc/cxl/cxllib.c @@ -0,0 +1,246 @@ +/* + * Copyright 2017 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include + +#include "cxl.h" + +#define CXL_INVALID_DRA ~0ull +#define CXL_DUMMY_READ_SIZE 128 +#define CXL_DUMMY_READ_ALIGN 8 +#define CXL_CAPI_WINDOW_START 0x2000000000000ull +#define CXL_CAPI_WINDOW_LOG_SIZE 48 +#define CXL_XSL_CONFIG_CURRENT_VERSION CXL_XSL_CONFIG_VERSION1 + + +bool cxllib_slot_is_supported(struct pci_dev *dev, unsigned long flags) +{ + int rc; + u32 phb_index; + u64 chip_id, capp_unit_id; + + /* No flags currently supported */ + if (flags) + return false; + + if (!cpu_has_feature(CPU_FTR_HVMODE)) + return false; + + if (!cxl_is_power9()) + return false; + + if (cxl_slot_is_switched(dev)) + return false; + + /* on p9, some pci slots are not connected to a CAPP unit */ + rc = cxl_calc_capp_routing(dev, &chip_id, &phb_index, &capp_unit_id); + if (rc) + return false; + + return true; +} +EXPORT_SYMBOL_GPL(cxllib_slot_is_supported); + +static DEFINE_MUTEX(dra_mutex); +static u64 dummy_read_addr = CXL_INVALID_DRA; + +static int allocate_dummy_read_buf(void) +{ + u64 buf, vaddr; + size_t buf_size; + + /* + * Dummy read buffer is 128-byte long, aligned on a + * 256-byte boundary and we need the physical address. + */ + buf_size = CXL_DUMMY_READ_SIZE + (1ull << CXL_DUMMY_READ_ALIGN); + buf = (u64) kzalloc(buf_size, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + vaddr = (buf + (1ull << CXL_DUMMY_READ_ALIGN) - 1) & + (~0ull << CXL_DUMMY_READ_ALIGN); + + WARN((vaddr + CXL_DUMMY_READ_SIZE) > (buf + buf_size), + "Dummy read buffer alignment issue"); + dummy_read_addr = virt_to_phys((void *) vaddr); + return 0; +} + +int cxllib_get_xsl_config(struct pci_dev *dev, struct cxllib_xsl_config *cfg) +{ + int rc; + u32 phb_index; + u64 chip_id, capp_unit_id; + + if (!cpu_has_feature(CPU_FTR_HVMODE)) + return -EINVAL; + + mutex_lock(&dra_mutex); + if (dummy_read_addr == CXL_INVALID_DRA) { + rc = allocate_dummy_read_buf(); + if (rc) { + mutex_unlock(&dra_mutex); + return rc; + } + } + mutex_unlock(&dra_mutex); + + rc = cxl_calc_capp_routing(dev, &chip_id, &phb_index, &capp_unit_id); + if (rc) + return rc; + + rc = cxl_get_xsl9_dsnctl(capp_unit_id, &cfg->dsnctl); + if (rc) + return rc; + if (cpu_has_feature(CPU_FTR_POWER9_DD1)) { + /* workaround for DD1 - nbwind = capiind */ + cfg->dsnctl |= ((u64)0x02 << (63-47)); + } + + cfg->version = CXL_XSL_CONFIG_CURRENT_VERSION; + cfg->log_bar_size = CXL_CAPI_WINDOW_LOG_SIZE; + cfg->bar_addr = CXL_CAPI_WINDOW_START; + cfg->dra = dummy_read_addr; + return 0; +} +EXPORT_SYMBOL_GPL(cxllib_get_xsl_config); + +int cxllib_switch_phb_mode(struct pci_dev *dev, enum cxllib_mode mode, + unsigned long flags) +{ + int rc = 0; + + if (!cpu_has_feature(CPU_FTR_HVMODE)) + return -EINVAL; + + switch (mode) { + case CXL_MODE_PCI: + /* + * We currently don't support going back to PCI mode + * However, we'll turn the invalidations off, so that + * the firmware doesn't have to ack them and can do + * things like reset, etc.. with no worries. + * So always return EPERM (can't go back to PCI) or + * EBUSY if we couldn't even turn off snooping + */ + rc = pnv_phb_to_cxl_mode(dev, OPAL_PHB_CAPI_MODE_SNOOP_OFF); + if (rc) + rc = -EBUSY; + else + rc = -EPERM; + break; + case CXL_MODE_CXL: + /* DMA only supported on TVT1 for the time being */ + if (flags != CXL_MODE_DMA_TVT1) + return -EINVAL; + rc = pnv_phb_to_cxl_mode(dev, OPAL_PHB_CAPI_MODE_DMA_TVT1); + if (rc) + return rc; + rc = pnv_phb_to_cxl_mode(dev, OPAL_PHB_CAPI_MODE_SNOOP_ON); + break; + default: + rc = -EINVAL; + } + return rc; +} +EXPORT_SYMBOL_GPL(cxllib_switch_phb_mode); + +/* + * When switching the PHB to capi mode, the TVT#1 entry for + * the Partitionable Endpoint is set in bypass mode, like + * in PCI mode. + * Configure the device dma to use TVT#1, which is done + * by calling dma_set_mask() with a mask large enough. + */ +int cxllib_set_device_dma(struct pci_dev *dev, unsigned long flags) +{ + int rc; + + if (flags) + return -EINVAL; + + rc = dma_set_mask(&dev->dev, DMA_BIT_MASK(64)); + return rc; +} +EXPORT_SYMBOL_GPL(cxllib_set_device_dma); + +int cxllib_get_PE_attributes(struct task_struct *task, + unsigned long translation_mode, + struct cxllib_pe_attributes *attr) +{ + struct mm_struct *mm = NULL; + + if (translation_mode != CXL_TRANSLATED_MODE && + translation_mode != CXL_REAL_MODE) + return -EINVAL; + + attr->sr = cxl_calculate_sr(false, + task == NULL, + translation_mode == CXL_REAL_MODE, + true); + attr->lpid = mfspr(SPRN_LPID); + if (task) { + mm = get_task_mm(task); + if (mm == NULL) + return -EINVAL; + /* + * Caller is keeping a reference on mm_users for as long + * as XSL uses the memory context + */ + attr->pid = mm->context.id; + mmput(mm); + } else { + attr->pid = 0; + } + attr->tid = 0; + return 0; +} +EXPORT_SYMBOL_GPL(cxllib_get_PE_attributes); + +int cxllib_handle_fault(struct mm_struct *mm, u64 addr, u64 size, u64 flags) +{ + int rc; + u64 dar; + struct vm_area_struct *vma = NULL; + unsigned long page_size; + + if (mm == NULL) + return -EFAULT; + + down_read(&mm->mmap_sem); + + for (dar = addr; dar < addr + size; dar += page_size) { + if (!vma || dar < vma->vm_start || dar > vma->vm_end) { + vma = find_vma(mm, addr); + if (!vma) { + pr_err("Can't find vma for addr %016llx\n", addr); + rc = -EFAULT; + goto out; + } + /* get the size of the pages allocated */ + page_size = vma_kernel_pagesize(vma); + } + + rc = cxl_handle_mm_fault(mm, flags, dar); + if (rc) { + pr_err("cxl_handle_mm_fault failed %d", rc); + rc = -EFAULT; + goto out; + } + } + rc = 0; +out: + up_read(&mm->mmap_sem); + return rc; +} +EXPORT_SYMBOL_GPL(cxllib_handle_fault); diff --git a/drivers/misc/cxl/fault.c b/drivers/misc/cxl/fault.c index c79e39bad7a4..6eed7d03e2b5 100644 --- a/drivers/misc/cxl/fault.c +++ b/drivers/misc/cxl/fault.c @@ -132,18 +132,15 @@ static int cxl_handle_segment_miss(struct cxl_context *ctx, return IRQ_HANDLED; } -static void cxl_handle_page_fault(struct cxl_context *ctx, - struct mm_struct *mm, u64 dsisr, u64 dar) +int cxl_handle_mm_fault(struct mm_struct *mm, u64 dsisr, u64 dar) { unsigned flt = 0; int result; unsigned long access, flags, inv_flags = 0; - trace_cxl_pte_miss(ctx, dsisr, dar); - if ((result = copro_handle_mm_fault(mm, dar, dsisr, &flt))) { pr_devel("copro_handle_mm_fault failed: %#x\n", result); - return cxl_ack_ae(ctx); + return result; } if (!radix_enabled()) { @@ -155,9 +152,8 @@ static void cxl_handle_page_fault(struct cxl_context *ctx, if (dsisr & CXL_PSL_DSISR_An_S) access |= _PAGE_WRITE; - access |= _PAGE_PRIVILEGED; - if ((!ctx->kernel) || (REGION_ID(dar) == USER_REGION_ID)) - access &= ~_PAGE_PRIVILEGED; + if (!mm && (REGION_ID(dar) != USER_REGION_ID)) + access |= _PAGE_PRIVILEGED; if (dsisr & DSISR_NOHPTE) inv_flags |= HPTE_NOHPTE_UPDATE; @@ -166,8 +162,21 @@ static void cxl_handle_page_fault(struct cxl_context *ctx, hash_page_mm(mm, dar, access, 0x300, inv_flags); local_irq_restore(flags); } - pr_devel("Page fault successfully handled for pe: %i!\n", ctx->pe); - cxl_ops->ack_irq(ctx, CXL_PSL_TFC_An_R, 0); + return 0; +} + +static void cxl_handle_page_fault(struct cxl_context *ctx, + struct mm_struct *mm, + u64 dsisr, u64 dar) +{ + trace_cxl_pte_miss(ctx, dsisr, dar); + + if (cxl_handle_mm_fault(mm, dsisr, dar)) { + cxl_ack_ae(ctx); + } else { + pr_devel("Page fault successfully handled for pe: %i!\n", ctx->pe); + cxl_ops->ack_irq(ctx, CXL_PSL_TFC_An_R, 0); + } } /* diff --git a/drivers/misc/cxl/native.c b/drivers/misc/cxl/native.c index 2b2f8894149d..4a82c313cf71 100644 --- a/drivers/misc/cxl/native.c +++ b/drivers/misc/cxl/native.c @@ -586,17 +586,17 @@ err: #define set_endian(sr) ((sr) &= ~(CXL_PSL_SR_An_LE)) #endif -static u64 calculate_sr(struct cxl_context *ctx) +u64 cxl_calculate_sr(bool master, bool kernel, bool real_mode, bool p9) { u64 sr = 0; set_endian(sr); - if (ctx->master) + if (master) sr |= CXL_PSL_SR_An_MP; if (mfspr(SPRN_LPCR) & LPCR_TC) sr |= CXL_PSL_SR_An_TC; - if (ctx->kernel) { - if (!ctx->real_mode) + if (kernel) { + if (!real_mode) sr |= CXL_PSL_SR_An_R; sr |= (mfmsr() & MSR_SF) | CXL_PSL_SR_An_HV; } else { @@ -608,7 +608,7 @@ static u64 calculate_sr(struct cxl_context *ctx) if (!test_tsk_thread_flag(current, TIF_32BIT)) sr |= CXL_PSL_SR_An_SF; } - if (cxl_is_power9()) { + if (p9) { if (radix_enabled()) sr |= CXL_PSL_SR_An_XLAT_ror; else @@ -617,6 +617,12 @@ static u64 calculate_sr(struct cxl_context *ctx) return sr; } +static u64 calculate_sr(struct cxl_context *ctx) +{ + return cxl_calculate_sr(ctx->master, ctx->kernel, ctx->real_mode, + cxl_is_power9()); +} + static void update_ivtes_directed(struct cxl_context *ctx) { bool need_update = (ctx->status == STARTED); diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c index 1eb9859809bf..d18b3d9292fd 100644 --- a/drivers/misc/cxl/pci.c +++ b/drivers/misc/cxl/pci.c @@ -375,7 +375,7 @@ static u64 get_capp_unit_id(struct device_node *np, u32 phb_index) return 0; } -static int calc_capp_routing(struct pci_dev *dev, u64 *chipid, +int cxl_calc_capp_routing(struct pci_dev *dev, u64 *chipid, u32 *phb_index, u64 *capp_unit_id) { int rc; @@ -408,17 +408,9 @@ static int calc_capp_routing(struct pci_dev *dev, u64 *chipid, return 0; } -static int init_implementation_adapter_regs_psl9(struct cxl *adapter, struct pci_dev *dev) +int cxl_get_xsl9_dsnctl(u64 capp_unit_id, u64 *reg) { - u64 xsl_dsnctl, psl_fircntl; - u64 chipid; - u32 phb_index; - u64 capp_unit_id; - int rc; - - rc = calc_capp_routing(dev, &chipid, &phb_index, &capp_unit_id); - if (rc) - return rc; + u64 xsl_dsnctl; /* * CAPI Identifier bits [0:7] @@ -454,6 +446,27 @@ static int init_implementation_adapter_regs_psl9(struct cxl *adapter, struct pci xsl_dsnctl |= ((u64)0x04 << (63-55)); } + *reg = xsl_dsnctl; + return 0; +} + +static int init_implementation_adapter_regs_psl9(struct cxl *adapter, + struct pci_dev *dev) +{ + u64 xsl_dsnctl, psl_fircntl; + u64 chipid; + u32 phb_index; + u64 capp_unit_id; + int rc; + + rc = cxl_calc_capp_routing(dev, &chipid, &phb_index, &capp_unit_id); + if (rc) + return rc; + + rc = cxl_get_xsl9_dsnctl(capp_unit_id, &xsl_dsnctl); + if (rc) + return rc; + cxl_p1_write(adapter, CXL_XSL9_DSNCTL, xsl_dsnctl); /* Set fir_cntl to recommended value for production env */ @@ -505,7 +518,7 @@ static int init_implementation_adapter_regs_psl8(struct cxl *adapter, struct pci u64 capp_unit_id; int rc; - rc = calc_capp_routing(dev, &chipid, &phb_index, &capp_unit_id); + rc = cxl_calc_capp_routing(dev, &chipid, &phb_index, &capp_unit_id); if (rc) return rc; @@ -538,7 +551,7 @@ static int init_implementation_adapter_regs_xsl(struct cxl *adapter, struct pci_ u64 capp_unit_id; int rc; - rc = calc_capp_routing(dev, &chipid, &phb_index, &capp_unit_id); + rc = cxl_calc_capp_routing(dev, &chipid, &phb_index, &capp_unit_id); if (rc) return rc; @@ -1897,7 +1910,7 @@ static void cxl_pci_remove_adapter(struct cxl *adapter) #define CXL_MAX_PCIEX_PARENT 2 -static int cxl_slot_is_switched(struct pci_dev *dev) +int cxl_slot_is_switched(struct pci_dev *dev) { struct device_node *np; int depth = 0; diff --git a/include/misc/cxllib.h b/include/misc/cxllib.h new file mode 100644 index 000000000000..e5aa29f019a6 --- /dev/null +++ b/include/misc/cxllib.h @@ -0,0 +1,133 @@ +/* + * Copyright 2017 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _MISC_CXLLIB_H +#define _MISC_CXLLIB_H + +#include +#include + +/* + * cxl driver exports a in-kernel 'library' API which can be called by + * other drivers to help interacting with an IBM XSL. + */ + +/* + * tells whether capi is supported on the PCIe slot where the + * device is seated + * + * Input: + * dev: device whose slot needs to be checked + * flags: 0 for the time being + */ +bool cxllib_slot_is_supported(struct pci_dev *dev, unsigned long flags); + + +/* + * Returns the configuration parameters to be used by the XSL or device + * + * Input: + * dev: device, used to find PHB + * Output: + * struct cxllib_xsl_config: + * version + * capi BAR address, i.e. 0x2000000000000-0x2FFFFFFFFFFFF + * capi BAR size + * data send control (XSL_DSNCTL) + * dummy read address (XSL_DRA) + */ +#define CXL_XSL_CONFIG_VERSION1 1 +struct cxllib_xsl_config { + u32 version; /* format version for register encoding */ + u32 log_bar_size;/* log size of the capi_window */ + u64 bar_addr; /* address of the start of capi window */ + u64 dsnctl; /* matches definition of XSL_DSNCTL */ + u64 dra; /* real address that can be used for dummy read */ +}; + +int cxllib_get_xsl_config(struct pci_dev *dev, struct cxllib_xsl_config *cfg); + + +/* + * Activate capi for the pci host bridge associated with the device. + * Can be extended to deactivate once we know how to do it. + * Device must be ready to accept messages from the CAPP unit and + * respond accordingly (TLB invalidates, ...) + * + * PHB is switched to capi mode through calls to skiboot. + * CAPP snooping is activated + * + * Input: + * dev: device whose PHB should switch mode + * mode: mode to switch to i.e. CAPI or PCI + * flags: options related to the mode + */ +enum cxllib_mode { + CXL_MODE_CXL, + CXL_MODE_PCI, +}; + +#define CXL_MODE_NO_DMA 0 +#define CXL_MODE_DMA_TVT0 1 +#define CXL_MODE_DMA_TVT1 2 + +int cxllib_switch_phb_mode(struct pci_dev *dev, enum cxllib_mode mode, + unsigned long flags); + + +/* + * Set the device for capi DMA. + * Define its dma_ops and dma offset so that allocations will be using TVT#1 + * + * Input: + * dev: device to set + * flags: options. CXL_MODE_DMA_TVT1 should be used + */ +int cxllib_set_device_dma(struct pci_dev *dev, unsigned long flags); + + +/* + * Get the Process Element structure for the given thread + * + * Input: + * task: task_struct for the context of the translation + * translation_mode: whether addresses should be translated + * Output: + * attr: attributes to fill up the Process Element structure from CAIA + */ +struct cxllib_pe_attributes { + u64 sr; + u32 lpid; + u32 tid; + u32 pid; +}; +#define CXL_TRANSLATED_MODE 0 +#define CXL_REAL_MODE 1 + +int cxllib_get_PE_attributes(struct task_struct *task, + unsigned long translation_mode, struct cxllib_pe_attributes *attr); + + +/* + * Handle memory fault. + * Fault in all the pages of the specified buffer for the permissions + * provided in ‘flags’ + * + * Shouldn't be called from interrupt context + * + * Input: + * mm: struct mm for the thread faulting the pages + * addr: base address of the buffer to page in + * size: size of the buffer to page in + * flags: permission requested (DSISR_ISSTORE...) + */ +int cxllib_handle_fault(struct mm_struct *mm, u64 addr, u64 size, u64 flags); + + +#endif /* _MISC_CXLLIB_H */ From 83e840c770f2c578bbbff478d62a4403c073b438 Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Thu, 29 Jun 2017 23:19:14 +0530 Subject: [PATCH 142/158] powerpc64/elfv1: Only dereference function descriptor for non-text symbols Currently, we assume that the function pointer we receive in ppc_function_entry() points to a function descriptor. However, this is not always the case. In particular, assembly symbols without the right annotation do not have an associated function descriptor. Some of these symbols are added to the kprobe blacklist using _ASM_NOKPROBE_SYMBOL(). When such addresses are subsequently processed through arch_deref_entry_point() in populate_kprobe_blacklist(), we see the below errors during bootup: [ 0.663963] Failed to find blacklist at 7d9b02a648029b6c [ 0.663970] Failed to find blacklist at a14d03d0394a0001 [ 0.663972] Failed to find blacklist at 7d5302a6f94d0388 [ 0.663973] Failed to find blacklist at 48027d11e8610178 [ 0.663974] Failed to find blacklist at f8010070f8410080 [ 0.663976] Failed to find blacklist at 386100704801f89d [ 0.663977] Failed to find blacklist at 7d5302a6f94d00b0 Fix this by checking if the function pointer we receive in ppc_function_entry() already points to kernel text. If so, we just return it as is. If not, we assume that this is a function descriptor and proceed to dereference it. Suggested-by: Nicholas Piggin Reviewed-by: Nicholas Piggin Signed-off-by: Naveen N. Rao Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/code-patching.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/code-patching.h b/arch/powerpc/include/asm/code-patching.h index abef812de7f8..5482928eea1b 100644 --- a/arch/powerpc/include/asm/code-patching.h +++ b/arch/powerpc/include/asm/code-patching.h @@ -83,8 +83,16 @@ static inline unsigned long ppc_function_entry(void *func) * On PPC64 ABIv1 the function pointer actually points to the * function's descriptor. The first entry in the descriptor is the * address of the function text. + * + * However, we may also receive pointer to an assembly symbol. To + * detect that, we first check if the function pointer we receive + * already points to kernel/module text and we only dereference it + * if it doesn't. */ - return ((func_descr_t *)func)->entry; + if (kernel_text_address((unsigned long)func)) + return (unsigned long)func; + else + return ((func_descr_t *)func)->entry; #else return (unsigned long)func; #endif From 9d6c452352d6535745e734f296335e6695b6df0b Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Thu, 29 Jun 2017 23:19:15 +0530 Subject: [PATCH 143/158] powerpc/64s: Convert .L__replay_interrupt_return to a local label Commit b48bbb82e2b835 ("powerpc/64s: Don't unbalance the return branch predictor in __replay_interrupt()") introduced __replay_interrupt_return symbol with '.L' prefix in hopes of keeping it private. However, due to the use of LOAD_REG_ADDR(), the assembler kept this symbol visible. Fix the same by instead using the local label '1'. Fixes: Commit b48bbb82e2b835 ("powerpc/64s: Don't unbalance the return branch predictor in __replay_interrupt()") Suggested-by: Nicholas Piggin Reviewed-by: Nicholas Piggin Signed-off-by: Naveen N. Rao Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/exceptions-64s.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index fb4ffc773082..3a7a456ab95c 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -1634,7 +1634,7 @@ _GLOBAL(__replay_interrupt) * we don't give a damn about, so we don't bother storing them. */ mfmsr r12 - LOAD_REG_ADDR(r11, .L__replay_interrupt_return) + LOAD_REG_ADDR(r11, 1f) mfcr r9 ori r12,r12,MSR_EE cmpwi r3,0x900 @@ -1652,6 +1652,6 @@ FTR_SECTION_ELSE cmpwi r3,0xa00 beq doorbell_super_common_msgclr ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE) -.L__replay_interrupt_return: +1: blr From cf7d6fb0672668690ba235280698c9aa6a81010a Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Thu, 29 Jun 2017 23:19:16 +0530 Subject: [PATCH 144/158] powerpc/64s: Blacklist system_call() and system_call_common() from kprobes Convert some of the symbols into private symbols and blacklist system_call_common() and system_call() from kprobes. We can't take a trap at parts of these functions as either MSR_RI is unset or the kernel stack pointer is not yet setup. Reviewed-by: Masami Hiramatsu Reviewed-by: Nicholas Piggin Signed-off-by: Naveen N. Rao [mpe: Don't convert system_call_common to _GLOBAL()] Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/entry_64.S | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index da9486e2fd89..410e19295259 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -57,7 +57,7 @@ system_call_common: #ifdef CONFIG_PPC_TRANSACTIONAL_MEM BEGIN_FTR_SECTION extrdi. r10, r12, 1, (63-MSR_TS_T_LG) /* transaction active? */ - bne tabort_syscall + bne .Ltabort_syscall END_FTR_SECTION_IFSET(CPU_FTR_TM) #endif andi. r10,r12,MSR_PR @@ -152,9 +152,9 @@ END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR) CURRENT_THREAD_INFO(r11, r1) ld r10,TI_FLAGS(r11) andi. r11,r10,_TIF_SYSCALL_DOTRACE - bne syscall_dotrace /* does not return */ + bne .Lsyscall_dotrace /* does not return */ cmpldi 0,r0,NR_syscalls - bge- syscall_enosys + bge- .Lsyscall_enosys system_call: /* label this so stack traces look sane */ /* @@ -208,7 +208,7 @@ system_call: /* label this so stack traces look sane */ ld r9,TI_FLAGS(r12) li r11,-MAX_ERRNO andi. r0,r9,(_TIF_SYSCALL_DOTRACE|_TIF_SINGLESTEP|_TIF_USER_WORK_MASK|_TIF_PERSYSCALL_MASK) - bne- syscall_exit_work + bne- .Lsyscall_exit_work /* If MSR_FP and MSR_VEC are set in user msr, then no need to restore */ li r7,MSR_FP @@ -217,12 +217,12 @@ system_call: /* label this so stack traces look sane */ #endif and r0,r8,r7 cmpd r0,r7 - bne syscall_restore_math + bne .Lsyscall_restore_math .Lsyscall_restore_math_cont: cmpld r3,r11 ld r5,_CCR(r1) - bge- syscall_error + bge- .Lsyscall_error .Lsyscall_error_cont: ld r7,_NIP(r1) BEGIN_FTR_SECTION @@ -248,13 +248,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) RFI b . /* prevent speculative execution */ -syscall_error: +.Lsyscall_error: oris r5,r5,0x1000 /* Set SO bit in CR */ neg r3,r3 std r5,_CCR(r1) b .Lsyscall_error_cont -syscall_restore_math: +.Lsyscall_restore_math: /* * Some initial tests from restore_math to avoid the heavyweight * C code entry and MSR manipulations. @@ -289,7 +289,7 @@ syscall_restore_math: b .Lsyscall_restore_math_cont /* Traced system call support */ -syscall_dotrace: +.Lsyscall_dotrace: bl save_nvgprs addi r3,r1,STACK_FRAME_OVERHEAD bl do_syscall_trace_enter @@ -322,11 +322,11 @@ syscall_dotrace: b .Lsyscall_exit -syscall_enosys: +.Lsyscall_enosys: li r3,-ENOSYS b .Lsyscall_exit -syscall_exit_work: +.Lsyscall_exit_work: #ifdef CONFIG_PPC_BOOK3S li r10,MSR_RI mtmsrd r10,1 /* Restore RI */ @@ -386,7 +386,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) b ret_from_except #ifdef CONFIG_PPC_TRANSACTIONAL_MEM -tabort_syscall: +.Ltabort_syscall: /* Firstly we need to enable TM in the kernel */ mfmsr r10 li r9, 1 @@ -412,6 +412,8 @@ tabort_syscall: rfid b . /* prevent speculative execution */ #endif +_ASM_NOKPROBE_SYMBOL(system_call_common); +_ASM_NOKPROBE_SYMBOL(system_call); /* Save non-volatile GPRs, if not already saved. */ _GLOBAL(save_nvgprs) From 266de3a842d70b481ef1dac487acfad1240deb7e Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Thu, 29 Jun 2017 23:19:17 +0530 Subject: [PATCH 145/158] powerpc/64s: Move system_call() symbol to just after setting MSR_EE It is common to get a PMU interrupt right after the mtmsr instruction that enables interrupts. Due to this, the stack trace profile gets needlessly split across system_call_common() and system_call(). Previously, system_call() symbol was at the current place to hide a few earlier symbols which have since been made private or removed entirely. So, let's move system_call() slightly higher up, right after the mtmsr instruction that enables interrupts. Convert existing references to system_call to a local syscall symbol. Suggested-by: Nicholas Piggin Reviewed-by: Nicholas Piggin Signed-off-by: Naveen N. Rao Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/entry_64.S | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 410e19295259..e5cf98d755db 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -143,6 +143,7 @@ END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR) mtmsrd r11,1 #endif /* CONFIG_PPC_BOOK3E */ +system_call: /* label this so stack traces look sane */ /* We do need to set SOFTE in the stack frame or the return * from interrupt will be painful */ @@ -156,7 +157,7 @@ END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR) cmpldi 0,r0,NR_syscalls bge- .Lsyscall_enosys -system_call: /* label this so stack traces look sane */ +.Lsyscall: /* * Need to vector to 32 Bit or default sys_call_table here, * based on caller's run-mode / personality. @@ -310,13 +311,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) ld r7,GPR7(r1) ld r8,GPR8(r1) - /* Repopulate r9 and r10 for the system_call path */ + /* Repopulate r9 and r10 for the syscall path */ addi r9,r1,STACK_FRAME_OVERHEAD CURRENT_THREAD_INFO(r10, r1) ld r10,TI_FLAGS(r10) cmpldi r0,NR_syscalls - blt+ system_call + blt+ .Lsyscall /* Return code is already in r3 thanks to do_syscall_trace_enter() */ b .Lsyscall_exit From 3639d6619c249acf3b8ecc4fd4552486f217dec0 Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Thu, 29 Jun 2017 23:19:18 +0530 Subject: [PATCH 146/158] powerpc/64s: Un-blacklist system_call() from kprobes It is actually safe to probe system_call() in entry_64.S, but only till we unset MSR_RI. To allow this, add a new symbol system_call_exit() after the mtmsrd and blacklist that. Suggested-by: Michael Ellerman Reviewed-by: Nicholas Piggin Signed-off-by: Naveen N. Rao Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/entry_64.S | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index e5cf98d755db..ed45e86673c0 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -188,6 +188,18 @@ system_call: /* label this so stack traces look sane */ andi. r10,r8,MSR_RI beq- unrecov_restore #endif + +/* + * This is a few instructions into the actual syscall exit path (which actually + * starts at .Lsyscall_exit) to cater to kprobe blacklisting and to reduce the + * number of visible symbols for profiling purposes. + * + * We can probe from system_call until this point as MSR_RI is set. But once it + * is cleared below, we won't be able to take a trap. + * + * This is blacklisted from kprobes further below with _ASM_NOKPROBE_SYMBOL(). + */ +system_call_exit: /* * Disable interrupts so current_thread_info()->flags can't change, * and so that we don't get interrupted after loading SRR0/1. @@ -414,7 +426,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) b . /* prevent speculative execution */ #endif _ASM_NOKPROBE_SYMBOL(system_call_common); -_ASM_NOKPROBE_SYMBOL(system_call); +_ASM_NOKPROBE_SYMBOL(system_call_exit); /* Save non-volatile GPRs, if not already saved. */ _GLOBAL(save_nvgprs) From 15770a13bebc7fae2fbc06bc4cc708a3f724c586 Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Thu, 29 Jun 2017 23:19:19 +0530 Subject: [PATCH 147/158] powerpc/64s: Blacklist functions invoked on a trap Blacklist all functions involved while handling a trap. We: - convert some of the symbols into private symbols, and - blacklist most functions involved while handling a trap. Reviewed-by: Masami Hiramatsu Signed-off-by: Naveen N. Rao Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/entry_64.S | 35 +++++++++++++++++----------- arch/powerpc/kernel/exceptions-64s.S | 2 ++ arch/powerpc/kernel/traps.c | 3 +++ 3 files changed, 27 insertions(+), 13 deletions(-) diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index ed45e86673c0..d5be463b2fa4 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -186,7 +186,7 @@ system_call: /* label this so stack traces look sane */ #ifdef CONFIG_PPC_BOOK3S /* No MSR:RI on BookE */ andi. r10,r8,MSR_RI - beq- unrecov_restore + beq- .Lunrecov_restore #endif /* @@ -437,6 +437,7 @@ _GLOBAL(save_nvgprs) clrrdi r0,r11,1 std r0,_TRAP(r1) blr +_ASM_NOKPROBE_SYMBOL(save_nvgprs); /* @@ -807,11 +808,11 @@ restore: ld r5,SOFTE(r1) lbz r6,PACASOFTIRQEN(r13) cmpwi cr0,r5,0 - beq restore_irq_off + beq .Lrestore_irq_off /* We are enabling, were we already enabled ? Yes, just return */ cmpwi cr0,r6,1 - beq cr0,do_restore + beq cr0,.Ldo_restore /* * We are about to soft-enable interrupts (we are hard disabled @@ -820,14 +821,14 @@ restore: */ lbz r0,PACAIRQHAPPENED(r13) cmpwi cr0,r0,0 - bne- restore_check_irq_replay + bne- .Lrestore_check_irq_replay /* * Get here when nothing happened while soft-disabled, just * soft-enable and move-on. We will hard-enable as a side * effect of rfi */ -restore_no_replay: +.Lrestore_no_replay: TRACE_ENABLE_INTS li r0,1 stb r0,PACASOFTIRQEN(r13); @@ -835,7 +836,7 @@ restore_no_replay: /* * Final return path. BookE is handled in a different file */ -do_restore: +.Ldo_restore: #ifdef CONFIG_PPC_BOOK3E b exception_return_book3e #else @@ -869,7 +870,7 @@ fast_exception_return: REST_8GPRS(5, r1) andi. r0,r3,MSR_RI - beq- unrecov_restore + beq- .Lunrecov_restore /* Load PPR from thread struct before we clear MSR:RI */ BEGIN_FTR_SECTION @@ -927,7 +928,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) * make sure that in this case, we also clear PACA_IRQ_HARD_DIS * or that bit can get out of sync and bad things will happen */ -restore_irq_off: +.Lrestore_irq_off: ld r3,_MSR(r1) lbz r7,PACAIRQHAPPENED(r13) andi. r0,r3,MSR_EE @@ -937,13 +938,13 @@ restore_irq_off: 1: li r0,0 stb r0,PACASOFTIRQEN(r13); TRACE_DISABLE_INTS - b do_restore + b .Ldo_restore /* * Something did happen, check if a re-emit is needed * (this also clears paca->irq_happened) */ -restore_check_irq_replay: +.Lrestore_check_irq_replay: /* XXX: We could implement a fast path here where we check * for irq_happened being just 0x01, in which case we can * clear it and return. That means that we would potentially @@ -953,7 +954,7 @@ restore_check_irq_replay: */ bl __check_irq_replay cmpwi cr0,r3,0 - beq restore_no_replay + beq .Lrestore_no_replay /* * We need to re-emit an interrupt. We do so by re-using our @@ -1002,10 +1003,18 @@ restore_check_irq_replay: #endif /* CONFIG_PPC_DOORBELL */ 1: b ret_from_except /* What else to do here ? */ -unrecov_restore: +.Lunrecov_restore: addi r3,r1,STACK_FRAME_OVERHEAD bl unrecoverable_exception - b unrecov_restore + b .Lunrecov_restore + +_ASM_NOKPROBE_SYMBOL(ret_from_except); +_ASM_NOKPROBE_SYMBOL(ret_from_except_lite); +_ASM_NOKPROBE_SYMBOL(resume_kernel); +_ASM_NOKPROBE_SYMBOL(fast_exc_return_irq); +_ASM_NOKPROBE_SYMBOL(restore); +_ASM_NOKPROBE_SYMBOL(fast_exception_return); + #ifdef CONFIG_PPC_RTAS /* diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 3a7a456ab95c..4c18a5fbb4bb 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -1594,6 +1594,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR) 1: addi r3,r1,STACK_FRAME_OVERHEAD bl kernel_bad_stack b 1b +_ASM_NOKPROBE_SYMBOL(bad_stack); /* * When doorbell is triggered from system reset wakeup, the message is @@ -1655,3 +1656,4 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE) 1: blr +_ASM_NOKPROBE_SYMBOL(__replay_interrupt) diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index d4e545d27ef9..bfcfd9ef09f2 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -237,6 +237,7 @@ void die(const char *str, struct pt_regs *regs, long err) err = 0; oops_end(flags, regs, err); } +NOKPROBE_SYMBOL(die); void user_single_step_siginfo(struct task_struct *tsk, struct pt_regs *regs, siginfo_t *info) @@ -1968,6 +1969,7 @@ void unrecoverable_exception(struct pt_regs *regs) regs->trap, regs->nip); die("Unrecoverable exception", regs, SIGABRT); } +NOKPROBE_SYMBOL(unrecoverable_exception); #if defined(CONFIG_BOOKE_WDT) || defined(CONFIG_40x) /* @@ -1998,6 +2000,7 @@ void kernel_bad_stack(struct pt_regs *regs) regs->gpr[1], regs->nip); die("Bad kernel stack pointer", regs, SIGABRT); } +NOKPROBE_SYMBOL(kernel_bad_stack); void __init trap_init(void) { From 90653a84052cfbbc1f46427f851dad14083e36df Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Thu, 29 Jun 2017 23:19:20 +0530 Subject: [PATCH 148/158] powerpc/64s: Blacklist rtas entry/exit from kprobes We can't take traps with relocation off, so blacklist enter_rtas() and rtas_return_loc(). However, instead of blacklisting all of enter_rtas(), introduce a new symbol __enter_rtas from where on we can't take a trap and blacklist that. Signed-off-by: Naveen N. Rao Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/entry_64.S | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index d5be463b2fa4..49d8422767b4 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -1091,6 +1091,8 @@ _GLOBAL(enter_rtas) rldicr r9,r9,MSR_SF_LG,(63-MSR_SF_LG) ori r9,r9,MSR_IR|MSR_DR|MSR_FE0|MSR_FE1|MSR_FP|MSR_RI|MSR_LE andc r6,r0,r9 + +__enter_rtas: sync /* disable interrupts so SRR0/1 */ mtmsrd r0 /* don't get trashed */ @@ -1127,6 +1129,8 @@ rtas_return_loc: mtspr SPRN_SRR1,r4 rfid b . /* prevent speculative execution */ +_ASM_NOKPROBE_SYMBOL(__enter_rtas) +_ASM_NOKPROBE_SYMBOL(rtas_return_loc) .align 3 1: .llong rtas_restore_regs From e71ff982ae4c17d176e9f0132157d54973788377 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Thu, 29 Jun 2017 03:04:07 +1000 Subject: [PATCH 149/158] powerpc/pseries: Fix passing of pp0 in updatepp() and updateboltedpp() Once upon a time there were only two PP (page protection) bits. In ISA 2.03 an additional PP bit was added, but because of the layout of the HPTE it could not be made contiguous with the existing PP bits. The result is that we now have three PP bits, named pp0, pp1, pp2, where pp0 occupies bit 63 of dword 1 of the HPTE and pp1 and pp2 occupy bits 1 and 0 respectively. Until recently Linux hasn't used pp0, however with the addition of _PAGE_KERNEL_RO we started using it. The problem arises in the LPAR code, where we need to translate the PP bits into the argument for the H_PROTECT hypercall. Currently the code only passes bits 0-2 of newpp, which covers pp1, pp2 and N (no execute), meaning pp0 is not passed to the hypervisor at all. We can't simply pass it through in bit 63, as that would collide with a different field in the flags argument, as defined in PAPR. Instead we have to shift it down to bit 8 (IBM bit 55). Fixes: e58e87adc8bf ("powerpc/mm: Update _PAGE_KERNEL_RO") Cc: stable@vger.kernel.org # v4.7+ Signed-off-by: Balbir Singh [mpe: Simplify the test, rework change log] Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/pseries/lpar.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index 6541d0b03e4c..495ba4e7336d 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -301,7 +301,7 @@ static long pSeries_lpar_hpte_updatepp(unsigned long slot, int ssize, unsigned long inv_flags) { unsigned long lpar_rc; - unsigned long flags = (newpp & 7) | H_AVPN; + unsigned long flags; unsigned long want_v; want_v = hpte_encode_avpn(vpn, psize, ssize); @@ -309,6 +309,11 @@ static long pSeries_lpar_hpte_updatepp(unsigned long slot, pr_devel(" update: avpnv=%016lx, hash=%016lx, f=%lx, psize: %d ...", want_v, slot, flags, psize); + flags = (newpp & 7) | H_AVPN; + if (mmu_has_feature(MMU_FTR_KERNEL_RO)) + /* Move pp0 into bit 8 (IBM 55) */ + flags |= (newpp & HPTE_R_PP0) >> 55; + lpar_rc = plpar_pte_protect(flags, slot, want_v); if (lpar_rc == H_NOT_FOUND) { @@ -380,6 +385,10 @@ static void pSeries_lpar_hpte_updateboltedpp(unsigned long newpp, BUG_ON(slot == -1); flags = newpp & 7; + if (mmu_has_feature(MMU_FTR_KERNEL_RO)) + /* Move pp0 into bit 8 (IBM 55) */ + flags |= (newpp & HPTE_R_PP0) >> 55; + lpar_rc = plpar_pte_protect(flags, slot, 0); BUG_ON(lpar_rc != H_SUCCESS); From 7f6d498ed3354740cfd100e4aa99e388f1a95be7 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Thu, 29 Jun 2017 03:04:10 +1000 Subject: [PATCH 150/158] powerpc/mm/radix: Fix execute permissions for interrupt_vectors Commit 9abcc981de97 ("powerpc/mm/radix: Only add X for pages overlapping kernel text") changed the linear mapping on Radix to only mark the kernel text executable. However if the kernel is run relocated, for example as a kdump kernel, then the exception vectors are split from the kernel text, ie. they remain at real address 0. We tend to get away with it, because the kernel itself will usually be below 1G, which means the 1G page at 0-1G is marked executable and everything works OK. However if the kernel is loaded above 1G, or the system has less than 1G in total (meaning we can't use a 1G page), then the exception vectors will not be marked executable and the kernel will fail to boot. Fix it by also checking if the address range overlaps the exception vectors when deciding if we should add PAGE_KERNEL_X. Fixes: 9abcc981de97 ("powerpc/mm/radix: Only add X for pages overlapping kernel text") Cc: stable@vger.kernel.org # v4.7+ Signed-off-by: Balbir Singh [mpe: Combine with the existing check, rewrite change log] Signed-off-by: Michael Ellerman --- arch/powerpc/mm/pgtable-radix.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c index f6af90371b1e..1342859552b1 100644 --- a/arch/powerpc/mm/pgtable-radix.c +++ b/arch/powerpc/mm/pgtable-radix.c @@ -150,7 +150,8 @@ static int __meminit create_physical_mapping(unsigned long start, vaddr = (unsigned long)__va(addr); - if (overlaps_kernel_text(vaddr, vaddr + mapping_size)) + if (overlaps_kernel_text(vaddr, vaddr + mapping_size) || + overlaps_interrupt_vector_text(vaddr, vaddr + mapping_size)) prot = PAGE_KERNEL_X; else prot = PAGE_KERNEL; From d07df82c43be82ab6972662180e89e6ba2a828ad Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Tue, 6 Jun 2017 14:29:38 +1000 Subject: [PATCH 151/158] powerpc/kprobes: Move kprobes over to patch_instruction() arch_arm/disarm_probe() use direct assignment for copying instructions, replace them with patch_instruction(). We don't need to call flush_icache_range() because patch_instruction() does it for us. Signed-off-by: Balbir Singh Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/kprobes.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c index 01addfb0ed0a..45f1ff721c32 100644 --- a/arch/powerpc/kernel/kprobes.c +++ b/arch/powerpc/kernel/kprobes.c @@ -164,17 +164,13 @@ NOKPROBE_SYMBOL(arch_prepare_kprobe); void arch_arm_kprobe(struct kprobe *p) { - *p->addr = BREAKPOINT_INSTRUCTION; - flush_icache_range((unsigned long) p->addr, - (unsigned long) p->addr + sizeof(kprobe_opcode_t)); + patch_instruction(p->addr, BREAKPOINT_INSTRUCTION); } NOKPROBE_SYMBOL(arch_arm_kprobe); void arch_disarm_kprobe(struct kprobe *p) { - *p->addr = p->opcode; - flush_icache_range((unsigned long) p->addr, - (unsigned long) p->addr + sizeof(kprobe_opcode_t)); + patch_instruction(p->addr, p->opcode); } NOKPROBE_SYMBOL(arch_disarm_kprobe); From f3eca956389316acd1a132fad1ad0b6f2ca78a61 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Tue, 6 Jun 2017 14:29:39 +1000 Subject: [PATCH 152/158] powerpc/kprobes/optprobes: Use patch_instruction() So that we can implement STRICT_RWX, use patch_instruction() in optprobes. Signed-off-by: Balbir Singh Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/optprobes.c | 53 ++++++++++++++++++++------------- 1 file changed, 32 insertions(+), 21 deletions(-) diff --git a/arch/powerpc/kernel/optprobes.c b/arch/powerpc/kernel/optprobes.c index ec60ed0d4aad..6f8273f5e988 100644 --- a/arch/powerpc/kernel/optprobes.c +++ b/arch/powerpc/kernel/optprobes.c @@ -158,12 +158,13 @@ void arch_remove_optimized_kprobe(struct optimized_kprobe *op) void patch_imm32_load_insns(unsigned int val, kprobe_opcode_t *addr) { /* addis r4,0,(insn)@h */ - *addr++ = PPC_INST_ADDIS | ___PPC_RT(4) | - ((val >> 16) & 0xffff); + patch_instruction(addr, PPC_INST_ADDIS | ___PPC_RT(4) | + ((val >> 16) & 0xffff)); + addr++; /* ori r4,r4,(insn)@l */ - *addr = PPC_INST_ORI | ___PPC_RA(4) | ___PPC_RS(4) | - (val & 0xffff); + patch_instruction(addr, PPC_INST_ORI | ___PPC_RA(4) | + ___PPC_RS(4) | (val & 0xffff)); } /* @@ -173,24 +174,28 @@ void patch_imm32_load_insns(unsigned int val, kprobe_opcode_t *addr) void patch_imm64_load_insns(unsigned long val, kprobe_opcode_t *addr) { /* lis r3,(op)@highest */ - *addr++ = PPC_INST_ADDIS | ___PPC_RT(3) | - ((val >> 48) & 0xffff); + patch_instruction(addr, PPC_INST_ADDIS | ___PPC_RT(3) | + ((val >> 48) & 0xffff)); + addr++; /* ori r3,r3,(op)@higher */ - *addr++ = PPC_INST_ORI | ___PPC_RA(3) | ___PPC_RS(3) | - ((val >> 32) & 0xffff); + patch_instruction(addr, PPC_INST_ORI | ___PPC_RA(3) | + ___PPC_RS(3) | ((val >> 32) & 0xffff)); + addr++; /* rldicr r3,r3,32,31 */ - *addr++ = PPC_INST_RLDICR | ___PPC_RA(3) | ___PPC_RS(3) | - __PPC_SH64(32) | __PPC_ME64(31); + patch_instruction(addr, PPC_INST_RLDICR | ___PPC_RA(3) | + ___PPC_RS(3) | __PPC_SH64(32) | __PPC_ME64(31)); + addr++; /* oris r3,r3,(op)@h */ - *addr++ = PPC_INST_ORIS | ___PPC_RA(3) | ___PPC_RS(3) | - ((val >> 16) & 0xffff); + patch_instruction(addr, PPC_INST_ORIS | ___PPC_RA(3) | + ___PPC_RS(3) | ((val >> 16) & 0xffff)); + addr++; /* ori r3,r3,(op)@l */ - *addr = PPC_INST_ORI | ___PPC_RA(3) | ___PPC_RS(3) | - (val & 0xffff); + patch_instruction(addr, PPC_INST_ORI | ___PPC_RA(3) | + ___PPC_RS(3) | (val & 0xffff)); } int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p) @@ -198,7 +203,8 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p) kprobe_opcode_t *buff, branch_op_callback, branch_emulate_step; kprobe_opcode_t *op_callback_addr, *emulate_step_addr; long b_offset; - unsigned long nip; + unsigned long nip, size; + int rc, i; kprobe_ppc_optinsn_slots.insn_size = MAX_OPTINSN_SIZE; @@ -231,8 +237,14 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p) goto error; /* Setup template */ - memcpy(buff, optprobe_template_entry, - TMPL_END_IDX * sizeof(kprobe_opcode_t)); + /* We can optimize this via patch_instruction_window later */ + size = (TMPL_END_IDX * sizeof(kprobe_opcode_t)) / sizeof(int); + pr_devel("Copying template to %p, size %lu\n", buff, size); + for (i = 0; i < size; i++) { + rc = patch_instruction(buff + i, *(optprobe_template_entry + i)); + if (rc < 0) + goto error; + } /* * Fixup the template with instructions to: @@ -261,8 +273,8 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p) if (!branch_op_callback || !branch_emulate_step) goto error; - buff[TMPL_CALL_HDLR_IDX] = branch_op_callback; - buff[TMPL_EMULATE_IDX] = branch_emulate_step; + patch_instruction(buff + TMPL_CALL_HDLR_IDX, branch_op_callback); + patch_instruction(buff + TMPL_EMULATE_IDX, branch_emulate_step); /* * 3. load instruction to be emulated into relevant register, and @@ -272,8 +284,7 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p) /* * 4. branch back from trampoline */ - buff[TMPL_RET_IDX] = create_branch((unsigned int *)buff + TMPL_RET_IDX, - (unsigned long)nip, 0); + patch_branch(buff + TMPL_RET_IDX, (unsigned long)nip, 0); flush_icache_range((unsigned long)buff, (unsigned long)(&buff[TMPL_END_IDX])); From efe4fbb1ac3a6489c1dc7d31b51ecb7425807b1b Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Tue, 27 Jun 2017 17:48:58 +1000 Subject: [PATCH 153/158] powerpc/xmon: Add patch_instruction() support for xmon Move from mwrite() to patch_instruction() for xmon for breakpoint addition and removal. Signed-off-by: Balbir Singh Signed-off-by: Michael Ellerman --- arch/powerpc/xmon/xmon.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index a728e1919613..08e367e3e8c3 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -53,6 +53,7 @@ #include #include #include +#include #ifdef CONFIG_PPC64 #include @@ -837,7 +838,8 @@ static void insert_bpts(void) store_inst(&bp->instr[0]); if (bp->enabled & BP_CIABR) continue; - if (mwrite(bp->address, &bpinstr, 4) != 4) { + if (patch_instruction((unsigned int *)bp->address, + bpinstr) != 0) { printf("Couldn't write instruction at %lx, " "disabling breakpoint there\n", bp->address); bp->enabled &= ~BP_TRAP; @@ -874,7 +876,8 @@ static void remove_bpts(void) continue; if (mread(bp->address, &instr, 4) == 4 && instr == bpinstr - && mwrite(bp->address, &bp->instr, 4) != 4) + && patch_instruction( + (unsigned int *)bp->address, bp->instr[0]) != 0) printf("Couldn't remove breakpoint at %lx\n", bp->address); else From 37bc3e5fd764fb258ff4fcbb90b6d1b67fb466c1 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Thu, 29 Jun 2017 03:04:05 +1000 Subject: [PATCH 154/158] powerpc/lib/code-patching: Use alternate map for patch_instruction() This patch creates the window using text_poke_area, allocated via get_vm_area(). text_poke_area is per CPU to avoid locking. text_poke_area for each cpu is setup using late_initcall, prior to setup of these alternate mapping areas, we continue to use direct write to change/modify kernel text. With the ability to use alternate mappings to write to kernel text, it provides us the freedom to then turn text read-only and implement CONFIG_STRICT_KERNEL_RWX. This code is CPU hotplug aware to ensure that the we have mappings for any new cpus as they come online and tear down mappings for any CPUs that go offline. Signed-off-by: Balbir Singh Signed-off-by: Michael Ellerman --- arch/powerpc/lib/code-patching.c | 171 ++++++++++++++++++++++++++++++- 1 file changed, 167 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index 500b0f6a0b64..c9de03e0c1f1 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -12,23 +12,186 @@ #include #include #include -#include -#include +#include +#include #include #include +#include +#include +#include +#include -int patch_instruction(unsigned int *addr, unsigned int instr) +static int __patch_instruction(unsigned int *addr, unsigned int instr) { int err; __put_user_size(instr, addr, 4, err); if (err) return err; - asm ("dcbst 0, %0; sync; icbi 0,%0; sync; isync" : : "r" (addr)); + + asm ("dcbst 0, %0; sync; icbi 0,%0; sync; isync" :: "r" (addr)); + return 0; } +#ifdef CONFIG_STRICT_KERNEL_RWX +static DEFINE_PER_CPU(struct vm_struct *, text_poke_area); + +static int text_area_cpu_up(unsigned int cpu) +{ + struct vm_struct *area; + + area = get_vm_area(PAGE_SIZE, VM_ALLOC); + if (!area) { + WARN_ONCE(1, "Failed to create text area for cpu %d\n", + cpu); + return -1; + } + this_cpu_write(text_poke_area, area); + + return 0; +} + +static int text_area_cpu_down(unsigned int cpu) +{ + free_vm_area(this_cpu_read(text_poke_area)); + return 0; +} + +/* + * Run as a late init call. This allows all the boot time patching to be done + * simply by patching the code, and then we're called here prior to + * mark_rodata_ro(), which happens after all init calls are run. Although + * BUG_ON() is rude, in this case it should only happen if ENOMEM, and we judge + * it as being preferable to a kernel that will crash later when someone tries + * to use patch_instruction(). + */ +static int __init setup_text_poke_area(void) +{ + BUG_ON(!cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, + "powerpc/text_poke:online", text_area_cpu_up, + text_area_cpu_down)); + + return 0; +} +late_initcall(setup_text_poke_area); + +/* + * This can be called for kernel text or a module. + */ +static int map_patch_area(void *addr, unsigned long text_poke_addr) +{ + unsigned long pfn; + int err; + + if (is_vmalloc_addr(addr)) + pfn = vmalloc_to_pfn(addr); + else + pfn = __pa_symbol(addr) >> PAGE_SHIFT; + + err = map_kernel_page(text_poke_addr, (pfn << PAGE_SHIFT), + pgprot_val(PAGE_KERNEL)); + + pr_devel("Mapped addr %lx with pfn %lx:%d\n", text_poke_addr, pfn, err); + if (err) + return -1; + + return 0; +} + +static inline int unmap_patch_area(unsigned long addr) +{ + pte_t *ptep; + pmd_t *pmdp; + pud_t *pudp; + pgd_t *pgdp; + + pgdp = pgd_offset_k(addr); + if (unlikely(!pgdp)) + return -EINVAL; + + pudp = pud_offset(pgdp, addr); + if (unlikely(!pudp)) + return -EINVAL; + + pmdp = pmd_offset(pudp, addr); + if (unlikely(!pmdp)) + return -EINVAL; + + ptep = pte_offset_kernel(pmdp, addr); + if (unlikely(!ptep)) + return -EINVAL; + + pr_devel("clearing mm %p, pte %p, addr %lx\n", &init_mm, ptep, addr); + + /* + * In hash, pte_clear flushes the tlb, in radix, we have to + */ + pte_clear(&init_mm, addr, ptep); + flush_tlb_kernel_range(addr, addr + PAGE_SIZE); + + return 0; +} + +int patch_instruction(unsigned int *addr, unsigned int instr) +{ + int err; + unsigned int *dest = NULL; + unsigned long flags; + unsigned long text_poke_addr; + unsigned long kaddr = (unsigned long)addr; + + /* + * During early early boot patch_instruction is called + * when text_poke_area is not ready, but we still need + * to allow patching. We just do the plain old patching + * We use slab_is_available and per cpu read * via this_cpu_read + * of text_poke_area. Per-CPU areas might not be up early + * this can create problems with just using this_cpu_read() + */ + if (!slab_is_available() || !this_cpu_read(text_poke_area)) + return __patch_instruction(addr, instr); + + local_irq_save(flags); + + text_poke_addr = (unsigned long)__this_cpu_read(text_poke_area)->addr; + if (map_patch_area(addr, text_poke_addr)) { + err = -1; + goto out; + } + + dest = (unsigned int *)(text_poke_addr) + + ((kaddr & ~PAGE_MASK) / sizeof(unsigned int)); + + /* + * We use __put_user_size so that we can handle faults while + * writing to dest and return err to handle faults gracefully + */ + __put_user_size(instr, dest, 4, err); + if (!err) + asm ("dcbst 0, %0; sync; icbi 0,%0; icbi 0,%1; sync; isync" + ::"r" (dest), "r"(addr)); + + err = unmap_patch_area(text_poke_addr); + if (err) + pr_warn("failed to unmap %lx\n", text_poke_addr); + +out: + local_irq_restore(flags); + + return err; +} +#else /* !CONFIG_STRICT_KERNEL_RWX */ + +int patch_instruction(unsigned int *addr, unsigned int instr) +{ + return __patch_instruction(addr, instr); +} + +#endif /* CONFIG_STRICT_KERNEL_RWX */ +NOKPROBE_SYMBOL(patch_instruction); + int patch_branch(unsigned int *addr, unsigned long target, int flags) { return patch_instruction(addr, create_branch(addr, target, flags)); From d924cc3feda9c2bea8164930899f367ce249cbbf Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Thu, 29 Jun 2017 03:04:06 +1000 Subject: [PATCH 155/158] powerpc/vmlinux.lds: Align __init_begin to 16M For CONFIG_STRICT_KERNEL_RWX align __init_begin to 16M. We use 16M since its the larger of 2M on radix and 16M on hash for our linear mapping. The plan is to have .text, .rodata and everything upto __init_begin marked as RX. Note we still have executable read only data. We could further align rodata to another 16M boundary. I've used keeping text plus rodata as read-only-executable as a trade-off to doing read-only-executable for text and read-only for rodata. We don't use multi PT_LOAD in PHDRS because we are not sure if all bootloaders support them. This patch keeps PHDRS in vmlinux.lds.S as the same they are with just one PT_LOAD for all of the kernel marked as RWX (7). mpe: What this means is the added alignment bloats the resulting binary on disk, a powernv kernel goes from 17M to 22M. Signed-off-by: Balbir Singh Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/vmlinux.lds.S | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index ace6b6579961..b1a250560198 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -8,6 +8,12 @@ #include #include +#ifdef CONFIG_STRICT_KERNEL_RWX +#define STRICT_ALIGN_SIZE (1 << 24) +#else +#define STRICT_ALIGN_SIZE PAGE_SIZE +#endif + ENTRY(_stext) PHDRS { @@ -123,7 +129,7 @@ SECTIONS PROVIDE32 (etext = .); /* Read-only data */ - RODATA + RO_DATA(PAGE_SIZE) EXCEPTION_TABLE(0) @@ -140,7 +146,7 @@ SECTIONS /* * Init sections discarded at runtime */ - . = ALIGN(PAGE_SIZE); + . = ALIGN(STRICT_ALIGN_SIZE); __init_begin = .; INIT_TEXT_SECTION(PAGE_SIZE) :kernel From cd65d69713349fc7b33fa9de2b32989b99c9fb39 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Thu, 29 Jun 2017 03:04:08 +1000 Subject: [PATCH 156/158] powerpc/mm/hash: Implement mark_rodata_ro() for hash With hash we update the bolted pte to mark it read-only. We rely on the MMU_FTR_KERNEL_RO to generate the correct permissions for read-only text. The radix implementation just prints a warning in this implementation Signed-off-by: Balbir Singh [mpe: Make the warning louder when we don't have MMU_FTR_KERNEL_RO] Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/64/hash.h | 3 ++ arch/powerpc/include/asm/book3s/64/radix.h | 4 +++ arch/powerpc/mm/pgtable-hash64.c | 36 ++++++++++++++++++++++ arch/powerpc/mm/pgtable_64.c | 13 ++++++++ 4 files changed, 56 insertions(+) diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h index 4e957b027fe0..0ce513f2926f 100644 --- a/arch/powerpc/include/asm/book3s/64/hash.h +++ b/arch/powerpc/include/asm/book3s/64/hash.h @@ -89,6 +89,9 @@ static inline int hash__pgd_bad(pgd_t pgd) { return (pgd_val(pgd) == 0); } +#ifdef CONFIG_STRICT_KERNEL_RWX +extern void hash__mark_rodata_ro(void); +#endif extern void hpte_need_flush(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned long pte, int huge); diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h index ba43754e96d2..487709ff6875 100644 --- a/arch/powerpc/include/asm/book3s/64/radix.h +++ b/arch/powerpc/include/asm/book3s/64/radix.h @@ -116,6 +116,10 @@ #define RADIX_PUD_TABLE_SIZE (sizeof(pud_t) << RADIX_PUD_INDEX_SIZE) #define RADIX_PGD_TABLE_SIZE (sizeof(pgd_t) << RADIX_PGD_INDEX_SIZE) +#ifdef CONFIG_STRICT_KERNEL_RWX +extern void radix__mark_rodata_ro(void); +#endif + static inline unsigned long __radix_pte_update(pte_t *ptep, unsigned long clr, unsigned long set) { diff --git a/arch/powerpc/mm/pgtable-hash64.c b/arch/powerpc/mm/pgtable-hash64.c index a0facee58811..188b4107584d 100644 --- a/arch/powerpc/mm/pgtable-hash64.c +++ b/arch/powerpc/mm/pgtable-hash64.c @@ -11,8 +11,12 @@ #include #include +#include #include +#include +#include +#include #include #include "mmu_decl.h" @@ -419,3 +423,35 @@ int hash__has_transparent_hugepage(void) return 1; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +#ifdef CONFIG_STRICT_KERNEL_RWX +void hash__mark_rodata_ro(void) +{ + unsigned long start = (unsigned long)_stext; + unsigned long end = (unsigned long)__init_begin; + unsigned long idx; + unsigned int step, shift; + unsigned long newpp = PP_RXXX; + + shift = mmu_psize_defs[mmu_linear_psize].shift; + step = 1 << shift; + + start = ((start + step - 1) >> shift) << shift; + end = (end >> shift) << shift; + + pr_devel("marking ro start %lx, end %lx, step %x\n", + start, end, step); + + if (start == end) { + pr_warn("could not set rodata ro, relocate the start" + " of the kernel to a 0x%x boundary\n", step); + return; + } + + for (idx = start; idx < end; idx += step) + /* Not sure if we can do much with the return value */ + mmu_hash_ops.hpte_updateboltedpp(newpp, idx, mmu_linear_psize, + mmu_kernel_ssize); + +} +#endif diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index bce0ed50789c..fc2e6def2eb7 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -491,3 +491,16 @@ void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0, } EXPORT_SYMBOL_GPL(mmu_partition_table_set_entry); #endif /* CONFIG_PPC_BOOK3S_64 */ + +#ifdef CONFIG_STRICT_KERNEL_RWX +void mark_rodata_ro(void) +{ + if (!mmu_has_feature(MMU_FTR_KERNEL_RO)) { + pr_warn("Warning: Unable to mark rodata read only on this CPU.\n"); + return; + } + + if (!radix_enabled()) + hash__mark_rodata_ro(); +} +#endif From 7614ff3272a115a047139173cc04466e8132a1f2 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Thu, 29 Jun 2017 03:04:09 +1000 Subject: [PATCH 157/158] powerpc/mm/radix: Implement STRICT_RWX/mark_rodata_ro() for Radix The Radix linear mapping code (create_physical_mapping()) tries to use the largest page size it can at each step. Currently the only reason it steps down to a smaller page size is if the start addr is unaligned (never happens in practice), or the end of memory is not aligned to a huge page boundary. To support STRICT_RWX we need to break the mapping at __init_begin, so that the text and rodata prior to that can be marked R_X and the regular pages after can be marked RW. Having done that we can now implement mark_rodata_ro() for Radix, knowing that we won't need to split any mappings. Signed-off-by: Balbir Singh [mpe: Split down to PAGE_SIZE, not 2MB, rewrite change log] Signed-off-by: Michael Ellerman --- arch/powerpc/mm/pgtable-radix.c | 67 ++++++++++++++++++++++++++++++++- arch/powerpc/mm/pgtable_64.c | 4 +- 2 files changed, 69 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c index 1342859552b1..8c13e4282308 100644 --- a/arch/powerpc/mm/pgtable-radix.c +++ b/arch/powerpc/mm/pgtable-radix.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -110,6 +111,49 @@ set_the_pte: return 0; } +#ifdef CONFIG_STRICT_KERNEL_RWX +void radix__mark_rodata_ro(void) +{ + unsigned long start = (unsigned long)_stext; + unsigned long end = (unsigned long)__init_begin; + unsigned long idx; + pgd_t *pgdp; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + + start = ALIGN_DOWN(start, PAGE_SIZE); + end = PAGE_ALIGN(end); // aligns up + + pr_devel("marking ro start %lx, end %lx\n", start, end); + + for (idx = start; idx < end; idx += PAGE_SIZE) { + pgdp = pgd_offset_k(idx); + pudp = pud_alloc(&init_mm, pgdp, idx); + if (!pudp) + continue; + if (pud_huge(*pudp)) { + ptep = (pte_t *)pudp; + goto update_the_pte; + } + pmdp = pmd_alloc(&init_mm, pudp, idx); + if (!pmdp) + continue; + if (pmd_huge(*pmdp)) { + ptep = pmdp_ptep(pmdp); + goto update_the_pte; + } + ptep = pte_alloc_kernel(pmdp, idx); + if (!ptep) + continue; +update_the_pte: + radix__pte_update(&init_mm, idx, ptep, _PAGE_WRITE, 0, 0); + } + + radix__flush_tlb_kernel_range(start, end); +} +#endif /* CONFIG_STRICT_KERNEL_RWX */ + static inline void __meminit print_mapping(unsigned long start, unsigned long end, unsigned long size) @@ -125,6 +169,12 @@ static int __meminit create_physical_mapping(unsigned long start, { unsigned long vaddr, addr, mapping_size = 0; pgprot_t prot; + unsigned long max_mapping_size; +#ifdef CONFIG_STRICT_KERNEL_RWX + int split_text_mapping = 1; +#else + int split_text_mapping = 0; +#endif start = _ALIGN_UP(start, PAGE_SIZE); for (addr = start; addr < end; addr += mapping_size) { @@ -133,9 +183,12 @@ static int __meminit create_physical_mapping(unsigned long start, gap = end - addr; previous_size = mapping_size; + max_mapping_size = PUD_SIZE; +retry: if (IS_ALIGNED(addr, PUD_SIZE) && gap >= PUD_SIZE && - mmu_psize_defs[MMU_PAGE_1G].shift) + mmu_psize_defs[MMU_PAGE_1G].shift && + PUD_SIZE <= max_mapping_size) mapping_size = PUD_SIZE; else if (IS_ALIGNED(addr, PMD_SIZE) && gap >= PMD_SIZE && mmu_psize_defs[MMU_PAGE_2M].shift) @@ -143,6 +196,18 @@ static int __meminit create_physical_mapping(unsigned long start, else mapping_size = PAGE_SIZE; + if (split_text_mapping && (mapping_size == PUD_SIZE) && + (addr <= __pa_symbol(__init_begin)) && + (addr + mapping_size) >= __pa_symbol(_stext)) { + max_mapping_size = PMD_SIZE; + goto retry; + } + + if (split_text_mapping && (mapping_size == PMD_SIZE) && + (addr <= __pa_symbol(__init_begin)) && + (addr + mapping_size) >= __pa_symbol(_stext)) + mapping_size = PAGE_SIZE; + if (mapping_size != previous_size) { print_mapping(start, addr, previous_size); start = addr; diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index fc2e6def2eb7..5c0b795d656c 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -500,7 +500,9 @@ void mark_rodata_ro(void) return; } - if (!radix_enabled()) + if (radix_enabled()) + radix__mark_rodata_ro(); + else hash__mark_rodata_ro(); } #endif From 1e0fc9d1eb2b0241a03e0a02bcdb9b5b641b9d35 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Thu, 29 Jun 2017 03:04:11 +1000 Subject: [PATCH 158/158] powerpc/Kconfig: Enable STRICT_KERNEL_RWX for some configs All code that patches kernel text has been moved over to using patch_instruction() and patch_instruction() is able to cope with the kernel text being read only. The linker script has been updated to ensure the read only data ends on a large page boundary, so it and the preceding kernel text can be marked R_X. We also have implementations of mark_rodata_ro() for Hash and Radix MMU modes. There are some corner-cases missing when the kernel is built relocatable, so for now make it depend on !RELOCATABLE. There's also a temporary workaround to depend on !HIBERNATION to avoid a build failure, that will be removed once we've merged with the PM tree. Signed-off-by: Balbir Singh [mpe: Make it depend on !RELOCATABLE, munge change log] Signed-off-by: Michael Ellerman --- arch/powerpc/Kconfig | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 5ef763853876..8998eefe1638 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -164,6 +164,8 @@ config PPC select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_TRACEHOOK + select ARCH_HAS_STRICT_KERNEL_RWX if (PPC_BOOK3S_64 && !RELOCATABLE && !HIBERNATION) + select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX select HAVE_CBPF_JIT if !PPC64 select HAVE_CONTEXT_TRACKING if PPC64 select HAVE_DEBUG_KMEMLEAK