1
0
Fork 0
alistair23-linux/arch/powerpc/platforms/pseries/setup.c

1044 lines
27 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0-or-later
/*
* 64-bit pSeries and RS/6000 setup code.
*
* Copyright (C) 1995 Linus Torvalds
* Adapted from 'alpha' version by Gary Thomas
* Modified by Cort Dougan (cort@cs.nmt.edu)
* Modified by PPC64 Team, IBM Corp
*/
/*
* bootup setup stuff..
*/
#include <linux/cpu.h>
#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/stddef.h>
#include <linux/unistd.h>
#include <linux/user.h>
#include <linux/tty.h>
#include <linux/major.h>
#include <linux/interrupt.h>
#include <linux/reboot.h>
#include <linux/init.h>
#include <linux/ioport.h>
#include <linux/console.h>
#include <linux/pci.h>
#include <linux/utsname.h>
#include <linux/adb.h>
#include <linux/export.h>
#include <linux/delay.h>
#include <linux/irq.h>
#include <linux/seq_file.h>
#include <linux/root_dev.h>
#include <linux/of.h>
#include <linux/of_pci.h>
powerpc/pseries: Defer the logging of rtas error to irq work queue. rtas_log_buf is a buffer to hold RTAS event data that are communicated to kernel by hypervisor. This buffer is then used to pass RTAS event data to user through proc fs. This buffer is allocated from vmalloc (non-linear mapping) area. On Machine check interrupt, register r3 points to RTAS extended event log passed by hypervisor that contains the MCE event. The pseries machine check handler then logs this error into rtas_log_buf. The rtas_log_buf is a vmalloc-ed (non-linear) buffer we end up taking up a page fault (vector 0x300) while accessing it. Since machine check interrupt handler runs in NMI context we can not afford to take any page fault. Page faults are not honored in NMI context and causes kernel panic. Apart from that, as Nick pointed out, pSeries_log_error() also takes a spin_lock while logging error which is not safe in NMI context. It may endup in deadlock if we get another MCE before releasing the lock. Fix this by deferring the logging of rtas error to irq work queue. Current implementation uses two different buffers to hold rtas error log depending on whether extended log is provided or not. This makes bit difficult to identify which buffer has valid data that needs to logged later in irq work. Simplify this using single buffer, one per paca, and copy rtas log to it irrespective of whether extended log is provided or not. Allocate this buffer below RMA region so that it can be accessed in real mode mce handler. Fixes: b96672dd840f ("powerpc: Machine check interrupt is a non-maskable interrupt") Cc: stable@vger.kernel.org # v4.14+ Reviewed-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2018-07-04 11:57:21 -06:00
#include <linux/memblock.h>
#include <linux/swiotlb.h>
#include <asm/mmu.h>
#include <asm/processor.h>
#include <asm/io.h>
#include <asm/pgtable.h>
#include <asm/prom.h>
#include <asm/rtas.h>
#include <asm/pci-bridge.h>
#include <asm/iommu.h>
#include <asm/dma.h>
#include <asm/machdep.h>
#include <asm/irq.h>
#include <asm/time.h>
#include <asm/nvram.h>
#include <asm/pmc.h>
#include <asm/xics.h>
powerpc/xive: guest exploitation of the XIVE interrupt controller This is the framework for using XIVE in a PowerVM guest. The support is very similar to the native one in a much simpler form. Each source is associated with an Event State Buffer (ESB). This is a two bit state machine which is used to trigger events. The bits are named "P" (pending) and "Q" (queued) and can be controlled by MMIO. The Guest OS registers event (or notifications) queues on which the HW will post event data for a target to notify. Instead of OPAL calls, a set of Hypervisors call are used to configure the interrupt sources and the event/notification queues of the guest: - H_INT_GET_SOURCE_INFO used to obtain the address of the MMIO page of the Event State Buffer (PQ bits) entry associated with the source. - H_INT_SET_SOURCE_CONFIG assigns a source to a "target". - H_INT_GET_SOURCE_CONFIG determines to which "target" and "priority" is assigned to a source - H_INT_GET_QUEUE_INFO returns the address of the notification management page associated with the specified "target" and "priority". - H_INT_SET_QUEUE_CONFIG sets or resets the event queue for a given "target" and "priority". It is also used to set the notification config associated with the queue, only unconditional notification for the moment. Reset is performed with a queue size of 0 and queueing is disabled in that case. - H_INT_GET_QUEUE_CONFIG returns the queue settings for a given "target" and "priority". - H_INT_RESET resets all of the partition's interrupt exploitation structures to their initial state, losing all configuration set via the hcalls H_INT_SET_SOURCE_CONFIG and H_INT_SET_QUEUE_CONFIG. - H_INT_SYNC issue a synchronisation on a source to make sure sure all notifications have reached their queue. As for XICS, the XIVE interface for the guest is described in the device tree under the "interrupt-controller" node. A couple of new properties are specific to XIVE : - "reg" contains the base address and size of the thread interrupt managnement areas (TIMA), also called rings, for the User level and for the Guest OS level. Only the Guest OS level is taken into account today. - "ibm,xive-eq-sizes" the size of the event queues. One cell per size supported, contains log2 of size, in ascending order. - "ibm,xive-lisn-ranges" the interrupt numbers ranges assigned to the guest. These are allocated using a simple bitmap. and also : - "/ibm,plat-res-int-priorities" contains a list of priorities that the hypervisor has reserved for its own use. Tested with a QEMU XIVE model for pseries and with the Power hypervisor. Signed-off-by: Cédric Le Goater <clg@kaod.org> Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2017-08-30 13:46:11 -06:00
#include <asm/xive.h>
#include <asm/ppc-pci.h>
#include <asm/i8259.h>
#include <asm/udbg.h>
#include <asm/smp.h>
#include <asm/firmware.h>
#include <asm/eeh.h>
#include <asm/reg.h>
#include <asm/plpar_wrappers.h>
#include <asm/kexec.h>
#include <asm/isa-bridge.h>
#include <asm/security_features.h>
#include <asm/asm-const.h>
#include <asm/swiotlb.h>
#include <asm/svm.h>
#include "pseries.h"
#include "../../../../drivers/pci/pci.h"
powerpc/vcpu: Assume dedicated processors as non-preempt commit 14c73bd344da60abaf7da3ea2e7733ddda35bbac upstream. With commit 247f2f6f3c70 ("sched/core: Don't schedule threads on pre-empted vCPUs"), the scheduler avoids preempted vCPUs to schedule tasks on wakeup. This leads to wrong choice of CPU, which in-turn leads to larger wakeup latencies. Eventually, it leads to performance regression in latency sensitive benchmarks like soltp, schbench etc. On Powerpc, vcpu_is_preempted() only looks at yield_count. If the yield_count is odd, the vCPU is assumed to be preempted. However yield_count is increased whenever the LPAR enters CEDE state (idle). So any CPU that has entered CEDE state is assumed to be preempted. Even if vCPU of dedicated LPAR is preempted/donated, it should have right of first-use since they are supposed to own the vCPU. On a Power9 System with 32 cores: # lscpu Architecture: ppc64le Byte Order: Little Endian CPU(s): 128 On-line CPU(s) list: 0-127 Thread(s) per core: 8 Core(s) per socket: 1 Socket(s): 16 NUMA node(s): 2 Model: 2.2 (pvr 004e 0202) Model name: POWER9 (architected), altivec supported Hypervisor vendor: pHyp Virtualization type: para L1d cache: 32K L1i cache: 32K L2 cache: 512K L3 cache: 10240K NUMA node0 CPU(s): 0-63 NUMA node1 CPU(s): 64-127 # perf stat -a -r 5 ./schbench v5.4 v5.4 + patch Latency percentiles (usec) Latency percentiles (usec) 50.0000th: 45 50.0th: 45 75.0000th: 62 75.0th: 63 90.0000th: 71 90.0th: 74 95.0000th: 77 95.0th: 78 *99.0000th: 91 *99.0th: 82 99.5000th: 707 99.5th: 83 99.9000th: 6920 99.9th: 86 min=0, max=10048 min=0, max=96 Latency percentiles (usec) Latency percentiles (usec) 50.0000th: 45 50.0th: 46 75.0000th: 61 75.0th: 64 90.0000th: 72 90.0th: 75 95.0000th: 79 95.0th: 79 *99.0000th: 691 *99.0th: 83 99.5000th: 3972 99.5th: 85 99.9000th: 8368 99.9th: 91 min=0, max=16606 min=0, max=117 Latency percentiles (usec) Latency percentiles (usec) 50.0000th: 45 50.0th: 46 75.0000th: 61 75.0th: 64 90.0000th: 71 90.0th: 75 95.0000th: 77 95.0th: 79 *99.0000th: 106 *99.0th: 83 99.5000th: 2364 99.5th: 84 99.9000th: 7480 99.9th: 90 min=0, max=10001 min=0, max=95 Latency percentiles (usec) Latency percentiles (usec) 50.0000th: 45 50.0th: 47 75.0000th: 62 75.0th: 65 90.0000th: 72 90.0th: 75 95.0000th: 78 95.0th: 79 *99.0000th: 93 *99.0th: 84 99.5000th: 108 99.5th: 85 99.9000th: 6792 99.9th: 90 min=0, max=17681 min=0, max=117 Latency percentiles (usec) Latency percentiles (usec) 50.0000th: 46 50.0th: 45 75.0000th: 62 75.0th: 64 90.0000th: 73 90.0th: 75 95.0000th: 79 95.0th: 79 *99.0000th: 113 *99.0th: 82 99.5000th: 2724 99.5th: 83 99.9000th: 6184 99.9th: 93 min=0, max=9887 min=0, max=111 Performance counter stats for 'system wide' (5 runs): context-switches 43,373 ( +- 0.40% ) 44,597 ( +- 0.55% ) cpu-migrations 1,211 ( +- 5.04% ) 220 ( +- 6.23% ) page-faults 15,983 ( +- 5.21% ) 15,360 ( +- 3.38% ) Waiman Long suggested using static_keys. Fixes: 247f2f6f3c70 ("sched/core: Don't schedule threads on pre-empted vCPUs") Cc: stable@vger.kernel.org # v4.18+ Reported-by: Parth Shah <parth@linux.ibm.com> Reported-by: Ihor Pasichnyk <Ihor.Pasichnyk@ibm.com> Tested-by: Juri Lelli <juri.lelli@redhat.com> Acked-by: Waiman Long <longman@redhat.com> Reviewed-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com> Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com> Acked-by: Phil Auld <pauld@redhat.com> Reviewed-by: Vaidyanathan Srinivasan <svaidy@linux.ibm.com> Tested-by: Parth Shah <parth@linux.ibm.com> [mpe: Move the key and setting of the key to pseries/setup.c] Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20191213035036.6913-1-mpe@ellerman.id.au Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2019-12-05 01:32:17 -07:00
DEFINE_STATIC_KEY_FALSE(shared_processor);
EXPORT_SYMBOL_GPL(shared_processor);
int CMO_PrPSP = -1;
int CMO_SecPSP = -1;
unsigned long CMO_PageSize = (ASM_CONST(1) << IOMMU_PAGE_SHIFT_4K);
EXPORT_SYMBOL(CMO_PageSize);
int fwnmi_active; /* TRUE if an FWNMI handler is present */
static void pSeries_show_cpuinfo(struct seq_file *m)
{
struct device_node *root;
const char *model = "";
root = of_find_node_by_path("/");
if (root)
model = of_get_property(root, "model", NULL);
seq_printf(m, "machine\t\t: CHRP %s\n", model);
of_node_put(root);
if (radix_enabled())
seq_printf(m, "MMU\t\t: Radix\n");
else
seq_printf(m, "MMU\t\t: Hash\n");
}
/* Initialize firmware assisted non-maskable interrupts if
* the firmware supports this feature.
*/
static void __init fwnmi_init(void)
{
unsigned long system_reset_addr, machine_check_addr;
powerpc/pseries: Defer the logging of rtas error to irq work queue. rtas_log_buf is a buffer to hold RTAS event data that are communicated to kernel by hypervisor. This buffer is then used to pass RTAS event data to user through proc fs. This buffer is allocated from vmalloc (non-linear mapping) area. On Machine check interrupt, register r3 points to RTAS extended event log passed by hypervisor that contains the MCE event. The pseries machine check handler then logs this error into rtas_log_buf. The rtas_log_buf is a vmalloc-ed (non-linear) buffer we end up taking up a page fault (vector 0x300) while accessing it. Since machine check interrupt handler runs in NMI context we can not afford to take any page fault. Page faults are not honored in NMI context and causes kernel panic. Apart from that, as Nick pointed out, pSeries_log_error() also takes a spin_lock while logging error which is not safe in NMI context. It may endup in deadlock if we get another MCE before releasing the lock. Fix this by deferring the logging of rtas error to irq work queue. Current implementation uses two different buffers to hold rtas error log depending on whether extended log is provided or not. This makes bit difficult to identify which buffer has valid data that needs to logged later in irq work. Simplify this using single buffer, one per paca, and copy rtas log to it irrespective of whether extended log is provided or not. Allocate this buffer below RMA region so that it can be accessed in real mode mce handler. Fixes: b96672dd840f ("powerpc: Machine check interrupt is a non-maskable interrupt") Cc: stable@vger.kernel.org # v4.14+ Reviewed-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2018-07-04 11:57:21 -06:00
u8 *mce_data_buf;
unsigned int i;
int nr_cpus = num_possible_cpus();
powerpc/pseries: Dump the SLB contents on SLB MCE errors. If we get a machine check exceptions due to SLB errors then dump the current SLB contents which will be very much helpful in debugging the root cause of SLB errors. Introduce an exclusive buffer per cpu to hold faulty SLB entries. In real mode mce handler saves the old SLB contents into this buffer accessible through paca and print it out later in virtual mode. With this patch the console will log SLB contents like below on SLB MCE errors: [ 507.297236] SLB contents of cpu 0x1 [ 507.297237] Last SLB entry inserted at slot 16 [ 507.297238] 00 c000000008000000 400ea1b217000500 [ 507.297239] 1T ESID= c00000 VSID= ea1b217 LLP:100 [ 507.297240] 01 d000000008000000 400d43642f000510 [ 507.297242] 1T ESID= d00000 VSID= d43642f LLP:110 [ 507.297243] 11 f000000008000000 400a86c85f000500 [ 507.297244] 1T ESID= f00000 VSID= a86c85f LLP:100 [ 507.297245] 12 00007f0008000000 4008119624000d90 [ 507.297246] 1T ESID= 7f VSID= 8119624 LLP:110 [ 507.297247] 13 0000000018000000 00092885f5150d90 [ 507.297247] 256M ESID= 1 VSID= 92885f5150 LLP:110 [ 507.297248] 14 0000010008000000 4009e7cb50000d90 [ 507.297249] 1T ESID= 1 VSID= 9e7cb50 LLP:110 [ 507.297250] 15 d000000008000000 400d43642f000510 [ 507.297251] 1T ESID= d00000 VSID= d43642f LLP:110 [ 507.297252] 16 d000000008000000 400d43642f000510 [ 507.297253] 1T ESID= d00000 VSID= d43642f LLP:110 [ 507.297253] ---------------------------------- [ 507.297254] SLB cache ptr value = 3 [ 507.297254] Valid SLB cache entries: [ 507.297255] 00 EA[0-35]= 7f000 [ 507.297256] 01 EA[0-35]= 1 [ 507.297257] 02 EA[0-35]= 1000 [ 507.297257] Rest of SLB cache entries: [ 507.297258] 03 EA[0-35]= 7f000 [ 507.297258] 04 EA[0-35]= 1 [ 507.297259] 05 EA[0-35]= 1000 [ 507.297260] 06 EA[0-35]= 12 [ 507.297260] 07 EA[0-35]= 7f000 Suggested-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> Suggested-by: Michael Ellerman <mpe@ellerman.id.au> Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com> Reviewed-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2018-09-11 08:27:15 -06:00
#ifdef CONFIG_PPC_BOOK3S_64
struct slb_entry *slb_ptr;
size_t size;
#endif
int ibm_nmi_register = rtas_token("ibm,nmi-register");
if (ibm_nmi_register == RTAS_UNKNOWN_SERVICE)
return;
/* If the kernel's not linked at zero we point the firmware at low
* addresses anyway, and use a trampoline to get to the real code. */
system_reset_addr = __pa(system_reset_fwnmi) - PHYSICAL_START;
machine_check_addr = __pa(machine_check_fwnmi) - PHYSICAL_START;
if (0 == rtas_call(ibm_nmi_register, 2, 1, NULL, system_reset_addr,
machine_check_addr))
fwnmi_active = 1;
powerpc/pseries: Defer the logging of rtas error to irq work queue. rtas_log_buf is a buffer to hold RTAS event data that are communicated to kernel by hypervisor. This buffer is then used to pass RTAS event data to user through proc fs. This buffer is allocated from vmalloc (non-linear mapping) area. On Machine check interrupt, register r3 points to RTAS extended event log passed by hypervisor that contains the MCE event. The pseries machine check handler then logs this error into rtas_log_buf. The rtas_log_buf is a vmalloc-ed (non-linear) buffer we end up taking up a page fault (vector 0x300) while accessing it. Since machine check interrupt handler runs in NMI context we can not afford to take any page fault. Page faults are not honored in NMI context and causes kernel panic. Apart from that, as Nick pointed out, pSeries_log_error() also takes a spin_lock while logging error which is not safe in NMI context. It may endup in deadlock if we get another MCE before releasing the lock. Fix this by deferring the logging of rtas error to irq work queue. Current implementation uses two different buffers to hold rtas error log depending on whether extended log is provided or not. This makes bit difficult to identify which buffer has valid data that needs to logged later in irq work. Simplify this using single buffer, one per paca, and copy rtas log to it irrespective of whether extended log is provided or not. Allocate this buffer below RMA region so that it can be accessed in real mode mce handler. Fixes: b96672dd840f ("powerpc: Machine check interrupt is a non-maskable interrupt") Cc: stable@vger.kernel.org # v4.14+ Reviewed-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2018-07-04 11:57:21 -06:00
/*
* Allocate a chunk for per cpu buffer to hold rtas errorlog.
* It will be used in real mode mce handler, hence it needs to be
* below RMA.
*/
powerpc: prefer memblock APIs returning virtual address Patch series "memblock: simplify several early memory allocation", v4. These patches simplify some of the early memory allocations by replacing usage of older memblock APIs with newer and shinier ones. Quite a few places in the arch/ code allocated memory using a memblock API that returns a physical address of the allocated area, then converted this physical address to a virtual one and then used memset(0) to clear the allocated range. More recent memblock APIs do all the three steps in one call and their usage simplifies the code. It's important to note that regardless of API used, the core allocation is nearly identical for any set of memblock allocators: first it tries to find a free memory with all the constraints specified by the caller and then falls back to the allocation with some or all constraints disabled. The first three patches perform the conversion of call sites that have exact requirements for the node and the possible memory range. The fourth patch is a bit one-off as it simplifies openrisc's implementation of pte_alloc_one_kernel(), and not only the memblock usage. The fifth patch takes care of simpler cases when the allocation can be satisfied with a simple call to memblock_alloc(). The sixth patch removes one-liner wrappers for memblock_alloc on arm and unicore32, as suggested by Christoph. This patch (of 6): There are a several places that allocate memory using memblock APIs that return a physical address, convert the returned address to the virtual address and frequently also memset(0) the allocated range. Update these places to use memblock allocators already returning a virtual address. Use memblock functions that clear the allocated memory instead of calling memset(0) where appropriate. The calls to memblock_alloc_base() that were not followed by memset(0) are replaced with memblock_alloc_try_nid_raw(). Since the latter does not panic() when the allocation fails, the appropriate panic() calls are added to the call sites. Link: http://lkml.kernel.org/r/1546248566-14910-2-git-send-email-rppt@linux.ibm.com Signed-off-by: Mike Rapoport <rppt@linux.ibm.com> Cc: Arnd Bergmann <arnd@arndb.de> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: "David S. Miller" <davem@davemloft.net> Cc: Guan Xuetao <gxt@pku.edu.cn> Cc: Greentime Hu <green.hu@gmail.com> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: Jonas Bonn <jonas@southpole.se> Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: Michal Hocko <mhocko@suse.com> Cc: Michal Simek <monstr@monstr.eu> Cc: Mark Salter <msalter@redhat.com> Cc: Paul Mackerras <paulus@samba.org> Cc: Rich Felker <dalias@libc.org> Cc: Russell King <linux@armlinux.org.uk> Cc: Stefan Kristiansson <stefan.kristiansson@saunalahti.fi> Cc: Stafford Horne <shorne@gmail.com> Cc: Vincent Chen <deanbo422@gmail.com> Cc: Yoshinori Sato <ysato@users.sourceforge.jp> Cc: Christoph Hellwig <hch@infradead.org> Cc: Michal Simek <michal.simek@xilinx.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-03-07 17:30:48 -07:00
mce_data_buf = memblock_alloc_try_nid_raw(RTAS_ERROR_LOG_MAX * nr_cpus,
RTAS_ERROR_LOG_MAX, MEMBLOCK_LOW_LIMIT,
ppc64_rma_size, NUMA_NO_NODE);
if (!mce_data_buf)
panic("Failed to allocate %d bytes below %pa for MCE buffer\n",
RTAS_ERROR_LOG_MAX * nr_cpus, &ppc64_rma_size);
powerpc/pseries: Defer the logging of rtas error to irq work queue. rtas_log_buf is a buffer to hold RTAS event data that are communicated to kernel by hypervisor. This buffer is then used to pass RTAS event data to user through proc fs. This buffer is allocated from vmalloc (non-linear mapping) area. On Machine check interrupt, register r3 points to RTAS extended event log passed by hypervisor that contains the MCE event. The pseries machine check handler then logs this error into rtas_log_buf. The rtas_log_buf is a vmalloc-ed (non-linear) buffer we end up taking up a page fault (vector 0x300) while accessing it. Since machine check interrupt handler runs in NMI context we can not afford to take any page fault. Page faults are not honored in NMI context and causes kernel panic. Apart from that, as Nick pointed out, pSeries_log_error() also takes a spin_lock while logging error which is not safe in NMI context. It may endup in deadlock if we get another MCE before releasing the lock. Fix this by deferring the logging of rtas error to irq work queue. Current implementation uses two different buffers to hold rtas error log depending on whether extended log is provided or not. This makes bit difficult to identify which buffer has valid data that needs to logged later in irq work. Simplify this using single buffer, one per paca, and copy rtas log to it irrespective of whether extended log is provided or not. Allocate this buffer below RMA region so that it can be accessed in real mode mce handler. Fixes: b96672dd840f ("powerpc: Machine check interrupt is a non-maskable interrupt") Cc: stable@vger.kernel.org # v4.14+ Reviewed-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2018-07-04 11:57:21 -06:00
for_each_possible_cpu(i) {
paca_ptrs[i]->mce_data_buf = mce_data_buf +
(RTAS_ERROR_LOG_MAX * i);
}
powerpc/pseries: Dump the SLB contents on SLB MCE errors. If we get a machine check exceptions due to SLB errors then dump the current SLB contents which will be very much helpful in debugging the root cause of SLB errors. Introduce an exclusive buffer per cpu to hold faulty SLB entries. In real mode mce handler saves the old SLB contents into this buffer accessible through paca and print it out later in virtual mode. With this patch the console will log SLB contents like below on SLB MCE errors: [ 507.297236] SLB contents of cpu 0x1 [ 507.297237] Last SLB entry inserted at slot 16 [ 507.297238] 00 c000000008000000 400ea1b217000500 [ 507.297239] 1T ESID= c00000 VSID= ea1b217 LLP:100 [ 507.297240] 01 d000000008000000 400d43642f000510 [ 507.297242] 1T ESID= d00000 VSID= d43642f LLP:110 [ 507.297243] 11 f000000008000000 400a86c85f000500 [ 507.297244] 1T ESID= f00000 VSID= a86c85f LLP:100 [ 507.297245] 12 00007f0008000000 4008119624000d90 [ 507.297246] 1T ESID= 7f VSID= 8119624 LLP:110 [ 507.297247] 13 0000000018000000 00092885f5150d90 [ 507.297247] 256M ESID= 1 VSID= 92885f5150 LLP:110 [ 507.297248] 14 0000010008000000 4009e7cb50000d90 [ 507.297249] 1T ESID= 1 VSID= 9e7cb50 LLP:110 [ 507.297250] 15 d000000008000000 400d43642f000510 [ 507.297251] 1T ESID= d00000 VSID= d43642f LLP:110 [ 507.297252] 16 d000000008000000 400d43642f000510 [ 507.297253] 1T ESID= d00000 VSID= d43642f LLP:110 [ 507.297253] ---------------------------------- [ 507.297254] SLB cache ptr value = 3 [ 507.297254] Valid SLB cache entries: [ 507.297255] 00 EA[0-35]= 7f000 [ 507.297256] 01 EA[0-35]= 1 [ 507.297257] 02 EA[0-35]= 1000 [ 507.297257] Rest of SLB cache entries: [ 507.297258] 03 EA[0-35]= 7f000 [ 507.297258] 04 EA[0-35]= 1 [ 507.297259] 05 EA[0-35]= 1000 [ 507.297260] 06 EA[0-35]= 12 [ 507.297260] 07 EA[0-35]= 7f000 Suggested-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> Suggested-by: Michael Ellerman <mpe@ellerman.id.au> Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com> Reviewed-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2018-09-11 08:27:15 -06:00
#ifdef CONFIG_PPC_BOOK3S_64
if (!radix_enabled()) {
/* Allocate per cpu area to save old slb contents during MCE */
size = sizeof(struct slb_entry) * mmu_slb_size * nr_cpus;
slb_ptr = memblock_alloc_try_nid_raw(size,
sizeof(struct slb_entry), MEMBLOCK_LOW_LIMIT,
ppc64_rma_size, NUMA_NO_NODE);
if (!slb_ptr)
panic("Failed to allocate %zu bytes below %pa for slb area\n",
size, &ppc64_rma_size);
for_each_possible_cpu(i)
paca_ptrs[i]->mce_faulty_slbs = slb_ptr + (mmu_slb_size * i);
}
powerpc/pseries: Dump the SLB contents on SLB MCE errors. If we get a machine check exceptions due to SLB errors then dump the current SLB contents which will be very much helpful in debugging the root cause of SLB errors. Introduce an exclusive buffer per cpu to hold faulty SLB entries. In real mode mce handler saves the old SLB contents into this buffer accessible through paca and print it out later in virtual mode. With this patch the console will log SLB contents like below on SLB MCE errors: [ 507.297236] SLB contents of cpu 0x1 [ 507.297237] Last SLB entry inserted at slot 16 [ 507.297238] 00 c000000008000000 400ea1b217000500 [ 507.297239] 1T ESID= c00000 VSID= ea1b217 LLP:100 [ 507.297240] 01 d000000008000000 400d43642f000510 [ 507.297242] 1T ESID= d00000 VSID= d43642f LLP:110 [ 507.297243] 11 f000000008000000 400a86c85f000500 [ 507.297244] 1T ESID= f00000 VSID= a86c85f LLP:100 [ 507.297245] 12 00007f0008000000 4008119624000d90 [ 507.297246] 1T ESID= 7f VSID= 8119624 LLP:110 [ 507.297247] 13 0000000018000000 00092885f5150d90 [ 507.297247] 256M ESID= 1 VSID= 92885f5150 LLP:110 [ 507.297248] 14 0000010008000000 4009e7cb50000d90 [ 507.297249] 1T ESID= 1 VSID= 9e7cb50 LLP:110 [ 507.297250] 15 d000000008000000 400d43642f000510 [ 507.297251] 1T ESID= d00000 VSID= d43642f LLP:110 [ 507.297252] 16 d000000008000000 400d43642f000510 [ 507.297253] 1T ESID= d00000 VSID= d43642f LLP:110 [ 507.297253] ---------------------------------- [ 507.297254] SLB cache ptr value = 3 [ 507.297254] Valid SLB cache entries: [ 507.297255] 00 EA[0-35]= 7f000 [ 507.297256] 01 EA[0-35]= 1 [ 507.297257] 02 EA[0-35]= 1000 [ 507.297257] Rest of SLB cache entries: [ 507.297258] 03 EA[0-35]= 7f000 [ 507.297258] 04 EA[0-35]= 1 [ 507.297259] 05 EA[0-35]= 1000 [ 507.297260] 06 EA[0-35]= 12 [ 507.297260] 07 EA[0-35]= 7f000 Suggested-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> Suggested-by: Michael Ellerman <mpe@ellerman.id.au> Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com> Reviewed-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2018-09-11 08:27:15 -06:00
#endif
}
static void pseries_8259_cascade(struct irq_desc *desc)
{
struct irq_chip *chip = irq_desc_get_chip(desc);
unsigned int cascade_irq = i8259_irq();
if (cascade_irq)
IRQ: Maintain regs pointer globally rather than passing to IRQ handlers Maintain a per-CPU global "struct pt_regs *" variable which can be used instead of passing regs around manually through all ~1800 interrupt handlers in the Linux kernel. The regs pointer is used in few places, but it potentially costs both stack space and code to pass it around. On the FRV arch, removing the regs parameter from all the genirq function results in a 20% speed up of the IRQ exit path (ie: from leaving timer_interrupt() to leaving do_IRQ()). Where appropriate, an arch may override the generic storage facility and do something different with the variable. On FRV, for instance, the address is maintained in GR28 at all times inside the kernel as part of general exception handling. Having looked over the code, it appears that the parameter may be handed down through up to twenty or so layers of functions. Consider a USB character device attached to a USB hub, attached to a USB controller that posts its interrupts through a cascaded auxiliary interrupt controller. A character device driver may want to pass regs to the sysrq handler through the input layer which adds another few layers of parameter passing. I've build this code with allyesconfig for x86_64 and i386. I've runtested the main part of the code on FRV and i386, though I can't test most of the drivers. I've also done partial conversion for powerpc and MIPS - these at least compile with minimal configurations. This will affect all archs. Mostly the changes should be relatively easy. Take do_IRQ(), store the regs pointer at the beginning, saving the old one: struct pt_regs *old_regs = set_irq_regs(regs); And put the old one back at the end: set_irq_regs(old_regs); Don't pass regs through to generic_handle_irq() or __do_IRQ(). In timer_interrupt(), this sort of change will be necessary: - update_process_times(user_mode(regs)); - profile_tick(CPU_PROFILING, regs); + update_process_times(user_mode(get_irq_regs())); + profile_tick(CPU_PROFILING); I'd like to move update_process_times()'s use of get_irq_regs() into itself, except that i386, alone of the archs, uses something other than user_mode(). Some notes on the interrupt handling in the drivers: (*) input_dev() is now gone entirely. The regs pointer is no longer stored in the input_dev struct. (*) finish_unlinks() in drivers/usb/host/ohci-q.c needs checking. It does something different depending on whether it's been supplied with a regs pointer or not. (*) Various IRQ handler function pointers have been moved to type irq_handler_t. Signed-Off-By: David Howells <dhowells@redhat.com> (cherry picked from 1b16e7ac850969f38b375e511e3fa2f474a33867 commit)
2006-10-05 07:55:46 -06:00
generic_handle_irq(cascade_irq);
chip->irq_eoi(&desc->irq_data);
}
static void __init pseries_setup_i8259_cascade(void)
{
struct device_node *np, *old, *found = NULL;
unsigned int cascade;
const u32 *addrp;
unsigned long intack = 0;
int naddr;
for_each_node_by_type(np, "interrupt-controller") {
if (of_device_is_compatible(np, "chrp,iic")) {
found = np;
break;
}
}
if (found == NULL) {
printk(KERN_DEBUG "pic: no ISA interrupt controller\n");
return;
}
cascade = irq_of_parse_and_map(found, 0);
if (!cascade) {
printk(KERN_ERR "pic: failed to map cascade interrupt");
return;
}
pr_debug("pic: cascade mapped to irq %d\n", cascade);
for (old = of_node_get(found); old != NULL ; old = np) {
np = of_get_parent(old);
of_node_put(old);
if (np == NULL)
break;
if (!of_node_name_eq(np, "pci"))
continue;
addrp = of_get_property(np, "8259-interrupt-acknowledge", NULL);
if (addrp == NULL)
continue;
naddr = of_n_addr_cells(np);
intack = addrp[naddr-1];
if (naddr > 1)
intack |= ((unsigned long)addrp[naddr-2]) << 32;
}
if (intack)
printk(KERN_DEBUG "pic: PCI 8259 intack at 0x%016lx\n", intack);
i8259_init(found, intack);
of_node_put(found);
irq_set_chained_handler(cascade, pseries_8259_cascade);
}
static void __init pseries_init_irq(void)
{
powerpc/xive: guest exploitation of the XIVE interrupt controller This is the framework for using XIVE in a PowerVM guest. The support is very similar to the native one in a much simpler form. Each source is associated with an Event State Buffer (ESB). This is a two bit state machine which is used to trigger events. The bits are named "P" (pending) and "Q" (queued) and can be controlled by MMIO. The Guest OS registers event (or notifications) queues on which the HW will post event data for a target to notify. Instead of OPAL calls, a set of Hypervisors call are used to configure the interrupt sources and the event/notification queues of the guest: - H_INT_GET_SOURCE_INFO used to obtain the address of the MMIO page of the Event State Buffer (PQ bits) entry associated with the source. - H_INT_SET_SOURCE_CONFIG assigns a source to a "target". - H_INT_GET_SOURCE_CONFIG determines to which "target" and "priority" is assigned to a source - H_INT_GET_QUEUE_INFO returns the address of the notification management page associated with the specified "target" and "priority". - H_INT_SET_QUEUE_CONFIG sets or resets the event queue for a given "target" and "priority". It is also used to set the notification config associated with the queue, only unconditional notification for the moment. Reset is performed with a queue size of 0 and queueing is disabled in that case. - H_INT_GET_QUEUE_CONFIG returns the queue settings for a given "target" and "priority". - H_INT_RESET resets all of the partition's interrupt exploitation structures to their initial state, losing all configuration set via the hcalls H_INT_SET_SOURCE_CONFIG and H_INT_SET_QUEUE_CONFIG. - H_INT_SYNC issue a synchronisation on a source to make sure sure all notifications have reached their queue. As for XICS, the XIVE interface for the guest is described in the device tree under the "interrupt-controller" node. A couple of new properties are specific to XIVE : - "reg" contains the base address and size of the thread interrupt managnement areas (TIMA), also called rings, for the User level and for the Guest OS level. Only the Guest OS level is taken into account today. - "ibm,xive-eq-sizes" the size of the event queues. One cell per size supported, contains log2 of size, in ascending order. - "ibm,xive-lisn-ranges" the interrupt numbers ranges assigned to the guest. These are allocated using a simple bitmap. and also : - "/ibm,plat-res-int-priorities" contains a list of priorities that the hypervisor has reserved for its own use. Tested with a QEMU XIVE model for pseries and with the Power hypervisor. Signed-off-by: Cédric Le Goater <clg@kaod.org> Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2017-08-30 13:46:11 -06:00
/* Try using a XIVE if available, otherwise use a XICS */
if (!xive_spapr_init()) {
xics_init();
pseries_setup_i8259_cascade();
}
}
static void pseries_lpar_enable_pmcs(void)
{
unsigned long set, reset;
set = 1UL << 63;
reset = 0;
plpar_hcall_norets(H_PERFMON, set, reset);
}
static int pci_dn_reconfig_notifier(struct notifier_block *nb, unsigned long action, void *data)
{
struct of_reconfig_data *rd = data;
struct device_node *parent, *np = rd->dn;
struct pci_dn *pdn;
int err = NOTIFY_OK;
switch (action) {
case OF_RECONFIG_ATTACH_NODE:
parent = of_get_parent(np);
pdn = parent ? PCI_DN(parent) : NULL;
if (pdn)
pci_add_device_node_info(pdn->phb, np);
of_node_put(parent);
break;
case OF_RECONFIG_DETACH_NODE:
pdn = PCI_DN(np);
if (pdn)
list_del(&pdn->list);
break;
default:
err = NOTIFY_DONE;
break;
}
return err;
}
static struct notifier_block pci_dn_reconfig_nb = {
.notifier_call = pci_dn_reconfig_notifier,
};
struct kmem_cache *dtl_cache;
cputime: Generic on-demand virtual cputime accounting If we want to stop the tick further idle, we need to be able to account the cputime without using the tick. Virtual based cputime accounting solves that problem by hooking into kernel/user boundaries. However implementing CONFIG_VIRT_CPU_ACCOUNTING require low level hooks and involves more overhead. But we already have a generic context tracking subsystem that is required for RCU needs by archs which plan to shut down the tick outside idle. This patch implements a generic virtual based cputime accounting that relies on these generic kernel/user hooks. There are some upsides of doing this: - This requires no arch code to implement CONFIG_VIRT_CPU_ACCOUNTING if context tracking is already built (already necessary for RCU in full tickless mode). - We can rely on the generic context tracking subsystem to dynamically (de)activate the hooks, so that we can switch anytime between virtual and tick based accounting. This way we don't have the overhead of the virtual accounting when the tick is running periodically. And one downside: - There is probably more overhead than a native virtual based cputime accounting. But this relies on hooks that are already set anyway. Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Ingo Molnar <mingo@kernel.org> Cc: Li Zhong <zhong@linux.vnet.ibm.com> Cc: Namhyung Kim <namhyung.kim@lge.com> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Paul Gortmaker <paul.gortmaker@windriver.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Thomas Gleixner <tglx@linutronix.de>
2012-07-24 23:56:04 -06:00
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
powerpc: Account time using timebase rather than PURR Currently, when CONFIG_VIRT_CPU_ACCOUNTING is enabled, we use the PURR register for measuring the user and system time used by processes, as well as other related times such as hardirq and softirq times. This turns out to be quite confusing for users because it means that a program will often be measured as taking less time when run on a multi-threaded processor (SMT2 or SMT4 mode) than it does when run on a single-threaded processor (ST mode), even though the program takes longer to finish. The discrepancy is accounted for as stolen time, which is also confusing, particularly when there are no other partitions running. This changes the accounting to use the timebase instead, meaning that the reported user and system times are the actual number of real-time seconds that the program was executing on the processor thread, regardless of which SMT mode the processor is in. Thus a program will generally show greater user and system times when run on a multi-threaded processor than on a single-threaded processor. On pSeries systems on POWER5 or later processors, we measure the stolen time (time when this partition wasn't running) using the hypervisor dispatch trace log. We check for new entries in the log on every entry from user mode and on every transition from kernel process context to soft or hard IRQ context (i.e. when account_system_vtime() gets called). So that we can correctly distinguish time stolen from user time and time stolen from system time, without having to check the log on every exit to user mode, we store separate timestamps for exit to user mode and entry from user mode. On systems that have a SPURR (POWER6 and POWER7), we read the SPURR in account_system_vtime() (as before), and then apportion the SPURR ticks since the last time we read it between scaled user time and scaled system time according to the relative proportions of user time and system time over the same interval. This avoids having to read the SPURR on every kernel entry and exit. On systems that have PURR but not SPURR (i.e., POWER5), we do the same using the PURR rather than the SPURR. This disables the DTL user interface in /sys/debug/kernel/powerpc/dtl for now since it conflicts with the use of the dispatch trace log by the time accounting code. Signed-off-by: Paul Mackerras <paulus@samba.org> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2010-08-26 13:56:43 -06:00
/*
* Allocate space for the dispatch trace log for all possible cpus
* and register the buffers with the hypervisor. This is used for
* computing time stolen by the hypervisor.
*/
static int alloc_dispatch_logs(void)
{
if (!firmware_has_feature(FW_FEATURE_SPLPAR))
return 0;
if (!dtl_cache)
return 0;
alloc_dtl_buffers(0);
powerpc: Account time using timebase rather than PURR Currently, when CONFIG_VIRT_CPU_ACCOUNTING is enabled, we use the PURR register for measuring the user and system time used by processes, as well as other related times such as hardirq and softirq times. This turns out to be quite confusing for users because it means that a program will often be measured as taking less time when run on a multi-threaded processor (SMT2 or SMT4 mode) than it does when run on a single-threaded processor (ST mode), even though the program takes longer to finish. The discrepancy is accounted for as stolen time, which is also confusing, particularly when there are no other partitions running. This changes the accounting to use the timebase instead, meaning that the reported user and system times are the actual number of real-time seconds that the program was executing on the processor thread, regardless of which SMT mode the processor is in. Thus a program will generally show greater user and system times when run on a multi-threaded processor than on a single-threaded processor. On pSeries systems on POWER5 or later processors, we measure the stolen time (time when this partition wasn't running) using the hypervisor dispatch trace log. We check for new entries in the log on every entry from user mode and on every transition from kernel process context to soft or hard IRQ context (i.e. when account_system_vtime() gets called). So that we can correctly distinguish time stolen from user time and time stolen from system time, without having to check the log on every exit to user mode, we store separate timestamps for exit to user mode and entry from user mode. On systems that have a SPURR (POWER6 and POWER7), we read the SPURR in account_system_vtime() (as before), and then apportion the SPURR ticks since the last time we read it between scaled user time and scaled system time according to the relative proportions of user time and system time over the same interval. This avoids having to read the SPURR on every kernel entry and exit. On systems that have PURR but not SPURR (i.e., POWER5), we do the same using the PURR rather than the SPURR. This disables the DTL user interface in /sys/debug/kernel/powerpc/dtl for now since it conflicts with the use of the dispatch trace log by the time accounting code. Signed-off-by: Paul Mackerras <paulus@samba.org> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2010-08-26 13:56:43 -06:00
/* Register the DTL for the current (boot) cpu */
register_dtl_buffer(smp_processor_id());
powerpc: Account time using timebase rather than PURR Currently, when CONFIG_VIRT_CPU_ACCOUNTING is enabled, we use the PURR register for measuring the user and system time used by processes, as well as other related times such as hardirq and softirq times. This turns out to be quite confusing for users because it means that a program will often be measured as taking less time when run on a multi-threaded processor (SMT2 or SMT4 mode) than it does when run on a single-threaded processor (ST mode), even though the program takes longer to finish. The discrepancy is accounted for as stolen time, which is also confusing, particularly when there are no other partitions running. This changes the accounting to use the timebase instead, meaning that the reported user and system times are the actual number of real-time seconds that the program was executing on the processor thread, regardless of which SMT mode the processor is in. Thus a program will generally show greater user and system times when run on a multi-threaded processor than on a single-threaded processor. On pSeries systems on POWER5 or later processors, we measure the stolen time (time when this partition wasn't running) using the hypervisor dispatch trace log. We check for new entries in the log on every entry from user mode and on every transition from kernel process context to soft or hard IRQ context (i.e. when account_system_vtime() gets called). So that we can correctly distinguish time stolen from user time and time stolen from system time, without having to check the log on every exit to user mode, we store separate timestamps for exit to user mode and entry from user mode. On systems that have a SPURR (POWER6 and POWER7), we read the SPURR in account_system_vtime() (as before), and then apportion the SPURR ticks since the last time we read it between scaled user time and scaled system time according to the relative proportions of user time and system time over the same interval. This avoids having to read the SPURR on every kernel entry and exit. On systems that have PURR but not SPURR (i.e., POWER5), we do the same using the PURR rather than the SPURR. This disables the DTL user interface in /sys/debug/kernel/powerpc/dtl for now since it conflicts with the use of the dispatch trace log by the time accounting code. Signed-off-by: Paul Mackerras <paulus@samba.org> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2010-08-26 13:56:43 -06:00
return 0;
}
cputime: Generic on-demand virtual cputime accounting If we want to stop the tick further idle, we need to be able to account the cputime without using the tick. Virtual based cputime accounting solves that problem by hooking into kernel/user boundaries. However implementing CONFIG_VIRT_CPU_ACCOUNTING require low level hooks and involves more overhead. But we already have a generic context tracking subsystem that is required for RCU needs by archs which plan to shut down the tick outside idle. This patch implements a generic virtual based cputime accounting that relies on these generic kernel/user hooks. There are some upsides of doing this: - This requires no arch code to implement CONFIG_VIRT_CPU_ACCOUNTING if context tracking is already built (already necessary for RCU in full tickless mode). - We can rely on the generic context tracking subsystem to dynamically (de)activate the hooks, so that we can switch anytime between virtual and tick based accounting. This way we don't have the overhead of the virtual accounting when the tick is running periodically. And one downside: - There is probably more overhead than a native virtual based cputime accounting. But this relies on hooks that are already set anyway. Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Ingo Molnar <mingo@kernel.org> Cc: Li Zhong <zhong@linux.vnet.ibm.com> Cc: Namhyung Kim <namhyung.kim@lge.com> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Paul Gortmaker <paul.gortmaker@windriver.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Thomas Gleixner <tglx@linutronix.de>
2012-07-24 23:56:04 -06:00
#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
static inline int alloc_dispatch_logs(void)
{
return 0;
}
cputime: Generic on-demand virtual cputime accounting If we want to stop the tick further idle, we need to be able to account the cputime without using the tick. Virtual based cputime accounting solves that problem by hooking into kernel/user boundaries. However implementing CONFIG_VIRT_CPU_ACCOUNTING require low level hooks and involves more overhead. But we already have a generic context tracking subsystem that is required for RCU needs by archs which plan to shut down the tick outside idle. This patch implements a generic virtual based cputime accounting that relies on these generic kernel/user hooks. There are some upsides of doing this: - This requires no arch code to implement CONFIG_VIRT_CPU_ACCOUNTING if context tracking is already built (already necessary for RCU in full tickless mode). - We can rely on the generic context tracking subsystem to dynamically (de)activate the hooks, so that we can switch anytime between virtual and tick based accounting. This way we don't have the overhead of the virtual accounting when the tick is running periodically. And one downside: - There is probably more overhead than a native virtual based cputime accounting. But this relies on hooks that are already set anyway. Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Ingo Molnar <mingo@kernel.org> Cc: Li Zhong <zhong@linux.vnet.ibm.com> Cc: Namhyung Kim <namhyung.kim@lge.com> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Paul Gortmaker <paul.gortmaker@windriver.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Thomas Gleixner <tglx@linutronix.de>
2012-07-24 23:56:04 -06:00
#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
powerpc: Account time using timebase rather than PURR Currently, when CONFIG_VIRT_CPU_ACCOUNTING is enabled, we use the PURR register for measuring the user and system time used by processes, as well as other related times such as hardirq and softirq times. This turns out to be quite confusing for users because it means that a program will often be measured as taking less time when run on a multi-threaded processor (SMT2 or SMT4 mode) than it does when run on a single-threaded processor (ST mode), even though the program takes longer to finish. The discrepancy is accounted for as stolen time, which is also confusing, particularly when there are no other partitions running. This changes the accounting to use the timebase instead, meaning that the reported user and system times are the actual number of real-time seconds that the program was executing on the processor thread, regardless of which SMT mode the processor is in. Thus a program will generally show greater user and system times when run on a multi-threaded processor than on a single-threaded processor. On pSeries systems on POWER5 or later processors, we measure the stolen time (time when this partition wasn't running) using the hypervisor dispatch trace log. We check for new entries in the log on every entry from user mode and on every transition from kernel process context to soft or hard IRQ context (i.e. when account_system_vtime() gets called). So that we can correctly distinguish time stolen from user time and time stolen from system time, without having to check the log on every exit to user mode, we store separate timestamps for exit to user mode and entry from user mode. On systems that have a SPURR (POWER6 and POWER7), we read the SPURR in account_system_vtime() (as before), and then apportion the SPURR ticks since the last time we read it between scaled user time and scaled system time according to the relative proportions of user time and system time over the same interval. This avoids having to read the SPURR on every kernel entry and exit. On systems that have PURR but not SPURR (i.e., POWER5), we do the same using the PURR rather than the SPURR. This disables the DTL user interface in /sys/debug/kernel/powerpc/dtl for now since it conflicts with the use of the dispatch trace log by the time accounting code. Signed-off-by: Paul Mackerras <paulus@samba.org> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2010-08-26 13:56:43 -06:00
static int alloc_dispatch_log_kmem_cache(void)
{
void (*ctor)(void *) = get_dtl_cache_ctor();
dtl_cache = kmem_cache_create("dtl", DISPATCH_LOG_BYTES,
DISPATCH_LOG_BYTES, 0, ctor);
if (!dtl_cache) {
pr_warn("Failed to create dispatch trace log buffer cache\n");
pr_warn("Stolen time statistics will be unreliable\n");
return 0;
}
return alloc_dispatch_logs();
}
machine_early_initcall(pseries, alloc_dispatch_log_kmem_cache);
static void pseries_lpar_idle(void)
{
sched/idle, PPC: Remove redundant cpuidle_idle_call() The core idle loop now takes care of it. However a few things need checking: - Invocation of cpuidle_idle_call() in pseries_lpar_idle() happened through arch_cpu_idle() and was therefore always preceded by a call to ppc64_runlatch_off(). To preserve this property now that cpuidle_idle_call() is invoked directly from core code, a call to ppc64_runlatch_off() has been added to idle_loop_prolog() in platforms/pseries/processor_idle.c. - Similarly, cpuidle_idle_call() was followed by ppc64_runlatch_off() so a call to the later has been added to idle_loop_epilog(). - And since arch_cpu_idle() always made sure to re-enable IRQs if they were not enabled, this is now done in idle_loop_epilog() as well. The above was made in order to keep the execution flow close to the original. I don't know if that was strictly necessary. Someone well aquainted with the platform details might find some room for possible optimizations. Signed-off-by: Nicolas Pitre <nico@linaro.org> Reviewed-by: Preeti U Murthy <preeti@linux.vnet.ibm.com> Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net> Cc: Daniel Lezcano <daniel.lezcano@linaro.org> Cc: linux-arm-kernel@lists.infradead.org Cc: linuxppc-dev@lists.ozlabs.org Cc: linux-sh@vger.kernel.org Cc: linux-pm@vger.kernel.org Cc: Russell King <linux@arm.linux.org.uk> Cc: linaro-kernel@lists.linaro.org Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Paul Mundt <lethal@linux-sh.org> Signed-off-by: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/n/tip-47o4m03citrfg9y1vxic5asb@git.kernel.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-01-29 10:45:10 -07:00
/*
* Default handler to go into low thread priority and possibly
* low power mode by ceding processor to hypervisor
*/
sched/idle, PPC: Remove redundant cpuidle_idle_call() The core idle loop now takes care of it. However a few things need checking: - Invocation of cpuidle_idle_call() in pseries_lpar_idle() happened through arch_cpu_idle() and was therefore always preceded by a call to ppc64_runlatch_off(). To preserve this property now that cpuidle_idle_call() is invoked directly from core code, a call to ppc64_runlatch_off() has been added to idle_loop_prolog() in platforms/pseries/processor_idle.c. - Similarly, cpuidle_idle_call() was followed by ppc64_runlatch_off() so a call to the later has been added to idle_loop_epilog(). - And since arch_cpu_idle() always made sure to re-enable IRQs if they were not enabled, this is now done in idle_loop_epilog() as well. The above was made in order to keep the execution flow close to the original. I don't know if that was strictly necessary. Someone well aquainted with the platform details might find some room for possible optimizations. Signed-off-by: Nicolas Pitre <nico@linaro.org> Reviewed-by: Preeti U Murthy <preeti@linux.vnet.ibm.com> Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net> Cc: Daniel Lezcano <daniel.lezcano@linaro.org> Cc: linux-arm-kernel@lists.infradead.org Cc: linuxppc-dev@lists.ozlabs.org Cc: linux-sh@vger.kernel.org Cc: linux-pm@vger.kernel.org Cc: Russell King <linux@arm.linux.org.uk> Cc: linaro-kernel@lists.linaro.org Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Paul Mundt <lethal@linux-sh.org> Signed-off-by: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/n/tip-47o4m03citrfg9y1vxic5asb@git.kernel.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-01-29 10:45:10 -07:00
if (!prep_irq_for_idle())
return;
sched/idle, PPC: Remove redundant cpuidle_idle_call() The core idle loop now takes care of it. However a few things need checking: - Invocation of cpuidle_idle_call() in pseries_lpar_idle() happened through arch_cpu_idle() and was therefore always preceded by a call to ppc64_runlatch_off(). To preserve this property now that cpuidle_idle_call() is invoked directly from core code, a call to ppc64_runlatch_off() has been added to idle_loop_prolog() in platforms/pseries/processor_idle.c. - Similarly, cpuidle_idle_call() was followed by ppc64_runlatch_off() so a call to the later has been added to idle_loop_epilog(). - And since arch_cpu_idle() always made sure to re-enable IRQs if they were not enabled, this is now done in idle_loop_epilog() as well. The above was made in order to keep the execution flow close to the original. I don't know if that was strictly necessary. Someone well aquainted with the platform details might find some room for possible optimizations. Signed-off-by: Nicolas Pitre <nico@linaro.org> Reviewed-by: Preeti U Murthy <preeti@linux.vnet.ibm.com> Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net> Cc: Daniel Lezcano <daniel.lezcano@linaro.org> Cc: linux-arm-kernel@lists.infradead.org Cc: linuxppc-dev@lists.ozlabs.org Cc: linux-sh@vger.kernel.org Cc: linux-pm@vger.kernel.org Cc: Russell King <linux@arm.linux.org.uk> Cc: linaro-kernel@lists.linaro.org Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Paul Mundt <lethal@linux-sh.org> Signed-off-by: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/n/tip-47o4m03citrfg9y1vxic5asb@git.kernel.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-01-29 10:45:10 -07:00
/* Indicate to hypervisor that we are idle. */
get_lppaca()->idle = 1;
/*
* Yield the processor to the hypervisor. We return if
* an external interrupt occurs (which are driven prior
* to returning here) or if a prod occurs from another
* processor. When returning here, external interrupts
* are enabled.
*/
cede_processor();
get_lppaca()->idle = 0;
}
/*
* Enable relocation on during exceptions. This has partition wide scope and
* may take a while to complete, if it takes longer than one second we will
* just give up rather than wasting any more time on this - if that turns out
* to ever be a problem in practice we can move this into a kernel thread to
* finish off the process later in boot.
*/
void pseries_enable_reloc_on_exc(void)
{
long rc;
unsigned int delay, total_delay = 0;
while (1) {
rc = enable_reloc_on_exceptions();
if (!H_IS_LONG_BUSY(rc)) {
if (rc == H_P2) {
pr_info("Relocation on exceptions not"
" supported\n");
} else if (rc != H_SUCCESS) {
pr_warn("Unable to enable relocation"
" on exceptions: %ld\n", rc);
}
break;
}
delay = get_longbusy_msecs(rc);
total_delay += delay;
if (total_delay > 1000) {
pr_warn("Warning: Giving up waiting to enable "
"relocation on exceptions (%u msec)!\n",
total_delay);
return;
}
mdelay(delay);
}
}
EXPORT_SYMBOL(pseries_enable_reloc_on_exc);
void pseries_disable_reloc_on_exc(void)
{
long rc;
while (1) {
rc = disable_reloc_on_exceptions();
if (!H_IS_LONG_BUSY(rc))
break;
mdelay(get_longbusy_msecs(rc));
}
if (rc != H_SUCCESS)
pr_warn("Warning: Failed to disable relocation on exceptions: %ld\n",
rc);
}
EXPORT_SYMBOL(pseries_disable_reloc_on_exc);
#ifdef CONFIG_KEXEC_CORE
static void pSeries_machine_kexec(struct kimage *image)
{
if (firmware_has_feature(FW_FEATURE_SET_MODE))
pseries_disable_reloc_on_exc();
default_machine_kexec(image);
}
#endif
#ifdef __LITTLE_ENDIAN__
void pseries_big_endian_exceptions(void)
{
long rc;
while (1) {
rc = enable_big_endian_exceptions();
if (!H_IS_LONG_BUSY(rc))
break;
mdelay(get_longbusy_msecs(rc));
}
/*
* At this point it is unlikely panic() will get anything
* out to the user, since this is called very late in kexec
* but at least this will stop us from continuing on further
* and creating an even more difficult to debug situation.
*
* There is a known problem when kdump'ing, if cpus are offline
* the above call will fail. Rather than panicking again, keep
* going and hope the kdump kernel is also little endian, which
* it usually is.
*/
if (rc && !kdump_in_progress())
panic("Could not enable big endian exceptions");
}
void pseries_little_endian_exceptions(void)
{
long rc;
while (1) {
rc = enable_little_endian_exceptions();
if (!H_IS_LONG_BUSY(rc))
break;
mdelay(get_longbusy_msecs(rc));
}
if (rc) {
ppc_md.progress("H_SET_MODE LE exception fail", 0);
panic("Could not enable little endian exceptions");
}
}
#endif
static void __init find_and_init_phbs(void)
{
struct device_node *node;
struct pci_controller *phb;
struct device_node *root = of_find_node_by_path("/");
for_each_child_of_node(root, node) {
if (!of_node_is_type(node, "pci") &&
!of_node_is_type(node, "pciex"))
continue;
phb = pcibios_alloc_controller(node);
if (!phb)
continue;
rtas_setup_phb(phb);
pci_process_bridge_OF_ranges(phb, node, 0);
isa_bridge_find_early(phb);
phb->controller_ops = pseries_pci_controller_ops;
}
of_node_put(root);
/*
* PCI_PROBE_ONLY and PCI_REASSIGN_ALL_BUS can be set via properties
* in chosen.
*/
of_pci_check_probe_only();
}
static void init_cpu_char_feature_flags(struct h_cpu_char_result *result)
{
/*
* The features below are disabled by default, so we instead look to see
* if firmware has *enabled* them, and set them if so.
*/
if (result->character & H_CPU_CHAR_SPEC_BAR_ORI31)
security_ftr_set(SEC_FTR_SPEC_BAR_ORI31);
if (result->character & H_CPU_CHAR_BCCTRL_SERIALISED)
security_ftr_set(SEC_FTR_BCCTRL_SERIALISED);
if (result->character & H_CPU_CHAR_L1D_FLUSH_ORI30)
security_ftr_set(SEC_FTR_L1D_FLUSH_ORI30);
if (result->character & H_CPU_CHAR_L1D_FLUSH_TRIG2)
security_ftr_set(SEC_FTR_L1D_FLUSH_TRIG2);
if (result->character & H_CPU_CHAR_L1D_THREAD_PRIV)
security_ftr_set(SEC_FTR_L1D_THREAD_PRIV);
if (result->character & H_CPU_CHAR_COUNT_CACHE_DISABLED)
security_ftr_set(SEC_FTR_COUNT_CACHE_DISABLED);
if (result->character & H_CPU_CHAR_BCCTR_FLUSH_ASSIST)
security_ftr_set(SEC_FTR_BCCTR_FLUSH_ASSIST);
if (result->behaviour & H_CPU_BEHAV_FLUSH_COUNT_CACHE)
security_ftr_set(SEC_FTR_FLUSH_COUNT_CACHE);
/*
* The features below are enabled by default, so we instead look to see
* if firmware has *disabled* them, and clear them if so.
*/
if (!(result->behaviour & H_CPU_BEHAV_FAVOUR_SECURITY))
security_ftr_clear(SEC_FTR_FAVOUR_SECURITY);
if (!(result->behaviour & H_CPU_BEHAV_L1D_FLUSH_PR))
security_ftr_clear(SEC_FTR_L1D_FLUSH_PR);
if (!(result->behaviour & H_CPU_BEHAV_BNDS_CHK_SPEC_BAR))
security_ftr_clear(SEC_FTR_BNDS_CHK_SPEC_BAR);
}
void pseries_setup_rfi_flush(void)
{
struct h_cpu_char_result result;
enum l1d_flush_type types;
bool enable;
long rc;
/*
* Set features to the defaults assumed by init_cpu_char_feature_flags()
* so it can set/clear again any features that might have changed after
* migration, and in case the hypercall fails and it is not even called.
*/
powerpc_security_features = SEC_FTR_DEFAULT;
rc = plpar_get_cpu_characteristics(&result);
if (rc == H_SUCCESS)
init_cpu_char_feature_flags(&result);
/*
* We're the guest so this doesn't apply to us, clear it to simplify
* handling of it elsewhere.
*/
security_ftr_clear(SEC_FTR_L1D_FLUSH_HV);
types = L1D_FLUSH_FALLBACK;
if (security_ftr_enabled(SEC_FTR_L1D_FLUSH_TRIG2))
types |= L1D_FLUSH_MTTRIG;
if (security_ftr_enabled(SEC_FTR_L1D_FLUSH_ORI30))
types |= L1D_FLUSH_ORI;
enable = security_ftr_enabled(SEC_FTR_FAVOUR_SECURITY) && \
security_ftr_enabled(SEC_FTR_L1D_FLUSH_PR);
setup_rfi_flush(types, enable);
setup_count_cache_flush();
2020-11-19 16:35:13 -07:00
enable = security_ftr_enabled(SEC_FTR_FAVOUR_SECURITY) &&
security_ftr_enabled(SEC_FTR_L1D_FLUSH_ENTRY);
setup_entry_flush(enable);
enable = security_ftr_enabled(SEC_FTR_FAVOUR_SECURITY) &&
security_ftr_enabled(SEC_FTR_L1D_FLUSH_UACCESS);
setup_uaccess_flush(enable);
}
#ifdef CONFIG_PCI_IOV
enum rtas_iov_fw_value_map {
NUM_RES_PROPERTY = 0, /* Number of Resources */
LOW_INT = 1, /* Lowest 32 bits of Address */
START_OF_ENTRIES = 2, /* Always start of entry */
APERTURE_PROPERTY = 2, /* Start of entry+ to Aperture Size */
WDW_SIZE_PROPERTY = 4, /* Start of entry+ to Window Size */
NEXT_ENTRY = 7 /* Go to next entry on array */
};
enum get_iov_fw_value_index {
BAR_ADDRS = 1, /* Get Bar Address */
APERTURE_SIZE = 2, /* Get Aperture Size */
WDW_SIZE = 3 /* Get Window Size */
};
resource_size_t pseries_get_iov_fw_value(struct pci_dev *dev, int resno,
enum get_iov_fw_value_index value)
{
const int *indexes;
struct device_node *dn = pci_device_to_OF_node(dev);
int i, num_res, ret = 0;
indexes = of_get_property(dn, "ibm,open-sriov-vf-bar-info", NULL);
if (!indexes)
return 0;
/*
* First element in the array is the number of Bars
* returned. Search through the list to find the matching
* bar
*/
num_res = of_read_number(&indexes[NUM_RES_PROPERTY], 1);
if (resno >= num_res)
return 0; /* or an errror */
i = START_OF_ENTRIES + NEXT_ENTRY * resno;
switch (value) {
case BAR_ADDRS:
ret = of_read_number(&indexes[i], 2);
break;
case APERTURE_SIZE:
ret = of_read_number(&indexes[i + APERTURE_PROPERTY], 2);
break;
case WDW_SIZE:
ret = of_read_number(&indexes[i + WDW_SIZE_PROPERTY], 2);
break;
}
return ret;
}
void of_pci_set_vf_bar_size(struct pci_dev *dev, const int *indexes)
{
struct resource *res;
resource_size_t base, size;
int i, r, num_res;
num_res = of_read_number(&indexes[NUM_RES_PROPERTY], 1);
num_res = min_t(int, num_res, PCI_SRIOV_NUM_BARS);
for (i = START_OF_ENTRIES, r = 0; r < num_res && r < PCI_SRIOV_NUM_BARS;
i += NEXT_ENTRY, r++) {
res = &dev->resource[r + PCI_IOV_RESOURCES];
base = of_read_number(&indexes[i], 2);
size = of_read_number(&indexes[i + APERTURE_PROPERTY], 2);
res->flags = pci_parse_of_flags(of_read_number
(&indexes[i + LOW_INT], 1), 0);
res->flags |= (IORESOURCE_MEM_64 | IORESOURCE_PCI_FIXED);
res->name = pci_name(dev);
res->start = base;
res->end = base + size - 1;
}
}
void of_pci_parse_iov_addrs(struct pci_dev *dev, const int *indexes)
{
struct resource *res, *root, *conflict;
resource_size_t base, size;
int i, r, num_res;
/*
* First element in the array is the number of Bars
* returned. Search through the list to find the matching
* bars assign them from firmware into resources structure.
*/
num_res = of_read_number(&indexes[NUM_RES_PROPERTY], 1);
for (i = START_OF_ENTRIES, r = 0; r < num_res && r < PCI_SRIOV_NUM_BARS;
i += NEXT_ENTRY, r++) {
res = &dev->resource[r + PCI_IOV_RESOURCES];
base = of_read_number(&indexes[i], 2);
size = of_read_number(&indexes[i + WDW_SIZE_PROPERTY], 2);
res->name = pci_name(dev);
res->start = base;
res->end = base + size - 1;
root = &iomem_resource;
dev_dbg(&dev->dev,
"pSeries IOV BAR %d: trying firmware assignment %pR\n",
r + PCI_IOV_RESOURCES, res);
conflict = request_resource_conflict(root, res);
if (conflict) {
dev_info(&dev->dev,
"BAR %d: %pR conflicts with %s %pR\n",
r + PCI_IOV_RESOURCES, res,
conflict->name, conflict);
res->flags |= IORESOURCE_UNSET;
}
}
}
static void pseries_disable_sriov_resources(struct pci_dev *pdev)
{
int i;
pci_warn(pdev, "No hypervisor support for SR-IOV on this device, IOV BARs disabled.\n");
for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
pdev->resource[i + PCI_IOV_RESOURCES].flags = 0;
}
static void pseries_pci_fixup_resources(struct pci_dev *pdev)
{
const int *indexes;
struct device_node *dn = pci_device_to_OF_node(pdev);
/*Firmware must support open sriov otherwise dont configure*/
indexes = of_get_property(dn, "ibm,open-sriov-vf-bar-info", NULL);
if (indexes)
of_pci_set_vf_bar_size(pdev, indexes);
else
pseries_disable_sriov_resources(pdev);
}
static void pseries_pci_fixup_iov_resources(struct pci_dev *pdev)
{
const int *indexes;
struct device_node *dn = pci_device_to_OF_node(pdev);
if (!pdev->is_physfn || pci_dev_is_added(pdev))
return;
/*Firmware must support open sriov otherwise dont configure*/
indexes = of_get_property(dn, "ibm,open-sriov-vf-bar-info", NULL);
if (indexes)
of_pci_parse_iov_addrs(pdev, indexes);
else
pseries_disable_sriov_resources(pdev);
}
static resource_size_t pseries_pci_iov_resource_alignment(struct pci_dev *pdev,
int resno)
{
const __be32 *reg;
struct device_node *dn = pci_device_to_OF_node(pdev);
/*Firmware must support open sriov otherwise report regular alignment*/
reg = of_get_property(dn, "ibm,is-open-sriov-pf", NULL);
if (!reg)
return pci_iov_resource_size(pdev, resno);
if (!pdev->is_physfn)
return 0;
return pseries_get_iov_fw_value(pdev,
resno - PCI_IOV_RESOURCES,
APERTURE_SIZE);
}
#endif
2006-07-03 05:36:01 -06:00
static void __init pSeries_setup_arch(void)
{
set_arch_panic_timeout(10, ARCH_PANIC_TIMEOUT);
2006-07-03 05:36:01 -06:00
/* Discover PIC type and setup ppc_md accordingly */
smp_init_pseries();
2006-07-03 05:36:01 -06:00
/* openpic global configuration register (64-bit format). */
/* openpic Interrupt Source Unit pointer (64-bit format). */
/* python0 facility area (mmio) (64-bit format) REAL address. */
/* init to some ~sane value until calibrate_delay() runs */
loops_per_jiffy = 50000000;
fwnmi_init();
pseries_setup_rfi_flush();
setup_stf_barrier();
pseries_lpar_read_hblkrm_characteristics();
/* By default, only probe PCI (can be overridden by rtas_pci) */
pci_add_flags(PCI_PROBE_ONLY);
/* Find and initialize PCI host bridges */
init_pci_config_tokens();
find_and_init_phbs();
of_reconfig_notifier_register(&pci_dn_reconfig_nb);
pSeries_nvram_init();
if (firmware_has_feature(FW_FEATURE_LPAR)) {
vpa_init(boot_cpuid);
powerpc/vcpu: Assume dedicated processors as non-preempt commit 14c73bd344da60abaf7da3ea2e7733ddda35bbac upstream. With commit 247f2f6f3c70 ("sched/core: Don't schedule threads on pre-empted vCPUs"), the scheduler avoids preempted vCPUs to schedule tasks on wakeup. This leads to wrong choice of CPU, which in-turn leads to larger wakeup latencies. Eventually, it leads to performance regression in latency sensitive benchmarks like soltp, schbench etc. On Powerpc, vcpu_is_preempted() only looks at yield_count. If the yield_count is odd, the vCPU is assumed to be preempted. However yield_count is increased whenever the LPAR enters CEDE state (idle). So any CPU that has entered CEDE state is assumed to be preempted. Even if vCPU of dedicated LPAR is preempted/donated, it should have right of first-use since they are supposed to own the vCPU. On a Power9 System with 32 cores: # lscpu Architecture: ppc64le Byte Order: Little Endian CPU(s): 128 On-line CPU(s) list: 0-127 Thread(s) per core: 8 Core(s) per socket: 1 Socket(s): 16 NUMA node(s): 2 Model: 2.2 (pvr 004e 0202) Model name: POWER9 (architected), altivec supported Hypervisor vendor: pHyp Virtualization type: para L1d cache: 32K L1i cache: 32K L2 cache: 512K L3 cache: 10240K NUMA node0 CPU(s): 0-63 NUMA node1 CPU(s): 64-127 # perf stat -a -r 5 ./schbench v5.4 v5.4 + patch Latency percentiles (usec) Latency percentiles (usec) 50.0000th: 45 50.0th: 45 75.0000th: 62 75.0th: 63 90.0000th: 71 90.0th: 74 95.0000th: 77 95.0th: 78 *99.0000th: 91 *99.0th: 82 99.5000th: 707 99.5th: 83 99.9000th: 6920 99.9th: 86 min=0, max=10048 min=0, max=96 Latency percentiles (usec) Latency percentiles (usec) 50.0000th: 45 50.0th: 46 75.0000th: 61 75.0th: 64 90.0000th: 72 90.0th: 75 95.0000th: 79 95.0th: 79 *99.0000th: 691 *99.0th: 83 99.5000th: 3972 99.5th: 85 99.9000th: 8368 99.9th: 91 min=0, max=16606 min=0, max=117 Latency percentiles (usec) Latency percentiles (usec) 50.0000th: 45 50.0th: 46 75.0000th: 61 75.0th: 64 90.0000th: 71 90.0th: 75 95.0000th: 77 95.0th: 79 *99.0000th: 106 *99.0th: 83 99.5000th: 2364 99.5th: 84 99.9000th: 7480 99.9th: 90 min=0, max=10001 min=0, max=95 Latency percentiles (usec) Latency percentiles (usec) 50.0000th: 45 50.0th: 47 75.0000th: 62 75.0th: 65 90.0000th: 72 90.0th: 75 95.0000th: 78 95.0th: 79 *99.0000th: 93 *99.0th: 84 99.5000th: 108 99.5th: 85 99.9000th: 6792 99.9th: 90 min=0, max=17681 min=0, max=117 Latency percentiles (usec) Latency percentiles (usec) 50.0000th: 46 50.0th: 45 75.0000th: 62 75.0th: 64 90.0000th: 73 90.0th: 75 95.0000th: 79 95.0th: 79 *99.0000th: 113 *99.0th: 82 99.5000th: 2724 99.5th: 83 99.9000th: 6184 99.9th: 93 min=0, max=9887 min=0, max=111 Performance counter stats for 'system wide' (5 runs): context-switches 43,373 ( +- 0.40% ) 44,597 ( +- 0.55% ) cpu-migrations 1,211 ( +- 5.04% ) 220 ( +- 6.23% ) page-faults 15,983 ( +- 5.21% ) 15,360 ( +- 3.38% ) Waiman Long suggested using static_keys. Fixes: 247f2f6f3c70 ("sched/core: Don't schedule threads on pre-empted vCPUs") Cc: stable@vger.kernel.org # v4.18+ Reported-by: Parth Shah <parth@linux.ibm.com> Reported-by: Ihor Pasichnyk <Ihor.Pasichnyk@ibm.com> Tested-by: Juri Lelli <juri.lelli@redhat.com> Acked-by: Waiman Long <longman@redhat.com> Reviewed-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com> Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com> Acked-by: Phil Auld <pauld@redhat.com> Reviewed-by: Vaidyanathan Srinivasan <svaidy@linux.ibm.com> Tested-by: Parth Shah <parth@linux.ibm.com> [mpe: Move the key and setting of the key to pseries/setup.c] Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20191213035036.6913-1-mpe@ellerman.id.au Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2019-12-05 01:32:17 -07:00
if (lppaca_shared_proc(get_lppaca()))
static_branch_enable(&shared_processor);
ppc_md.power_save = pseries_lpar_idle;
ppc_md.enable_pmcs = pseries_lpar_enable_pmcs;
#ifdef CONFIG_PCI_IOV
ppc_md.pcibios_fixup_resources =
pseries_pci_fixup_resources;
ppc_md.pcibios_fixup_sriov =
pseries_pci_fixup_iov_resources;
ppc_md.pcibios_iov_resource_alignment =
pseries_pci_iov_resource_alignment;
#endif
} else {
/* No special idle routine */
ppc_md.enable_pmcs = power4_enable_pmcs;
}
ppc_md.pcibios_root_bridge_prepare = pseries_root_bridge_prepare;
if (swiotlb_force == SWIOTLB_FORCE)
ppc_swiotlb_enable = 1;
}
static void pseries_panic(char *str)
{
panic_flush_kmsg_end();
rtas_os_term(str);
}
static int __init pSeries_init_panel(void)
{
/* Manually leave the kernel version on the panel. */
#ifdef __BIG_ENDIAN__
ppc_md.progress("Linux ppc64\n", 0);
#else
ppc_md.progress("Linux ppc64le\n", 0);
#endif
ppc_md.progress(init_utsname()->version, 0);
return 0;
}
machine_arch_initcall(pseries, pSeries_init_panel);
static int pseries_set_dabr(unsigned long dabr, unsigned long dabrx)
{
return plpar_hcall_norets(H_SET_DABR, dabr);
}
static int pseries_set_xdabr(unsigned long dabr, unsigned long dabrx)
{
/* Have to set at least one bit in the DABRX according to PAPR */
if (dabrx == 0 && dabr == 0)
dabrx = DABRX_USER;
/* PAPR says we can only set kernel and user bits */
dabrx &= DABRX_KERNEL | DABRX_USER;
return plpar_hcall_norets(H_SET_XDABR, dabr, dabrx);
}
static int pseries_set_dawr(unsigned long dawr, unsigned long dawrx)
{
/* PAPR says we can't set HYP */
dawrx &= ~DAWRX_HYP;
return plpar_set_watchpoint0(dawr, dawrx);
}
#define CMO_CHARACTERISTICS_TOKEN 44
#define CMO_MAXLENGTH 1026
void pSeries_coalesce_init(void)
{
struct hvcall_mpp_x_data mpp_x_data;
if (firmware_has_feature(FW_FEATURE_CMO) && !h_get_mpp_x(&mpp_x_data))
powerpc_firmware_features |= FW_FEATURE_XCMO;
else
powerpc_firmware_features &= ~FW_FEATURE_XCMO;
}
/**
* fw_cmo_feature_init - FW_FEATURE_CMO is not stored in ibm,hypertas-functions,
* handle that here. (Stolen from parse_system_parameter_string)
*/
static void pSeries_cmo_feature_init(void)
{
char *ptr, *key, *value, *end;
int call_status;
int page_order = IOMMU_PAGE_SHIFT_4K;
pr_debug(" -> fw_cmo_feature_init()\n");
spin_lock(&rtas_data_buf_lock);
memset(rtas_data_buf, 0, RTAS_DATA_BUF_SIZE);
call_status = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1,
NULL,
CMO_CHARACTERISTICS_TOKEN,
__pa(rtas_data_buf),
RTAS_DATA_BUF_SIZE);
if (call_status != 0) {
spin_unlock(&rtas_data_buf_lock);
pr_debug("CMO not available\n");
pr_debug(" <- fw_cmo_feature_init()\n");
return;
}
end = rtas_data_buf + CMO_MAXLENGTH - 2;
ptr = rtas_data_buf + 2; /* step over strlen value */
key = value = ptr;
while (*ptr && (ptr <= end)) {
/* Separate the key and value by replacing '=' with '\0' and
* point the value at the string after the '='
*/
if (ptr[0] == '=') {
ptr[0] = '\0';
value = ptr + 1;
} else if (ptr[0] == '\0' || ptr[0] == ',') {
/* Terminate the string containing the key/value pair */
ptr[0] = '\0';
if (key == value) {
pr_debug("Malformed key/value pair\n");
/* Never found a '=', end processing */
break;
}
if (0 == strcmp(key, "CMOPageSize"))
page_order = simple_strtol(value, NULL, 10);
else if (0 == strcmp(key, "PrPSP"))
CMO_PrPSP = simple_strtol(value, NULL, 10);
else if (0 == strcmp(key, "SecPSP"))
CMO_SecPSP = simple_strtol(value, NULL, 10);
value = key = ptr + 1;
}
ptr++;
}
/* Page size is returned as the power of 2 of the page size,
* convert to the page size in bytes before returning
*/
CMO_PageSize = 1 << page_order;
pr_debug("CMO_PageSize = %lu\n", CMO_PageSize);
if (CMO_PrPSP != -1 || CMO_SecPSP != -1) {
pr_info("CMO enabled\n");
pr_debug("CMO enabled, PrPSP=%d, SecPSP=%d\n", CMO_PrPSP,
CMO_SecPSP);
powerpc_firmware_features |= FW_FEATURE_CMO;
pSeries_coalesce_init();
} else
pr_debug("CMO not enabled, PrPSP=%d, SecPSP=%d\n", CMO_PrPSP,
CMO_SecPSP);
spin_unlock(&rtas_data_buf_lock);
pr_debug(" <- fw_cmo_feature_init()\n");
}
/*
* Early initialization. Relocation is on but do not reference unbolted pages
*/
static void __init pseries_init(void)
{
pr_debug(" -> pseries_init()\n");
powerpc/pseries: Re-implement HVSI as part of hvc_vio On pseries machines, consoles are provided by the hypervisor using a low level get_chars/put_chars type interface. However, this is really just a transport to the service processor which implements them either as "raw" console (networked consoles, HMC, ...) or as "hvsi" serial ports. The later is a simple packet protocol on top of the raw character interface that is supposed to convey additional "serial port" style semantics. In practice however, all it does is provide a way to read the CD line and set/clear our DTR line, that's it. We currently implement the "raw" protocol as an hvc console backend (/dev/hvcN) and the "hvsi" protocol using a separate tty driver (/dev/hvsi0). However this is quite impractical. The arbitrary difference between the two type of devices has been a major source of user (and distro) confusion. Additionally, there's an additional mini -hvsi implementation in the pseries platform code for our low level debug console and early boot kernel messages, which means code duplication, though that low level variant is impractical as it's incapable of doing the initial protocol negociation to establish the link to the FSP. This essentially replaces the dedicated hvsi driver and the platform udbg code completely by extending the existing hvc_vio backend used in "raw" mode so that: - It now supports HVSI as well - We add support for hvc backend providing tiocm{get,set} - It also provides a udbg interface for early debug and boot console This is overall less code, though this will only be obvious once we remove the old "hvsi" driver, which is still available for now. When the old driver is enabled, the new code still kicks in for the low level udbg console, replacing the old mini implementation in the platform code, it just doesn't provide the higher level "hvc" interface. In addition to producing generally simler code, this has several benefits over our current situation: - The user/distro only has to deal with /dev/hvcN for the hypervisor console, avoiding all sort of confusion that has plagued us in the past - The tty, kernel and low level debug console all use the same code base which supports the full protocol establishment process, thus the console is now available much earlier than it used to be with the old HVSI driver. The kernel console works much earlier and udbg is available much earlier too. Hackers can enable a hard coded very-early debug console as well that works with HVSI (previously that was only supported for the "raw" mode). I've tried to keep the same semantics as hvsi relative to how I react to things like CD changes, with some subtle differences though: - I clear DTR on close if HUPCL is set - Current hvsi triggers a hangup if it detects a up->down transition on CD (you can still open a console with CD down). My new implementation triggers a hangup if the link to the FSP is severed, and severs it upon detecting a up->down transition on CD. Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2011-05-11 21:46:38 -06:00
#ifdef CONFIG_HVC_CONSOLE
if (firmware_has_feature(FW_FEATURE_LPAR))
powerpc/pseries: Re-implement HVSI as part of hvc_vio On pseries machines, consoles are provided by the hypervisor using a low level get_chars/put_chars type interface. However, this is really just a transport to the service processor which implements them either as "raw" console (networked consoles, HMC, ...) or as "hvsi" serial ports. The later is a simple packet protocol on top of the raw character interface that is supposed to convey additional "serial port" style semantics. In practice however, all it does is provide a way to read the CD line and set/clear our DTR line, that's it. We currently implement the "raw" protocol as an hvc console backend (/dev/hvcN) and the "hvsi" protocol using a separate tty driver (/dev/hvsi0). However this is quite impractical. The arbitrary difference between the two type of devices has been a major source of user (and distro) confusion. Additionally, there's an additional mini -hvsi implementation in the pseries platform code for our low level debug console and early boot kernel messages, which means code duplication, though that low level variant is impractical as it's incapable of doing the initial protocol negociation to establish the link to the FSP. This essentially replaces the dedicated hvsi driver and the platform udbg code completely by extending the existing hvc_vio backend used in "raw" mode so that: - It now supports HVSI as well - We add support for hvc backend providing tiocm{get,set} - It also provides a udbg interface for early debug and boot console This is overall less code, though this will only be obvious once we remove the old "hvsi" driver, which is still available for now. When the old driver is enabled, the new code still kicks in for the low level udbg console, replacing the old mini implementation in the platform code, it just doesn't provide the higher level "hvc" interface. In addition to producing generally simler code, this has several benefits over our current situation: - The user/distro only has to deal with /dev/hvcN for the hypervisor console, avoiding all sort of confusion that has plagued us in the past - The tty, kernel and low level debug console all use the same code base which supports the full protocol establishment process, thus the console is now available much earlier than it used to be with the old HVSI driver. The kernel console works much earlier and udbg is available much earlier too. Hackers can enable a hard coded very-early debug console as well that works with HVSI (previously that was only supported for the "raw" mode). I've tried to keep the same semantics as hvsi relative to how I react to things like CD changes, with some subtle differences though: - I clear DTR on close if HUPCL is set - Current hvsi triggers a hangup if it detects a up->down transition on CD (you can still open a console with CD down). My new implementation triggers a hangup if the link to the FSP is severed, and severs it upon detecting a up->down transition on CD. Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2011-05-11 21:46:38 -06:00
hvc_vio_init_early();
#endif
if (firmware_has_feature(FW_FEATURE_XDABR))
ppc_md.set_dabr = pseries_set_xdabr;
else if (firmware_has_feature(FW_FEATURE_DABR))
ppc_md.set_dabr = pseries_set_dabr;
if (firmware_has_feature(FW_FEATURE_SET_MODE))
ppc_md.set_dawr = pseries_set_dawr;
pSeries_cmo_feature_init();
iommu_init_early_pSeries();
pr_debug(" <- pseries_init()\n");
}
/**
* pseries_power_off - tell firmware about how to power off the system.
*
* This function calls either the power-off rtas token in normal cases
* or the ibm,power-off-ups token (if present & requested) in case of
* a power failure. If power-off token is used, power on will only be
* possible with power button press. If ibm,power-off-ups token is used
* it will allow auto poweron after power is restored.
*/
static void pseries_power_off(void)
{
int rc;
int rtas_poweroff_ups_token = rtas_token("ibm,power-off-ups");
if (rtas_flash_term_hook)
rtas_flash_term_hook(SYS_POWER_OFF);
if (rtas_poweron_auto == 0 ||
rtas_poweroff_ups_token == RTAS_UNKNOWN_SERVICE) {
rc = rtas_call(rtas_token("power-off"), 2, 1, NULL, -1, -1);
printk(KERN_INFO "RTAS power-off returned %d\n", rc);
} else {
rc = rtas_call(rtas_poweroff_ups_token, 0, 1, NULL);
printk(KERN_INFO "RTAS ibm,power-off-ups returned %d\n", rc);
}
for (;;);
}
static int __init pSeries_probe(void)
{
if (!of_node_is_type(of_root, "chrp"))
return 0;
/* Cell blades firmware claims to be chrp while it's not. Until this
* is fixed, we need to avoid those here.
*/
if (of_machine_is_compatible("IBM,CPBW-1.0") ||
of_machine_is_compatible("IBM,CBEA"))
return 0;
pm_power_off = pseries_power_off;
pr_debug("Machine is%s LPAR !\n",
(powerpc_firmware_features & FW_FEATURE_LPAR) ? "" : " not");
pseries_init();
return 1;
}
ppc64: Set up PCI tree from Open Firmware device tree This adds code which gives us the option on ppc64 of instantiating the PCI tree (the tree of pci_bus and pci_dev structs) from the Open Firmware device tree rather than by probing PCI configuration space. The OF device tree has a node for each PCI device and bridge in the system, with properties that tell us what addresses the firmware has configured for them and other details. There are a couple of reasons why this is needed. First, on systems with a hypervisor, there is a PCI-PCI bridge per slot under the PCI host bridges. These PCI-PCI bridges have special isolation features for virtualization. We can't write to their config space, and we are not supposed to be reading their config space either. The firmware tells us about the address ranges that they pass in the OF device tree. Secondly, on powermacs, the interrupt controller is in a PCI device that may be behind a PCI-PCI bridge. If we happened to take an interrupt just at the point when the device or a bridge on the path to it was disabled for probing, we would crash when we try to access the interrupt controller. I have implemented a platform-specific function which is called for each PCI bridge (host or PCI-PCI) to say whether the code should look in the device tree or use normal PCI probing for the devices under that bridge. On pSeries machines we use the device tree if we're running under a hypervisor, otherwise we use normal probing. On powermacs we use normal probing for the AGP bridge, since the device for the AGP bridge itself isn't shown in the device tree (at least on my G5), and the device tree for everything else. This has been tested on a dual G5 powermac, a partition on a POWER5 machine (running under the hypervisor), and a legacy iSeries partition. Signed-off-by: Paul Mackerras <paulus@samba.org>
2005-09-12 01:17:36 -06:00
static int pSeries_pci_probe_mode(struct pci_bus *bus)
{
if (firmware_has_feature(FW_FEATURE_LPAR))
ppc64: Set up PCI tree from Open Firmware device tree This adds code which gives us the option on ppc64 of instantiating the PCI tree (the tree of pci_bus and pci_dev structs) from the Open Firmware device tree rather than by probing PCI configuration space. The OF device tree has a node for each PCI device and bridge in the system, with properties that tell us what addresses the firmware has configured for them and other details. There are a couple of reasons why this is needed. First, on systems with a hypervisor, there is a PCI-PCI bridge per slot under the PCI host bridges. These PCI-PCI bridges have special isolation features for virtualization. We can't write to their config space, and we are not supposed to be reading their config space either. The firmware tells us about the address ranges that they pass in the OF device tree. Secondly, on powermacs, the interrupt controller is in a PCI device that may be behind a PCI-PCI bridge. If we happened to take an interrupt just at the point when the device or a bridge on the path to it was disabled for probing, we would crash when we try to access the interrupt controller. I have implemented a platform-specific function which is called for each PCI bridge (host or PCI-PCI) to say whether the code should look in the device tree or use normal PCI probing for the devices under that bridge. On pSeries machines we use the device tree if we're running under a hypervisor, otherwise we use normal probing. On powermacs we use normal probing for the AGP bridge, since the device for the AGP bridge itself isn't shown in the device tree (at least on my G5), and the device tree for everything else. This has been tested on a dual G5 powermac, a partition on a POWER5 machine (running under the hypervisor), and a legacy iSeries partition. Signed-off-by: Paul Mackerras <paulus@samba.org>
2005-09-12 01:17:36 -06:00
return PCI_PROBE_DEVTREE;
return PCI_PROBE_NORMAL;
}
struct pci_controller_ops pseries_pci_controller_ops = {
.probe_mode = pSeries_pci_probe_mode,
};
define_machine(pseries) {
.name = "pSeries",
.probe = pSeries_probe,
.setup_arch = pSeries_setup_arch,
.init_IRQ = pseries_init_irq,
.show_cpuinfo = pSeries_show_cpuinfo,
.log_error = pSeries_log_error,
.pcibios_fixup = pSeries_final_fixup,
.restart = rtas_restart,
.halt = rtas_halt,
.panic = pseries_panic,
.get_boot_time = rtas_get_boot_time,
.get_rtc_time = rtas_get_rtc_time,
.set_rtc_time = rtas_set_rtc_time,
.calibrate_decr = generic_calibrate_decr,
.progress = rtas_progress,
.system_reset_exception = pSeries_system_reset_exception,
.machine_check_early = pseries_machine_check_realmode,
.machine_check_exception = pSeries_machine_check_exception,
#ifdef CONFIG_KEXEC_CORE
.machine_kexec = pSeries_machine_kexec,
.kexec_cpu_down = pseries_kexec_cpu_down,
#endif
#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
.memory_block_size = pseries_memory_block_size,
#endif
};